def parse_user_item(self, items): for k,v in items.items(): # assert k == v['id_str'], (k,v) user = User() user['id_'] = k user['raw_data'] = v yield user
def parse_user_item( self, items): #Durch Userinformationen wird benutzerId bestimmt for k, v in items.items(): # assert k == v['id_str'], (k,v) user = User() user['id_'] = k user['raw_data'] = v daten = user['raw_data'] benutzername = daten['screen_name'] if (benutzername == unternehmensname): self.benutzerId = daten['id_str'] return self.benutzerId yield user
def parse_tweet_item(self, items): for item in items: try: tweet = self.get_tweet(item) yield tweet if self.crawl_user: ### get user info user = User() user['ID'] = tweet['user_id'] user['name'] = item.xpath('.//@data-name').extract()[0] user['screen_name'] = item.xpath('.//@data-screen-name').extract()[0] user['avatar'] = \ item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0] yield user except: logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
def parse_user_item(self, user_items): for _, user_item in user_items.items(): user = User() user["created_at"] = datetime.strptime( user_item["created_at"], "%a %b %d %H:%M:%S %z %Y", ) user["collected_at"] = datetime.now() user["user_id"] = user_item["id"] user[ "user_profile_url"] = f'https://twitter.com/{user_item["screen_name"]}' user["user_name"] = user_item["screen_name"] user["user_screen_name"] = user_item["name"] user["user_bio"] = user_item["description"] user["user_followers"] = user_item["followers_count"] user["user_following"] = user_item["friends_count"] user["user_listed"] = user_item["listed_count"] try: location_str = user_item["location"] response = requests.get( f"https://maps.googleapis.com/maps/api/place/findplacefromtext/json?key={self.googleMap_api_key}&input={location_str}&inputtype=textquery&language=en&fields=place_id" ) place_id = response.json()["candidates"][0]["place_id"] response = requests.get( f"https://maps.googleapis.com/maps/api/place/details/json?key={self.googleMap_api_key}&place_id={place_id}&language=en" ) place_details = response.json()["result"]["address_components"] for item in place_details: long_name = item["long_name"] if item["types"][0] == "country": user["country"] = long_name elif item["types"][0] == "administrative_area_level_1": user["state"] = long_name elif item["types"][0] == "administrative_area_level_2": user["city"] = long_name except IndexError: user["country"] = None user["state"] = None user["city"] = None yield user
def parse_profile_page(self, response): response = urlopen(self.profile_url, ) page, result = self.check_if_account_protected(response) print("fsdfds") temp = [] temp_list = [] if (not result): # data = response.read().decode("utf-8") print("hello") # page = Selector(text=data) items_labels = page.xpath( '//span[@class="ProfileNav-label"]/text()') items = page.xpath('//span/@data-count') for i in range(0, len(items_labels) - 1): temp.append(items_labels[i].extract()) for i in range(0, len(items)): temp_list.append(items[i].extract()) if "Followers" in temp: index = temp.index("Followers") number = int(temp_list[index]) if number > 100000: user = User() user['ID'] = page.xpath( './/div/@data-user-id').extract()[0] user['name'] = page.xpath('.//div/@data-name').extract()[0] user['screen_name'] = page.xpath( './/div/@data-screen-name').extract()[0] for i in range(0, len(items_labels) - 1): if items_labels[i].extract( ) != "Lists" and items_labels[i].extract( ) != "Moments": user[ items_labels[i].extract()] = items.extract()[i] yield user url = self.url % (quote(self.query), '') yield http.Request(url, callback=self.parse_page) self.scrap_following()
def parse_tweet_item(self, items): logger.debug(f'items ontvangen {items}') tw_xpaden = { 'ID': './/@data-tweet-id', 'user_id': './/@data-user-id', 'text': './/div[@class="js-tweet-text-container"]/p', 'url': './/@data-permalink-path', 'timestamp': './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time', 'retweets': './/span[contains(@class, "ProfileTweet-action--retweet")]//@data-tweet-stat-count', 'favorites': './/span[contains(@class, "ProfileTweet-action--favorite")]//@data-tweet-stat-count', 'replies': './/span[contains(@class, "ProfileTweet-action--reply")]//@data-tweet-stat-count', 'conversation_id': '//@data-conversation-id', 'lang': self.lang } user_xpaden = { 'ID': './/@data-user-id', 'screenname': './/@data-screen-name', 'name': './/@data-name' } for item in items: tweet = ItemLoader(Tweet(), item) for keys, values in tw_xpaden.items(): tweet.add_xpath(keys, values) yield tweet.load_item() if self.crawl_user: user = ItemLoader(User(), item) for keys, values in user_xpaden.items(): user.add_xpath(keys, values) yield user.load_item() logger.info('pagina compleet geparsed op tweets')
def parse_tweet_item(self, items): for item in items: try: tweet = Tweet() tweet['usernameTweet'] = item.xpath('.//span[@class="username u-dir"]/b/text()').extract()[0] ID = item.xpath('.//@data-tweet-id').extract() if not ID: continue tweet['ID'] = ID[0] ### get text content tweet['text'] = ' '.join( item.xpath('.//div[@class="js-tweet-text-container"]/p//text()').extract()).replace(' # ', '#').replace( ' @ ', '@') if tweet['text'] == '': # If there is not text, we ignore the tweet continue ### get meta data tweet['url'] = item.xpath('.//@data-permalink-path').extract()[0] nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_retweet: tweet['nbr_retweet'] = int(nbr_retweet[0]) else: tweet['nbr_retweet'] = 0 nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_favorite: tweet['nbr_favorite'] = int(nbr_favorite[0]) else: tweet['nbr_favorite'] = 0 nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath( '@data-tweet-stat-count').extract() if nbr_reply: tweet['nbr_reply'] = int(nbr_reply[0]) else: tweet['nbr_reply'] = 0 tweet['datetime'] = datetime.fromtimestamp(int( item.xpath('.//div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time').extract()[ 0])).strftime('%Y-%m-%d %H:%M:%S') ### get photo has_cards = item.xpath('.//@data-card-type').extract() if has_cards and has_cards[0] == 'photo': tweet['has_image'] = True tweet['images'] = item.xpath('.//*/div/@data-image-url').extract() elif has_cards: logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0]) ### get animated_gif has_cards = item.xpath('.//@data-card2-type').extract() if has_cards: if has_cards[0] == 'animated_gif': tweet['has_video'] = True tweet['videos'] = item.xpath('.//*/source/@video-src').extract() elif has_cards[0] == 'player': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'summary_large_image': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'amplify': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == 'summary': tweet['has_media'] = True tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract() elif has_cards[0] == '__entity_video': pass # TODO # tweet['has_media'] = True # tweet['medias'] = item.xpath('.//*/div/@data-src').extract() else: # there are many other types of card2 !!!! logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0]) is_reply = item.xpath('.//div[@class="ReplyingToContextBelowAuthor"]').extract() tweet['is_reply'] = is_reply != [] is_retweet = item.xpath('.//span[@class="js-retweet-text"]').extract() tweet['is_retweet'] = is_retweet != [] tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0] yield tweet if self.crawl_user: ### get user info user = User() user['ID'] = tweet['user_id'] user['name'] = item.xpath('.//@data-name').extract()[0] user['screen_name'] = item.xpath('.//@data-screen-name').extract()[0] user['avatar'] = \ item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0] yield user except: logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
def parse_tweet_item(self, items): for item in items: try: tweet = Tweet() tweet['usernameTweet'] = item.xpath( './/span[@class="username u-dir"]/b/text()').extract()[0] ID = item.xpath('.//@data-tweet-id').extract() if not ID: continue tweet['ID'] = ID[0] tweet['text'] = ' '.join( item.xpath( './/div[@class="js-tweet-text-container"]/p//text()'). extract()).replace(' # ', '#').replace(' @ ', '@') logger.debug(tweet['text']) if tweet['text'] == '': continue tweet['url'] = item.xpath( './/@data-permalink-path').extract()[0] nbr_retweet = item.xpath( './/button[@data-modal="ProfileTweet-retweet"]/span/span/text()' ).extract() if nbr_retweet: tweet['nbr_retweet'] = int(nbr_retweet[0]) else: tweet['nbr_retweet'] = 0 nbr_favorite = item.xpath( './/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]/span/span/text()' ).extract() if nbr_favorite: tweet['nbr_favorite'] = int(nbr_favorite[0]) else: tweet['nbr_favorite'] = 0 nbr_reply = item.xpath( './/button[@class="ProfileTweet-actionButton js-actionButton js-actionReply"]/span/span/text()' ).extract() if nbr_reply: tweet['nbr_reply'] = int(nbr_reply[0]) else: tweet['nbr_reply'] = 0 tweet['datetime'] = datetime.fromtimestamp( int( item.xpath( './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time' ).extract()[0])).strftime('%Y-%m-%d %H:%M:%S') has_cards = item.xpath('.//@data-card-type').extract() if has_cards and has_cards[0] == 'photo': tweet['has_image'] = True tweet['images'] = item.xpath( './/*/div/@data-image-url').extract() elif has_cards: logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0]) has_cards = item.xpath('.//@data-card2-type').extract() if has_cards: if has_cards[0] == 'animated_gif': tweet['has_video'] = True tweet['videos'] = item.xpath( './/*/source/@video-src').extract() elif has_cards[0] == 'player': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'summary_large_image': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'amplify': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == 'summary': tweet['has_media'] = True tweet['medias'] = item.xpath( './/*/div/@data-card-url').extract() elif has_cards[0] == '__entity_video': pass else: logger.debug('error: "data-card2-type":\n%s' % item.xpath('.').extract()[0]) is_reply = item.xpath( './/div[@class="ReplyingToContextBelowAuthor"]').extract() tweet['is_reply'] = is_reply != [] is_retweet = item.xpath( './/span[@class="js-retweet-text"]').extract() tweet['is_retweet'] = is_retweet != [] tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0] yield tweet if self.crawl_user: # 拿用户信息 user = User() user['ID'] = tweet['user_id'] user['name'] = item.xpath('.//@data-name').extract()[0] user['screen_name'] = item.xpath( './/@data-screen-name').extract()[0] user['avatar'] = \ item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0] yield user except: logger.error("error :\n%s" % item.xpath('.').extract()[0])