Example #1
0
 def parse_user_item(self, items):
     for k,v in items.items():
         # assert k == v['id_str'], (k,v)
         user = User()
         user['id_'] = k
         user['raw_data'] = v
         yield user
Example #2
0
 def parse_user_item(
         self, items):  #Durch Userinformationen wird benutzerId bestimmt
     for k, v in items.items():
         # assert k == v['id_str'], (k,v)
         user = User()
         user['id_'] = k
         user['raw_data'] = v
         daten = user['raw_data']
         benutzername = daten['screen_name']
         if (benutzername == unternehmensname):
             self.benutzerId = daten['id_str']
             return self.benutzerId
         yield user
Example #3
0
    def parse_tweet_item(self, items):
        for item in items:
            try:
                tweet = self.get_tweet(item)
                yield tweet

                if self.crawl_user:
                    ### get user info
                    user = User()
                    user['ID'] = tweet['user_id']
                    user['name'] = item.xpath('.//@data-name').extract()[0]
                    user['screen_name'] = item.xpath('.//@data-screen-name').extract()[0]
                    user['avatar'] = \
                        item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0]
                    yield user
            except:
                logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
Example #4
0
    def parse_user_item(self, user_items):
        for _, user_item in user_items.items():
            user = User()
            user["created_at"] = datetime.strptime(
                user_item["created_at"],
                "%a %b %d %H:%M:%S %z %Y",
            )
            user["collected_at"] = datetime.now()

            user["user_id"] = user_item["id"]
            user[
                "user_profile_url"] = f'https://twitter.com/{user_item["screen_name"]}'
            user["user_name"] = user_item["screen_name"]
            user["user_screen_name"] = user_item["name"]
            user["user_bio"] = user_item["description"]
            user["user_followers"] = user_item["followers_count"]
            user["user_following"] = user_item["friends_count"]
            user["user_listed"] = user_item["listed_count"]

            try:
                location_str = user_item["location"]

                response = requests.get(
                    f"https://maps.googleapis.com/maps/api/place/findplacefromtext/json?key={self.googleMap_api_key}&input={location_str}&inputtype=textquery&language=en&fields=place_id"
                )
                place_id = response.json()["candidates"][0]["place_id"]
                response = requests.get(
                    f"https://maps.googleapis.com/maps/api/place/details/json?key={self.googleMap_api_key}&place_id={place_id}&language=en"
                )
                place_details = response.json()["result"]["address_components"]

                for item in place_details:
                    long_name = item["long_name"]
                    if item["types"][0] == "country":
                        user["country"] = long_name
                    elif item["types"][0] == "administrative_area_level_1":
                        user["state"] = long_name
                    elif item["types"][0] == "administrative_area_level_2":
                        user["city"] = long_name
            except IndexError:
                user["country"] = None
                user["state"] = None
                user["city"] = None

            yield user
Example #5
0
    def parse_profile_page(self, response):
        response = urlopen(self.profile_url, )
        page, result = self.check_if_account_protected(response)
        print("fsdfds")
        temp = []
        temp_list = []

        if (not result):
            # data = response.read().decode("utf-8")
            print("hello")
            # page = Selector(text=data)
            items_labels = page.xpath(
                '//span[@class="ProfileNav-label"]/text()')
            items = page.xpath('//span/@data-count')

            for i in range(0, len(items_labels) - 1):
                temp.append(items_labels[i].extract())

            for i in range(0, len(items)):
                temp_list.append(items[i].extract())

            if "Followers" in temp:
                index = temp.index("Followers")
                number = int(temp_list[index])

                if number > 100000:
                    user = User()
                    user['ID'] = page.xpath(
                        './/div/@data-user-id').extract()[0]
                    user['name'] = page.xpath('.//div/@data-name').extract()[0]
                    user['screen_name'] = page.xpath(
                        './/div/@data-screen-name').extract()[0]

                    for i in range(0, len(items_labels) - 1):
                        if items_labels[i].extract(
                        ) != "Lists" and items_labels[i].extract(
                        ) != "Moments":
                            user[
                                items_labels[i].extract()] = items.extract()[i]
                    yield user

                    url = self.url % (quote(self.query), '')
                    yield http.Request(url, callback=self.parse_page)
                    self.scrap_following()
Example #6
0
    def parse_tweet_item(self, items):
        logger.debug(f'items ontvangen {items}')
        tw_xpaden = {
            'ID': './/@data-tweet-id',
            'user_id': './/@data-user-id',
            'text': './/div[@class="js-tweet-text-container"]/p',
            'url': './/@data-permalink-path',
            'timestamp':
            './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time',
            'retweets':
            './/span[contains(@class, "ProfileTweet-action--retweet")]//@data-tweet-stat-count',
            'favorites':
            './/span[contains(@class, "ProfileTweet-action--favorite")]//@data-tweet-stat-count',
            'replies':
            './/span[contains(@class, "ProfileTweet-action--reply")]//@data-tweet-stat-count',
            'conversation_id': '//@data-conversation-id',
            'lang': self.lang
        }

        user_xpaden = {
            'ID': './/@data-user-id',
            'screenname': './/@data-screen-name',
            'name': './/@data-name'
        }

        for item in items:
            tweet = ItemLoader(Tweet(), item)
            for keys, values in tw_xpaden.items():
                tweet.add_xpath(keys, values)
            yield tweet.load_item()

            if self.crawl_user:
                user = ItemLoader(User(), item)
                for keys, values in user_xpaden.items():
                    user.add_xpath(keys, values)
                yield user.load_item()
        logger.info('pagina compleet geparsed op tweets')
Example #7
0
    def parse_tweet_item(self, items):
        for item in items:
            try:
                tweet = Tweet()

                tweet['usernameTweet'] = item.xpath('.//span[@class="username u-dir"]/b/text()').extract()[0]

                ID = item.xpath('.//@data-tweet-id').extract()
                if not ID:
                    continue
                tweet['ID'] = ID[0]

                ### get text content
                tweet['text'] = ' '.join(
                    item.xpath('.//div[@class="js-tweet-text-container"]/p//text()').extract()).replace(' # ',
                                                                                                        '#').replace(
                    ' @ ', '@')
                if tweet['text'] == '':
                    # If there is not text, we ignore the tweet
                    continue

                ### get meta data
                tweet['url'] = item.xpath('.//@data-permalink-path').extract()[0]

                nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_retweet:
                    tweet['nbr_retweet'] = int(nbr_retweet[0])
                else:
                    tweet['nbr_retweet'] = 0

                nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_favorite:
                    tweet['nbr_favorite'] = int(nbr_favorite[0])
                else:
                    tweet['nbr_favorite'] = 0

                nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath(
                    '@data-tweet-stat-count').extract()
                if nbr_reply:
                    tweet['nbr_reply'] = int(nbr_reply[0])
                else:
                    tweet['nbr_reply'] = 0

                tweet['datetime'] = datetime.fromtimestamp(int(
                    item.xpath('.//div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time').extract()[
                        0])).strftime('%Y-%m-%d %H:%M:%S')

                ### get photo
                has_cards = item.xpath('.//@data-card-type').extract()
                if has_cards and has_cards[0] == 'photo':
                    tweet['has_image'] = True
                    tweet['images'] = item.xpath('.//*/div/@data-image-url').extract()
                elif has_cards:
                    logger.debug('Not handle "data-card-type":\n%s' % item.xpath('.').extract()[0])

                ### get animated_gif
                has_cards = item.xpath('.//@data-card2-type').extract()
                if has_cards:
                    if has_cards[0] == 'animated_gif':
                        tweet['has_video'] = True
                        tweet['videos'] = item.xpath('.//*/source/@video-src').extract()
                    elif has_cards[0] == 'player':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary_large_image':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'amplify':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath('.//*/div/@data-card-url').extract()
                    elif has_cards[0] == '__entity_video':
                        pass  # TODO
                        # tweet['has_media'] = True
                        # tweet['medias'] = item.xpath('.//*/div/@data-src').extract()
                    else:  # there are many other types of card2 !!!!
                        logger.debug('Not handle "data-card2-type":\n%s' % item.xpath('.').extract()[0])

                is_reply = item.xpath('.//div[@class="ReplyingToContextBelowAuthor"]').extract()
                tweet['is_reply'] = is_reply != []

                is_retweet = item.xpath('.//span[@class="js-retweet-text"]').extract()
                tweet['is_retweet'] = is_retweet != []

                tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0]
                yield tweet

                if self.crawl_user:
                    ### get user info
                    user = User()
                    user['ID'] = tweet['user_id']
                    user['name'] = item.xpath('.//@data-name').extract()[0]
                    user['screen_name'] = item.xpath('.//@data-screen-name').extract()[0]
                    user['avatar'] = \
                        item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0]
                    yield user
            except:
                logger.error("Error tweet:\n%s" % item.xpath('.').extract()[0])
Example #8
0
    def parse_tweet_item(self, items):
        for item in items:
            try:
                tweet = Tweet()
                tweet['usernameTweet'] = item.xpath(
                    './/span[@class="username u-dir"]/b/text()').extract()[0]
                ID = item.xpath('.//@data-tweet-id').extract()
                if not ID:
                    continue
                tweet['ID'] = ID[0]

                tweet['text'] = ' '.join(
                    item.xpath(
                        './/div[@class="js-tweet-text-container"]/p//text()').
                    extract()).replace(' # ', '#').replace(' @ ', '@')
                logger.debug(tweet['text'])
                if tweet['text'] == '':
                    continue
                tweet['url'] = item.xpath(
                    './/@data-permalink-path').extract()[0]
                nbr_retweet = item.xpath(
                    './/button[@data-modal="ProfileTweet-retweet"]/span/span/text()'
                ).extract()
                if nbr_retweet:
                    tweet['nbr_retweet'] = int(nbr_retweet[0])
                else:
                    tweet['nbr_retweet'] = 0

                nbr_favorite = item.xpath(
                    './/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]/span/span/text()'
                ).extract()
                if nbr_favorite:
                    tweet['nbr_favorite'] = int(nbr_favorite[0])
                else:
                    tweet['nbr_favorite'] = 0

                nbr_reply = item.xpath(
                    './/button[@class="ProfileTweet-actionButton js-actionButton js-actionReply"]/span/span/text()'
                ).extract()
                if nbr_reply:
                    tweet['nbr_reply'] = int(nbr_reply[0])
                else:
                    tweet['nbr_reply'] = 0

                tweet['datetime'] = datetime.fromtimestamp(
                    int(
                        item.xpath(
                            './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time'
                        ).extract()[0])).strftime('%Y-%m-%d %H:%M:%S')
                has_cards = item.xpath('.//@data-card-type').extract()
                if has_cards and has_cards[0] == 'photo':
                    tweet['has_image'] = True
                    tweet['images'] = item.xpath(
                        './/*/div/@data-image-url').extract()
                elif has_cards:
                    logger.debug('Not handle "data-card-type":\n%s' %
                                 item.xpath('.').extract()[0])

                has_cards = item.xpath('.//@data-card2-type').extract()

                if has_cards:
                    if has_cards[0] == 'animated_gif':
                        tweet['has_video'] = True
                        tweet['videos'] = item.xpath(
                            './/*/source/@video-src').extract()
                    elif has_cards[0] == 'player':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary_large_image':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'amplify':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == 'summary':
                        tweet['has_media'] = True
                        tweet['medias'] = item.xpath(
                            './/*/div/@data-card-url').extract()
                    elif has_cards[0] == '__entity_video':
                        pass
                    else:
                        logger.debug('error: "data-card2-type":\n%s' %
                                     item.xpath('.').extract()[0])
                is_reply = item.xpath(
                    './/div[@class="ReplyingToContextBelowAuthor"]').extract()
                tweet['is_reply'] = is_reply != []

                is_retweet = item.xpath(
                    './/span[@class="js-retweet-text"]').extract()
                tweet['is_retweet'] = is_retweet != []

                tweet['user_id'] = item.xpath('.//@data-user-id').extract()[0]
                yield tweet

                if self.crawl_user:
                    # 拿用户信息
                    user = User()

                    user['ID'] = tweet['user_id']
                    user['name'] = item.xpath('.//@data-name').extract()[0]
                    user['screen_name'] = item.xpath(
                        './/@data-screen-name').extract()[0]
                    user['avatar'] = \
                        item.xpath('.//div[@class="content"]/div[@class="stream-item-header"]/a/img/@src').extract()[0]
                    yield user
            except:
                logger.error("error :\n%s" % item.xpath('.').extract()[0])