コード例 #1
0
    def extract_post(self, json_string_dict, user_pkey=None):
        """Returns tuple of lbsn.Post() and List of post_context_records

        e.g.:
            (lbsn.Post(), [lbsn.Country(), lbsn.City(), lbsn.Place(), lbsn.User()])
        """
        post_guid = json_string_dict.get('id_str')

        if not HF.check_notice_empty_post_guid(post_guid):
            return None, None
        post_record = HF.new_lbsn_record_with_id(lbsn.Post(),
                                                 post_guid,
                                                 self.origin)
        post_geoacc = None
        user_record = None
        user_info = json_string_dict.get('user')
        if user_info:
            # Get lbsn.Post/Reaction Details of lbsn.User
            user_record = self.extract_user(json_string_dict.get('user'))
        elif user_pkey:
            # userPkey is already available for posts that are statuses
            user_record = HF.new_lbsn_record_with_id(lbsn.User(),
                                                     user_pkey.id,
                                                     self.origin)
        if user_record:
            # self.lbsn_records.append(user_record)
            self.lbsn_records.append(user_record)
        else:
            self.log.warning(f'Record {self.lbsn_records.count_glob_total}: '
                             f'No lbsn.User record found for post: {post_guid} '
                             f'(post saved without userid)..')
            print(f'Record {self.lbsn_records.count_glob_total}', end='\r')

        # Some preprocessing for all types:
        post_coordinates = json_string_dict.get('coordinates')
        if post_coordinates:
            l_lng = post_coordinates.get('coordinates')[0]
            l_lat = post_coordinates.get('coordinates')[1]
            post_record.post_geoaccuracy = lbsn.Post.LATLNG
            post_record.post_latlng = "POINT(%s %s)" % (l_lng, l_lat)

        # Check if lbsn.Place is mentioned
        post_place_json = json_string_dict.get('place')
        if post_place_json:
            # we need some information from postRecord to create placeRecord
            # (e.g. user language, geoaccuracy, post_latlng)
            # some of the information from place will also modify postRecord
            # attributes; therefore return both
            if user_record:
                user_lang = user_record.user_language
            else:
                user_lang = None
            place_record, \
                post_geoacc, \
                post_country = self.extract_place(post_place_json,
                                                  post_record.post_geoaccuracy,
                                                  user_lang)
            if not post_record.post_geoaccuracy:
                post_record.post_geoaccuracy = post_geoacc
            # postRecord.post_geoaccuracy = twitterPostAttributes.geoaccuracy
            # self.lbsn_records.append(place_record)
            self.lbsn_records.append(place_record)
            if post_country:
                post_record.country_pkey.CopyFrom(post_country.pkey)
            if isinstance(place_record, lbsn.City):
                post_record.city_pkey.CopyFrom(place_record.pkey)
            # either city or place, Twitter user cannot attach both (?)
            elif isinstance(place_record, lbsn.Place):
                post_record.place_pkey.CopyFrom(place_record.pkey)
            # substitute postRecord LatLng Coordinates from placeRecord,
            # if not already set
            if not post_record.post_latlng:
                # Note: this will also substitute lbsn.Country lat/lng in post
                # this information is also available by query of
                # country_guid in posts
                # use input arg min_geoaccuracy to exclude country geo-posts
                post_record.post_latlng = place_record.geom_center
        # if still no geoinformation, send post to Null-Island
        if not post_record.post_latlng:
            if self.ignore_non_geotagged is True:
                return None
            self.null_island += 1
            post_record.post_latlng = "POINT(%s %s)" % (0, 0)
        if self.min_geoaccuracy:
            if not HF.geoacc_within_threshold(post_record.post_geoaccuracy,
                                              self.min_geoaccuracy):
                self.skipped_low_geoaccuracy += 1
                return None
        # Process attributes of twitter post
        post_source = json_string_dict.get('source')
        if post_source:
            post_record.input_source = HF.cleanhtml(
                json_string_dict.get('source'))
            if self.ignore_sources_set and \
                    post_record.input_source in self.ignore_sources_set:
                # skip entry if in ignore list
                self.skipped_ignore_list += 1
                return None
        post_record.post_publish_date.CopyFrom(
            HF.json_date_string_to_proto(json_string_dict.get('created_at')))
        if user_record:
            post_record.user_pkey.CopyFrom(user_record.pkey)
        post_record.post_quote_count = HF.value_count(
            json_string_dict.get('quote_count'))
        post_record.post_comment_count = HF.value_count(
            json_string_dict.get('reply_count'))
        post_record.post_share_count = HF.value_count(
            json_string_dict.get('retweet_count'))
        post_record.post_like_count = HF.value_count(
            json_string_dict.get('favorite_count'))
        post_record.post_url = f'https://twitter.com/statuses/{post_guid}'
        language_str = json_string_dict.get('lang')
        if language_str:
            post_language = lbsn.Language()
            post_language.language_short = json_string_dict.get('lang')
            post_record.post_language.CopyFrom(post_language)
        # If Extended_tweet object is available,
        # process entities and post_body (text) data from extended object
        is_truncated = json_string_dict.get('truncated')
        if is_truncated and 'extended_tweet' in json_string_dict:
            # if the "truncated" field is set to true,
            # and the "extended_tweet" object provides complete
            # "full_text" and "entities" Tweet metadata
            # Source for all data is extended object, if available
            json_string_dict = json_string_dict.get('extended_tweet')
            post_record.post_body = json_string_dict.get('full_text')
            # else:
            #    self.log.warning(f'Truncated but no extended_tweet:'
            #                     f'{json_string_dict}')
            #    input("Press Enter to continue... (entry will be skipped)")
            #    return None
        else:
            if 'full_text' in json_string_dict:
                post_record.post_body = json_string_dict.get('full_text')
            else:
                post_record.post_body = json_string_dict.get('text')
        # entities section always exists and includes meta information
        # such as hashtags or user_mentions
        entities_json = json_string_dict.get('entities')
        # extract hashtags
        hashtags_json = entities_json.get('hashtags')
        if hashtags_json:
            for hashtag in hashtags_json:  # iterate over the list
                post_record.hashtags.append(hashtag.get("text"))
        # Look for mentioned userRecords
        user_mentions_json = entities_json.get('user_mentions')
        if user_mentions_json:
            ref_user_records = HF.get_mentioned_users(user_mentions_json,
                                                      self.origin)
            # self.lbsn_records.append(ref_user_records)
            self.lbsn_records.append(ref_user_records)
            post_record.user_mentions_pkey.extend(
                [user_ref.pkey for user_ref in ref_user_records])
            if self.map_full_relations:
                self.extract_mentioned_users(
                    ref_user_records, user_record.pkey.id)
        # sometimes, extended_entities section exists and includes
        # additional information on media, but never hashtags or user_mentions
        # Since the media type metadata in the extended_entities section
        # correctly indicates the media type
        # (‘photo’, ‘video’ or ‘animated_gif’),
        # and supports up to 4 photos, it is the preferred metadata
        # source for native media. See:
        # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/extended-entities-object.html#extended-entities-object
        if 'extended_entities' in json_string_dict:
            entities_json = json_string_dict.get('extended_entities')
        media_json = entities_json.get('media')
        if media_json:
            post_record.post_type = HF.assign_media_post_type(media_json)
        else:
            post_record.post_type = lbsn.Post.TEXT
        post_record.emoji.extend(HF.extract_emoji(post_record.post_body))
        # because standard print statement will produce escaped text,
        # we can use protobuf text_format to give us a human friendly
        # version of the text
        # log.debug(f'lbsn.Post record: '
        #           f'{text_format.MessageToString(postRecord, as_utf8=True)}')
        # log.debug(f'lbsn.Post record: {postRecord}')
        return post_record
    def extract_post(
        self, json_string_dict: Dict[str, Any], place_record: lbsn.Place = None):
        post_guid = json_string_dict.get('id')
        if not HF.check_notice_empty_post_guid(post_guid):
            return None
        post_record = HF.new_lbsn_record_with_id(lbsn.Post(),
                                                 post_guid,
                                                 self.origin)
        user_record = None
        user_info = json_string_dict.get('owner')
        if user_info:
            # Get Post/Reaction Details of User
            user_record = self.extract_user(user_info)
        if user_record:
            self.lbsn_records.append(user_record)
        else:
            self.log.warning(
                f'No User record found for post: {post_guid} '
                f'(post saved without userid)..')

        # Check from upstream to update post attrs
        if place_record:
            # assign place accuracy, by default
            post_record.post_geoaccuracy = lbsn.Post.PLACE
            post_record.place_pkey.CopyFrom(place_record.pkey)
            post_record.post_latlng = place_record.geom_center
        else:
            post_record.post_geoaccuracy = None

        # if still no geoinformation, send post to Null-Island
        if not post_record.post_latlng:
            if self.ignore_non_geotagged is True:
                return None
            else:
                self.null_island += 1
                post_record.post_latlng = "POINT(%s %s)" % (0, 0)
        if self.min_geoaccuracy:
            if not HF.geoacc_within_threshold(
                    post_record.post_geoaccuracy, self.min_geoaccuracy):
                self.skipped_low_geoaccuracy += 1
                return
        post_record.post_publish_date.CopyFrom(
            HF.json_date_timestamp_to_proto(
                json_string_dict.get('taken_at_timestamp')))
        if user_record:
            post_record.user_pkey.CopyFrom(user_record.pkey)

        def value_count(x): return 0 if x is None else x
        post_record.post_comment_count = value_count(
            json_string_dict.get('edge_media_to_comment').get('count'))
        post_record.post_like_count = value_count(
            json_string_dict.get('edge_liked_by').get('count'))
        post_shortcode = json_string_dict.get('shortcode')
        post_record.post_url = f'http://www.instagram.com/p/{post_shortcode}'
        if json_string_dict.get("thumbnail_src"):
            post_record.post_thumbnail_url = json_string_dict.get(
                "thumbnail_src")
        post_caption_edge = json_string_dict.get('edge_media_to_caption')
        if post_caption_edge:
            post_caption_edge_edges = post_caption_edge.get("edges")
            if post_caption_edge_edges and not len(
                    post_caption_edge_edges) == 0:
                post_caption = post_caption_edge[
                    "edges"][0]["node"]["text"]
                post_record.post_body = post_caption.replace(
                    '\n', ' ').replace('\r', '')
                hashtags = HF.extract_hashtags_from_string(post_caption)
                if hashtags:
                    for hashtag in hashtags:
                        post_record.hashtags.append(hashtag)
        is_video = json_string_dict.get('is_video')
        if is_video:
            post_record.post_type = lbsn.Post.VIDEO
            post_record.post_views_count = value_count(
                json_string_dict.get('video_view_count'))
        else:
            post_record.post_type = lbsn.Post.IMAGE
        post_record.emoji.extend(HF.extract_emoji(post_record.post_body))
        self.lbsn_records.append(post_record)