def extract_post(self, json_string_dict, user_pkey=None): """Returns tuple of lbsn.Post() and List of post_context_records e.g.: (lbsn.Post(), [lbsn.Country(), lbsn.City(), lbsn.Place(), lbsn.User()]) """ post_guid = json_string_dict.get('id_str') if not HF.check_notice_empty_post_guid(post_guid): return None, None post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid, self.origin) post_geoacc = None user_record = None user_info = json_string_dict.get('user') if user_info: # Get lbsn.Post/Reaction Details of lbsn.User user_record = self.extract_user(json_string_dict.get('user')) elif user_pkey: # userPkey is already available for posts that are statuses user_record = HF.new_lbsn_record_with_id(lbsn.User(), user_pkey.id, self.origin) if user_record: # self.lbsn_records.append(user_record) self.lbsn_records.append(user_record) else: self.log.warning(f'Record {self.lbsn_records.count_glob_total}: ' f'No lbsn.User record found for post: {post_guid} ' f'(post saved without userid)..') print(f'Record {self.lbsn_records.count_glob_total}', end='\r') # Some preprocessing for all types: post_coordinates = json_string_dict.get('coordinates') if post_coordinates: l_lng = post_coordinates.get('coordinates')[0] l_lat = post_coordinates.get('coordinates')[1] post_record.post_geoaccuracy = lbsn.Post.LATLNG post_record.post_latlng = "POINT(%s %s)" % (l_lng, l_lat) # Check if lbsn.Place is mentioned post_place_json = json_string_dict.get('place') if post_place_json: # we need some information from postRecord to create placeRecord # (e.g. user language, geoaccuracy, post_latlng) # some of the information from place will also modify postRecord # attributes; therefore return both if user_record: user_lang = user_record.user_language else: user_lang = None place_record, \ post_geoacc, \ post_country = self.extract_place(post_place_json, post_record.post_geoaccuracy, user_lang) if not post_record.post_geoaccuracy: post_record.post_geoaccuracy = post_geoacc # postRecord.post_geoaccuracy = twitterPostAttributes.geoaccuracy # self.lbsn_records.append(place_record) self.lbsn_records.append(place_record) if post_country: post_record.country_pkey.CopyFrom(post_country.pkey) if isinstance(place_record, lbsn.City): post_record.city_pkey.CopyFrom(place_record.pkey) # either city or place, Twitter user cannot attach both (?) elif isinstance(place_record, lbsn.Place): post_record.place_pkey.CopyFrom(place_record.pkey) # substitute postRecord LatLng Coordinates from placeRecord, # if not already set if not post_record.post_latlng: # Note: this will also substitute lbsn.Country lat/lng in post # this information is also available by query of # country_guid in posts # use input arg min_geoaccuracy to exclude country geo-posts post_record.post_latlng = place_record.geom_center # if still no geoinformation, send post to Null-Island if not post_record.post_latlng: if self.ignore_non_geotagged is True: return None self.null_island += 1 post_record.post_latlng = "POINT(%s %s)" % (0, 0) if self.min_geoaccuracy: if not HF.geoacc_within_threshold(post_record.post_geoaccuracy, self.min_geoaccuracy): self.skipped_low_geoaccuracy += 1 return None # Process attributes of twitter post post_source = json_string_dict.get('source') if post_source: post_record.input_source = HF.cleanhtml( json_string_dict.get('source')) if self.ignore_sources_set and \ post_record.input_source in self.ignore_sources_set: # skip entry if in ignore list self.skipped_ignore_list += 1 return None post_record.post_publish_date.CopyFrom( HF.json_date_string_to_proto(json_string_dict.get('created_at'))) if user_record: post_record.user_pkey.CopyFrom(user_record.pkey) post_record.post_quote_count = HF.value_count( json_string_dict.get('quote_count')) post_record.post_comment_count = HF.value_count( json_string_dict.get('reply_count')) post_record.post_share_count = HF.value_count( json_string_dict.get('retweet_count')) post_record.post_like_count = HF.value_count( json_string_dict.get('favorite_count')) post_record.post_url = f'https://twitter.com/statuses/{post_guid}' language_str = json_string_dict.get('lang') if language_str: post_language = lbsn.Language() post_language.language_short = json_string_dict.get('lang') post_record.post_language.CopyFrom(post_language) # If Extended_tweet object is available, # process entities and post_body (text) data from extended object is_truncated = json_string_dict.get('truncated') if is_truncated and 'extended_tweet' in json_string_dict: # if the "truncated" field is set to true, # and the "extended_tweet" object provides complete # "full_text" and "entities" Tweet metadata # Source for all data is extended object, if available json_string_dict = json_string_dict.get('extended_tweet') post_record.post_body = json_string_dict.get('full_text') # else: # self.log.warning(f'Truncated but no extended_tweet:' # f'{json_string_dict}') # input("Press Enter to continue... (entry will be skipped)") # return None else: if 'full_text' in json_string_dict: post_record.post_body = json_string_dict.get('full_text') else: post_record.post_body = json_string_dict.get('text') # entities section always exists and includes meta information # such as hashtags or user_mentions entities_json = json_string_dict.get('entities') # extract hashtags hashtags_json = entities_json.get('hashtags') if hashtags_json: for hashtag in hashtags_json: # iterate over the list post_record.hashtags.append(hashtag.get("text")) # Look for mentioned userRecords user_mentions_json = entities_json.get('user_mentions') if user_mentions_json: ref_user_records = HF.get_mentioned_users(user_mentions_json, self.origin) # self.lbsn_records.append(ref_user_records) self.lbsn_records.append(ref_user_records) post_record.user_mentions_pkey.extend( [user_ref.pkey for user_ref in ref_user_records]) if self.map_full_relations: self.extract_mentioned_users( ref_user_records, user_record.pkey.id) # sometimes, extended_entities section exists and includes # additional information on media, but never hashtags or user_mentions # Since the media type metadata in the extended_entities section # correctly indicates the media type # (‘photo’, ‘video’ or ‘animated_gif’), # and supports up to 4 photos, it is the preferred metadata # source for native media. See: # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/extended-entities-object.html#extended-entities-object if 'extended_entities' in json_string_dict: entities_json = json_string_dict.get('extended_entities') media_json = entities_json.get('media') if media_json: post_record.post_type = HF.assign_media_post_type(media_json) else: post_record.post_type = lbsn.Post.TEXT post_record.emoji.extend(HF.extract_emoji(post_record.post_body)) # because standard print statement will produce escaped text, # we can use protobuf text_format to give us a human friendly # version of the text # log.debug(f'lbsn.Post record: ' # f'{text_format.MessageToString(postRecord, as_utf8=True)}') # log.debug(f'lbsn.Post record: {postRecord}') return post_record
def extract_post( self, json_string_dict: Dict[str, Any], place_record: lbsn.Place = None): post_guid = json_string_dict.get('id') if not HF.check_notice_empty_post_guid(post_guid): return None post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid, self.origin) user_record = None user_info = json_string_dict.get('owner') if user_info: # Get Post/Reaction Details of User user_record = self.extract_user(user_info) if user_record: self.lbsn_records.append(user_record) else: self.log.warning( f'No User record found for post: {post_guid} ' f'(post saved without userid)..') # Check from upstream to update post attrs if place_record: # assign place accuracy, by default post_record.post_geoaccuracy = lbsn.Post.PLACE post_record.place_pkey.CopyFrom(place_record.pkey) post_record.post_latlng = place_record.geom_center else: post_record.post_geoaccuracy = None # if still no geoinformation, send post to Null-Island if not post_record.post_latlng: if self.ignore_non_geotagged is True: return None else: self.null_island += 1 post_record.post_latlng = "POINT(%s %s)" % (0, 0) if self.min_geoaccuracy: if not HF.geoacc_within_threshold( post_record.post_geoaccuracy, self.min_geoaccuracy): self.skipped_low_geoaccuracy += 1 return post_record.post_publish_date.CopyFrom( HF.json_date_timestamp_to_proto( json_string_dict.get('taken_at_timestamp'))) if user_record: post_record.user_pkey.CopyFrom(user_record.pkey) def value_count(x): return 0 if x is None else x post_record.post_comment_count = value_count( json_string_dict.get('edge_media_to_comment').get('count')) post_record.post_like_count = value_count( json_string_dict.get('edge_liked_by').get('count')) post_shortcode = json_string_dict.get('shortcode') post_record.post_url = f'http://www.instagram.com/p/{post_shortcode}' if json_string_dict.get("thumbnail_src"): post_record.post_thumbnail_url = json_string_dict.get( "thumbnail_src") post_caption_edge = json_string_dict.get('edge_media_to_caption') if post_caption_edge: post_caption_edge_edges = post_caption_edge.get("edges") if post_caption_edge_edges and not len( post_caption_edge_edges) == 0: post_caption = post_caption_edge[ "edges"][0]["node"]["text"] post_record.post_body = post_caption.replace( '\n', ' ').replace('\r', '') hashtags = HF.extract_hashtags_from_string(post_caption) if hashtags: for hashtag in hashtags: post_record.hashtags.append(hashtag) is_video = json_string_dict.get('is_video') if is_video: post_record.post_type = lbsn.Post.VIDEO post_record.post_views_count = value_count( json_string_dict.get('video_view_count')) else: post_record.post_type = lbsn.Post.IMAGE post_record.emoji.extend(HF.extract_emoji(post_record.post_body)) self.lbsn_records.append(post_record)