def dict_type_switcher(desc_name): """ Create protoBuf messages by name""" dict_switcher = { lbsn.Country().DESCRIPTOR.name: lbsn.Country(), lbsn.City().DESCRIPTOR.name: lbsn.City(), lbsn.Place().DESCRIPTOR.name: lbsn.Place(), lbsn.User().DESCRIPTOR.name: lbsn.User(), lbsn.UserGroup().DESCRIPTOR.name: lbsn.UserGroup(), lbsn.Post().DESCRIPTOR.name: lbsn.Post(), lbsn.PostReaction().DESCRIPTOR.name: lbsn.PostReaction(), lbsn.Relationship().DESCRIPTOR.name: lbsn.Relationship() } return dict_switcher.get(desc_name)
def dict_selector(self, record): """ Get dictionary by record type name""" dict_switcher = { lbsn.Post().DESCRIPTOR.name: self.lbsn_post_dict, lbsn.Country().DESCRIPTOR.name: self.lbsn_country_dict, lbsn.City().DESCRIPTOR.name: self.lbsn_city_dict, lbsn.Place().DESCRIPTOR.name: self.lbsn_place_dict, lbsn.PostReaction().DESCRIPTOR.name: self.lbsn_post_reaction_dict, lbsn.User().DESCRIPTOR.name: self.lbsn_user_dict, lbsn.UserGroup().DESCRIPTOR.name: self.lbsn_user_group_dict, lbsn.Origin().DESCRIPTOR.name: self.lbsn_origin_dict } return dict_switcher.get(record.DESCRIPTOR.name)
def __init__(self, record=None): if record is None: record = lbsn.Post() self.origin_id = record.pkey.origin.origin_id self.guid = record.pkey.id self.post_latlng = HF.null_geom_check(record.post_latlng) self.place_guid = HF.null_check(record.place_pkey.id) self.city_guid = HF.null_check(record.city_pkey.id) self.country_guid = HF.null_check(record.country_pkey.id) self.post_geoaccuracy = HF.turn_lower( HF.null_check(lbsn.Post().PostGeoaccuracy.Name( record.post_geoaccuracy))) self.user_guid = HF.null_check(record.user_pkey.id) self.post_create_date = HF.null_check_datetime(record.post_create_date) self.post_publish_date = HF.null_check_datetime( record.post_publish_date) self.post_body = HF.null_check(record.post_body) self.post_language = HF.null_check(record.post_language.language_short) self.user_mentions = list( set([pkey.id for pkey in record.user_mentions_pkey])) self.hashtags = list(set(record.hashtags)) self.emoji = list(set(record.emoji)) self.post_like_count = HF.null_check(record.post_like_count) self.post_comment_count = HF.null_check(record.post_comment_count) self.post_views_count = HF.null_check(record.post_views_count) self.post_title = HF.null_check(record.post_title) self.post_thumbnail_url = HF.null_check(record.post_thumbnail_url) self.post_url = HF.null_check(record.post_url) self.post_type = HF.turn_lower( HF.null_check(lbsn.Post().PostType.Name(record.post_type))) self.post_filter = HF.null_check(record.post_filter) self.post_quote_count = HF.null_check(record.post_quote_count) self.post_share_count = HF.null_check(record.post_share_count) self.input_source = HF.null_check(record.input_source) self.post_content_license = HF.null_check(record.post_content_license) # optional: self.latitude = 0 self.longitude = 0
def type_sql_mapper(cls): """Assigns record types to SQL Insert SQLs""" type_sql_mapping = { lbsn.Origin().DESCRIPTOR.name: cls.origin_insertsql, lbsn.Country().DESCRIPTOR.name: cls.country_insertsql, lbsn.City().DESCRIPTOR.name: cls.city_insertsql, lbsn.Place().DESCRIPTOR.name: cls.place_insertsql, lbsn.User().DESCRIPTOR.name: cls.user_insertsql, lbsn.UserGroup().DESCRIPTOR.name: cls.usergroup_insertsql, lbsn.Post().DESCRIPTOR.name: cls.post_insertsql, lbsn.Event().DESCRIPTOR.name: cls.event_insertsql, lbsn.PostReaction().DESCRIPTOR.name: cls.postreaction_insertsql, } return type_sql_mapping
def get_hll_metrics(cls, record) -> hll.HllMetrics: """Extracts hll metrics based on record type""" dict_switcher = { lbsn.Origin().DESCRIPTOR.name: cls.get_origin_metrics, lbsn.Country().DESCRIPTOR.name: cls.get_country_metrics, lbsn.City().DESCRIPTOR.name: cls.get_city_metrics, lbsn.Place().DESCRIPTOR.name: cls.get_place_metrics, lbsn.User().DESCRIPTOR.name: cls.get_user_metrics, lbsn.UserGroup().DESCRIPTOR.name: cls.get_usergroup_metrics, lbsn.Post().DESCRIPTOR.name: cls.get_post_metrics, lbsn.PostReaction().DESCRIPTOR.name: cls.get_postreaction_metrics, lbsn.Relationship().DESCRIPTOR.name: cls.get_relationship_metrics } extract_function = dict_switcher.get(record.DESCRIPTOR.name) record_hll_metrics = extract_function(record) return record_hll_metrics
def func_prepare_selector(self, record): """Select correct prepare function according to record type""" dict_switcher = { lbsn.Origin().DESCRIPTOR.name: self.prepare_lbsn_origin, lbsn.Country().DESCRIPTOR.name: self.prepare_lbsn_country, lbsn.City().DESCRIPTOR.name: self.prepare_lbsn_city, lbsn.Place().DESCRIPTOR.name: self.prepare_lbsn_place, lbsn.User().DESCRIPTOR.name: self.prepare_lbsn_user, lbsn.UserGroup().DESCRIPTOR.name: self.prepare_lbsn_usergroup, lbsn.Post().DESCRIPTOR.name: self.prepare_lbsn_post, lbsn.Event().DESCRIPTOR.name: self.prepare_lbsn_event, lbsn.PostReaction().DESCRIPTOR.name: self.prepare_lbsn_postreaction, lbsn.Relationship().DESCRIPTOR.name: self.prepare_lbsn_relation } prepare_function = dict_switcher.get(record.DESCRIPTOR.name) return prepare_function(record)
def get_func_record(cls, record: Dict[str, Any], input_type: Optional[str] = None): """Returns mapping function for input_type""" FUNC_MAP = { lbsn.Origin().DESCRIPTOR.name: cls.extract_origin, lbsn.Country().DESCRIPTOR.name: cls.extract_country, lbsn.City().DESCRIPTOR.name: cls.extract_city, lbsn.Place().DESCRIPTOR.name: cls.extract_place, lbsn.UserGroup().DESCRIPTOR.name: cls.extract_usergroup, lbsn.User().DESCRIPTOR.name: cls.extract_user, lbsn.Post().DESCRIPTOR.name: cls.extract_post, lbsn.PostReaction().DESCRIPTOR.name: cls.extract_postreaction, lbsn.Event().DESCRIPTOR.name: cls.extract_event, } func_map = FUNC_MAP.get(input_type) # create origin always the same origin = lbsn.Origin() origin.origin_id = record.get('origin_id') return func_map(record, origin)
def __init__(self): self.lbsn_origin_dict = dict() self.lbsn_country_dict = dict() self.lbsn_city_dict = dict() self.lbsn_place_dict = dict() self.lbsn_user_group_dict = dict() self.lbsn_user_dict = dict() self.lbsn_post_dict = dict() self.lbsn_post_reaction_dict = dict() self.lbsn_relationship_dict = dict() self.key_hashes = { lbsn.Origin.DESCRIPTOR.name: set(), lbsn.Post.DESCRIPTOR.name: set(), lbsn.Country.DESCRIPTOR.name: set(), lbsn.City.DESCRIPTOR.name: set(), lbsn.Place.DESCRIPTOR.name: set(), lbsn.UserGroup.DESCRIPTOR.name: set(), lbsn.User.DESCRIPTOR.name: set(), lbsn.PostReaction.DESCRIPTOR.name: set(), lbsn.Relationship.DESCRIPTOR.name: set() } self.count_glob = 0 # total number of records added self.count_glob_total = 0 self.count_dup_merge = 0 # number of duplicate records merged self.count_dup_merge_total = 0 # returns all recordsDicts in correct order, # with names as references (tuple) self.all_dicts = [ (self.lbsn_origin_dict, lbsn.Origin().DESCRIPTOR.name), (self.lbsn_country_dict, lbsn.Country().DESCRIPTOR.name), (self.lbsn_city_dict, lbsn.City().DESCRIPTOR.name), (self.lbsn_place_dict, lbsn.Place().DESCRIPTOR.name), (self.lbsn_user_group_dict, lbsn.UserGroup().DESCRIPTOR.name), (self.lbsn_user_dict, lbsn.User().DESCRIPTOR.name), (self.lbsn_post_dict, lbsn.Post().DESCRIPTOR.name), (self.lbsn_post_reaction_dict, lbsn.PostReaction().DESCRIPTOR.name), (self.lbsn_relationship_dict, lbsn.Relationship().DESCRIPTOR.name) ]
def update_post_with_place(self, post_record: lbsn.Post = None, post_guid: str = None, place_records: List[lbsn.Place] = None): """Update post record with entries from place record""" if post_record is None: if post_guid is None: raise ValueError("Cannot create lbsn.Post without post_guid") # create new post record post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid, self.origin) if place_records is None: return post_record for place_record in place_records: if isinstance(place_record, lbsn.Country): post_record.country_pkey.CopyFrom(place_record.pkey) if isinstance(place_record, lbsn.City): post_record.city_pkey.CopyFrom(place_record.pkey) # either city or place, Twitter user cannot attach both (?) elif isinstance(place_record, lbsn.Place): post_record.place_pkey.CopyFrom(place_record.pkey) return post_record
def extract_post(cls, record, origin): post = HF.new_lbsn_record_with_id(lbsn.Post(), record.get('post_guid'), origin) post_latlng = record.get("post_latlng") if post_latlng: setattr(post, "post_latlng", parse_geom(post_latlng)) place_guid = record.get('place_guid') if place_guid: set_lbsn_pkey(post.place_pkey, lbsn.Place(), record.get('place_guid'), origin) city_guid = record.get('city_guid') if city_guid: set_lbsn_pkey(post.city_pkey, lbsn.City(), record.get('city_guid'), origin) country_guid = record.get('country_guid') if country_guid: set_lbsn_pkey(post.country_pkey, lbsn.Country(), record.get('country_guid'), origin) set_lbsn_pkey(post.user_pkey, lbsn.User(), record.get('user_guid'), origin) pub_date = record.get('post_publish_date') if pub_date: copydate_lbsn_attr(post.post_publish_date, pub_date) set_lbsn_attr(post, "post_body", record) post.post_geoaccuracy geo_acc = record.get("post_geoaccuracy") if geo_acc: # get enum value post.post_geoaccuracy = lbsn.Post.PostGeoaccuracy.Value( geo_acc.upper()) set_lbsn_attr(post, "hashtags", record) set_lbsn_attr(post, "emoji", record) set_lbsn_attr(post, "post_like_count", record) set_lbsn_attr(post, "post_comment_count", record) set_lbsn_attr(post, "post_views_count", record) set_lbsn_attr(post, "post_title", record) crt_date = record.get('post_create_date') if crt_date: copydate_lbsn_attr(post.post_create_date, crt_date) set_lbsn_attr(post, "post_thumbnail_url", record) set_lbsn_attr(post, "post_url", record) post_type = record.get("post_type") if post_type: # get enum value post.post_type = lbsn.Post.PostType.Value(post_type.upper()) set_lbsn_attr(post, "post_filter", record) set_lbsn_attr(post, "post_quote_count", record) set_lbsn_attr(post, "post_share_count", record) lang = record.get('post_language') if lang: ref_post_language = lbsn.Language() ref_post_language.language_short = lang post.post_language.CopyFrom(ref_post_language) set_lbsn_attr(post, "input_source", record) user_mentions = record.get("user_mentions") if user_mentions: mentioned_users_list = [] for user_id in user_mentions: # iterate over the list ref_user_record = \ HF.new_lbsn_record_with_id( lbsn.User(), user_id, origin) mentioned_users_list.append(ref_user_record) post.user_mentions_pkey.extend( [user_ref.pkey for user_ref in mentioned_users_list]) set_lbsn_attr(post, "post_content_license", record) return post
def extract_flickr_post(self, record): """Main function for processing Flickr YFCC100M CSV entry. This mothod is adapted to a special structure, adapt if needed. To Do: - parameterize column numbers and structure - provide external config-file for specific CSV structures - currently not included in lbsn mapping are MachineTags, GeoContext (indoors, outdoors), WoeId and some extra attributes only present for Flickr Overview of available columns and examples: 0 row-number - 0 1 Photo/video identifier - 6985418911 2 lbsn.User NSID(PostID?) - 4e2f7a26a1dfbf165a7e30bdabf7e72a 3 lbsn.User ID - 39089491@N00 4 lbsn.User nickname - gnuckx 5 Date taken - 2012-02-16 09:56:37.0 6 Date uploaded - 1331840483 7 Capture device - Canon+PowerShot+ELPH+310+HS 8 Title - IMG_0520 9 Description - My vacation 10 tags (comma-separated) - canon,canon+powershot+hs+310 11 Machine tags (comma-separated) - landscape, hills, water 12 Longitude - -81.804885 13 Latitude - 24.550558 14 Accuracy - 12 15 Photo/video page URL - http://www.flickr.com/photos/39089491@N00/6985418911/ 16 Photo/video download URL - http://farm8.staticflickr.com/7205/6985418911_df7747990d.jpg 17 License name - Attribution-NonCommercial-NoDerivs License 18 License URL - http://creativecommons.org/licenses/by-nc-nd/2.0/ 19 Photo/video server identifier - 7205 20 Photo/video farm identifier - 8 21 Photo/video secret - df7747990d 22 Photo/video secret original - 692d7e0a7f 23 Extension of the original photo - jpg 24 Marker (0 ¼ photo, 1 ¼ video) - 0 if concat: 25 Photo/video identifier 26 Place references (null to multiple) """ # note that one input record may contain many lbsn records # therefore, return list of processed records lbsn_records = [] # start mapping input to lbsn_records post_guid = record[1] if not HF.check_notice_empty_post_guid(post_guid): return None post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid, self.origin) user_record = HF.new_lbsn_record_with_id(lbsn.User(), record[3], self.origin) user_record.user_name = unquote(record[4]).replace('+', ' ') user_record.url = f'http://www.flickr.com/photos/{user_record.pkey.id}/' if user_record: post_record.user_pkey.CopyFrom(user_record.pkey) lbsn_records.append(user_record) post_record.post_latlng = self.flickr_extract_postlatlng(record) geoaccuracy = importer.flickr_map_geoaccuracy(record[14]) if geoaccuracy: post_record.post_geoaccuracy = geoaccuracy # place record available in separate yfcc100m dataset # if record[1]: # we need some information from postRecord to create placeRecord # (e.g. user language, geoaccuracy, post_latlng) # some of the information from place will also modify postRecord # place_record = HF.new_lbsn_record_with_id(lbsn.Place(), # record[1], # self.origin) # lbsn_records.append(place_record) # post_record.place_pkey.CopyFrom(place_record.pkey) post_record.post_publish_date.CopyFrom( HF.parse_timestamp_string_to_protobuf(record[6])) post_created_date = HF.parse_csv_datestring_to_protobuf( record[5], t_format='%Y-%m-%d %H:%M:%S.%f') if post_created_date: post_record.post_create_date.CopyFrom(post_created_date) post_record.post_views_count = 0 post_record.post_comment_count = 0 post_record.post_like_count = 0 post_record.post_url = record[15] # YFCC100M dataset contains HTML codes (%20) and # space character is replaced by + post_record.post_body = unquote(record[9]).replace('+', ' ') post_record.post_title = unquote(record[8]).replace('+', ' ') post_record.post_thumbnail_url = record[16] # note: fullsize url! # split tags by , and + because by lbsn-spec, # tags are limited to single word record_tags_list = list( set( filter(None, [ HF.remove_prefix(unquote(tag), "#") for tag in re.split("[,+]+", record[10]) ]))) if record_tags_list: for tag in record_tags_list: tag = importer.clean_tags_from_flickr(tag) post_record.hashtags.append(tag) record_machine_tags = list( set( filter( None, [unquote(mtag) for mtag in re.split("[,+]+", record[11])]))) if 'video' in record_machine_tags: # all videos appear to have 'video' in machine tags post_record.post_type = lbsn.Post.VIDEO else: post_record.post_type = lbsn.Post.IMAGE # replace text-string of content license by integer-id if record[17] is not None: post_record.post_content_license = \ self.get_license_number_from_license_name(record[17]) # place record available in separate yfcc100m dataset # if records parsed as joined urls, length is larger than 25 if len(record) > 25: post_plus_place_records = self.extract_flickr_place( record[25:], post_record=post_record) if post_plus_place_records is None: lbsn_records.append(post_record) else: lbsn_records.extend(post_plus_place_records) else: lbsn_records.append(post_record) return lbsn_records
Module for db input connection sql mapping """ import enum from typing import Union, Optional, List, Tuple from lbsnstructure import lbsnstructure_pb2 as lbsn """Schema convention from lbsn db spec""" LBSN_SCHEMA = [ (lbsn.Origin().DESCRIPTOR.name, "social", "origin", "origin_id"), (lbsn.Country().DESCRIPTOR.name, "spatial", "country", "country_guid"), (lbsn.City().DESCRIPTOR.name, "spatial", "city", "city_guid"), (lbsn.Place().DESCRIPTOR.name, "spatial", "place", "place_guid"), (lbsn.UserGroup().DESCRIPTOR.name, "social", "user_groups", "usergroup_guid"), (lbsn.User().DESCRIPTOR.name, "social", "user", "user_guid"), (lbsn.Post().DESCRIPTOR.name, "topical", "post", "post_guid"), (lbsn.PostReaction().DESCRIPTOR.name, "topical", "post_reaction", "reaction_guid"), ] def optional_schema_override( LBSN_SCHEMA: List[Tuple[str, str, str, str]], schema_table_overrides: List[Tuple[str, str]]) -> List[Tuple[str, str, str, str]]: """Override schema and table name for selected lbsn objects.""" LBSN_SCHEMA_OVERRIDE = [] for lbsn_type, schema_name, table_name, key_col in LBSN_SCHEMA: for schema_table_override in schema_table_overrides: lbsn_object_ref, schema_table_override = schema_table_override try: schema_override, table_override = schema_table_override.split(
def extract_post(self, json_string_dict, user_pkey=None): """Returns tuple of lbsn.Post() and List of post_context_records e.g.: (lbsn.Post(), [lbsn.Country(), lbsn.City(), lbsn.Place(), lbsn.User()]) """ post_guid = json_string_dict.get('id_str') if not HF.check_notice_empty_post_guid(post_guid): return None, None post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid, self.origin) post_geoacc = None user_record = None user_info = json_string_dict.get('user') if user_info: # Get lbsn.Post/Reaction Details of lbsn.User user_record = self.extract_user(json_string_dict.get('user')) elif user_pkey: # userPkey is already available for posts that are statuses user_record = HF.new_lbsn_record_with_id(lbsn.User(), user_pkey.id, self.origin) if user_record: # self.lbsn_records.append(user_record) self.lbsn_records.append(user_record) else: self.log.warning(f'Record {self.lbsn_records.count_glob_total}: ' f'No lbsn.User record found for post: {post_guid} ' f'(post saved without userid)..') print(f'Record {self.lbsn_records.count_glob_total}', end='\r') # Some preprocessing for all types: post_coordinates = json_string_dict.get('coordinates') if post_coordinates: l_lng = post_coordinates.get('coordinates')[0] l_lat = post_coordinates.get('coordinates')[1] post_record.post_geoaccuracy = lbsn.Post.LATLNG post_record.post_latlng = "POINT(%s %s)" % (l_lng, l_lat) # Check if lbsn.Place is mentioned post_place_json = json_string_dict.get('place') if post_place_json: # we need some information from postRecord to create placeRecord # (e.g. user language, geoaccuracy, post_latlng) # some of the information from place will also modify postRecord # attributes; therefore return both if user_record: user_lang = user_record.user_language else: user_lang = None place_record, \ post_geoacc, \ post_country = self.extract_place(post_place_json, post_record.post_geoaccuracy, user_lang) if not post_record.post_geoaccuracy: post_record.post_geoaccuracy = post_geoacc # postRecord.post_geoaccuracy = twitterPostAttributes.geoaccuracy # self.lbsn_records.append(place_record) self.lbsn_records.append(place_record) if post_country: post_record.country_pkey.CopyFrom(post_country.pkey) if isinstance(place_record, lbsn.City): post_record.city_pkey.CopyFrom(place_record.pkey) # either city or place, Twitter user cannot attach both (?) elif isinstance(place_record, lbsn.Place): post_record.place_pkey.CopyFrom(place_record.pkey) # substitute postRecord LatLng Coordinates from placeRecord, # if not already set if not post_record.post_latlng: # Note: this will also substitute lbsn.Country lat/lng in post # this information is also available by query of # country_guid in posts # use input arg min_geoaccuracy to exclude country geo-posts post_record.post_latlng = place_record.geom_center # if still no geoinformation, send post to Null-Island if not post_record.post_latlng: if self.ignore_non_geotagged is True: return None self.null_island += 1 post_record.post_latlng = "POINT(%s %s)" % (0, 0) if self.min_geoaccuracy: if not HF.geoacc_within_threshold(post_record.post_geoaccuracy, self.min_geoaccuracy): self.skipped_low_geoaccuracy += 1 return None # Process attributes of twitter post post_source = json_string_dict.get('source') if post_source: post_record.input_source = HF.cleanhtml( json_string_dict.get('source')) if self.ignore_sources_set and \ post_record.input_source in self.ignore_sources_set: # skip entry if in ignore list self.skipped_ignore_list += 1 return None post_record.post_publish_date.CopyFrom( HF.json_date_string_to_proto(json_string_dict.get('created_at'))) if user_record: post_record.user_pkey.CopyFrom(user_record.pkey) post_record.post_quote_count = HF.value_count( json_string_dict.get('quote_count')) post_record.post_comment_count = HF.value_count( json_string_dict.get('reply_count')) post_record.post_share_count = HF.value_count( json_string_dict.get('retweet_count')) post_record.post_like_count = HF.value_count( json_string_dict.get('favorite_count')) post_record.post_url = f'https://twitter.com/statuses/{post_guid}' language_str = json_string_dict.get('lang') if language_str: post_language = lbsn.Language() post_language.language_short = json_string_dict.get('lang') post_record.post_language.CopyFrom(post_language) # If Extended_tweet object is available, # process entities and post_body (text) data from extended object is_truncated = json_string_dict.get('truncated') if is_truncated and 'extended_tweet' in json_string_dict: # if the "truncated" field is set to true, # and the "extended_tweet" object provides complete # "full_text" and "entities" Tweet metadata # Source for all data is extended object, if available json_string_dict = json_string_dict.get('extended_tweet') post_record.post_body = json_string_dict.get('full_text') # else: # self.log.warning(f'Truncated but no extended_tweet:' # f'{json_string_dict}') # input("Press Enter to continue... (entry will be skipped)") # return None else: if 'full_text' in json_string_dict: post_record.post_body = json_string_dict.get('full_text') else: post_record.post_body = json_string_dict.get('text') # entities section always exists and includes meta information # such as hashtags or user_mentions entities_json = json_string_dict.get('entities') # extract hashtags hashtags_json = entities_json.get('hashtags') if hashtags_json: for hashtag in hashtags_json: # iterate over the list post_record.hashtags.append(hashtag.get("text")) # Look for mentioned userRecords user_mentions_json = entities_json.get('user_mentions') if user_mentions_json: ref_user_records = HF.get_mentioned_users(user_mentions_json, self.origin) # self.lbsn_records.append(ref_user_records) self.lbsn_records.append(ref_user_records) post_record.user_mentions_pkey.extend( [user_ref.pkey for user_ref in ref_user_records]) if self.map_full_relations: self.extract_mentioned_users( ref_user_records, user_record.pkey.id) # sometimes, extended_entities section exists and includes # additional information on media, but never hashtags or user_mentions # Since the media type metadata in the extended_entities section # correctly indicates the media type # (‘photo’, ‘video’ or ‘animated_gif’), # and supports up to 4 photos, it is the preferred metadata # source for native media. See: # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/extended-entities-object.html#extended-entities-object if 'extended_entities' in json_string_dict: entities_json = json_string_dict.get('extended_entities') media_json = entities_json.get('media') if media_json: post_record.post_type = HF.assign_media_post_type(media_json) else: post_record.post_type = lbsn.Post.TEXT post_record.emoji.extend(HF.extract_emoji(post_record.post_body)) # because standard print statement will produce escaped text, # we can use protobuf text_format to give us a human friendly # version of the text # log.debug(f'lbsn.Post record: ' # f'{text_format.MessageToString(postRecord, as_utf8=True)}') # log.debug(f'lbsn.Post record: {postRecord}') return post_record
def parse_json_post(self, json_string_dict, user_pkey=None): """Extract json post retrieved from Twitter API The process is nested, but pretty linear: 1. Extract all relevant lbsn.Post Attributes 1.a extract post coordinates 1.b extract user attributes 1.c extract place attributes (poi, city, neigborhood, admin, country) 1.d extract extract extended tweet, if available, and extended entities, if available 2. decide if post is reaction (reply, quote, share, see https://developer.twitter.com/ en/docs/tweets/data-dictionary/overview/entities-object.html) 3. if post is reaction, copy reduced reaction attributes from extracted lbsn.Post 4. add post/reaction to recordDict 5. process all referenced posts 5.a Retweet(=Share) and Quote Tweets are special kinds of Tweets that contain the original Tweet as an embedded object. 5.b Retweets have a top-level "retweeted_status" object, and Quoted Tweets have a "quoted_status" object process tweet-post object Note: one input record may contain many lbsn records therefore, records are first added to self.lbsn_records to be later returned together """ post_record = self.extract_post( json_string_dict, user_pkey) if not post_record: # in case no post record has been extracted # (e.g. non_geotagged clause) return # Assignment Step # check if post is reaction to other post # reaction means: reduced structure compared to post; # reactions often include the complete original post, # therefore nested processing necessary if HF.is_post_reaction(json_string_dict): if self.map_reactions is False: return post_reaction_record = self.map_postrecord_to_postreactionrecord( post_record) refuser_pkey = None if 'quoted_status' in json_string_dict: # Note: Quote is both: Share & Reply if 'user' not in json_string_dict.get('quoted_status'): refuser_pkey = \ HF.substitute_referenced_user(json_string_dict, self.origin, self.log) post_reaction_record.reaction_type = lbsn.PostReaction.QUOTE ref_post_record = self.extract_post( json_string_dict.get('quoted_status')) elif 'retweeted_status' in json_string_dict: # Note: No retweets are available when data is queried # using Bounding Box because of Geo-Tweet limitation: # "Note that native Retweets are not matched by this # parameter. While the original Tweet may have a location, # the Retweet will not" # see https://developer.twitter.com/en/docs/ # tweets/filter-realtime/guides/basic-stream-parameters.html if 'user' not in json_string_dict.get('retweeted_status'): # Current issue with Twitter search: the retweeting # user is not returned in retweeted_status # but we can get this from other information, # such as user_mentions field from the retweet # https://twittercommunity.com/t/status-retweeted- # status-quoted-status-user-missing-from-search-tweets-json-response/63355 refuser_pkey = \ HF.substitute_referenced_user(json_string_dict, self.origin, self.log) post_reaction_record.reaction_type = lbsn.PostReaction.SHARE retweet_post = json_string_dict.get('retweeted_status') ref_post_record = self.extract_post(retweet_post, refuser_pkey) elif json_string_dict.get('in_reply_to_status_id_str'): # if reply, original tweet is not available (?) post_reaction_record.reaction_type = lbsn.PostReaction.COMMENT ref_post_record = \ HF.new_lbsn_record_with_id( lbsn.Post(), json_string_dict.get( 'in_reply_to_status_id_str'), self.origin) ref_user_record = \ HF.new_lbsn_record_with_id( lbsn.User(), json_string_dict.get( 'in_reply_to_user_id_str'), self.origin) ref_user_record.user_name = json_string_dict.get( 'in_reply_to_screen_name') # Needs to be saved self.lbsn_records.append(ref_user_record) ref_post_record.user_pkey.CopyFrom(ref_user_record.pkey) # add referenced post pkey to reaction if not self.disable_reaction_post_referencing: post_reaction_record.referencedPost_pkey.CopyFrom( ref_post_record.pkey) # ToDo: if a Reaction refers to another # reaction (Information Spread) # This information is currently not # [available from Twitter](https://developer.twitter.com/ # en/docs/tweets/data-dictionary/overview/tweet-object): # "Note that retweets of retweets do not show # representations of the intermediary retweet [...]" # would be added to # postReactionRecord.referencedPostReaction_pkey if ref_post_record: self.lbsn_records.append(ref_post_record) # add postReactionRecord to Dict self.lbsn_records.append(post_reaction_record) else: # otherwise add post to self.lbsn_records # which already includes all other entries (lbsn.User, lbsn.City, lbsn.Place etc.) self.lbsn_records.append(post_record)
def extract_post( self, json_string_dict: Dict[str, Any], place_record: lbsn.Place = None): post_guid = json_string_dict.get('id') if not HF.check_notice_empty_post_guid(post_guid): return None post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid, self.origin) user_record = None user_info = json_string_dict.get('owner') if user_info: # Get Post/Reaction Details of User user_record = self.extract_user(user_info) if user_record: self.lbsn_records.append(user_record) else: self.log.warning( f'No User record found for post: {post_guid} ' f'(post saved without userid)..') # Check from upstream to update post attrs if place_record: # assign place accuracy, by default post_record.post_geoaccuracy = lbsn.Post.PLACE post_record.place_pkey.CopyFrom(place_record.pkey) post_record.post_latlng = place_record.geom_center else: post_record.post_geoaccuracy = None # if still no geoinformation, send post to Null-Island if not post_record.post_latlng: if self.ignore_non_geotagged is True: return None else: self.null_island += 1 post_record.post_latlng = "POINT(%s %s)" % (0, 0) if self.min_geoaccuracy: if not HF.geoacc_within_threshold( post_record.post_geoaccuracy, self.min_geoaccuracy): self.skipped_low_geoaccuracy += 1 return post_record.post_publish_date.CopyFrom( HF.json_date_timestamp_to_proto( json_string_dict.get('taken_at_timestamp'))) if user_record: post_record.user_pkey.CopyFrom(user_record.pkey) def value_count(x): return 0 if x is None else x post_record.post_comment_count = value_count( json_string_dict.get('edge_media_to_comment').get('count')) post_record.post_like_count = value_count( json_string_dict.get('edge_liked_by').get('count')) post_shortcode = json_string_dict.get('shortcode') post_record.post_url = f'http://www.instagram.com/p/{post_shortcode}' if json_string_dict.get("thumbnail_src"): post_record.post_thumbnail_url = json_string_dict.get( "thumbnail_src") post_caption_edge = json_string_dict.get('edge_media_to_caption') if post_caption_edge: post_caption_edge_edges = post_caption_edge.get("edges") if post_caption_edge_edges and not len( post_caption_edge_edges) == 0: post_caption = post_caption_edge[ "edges"][0]["node"]["text"] post_record.post_body = post_caption.replace( '\n', ' ').replace('\r', '') hashtags = HF.extract_hashtags_from_string(post_caption) if hashtags: for hashtag in hashtags: post_record.hashtags.append(hashtag) is_video = json_string_dict.get('is_video') if is_video: post_record.post_type = lbsn.Post.VIDEO post_record.post_views_count = value_count( json_string_dict.get('video_view_count')) else: post_record.post_type = lbsn.Post.IMAGE post_record.emoji.extend(HF.extract_emoji(post_record.post_body)) self.lbsn_records.append(post_record)