def dict_type_switcher(desc_name): """ Create protoBuf messages by name""" dict_switcher = { lbsn.Country().DESCRIPTOR.name: lbsn.Country(), lbsn.City().DESCRIPTOR.name: lbsn.City(), lbsn.Place().DESCRIPTOR.name: lbsn.Place(), lbsn.User().DESCRIPTOR.name: lbsn.User(), lbsn.UserGroup().DESCRIPTOR.name: lbsn.UserGroup(), lbsn.Post().DESCRIPTOR.name: lbsn.Post(), lbsn.PostReaction().DESCRIPTOR.name: lbsn.PostReaction(), lbsn.Relationship().DESCRIPTOR.name: lbsn.Relationship() } return dict_switcher.get(desc_name)
def __init__(self, record=None): if record is None: record = lbsn.User() self.origin_id = record.pkey.origin.origin_id self.guid = record.pkey.id self.user_name = HF.null_check(record.user_name) self.user_fullname = HF.null_check(record.user_fullname) self.follows = HF.null_check(record.follows) self.followed = HF.null_check(record.followed) self.group_count = HF.null_check(record.group_count) self.biography = HF.null_check(record.biography) self.post_count = HF.null_check(record.post_count) self.url = HF.null_check(record.url) self.is_private = HF.null_check(record.is_private) self.is_available = HF.null_check(record.is_available) self.user_language = HF.null_check(record.user_language.language_short) self.user_location = HF.null_check(record.user_location) self.user_location_geom = HF.null_check(record.user_location_geom) self.liked_count = HF.null_check(record.liked_count) self.active_since = HF.null_check_datetime(record.active_since) self.profile_image_url = HF.null_check(record.profile_image_url) self.user_timezone = HF.null_check(record.user_timezone) self.user_utc_offset = HF.null_check(record.user_utc_offset) self.user_groups_member = list(set(record.user_groups_member)) self.user_groups_follows = list(set(record.user_groups_follows))
def extract_related_users( self, related_user_list, input_lbsn_type, user_record): """Extract related users from user list""" for related_user in related_user_list: related_record = HF.new_lbsn_record_with_id(lbsn.User(), str(related_user), self.origin) self.lbsn_records.append(related_record) # note the switch of order here, # direction is important for 'isConnected', # and the different list each give us a # different view on this relationship if input_lbsn_type == 'friendslist': relationship_record =\ HF.new_lbsn_relation_with_id(lbsn.Relationship(), user_record.pkey.id, related_record.pkey.id, self.origin) elif input_lbsn_type == 'followerslist': relationship_record = \ HF.new_lbsn_relation_with_id(lbsn.Relationship(), related_record.pkey.id, user_record.pkey.id, self.origin) relationship_record.relationship_type = \ lbsn.Relationship.isCONNECTED self.lbsn_records.add_relationship_to_dict( relationship_record)
def extract_user(cls, record, origin): user = HF.new_lbsn_record_with_id(lbsn.User(), record.get('user_guid'), origin) set_lbsn_attr(user, "user_name", record) set_lbsn_attr(user, "user_fullname", record) set_lbsn_attr(user, "follows", record) set_lbsn_attr(user, "followed", record) set_lbsn_attr(user, "biography", record) set_lbsn_attr(user, "post_count", record) set_lbsn_attr(user, "url", record) set_lbsn_attr(user, "is_private", record) set_lbsn_attr(user, "is_available", record) lang = record.get('user_language') if lang: ref_user_language = lbsn.Language() ref_user_language.language_short = lang user.user_language.CopyFrom(ref_user_language) set_lbsn_attr(user, "user_location", record) user_location_geom = record.get("user_location_geom") if user_location_geom: setattr(user, "user_location_geom", parse_geom(user_location_geom)) set_lbsn_attr(user, "liked_count", record) active_since = record.get('active_since') if active_since: copydate_lbsn_attr(user.active_since, active_since) set_lbsn_attr(user, "profile_image_url", record) set_lbsn_attr(user, "user_timezone", record) set_lbsn_attr(user, "user_utc_offset", record) set_lbsn_attr(user, "user_groups_member", record) set_lbsn_attr(user, "user_groups_follows", record) set_lbsn_attr(user, "group_count", record) return user
def extract_user(self, json_string_dict): user = json_string_dict user_record = HF.new_lbsn_record_with_id(lbsn.User(), user.get( 'id'), self.origin) return user_record
def get_mentioned_users(userMentions_jsonString, origin): """Return list of mentioned users from json""" mentioned_users_list = [] for user_mention in userMentions_jsonString: # iterate over the list ref_user_record = \ HelperFunctions.new_lbsn_record_with_id( lbsn.User(), user_mention.get('id_str'), origin) ref_user_record.user_fullname = \ user_mention.get('name') # Needs to be saved ref_user_record.user_name = user_mention.get('screen_name') mentioned_users_list.append(ref_user_record) return mentioned_users_list
def dict_selector(self, record): """ Get dictionary by record type name""" dict_switcher = { lbsn.Post().DESCRIPTOR.name: self.lbsn_post_dict, lbsn.Country().DESCRIPTOR.name: self.lbsn_country_dict, lbsn.City().DESCRIPTOR.name: self.lbsn_city_dict, lbsn.Place().DESCRIPTOR.name: self.lbsn_place_dict, lbsn.PostReaction().DESCRIPTOR.name: self.lbsn_post_reaction_dict, lbsn.User().DESCRIPTOR.name: self.lbsn_user_dict, lbsn.UserGroup().DESCRIPTOR.name: self.lbsn_user_group_dict, lbsn.Origin().DESCRIPTOR.name: self.lbsn_origin_dict } return dict_switcher.get(record.DESCRIPTOR.name)
def type_sql_mapper(cls): """Assigns record types to SQL Insert SQLs""" type_sql_mapping = { lbsn.Origin().DESCRIPTOR.name: cls.origin_insertsql, lbsn.Country().DESCRIPTOR.name: cls.country_insertsql, lbsn.City().DESCRIPTOR.name: cls.city_insertsql, lbsn.Place().DESCRIPTOR.name: cls.place_insertsql, lbsn.User().DESCRIPTOR.name: cls.user_insertsql, lbsn.UserGroup().DESCRIPTOR.name: cls.usergroup_insertsql, lbsn.Post().DESCRIPTOR.name: cls.post_insertsql, lbsn.Event().DESCRIPTOR.name: cls.event_insertsql, lbsn.PostReaction().DESCRIPTOR.name: cls.postreaction_insertsql, } return type_sql_mapping
def extract_usergroup(cls, record, origin): usergroup = HF.new_lbsn_record_with_id(lbsn.UserGroup(), record.get('usergroup_guid'), origin) usergroup.usergroup_name = record.get('usergroup_name') usergroup.usergroup_description = record.get('usergroup_description') usergroup.member_count = record.get('member_count') usergroup.usergroup_createdate = record.get('usergroup_createdate') usergroup.user_owner = record.get('user_owner') user_owner = record.get('user_owner') if user_owner: usergroup.user_owner_pkey.CopyFrom( HF.new_lbsn_record_with_id(lbsn.User(), record.get('user_owner'), origin).pkey) return usergroup
def get_hll_metrics(cls, record) -> hll.HllMetrics: """Extracts hll metrics based on record type""" dict_switcher = { lbsn.Origin().DESCRIPTOR.name: cls.get_origin_metrics, lbsn.Country().DESCRIPTOR.name: cls.get_country_metrics, lbsn.City().DESCRIPTOR.name: cls.get_city_metrics, lbsn.Place().DESCRIPTOR.name: cls.get_place_metrics, lbsn.User().DESCRIPTOR.name: cls.get_user_metrics, lbsn.UserGroup().DESCRIPTOR.name: cls.get_usergroup_metrics, lbsn.Post().DESCRIPTOR.name: cls.get_post_metrics, lbsn.PostReaction().DESCRIPTOR.name: cls.get_postreaction_metrics, lbsn.Relationship().DESCRIPTOR.name: cls.get_relationship_metrics } extract_function = dict_switcher.get(record.DESCRIPTOR.name) record_hll_metrics = extract_function(record) return record_hll_metrics
def func_prepare_selector(self, record): """Select correct prepare function according to record type""" dict_switcher = { lbsn.Origin().DESCRIPTOR.name: self.prepare_lbsn_origin, lbsn.Country().DESCRIPTOR.name: self.prepare_lbsn_country, lbsn.City().DESCRIPTOR.name: self.prepare_lbsn_city, lbsn.Place().DESCRIPTOR.name: self.prepare_lbsn_place, lbsn.User().DESCRIPTOR.name: self.prepare_lbsn_user, lbsn.UserGroup().DESCRIPTOR.name: self.prepare_lbsn_usergroup, lbsn.Post().DESCRIPTOR.name: self.prepare_lbsn_post, lbsn.Event().DESCRIPTOR.name: self.prepare_lbsn_event, lbsn.PostReaction().DESCRIPTOR.name: self.prepare_lbsn_postreaction, lbsn.Relationship().DESCRIPTOR.name: self.prepare_lbsn_relation } prepare_function = dict_switcher.get(record.DESCRIPTOR.name) return prepare_function(record)
def extract_event(cls, record, origin): event = HF.new_lbsn_record_with_id(lbsn.Event(), record.get('event_guid'), origin) set_lbsn_attr(event, "name", record) event_latlng = record.get("event_latlng") if event_latlng: setattr(event, "event_latlng", parse_geom(event_latlng)) event_area = record.get("event_area") if event_area: setattr(event, "event_area", parse_geom(event_area)) set_lbsn_attr(event, "event_website", record) event_date = record.get('event_date') if event_date: copydate_lbsn_attr(event.event_date, event_date) event_date_start = record.get('event_date_start') if event_date_start: copydate_lbsn_attr(event.event_date_start, event_date_start) event_date_end = record.get('event_date_end') if event_date_end: copydate_lbsn_attr(event.event_date_end, event_date_end) duration = record.get('duration') if duration: copyduration_lbsn_attr(event.duration, duration) place_guid = record.get('place_guid') if place_guid: set_lbsn_pkey(event.place_pkey, lbsn.Place(), record.get('place_guid'), origin) city_guid = record.get('city_guid') if city_guid: set_lbsn_pkey(event.city_pkey, lbsn.City(), record.get('city_guid'), origin) country_guid = record.get('country_guid') if country_guid: set_lbsn_pkey(event.country_pkey, lbsn.Country(), record.get('country_guid'), origin) set_lbsn_pkey(event.user_pkey, lbsn.User(), record.get('user_guid'), origin) set_lbsn_attr(event, "event_description", record) set_lbsn_attr(event, "event_type", record) set_lbsn_attr(event, "event_share_count", record) set_lbsn_attr(event, "event_like_count", record) set_lbsn_attr(event, "event_comment_count", record) set_lbsn_attr(event, "event_views_count", record) set_lbsn_attr(event, "event_engage_count", record) return event
def get_func_record(cls, record: Dict[str, Any], input_type: Optional[str] = None): """Returns mapping function for input_type""" FUNC_MAP = { lbsn.Origin().DESCRIPTOR.name: cls.extract_origin, lbsn.Country().DESCRIPTOR.name: cls.extract_country, lbsn.City().DESCRIPTOR.name: cls.extract_city, lbsn.Place().DESCRIPTOR.name: cls.extract_place, lbsn.UserGroup().DESCRIPTOR.name: cls.extract_usergroup, lbsn.User().DESCRIPTOR.name: cls.extract_user, lbsn.Post().DESCRIPTOR.name: cls.extract_post, lbsn.PostReaction().DESCRIPTOR.name: cls.extract_postreaction, lbsn.Event().DESCRIPTOR.name: cls.extract_event, } func_map = FUNC_MAP.get(input_type) # create origin always the same origin = lbsn.Origin() origin.origin_id = record.get('origin_id') return func_map(record, origin)
def __init__(self): self.lbsn_origin_dict = dict() self.lbsn_country_dict = dict() self.lbsn_city_dict = dict() self.lbsn_place_dict = dict() self.lbsn_user_group_dict = dict() self.lbsn_user_dict = dict() self.lbsn_post_dict = dict() self.lbsn_post_reaction_dict = dict() self.lbsn_relationship_dict = dict() self.key_hashes = { lbsn.Origin.DESCRIPTOR.name: set(), lbsn.Post.DESCRIPTOR.name: set(), lbsn.Country.DESCRIPTOR.name: set(), lbsn.City.DESCRIPTOR.name: set(), lbsn.Place.DESCRIPTOR.name: set(), lbsn.UserGroup.DESCRIPTOR.name: set(), lbsn.User.DESCRIPTOR.name: set(), lbsn.PostReaction.DESCRIPTOR.name: set(), lbsn.Relationship.DESCRIPTOR.name: set() } self.count_glob = 0 # total number of records added self.count_glob_total = 0 self.count_dup_merge = 0 # number of duplicate records merged self.count_dup_merge_total = 0 # returns all recordsDicts in correct order, # with names as references (tuple) self.all_dicts = [ (self.lbsn_origin_dict, lbsn.Origin().DESCRIPTOR.name), (self.lbsn_country_dict, lbsn.Country().DESCRIPTOR.name), (self.lbsn_city_dict, lbsn.City().DESCRIPTOR.name), (self.lbsn_place_dict, lbsn.Place().DESCRIPTOR.name), (self.lbsn_user_group_dict, lbsn.UserGroup().DESCRIPTOR.name), (self.lbsn_user_dict, lbsn.User().DESCRIPTOR.name), (self.lbsn_post_dict, lbsn.Post().DESCRIPTOR.name), (self.lbsn_post_reaction_dict, lbsn.PostReaction().DESCRIPTOR.name), (self.lbsn_relationship_dict, lbsn.Relationship().DESCRIPTOR.name) ]
def extract_post(cls, record, origin): post = HF.new_lbsn_record_with_id(lbsn.Post(), record.get('post_guid'), origin) post_latlng = record.get("post_latlng") if post_latlng: setattr(post, "post_latlng", parse_geom(post_latlng)) place_guid = record.get('place_guid') if place_guid: set_lbsn_pkey(post.place_pkey, lbsn.Place(), record.get('place_guid'), origin) city_guid = record.get('city_guid') if city_guid: set_lbsn_pkey(post.city_pkey, lbsn.City(), record.get('city_guid'), origin) country_guid = record.get('country_guid') if country_guid: set_lbsn_pkey(post.country_pkey, lbsn.Country(), record.get('country_guid'), origin) set_lbsn_pkey(post.user_pkey, lbsn.User(), record.get('user_guid'), origin) pub_date = record.get('post_publish_date') if pub_date: copydate_lbsn_attr(post.post_publish_date, pub_date) set_lbsn_attr(post, "post_body", record) post.post_geoaccuracy geo_acc = record.get("post_geoaccuracy") if geo_acc: # get enum value post.post_geoaccuracy = lbsn.Post.PostGeoaccuracy.Value( geo_acc.upper()) set_lbsn_attr(post, "hashtags", record) set_lbsn_attr(post, "emoji", record) set_lbsn_attr(post, "post_like_count", record) set_lbsn_attr(post, "post_comment_count", record) set_lbsn_attr(post, "post_views_count", record) set_lbsn_attr(post, "post_title", record) crt_date = record.get('post_create_date') if crt_date: copydate_lbsn_attr(post.post_create_date, crt_date) set_lbsn_attr(post, "post_thumbnail_url", record) set_lbsn_attr(post, "post_url", record) post_type = record.get("post_type") if post_type: # get enum value post.post_type = lbsn.Post.PostType.Value(post_type.upper()) set_lbsn_attr(post, "post_filter", record) set_lbsn_attr(post, "post_quote_count", record) set_lbsn_attr(post, "post_share_count", record) lang = record.get('post_language') if lang: ref_post_language = lbsn.Language() ref_post_language.language_short = lang post.post_language.CopyFrom(ref_post_language) set_lbsn_attr(post, "input_source", record) user_mentions = record.get("user_mentions") if user_mentions: mentioned_users_list = [] for user_id in user_mentions: # iterate over the list ref_user_record = \ HF.new_lbsn_record_with_id( lbsn.User(), user_id, origin) mentioned_users_list.append(ref_user_record) post.user_mentions_pkey.extend( [user_ref.pkey for user_ref in mentioned_users_list]) set_lbsn_attr(post, "post_content_license", record) return post
def parse_json_post(self, json_string_dict, user_pkey=None): """Extract json post retrieved from Twitter API The process is nested, but pretty linear: 1. Extract all relevant lbsn.Post Attributes 1.a extract post coordinates 1.b extract user attributes 1.c extract place attributes (poi, city, neigborhood, admin, country) 1.d extract extract extended tweet, if available, and extended entities, if available 2. decide if post is reaction (reply, quote, share, see https://developer.twitter.com/ en/docs/tweets/data-dictionary/overview/entities-object.html) 3. if post is reaction, copy reduced reaction attributes from extracted lbsn.Post 4. add post/reaction to recordDict 5. process all referenced posts 5.a Retweet(=Share) and Quote Tweets are special kinds of Tweets that contain the original Tweet as an embedded object. 5.b Retweets have a top-level "retweeted_status" object, and Quoted Tweets have a "quoted_status" object process tweet-post object Note: one input record may contain many lbsn records therefore, records are first added to self.lbsn_records to be later returned together """ post_record = self.extract_post( json_string_dict, user_pkey) if not post_record: # in case no post record has been extracted # (e.g. non_geotagged clause) return # Assignment Step # check if post is reaction to other post # reaction means: reduced structure compared to post; # reactions often include the complete original post, # therefore nested processing necessary if HF.is_post_reaction(json_string_dict): if self.map_reactions is False: return post_reaction_record = self.map_postrecord_to_postreactionrecord( post_record) refuser_pkey = None if 'quoted_status' in json_string_dict: # Note: Quote is both: Share & Reply if 'user' not in json_string_dict.get('quoted_status'): refuser_pkey = \ HF.substitute_referenced_user(json_string_dict, self.origin, self.log) post_reaction_record.reaction_type = lbsn.PostReaction.QUOTE ref_post_record = self.extract_post( json_string_dict.get('quoted_status')) elif 'retweeted_status' in json_string_dict: # Note: No retweets are available when data is queried # using Bounding Box because of Geo-Tweet limitation: # "Note that native Retweets are not matched by this # parameter. While the original Tweet may have a location, # the Retweet will not" # see https://developer.twitter.com/en/docs/ # tweets/filter-realtime/guides/basic-stream-parameters.html if 'user' not in json_string_dict.get('retweeted_status'): # Current issue with Twitter search: the retweeting # user is not returned in retweeted_status # but we can get this from other information, # such as user_mentions field from the retweet # https://twittercommunity.com/t/status-retweeted- # status-quoted-status-user-missing-from-search-tweets-json-response/63355 refuser_pkey = \ HF.substitute_referenced_user(json_string_dict, self.origin, self.log) post_reaction_record.reaction_type = lbsn.PostReaction.SHARE retweet_post = json_string_dict.get('retweeted_status') ref_post_record = self.extract_post(retweet_post, refuser_pkey) elif json_string_dict.get('in_reply_to_status_id_str'): # if reply, original tweet is not available (?) post_reaction_record.reaction_type = lbsn.PostReaction.COMMENT ref_post_record = \ HF.new_lbsn_record_with_id( lbsn.Post(), json_string_dict.get( 'in_reply_to_status_id_str'), self.origin) ref_user_record = \ HF.new_lbsn_record_with_id( lbsn.User(), json_string_dict.get( 'in_reply_to_user_id_str'), self.origin) ref_user_record.user_name = json_string_dict.get( 'in_reply_to_screen_name') # Needs to be saved self.lbsn_records.append(ref_user_record) ref_post_record.user_pkey.CopyFrom(ref_user_record.pkey) # add referenced post pkey to reaction if not self.disable_reaction_post_referencing: post_reaction_record.referencedPost_pkey.CopyFrom( ref_post_record.pkey) # ToDo: if a Reaction refers to another # reaction (Information Spread) # This information is currently not # [available from Twitter](https://developer.twitter.com/ # en/docs/tweets/data-dictionary/overview/tweet-object): # "Note that retweets of retweets do not show # representations of the intermediary retweet [...]" # would be added to # postReactionRecord.referencedPostReaction_pkey if ref_post_record: self.lbsn_records.append(ref_post_record) # add postReactionRecord to Dict self.lbsn_records.append(post_reaction_record) else: # otherwise add post to self.lbsn_records # which already includes all other entries (lbsn.User, lbsn.City, lbsn.Place etc.) self.lbsn_records.append(post_record)
def extract_user(self, json_string_dict): """Extract lbsn.User from Twitter json""" user = json_string_dict user_record = HF.new_lbsn_record_with_id( lbsn.User(), user.get('id_str'), self.origin) # get additional information about the user, if available user_record.user_fullname = user.get('name') user_record.follows = user.get('friends_count') user_record.is_private = user.get('protected') user_record.followed = user.get('followers_count') user_bio = user.get('description') if user_bio: user_record.biography = user_bio user_record.user_name = user.get('screen_name') listed_count = user.get('listed_count') if listed_count: user_record.group_count = listed_count user_record.post_count = user.get('statuses_count') user_record.url = f'https://twitter.com/intent/user?user_id=' \ f'{user_record.pkey.id}' ref_user_language = lbsn.Language() ref_user_language.language_short = user.get('lang') user_record.user_language.CopyFrom(ref_user_language) user_location = user.get('location') if user_location: user_record.user_location = user_location if self.geocodes and user_record.user_location in self.geocodes: l_lat = self.geocodes[user_record.user_location][0] l_lng = self.geocodes[user_record.user_location][1] user_record.user_location_geom = "POINT(%s %s)" % ( l_lng, l_lat) # userGeoLocation = user.get('profile_location') # todo! user_record.liked_count = user.get('favourites_count') user_record.active_since.CopyFrom( HF.json_date_string_to_proto(user.get('created_at'))) user_profile_image_url = user.get('profile_image_url') if not user_profile_image_url == f'http://abs.twimg.com/sticky/' \ f'default_profile_images/' \ f'default_profile_normal.png': user_record.profile_image_url = user_profile_image_url user_timezone = user.get('time_zone') if user_timezone: user_record.user_timezone = user_timezone user_utc_offset = user.get('utc_offset') if user_utc_offset: user_record.user_utc_offset = user_utc_offset # the following example demonstrates specific information # that cannot be extracted from twitter post data # deutscherBundestagGroup = \ # HF.createNewLBSNRecord_with_id(lbsn.UserGroup(), # "MdB (Bundestag)", # self.origin) # userRecord.user_groups_member.append( # deutscherBundestagGroup.pkey.id) # if self.mapFullRelations: # relationshipRecord = \ # HF.createNewLBSNRelationship_with_id(lbsn.Relationship(), # userRecord.pkey.id, # deutscherBundestagGroup.pkey.id, # self.origin) # relationshipRecord.relationship_type = lbsn.Relationship.inGROUP # self.lbsn_records.AddRelationshipToDict(relationshipRecord) # userRecord.user_groups_follows = [] return user_record
def extract_post(self, json_string_dict, user_pkey=None): """Returns tuple of lbsn.Post() and List of post_context_records e.g.: (lbsn.Post(), [lbsn.Country(), lbsn.City(), lbsn.Place(), lbsn.User()]) """ post_guid = json_string_dict.get('id_str') if not HF.check_notice_empty_post_guid(post_guid): return None, None post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid, self.origin) post_geoacc = None user_record = None user_info = json_string_dict.get('user') if user_info: # Get lbsn.Post/Reaction Details of lbsn.User user_record = self.extract_user(json_string_dict.get('user')) elif user_pkey: # userPkey is already available for posts that are statuses user_record = HF.new_lbsn_record_with_id(lbsn.User(), user_pkey.id, self.origin) if user_record: # self.lbsn_records.append(user_record) self.lbsn_records.append(user_record) else: self.log.warning(f'Record {self.lbsn_records.count_glob_total}: ' f'No lbsn.User record found for post: {post_guid} ' f'(post saved without userid)..') print(f'Record {self.lbsn_records.count_glob_total}', end='\r') # Some preprocessing for all types: post_coordinates = json_string_dict.get('coordinates') if post_coordinates: l_lng = post_coordinates.get('coordinates')[0] l_lat = post_coordinates.get('coordinates')[1] post_record.post_geoaccuracy = lbsn.Post.LATLNG post_record.post_latlng = "POINT(%s %s)" % (l_lng, l_lat) # Check if lbsn.Place is mentioned post_place_json = json_string_dict.get('place') if post_place_json: # we need some information from postRecord to create placeRecord # (e.g. user language, geoaccuracy, post_latlng) # some of the information from place will also modify postRecord # attributes; therefore return both if user_record: user_lang = user_record.user_language else: user_lang = None place_record, \ post_geoacc, \ post_country = self.extract_place(post_place_json, post_record.post_geoaccuracy, user_lang) if not post_record.post_geoaccuracy: post_record.post_geoaccuracy = post_geoacc # postRecord.post_geoaccuracy = twitterPostAttributes.geoaccuracy # self.lbsn_records.append(place_record) self.lbsn_records.append(place_record) if post_country: post_record.country_pkey.CopyFrom(post_country.pkey) if isinstance(place_record, lbsn.City): post_record.city_pkey.CopyFrom(place_record.pkey) # either city or place, Twitter user cannot attach both (?) elif isinstance(place_record, lbsn.Place): post_record.place_pkey.CopyFrom(place_record.pkey) # substitute postRecord LatLng Coordinates from placeRecord, # if not already set if not post_record.post_latlng: # Note: this will also substitute lbsn.Country lat/lng in post # this information is also available by query of # country_guid in posts # use input arg min_geoaccuracy to exclude country geo-posts post_record.post_latlng = place_record.geom_center # if still no geoinformation, send post to Null-Island if not post_record.post_latlng: if self.ignore_non_geotagged is True: return None self.null_island += 1 post_record.post_latlng = "POINT(%s %s)" % (0, 0) if self.min_geoaccuracy: if not HF.geoacc_within_threshold(post_record.post_geoaccuracy, self.min_geoaccuracy): self.skipped_low_geoaccuracy += 1 return None # Process attributes of twitter post post_source = json_string_dict.get('source') if post_source: post_record.input_source = HF.cleanhtml( json_string_dict.get('source')) if self.ignore_sources_set and \ post_record.input_source in self.ignore_sources_set: # skip entry if in ignore list self.skipped_ignore_list += 1 return None post_record.post_publish_date.CopyFrom( HF.json_date_string_to_proto(json_string_dict.get('created_at'))) if user_record: post_record.user_pkey.CopyFrom(user_record.pkey) post_record.post_quote_count = HF.value_count( json_string_dict.get('quote_count')) post_record.post_comment_count = HF.value_count( json_string_dict.get('reply_count')) post_record.post_share_count = HF.value_count( json_string_dict.get('retweet_count')) post_record.post_like_count = HF.value_count( json_string_dict.get('favorite_count')) post_record.post_url = f'https://twitter.com/statuses/{post_guid}' language_str = json_string_dict.get('lang') if language_str: post_language = lbsn.Language() post_language.language_short = json_string_dict.get('lang') post_record.post_language.CopyFrom(post_language) # If Extended_tweet object is available, # process entities and post_body (text) data from extended object is_truncated = json_string_dict.get('truncated') if is_truncated and 'extended_tweet' in json_string_dict: # if the "truncated" field is set to true, # and the "extended_tweet" object provides complete # "full_text" and "entities" Tweet metadata # Source for all data is extended object, if available json_string_dict = json_string_dict.get('extended_tweet') post_record.post_body = json_string_dict.get('full_text') # else: # self.log.warning(f'Truncated but no extended_tweet:' # f'{json_string_dict}') # input("Press Enter to continue... (entry will be skipped)") # return None else: if 'full_text' in json_string_dict: post_record.post_body = json_string_dict.get('full_text') else: post_record.post_body = json_string_dict.get('text') # entities section always exists and includes meta information # such as hashtags or user_mentions entities_json = json_string_dict.get('entities') # extract hashtags hashtags_json = entities_json.get('hashtags') if hashtags_json: for hashtag in hashtags_json: # iterate over the list post_record.hashtags.append(hashtag.get("text")) # Look for mentioned userRecords user_mentions_json = entities_json.get('user_mentions') if user_mentions_json: ref_user_records = HF.get_mentioned_users(user_mentions_json, self.origin) # self.lbsn_records.append(ref_user_records) self.lbsn_records.append(ref_user_records) post_record.user_mentions_pkey.extend( [user_ref.pkey for user_ref in ref_user_records]) if self.map_full_relations: self.extract_mentioned_users( ref_user_records, user_record.pkey.id) # sometimes, extended_entities section exists and includes # additional information on media, but never hashtags or user_mentions # Since the media type metadata in the extended_entities section # correctly indicates the media type # (‘photo’, ‘video’ or ‘animated_gif’), # and supports up to 4 photos, it is the preferred metadata # source for native media. See: # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/extended-entities-object.html#extended-entities-object if 'extended_entities' in json_string_dict: entities_json = json_string_dict.get('extended_entities') media_json = entities_json.get('media') if media_json: post_record.post_type = HF.assign_media_post_type(media_json) else: post_record.post_type = lbsn.Post.TEXT post_record.emoji.extend(HF.extract_emoji(post_record.post_body)) # because standard print statement will produce escaped text, # we can use protobuf text_format to give us a human friendly # version of the text # log.debug(f'lbsn.Post record: ' # f'{text_format.MessageToString(postRecord, as_utf8=True)}') # log.debug(f'lbsn.Post record: {postRecord}') return post_record
def parse_json_record(self, json_string_dict, input_lbsn_type=None): """Will parse Twitter json retrieved from Twitter API, returns a list of LBSN records. Fully extracting Twitter json's to flat relational db structure is challenging because Twitter json's may consist of deeply nested structures, which can include many LBSN record entities, e.g.: - the lbsn.Post itself - the lbsn.User who posted, and its attributes - Coordinates, Places, Cities, Countries linked to the post - lbsn.Language of the post - shared or retweeted Posts and their attribues ( Users, Places, Cities etc.) - mentioned users in the post ("@-mentions") - special jsons retrieved from other API endpoints, e.g. groups of users etc. This methods tries to do all of this automatically, but default values may need adjustment for specific cases. All extracted LBSN records are added subsequently to self.lbsn_records and returned finally as a single list of records in this method. This guarantees that db-key-relations are acknowledged when submitting records to db. The order of LBSN record type extraction follows the order of db inserts """ # clear any records from previous run self.lbsn_records.clear() # decide if main object is post or user json if input_lbsn_type and input_lbsn_type in ('friendslist', 'followerslist'): for user, related_user_list in json_string_dict.items(): user_record = HF.new_lbsn_record_with_id( lbsn.User(), str(user), self.origin) self.lbsn_records.append(user_record) self.extract_related_users(related_user_list, input_lbsn_type, user_record) elif (input_lbsn_type and input_lbsn_type == 'profile') \ or 'screen_name' in json_string_dict: # user user_record = self.extract_user(json_string_dict) self.lbsn_records.append(user_record) # sys.exit(f'lbsn.Post record: {text_format.MessageToString(userRecord, # as_utf8=True)}') if not user_record.is_private: # if user profile is private, we cannot access posts user_status = None if 'status' in json_string_dict: user_status = json_string_dict.get('status') elif 'quoted_status' in json_string_dict: user_status = json_string_dict.get('quoted_status') elif 'retweeted_status' in json_string_dict: user_status = json_string_dict.get('retweeted_status') # in case user status is available if user_status: self.parse_json_post( user_status, user_pkey=user_record.pkey) else: # otherwise, parse post self.parse_json_post(json_string_dict) # finally, return list of all extracted records return self.lbsn_records
def extract_flickr_post(self, record): """Main function for processing Flickr YFCC100M CSV entry. This mothod is adapted to a special structure, adapt if needed. To Do: - parameterize column numbers and structure - provide external config-file for specific CSV structures - currently not included in lbsn mapping are MachineTags, GeoContext (indoors, outdoors), WoeId and some extra attributes only present for Flickr Overview of available columns and examples: 0 row-number - 0 1 Photo/video identifier - 6985418911 2 lbsn.User NSID(PostID?) - 4e2f7a26a1dfbf165a7e30bdabf7e72a 3 lbsn.User ID - 39089491@N00 4 lbsn.User nickname - gnuckx 5 Date taken - 2012-02-16 09:56:37.0 6 Date uploaded - 1331840483 7 Capture device - Canon+PowerShot+ELPH+310+HS 8 Title - IMG_0520 9 Description - My vacation 10 tags (comma-separated) - canon,canon+powershot+hs+310 11 Machine tags (comma-separated) - landscape, hills, water 12 Longitude - -81.804885 13 Latitude - 24.550558 14 Accuracy - 12 15 Photo/video page URL - http://www.flickr.com/photos/39089491@N00/6985418911/ 16 Photo/video download URL - http://farm8.staticflickr.com/7205/6985418911_df7747990d.jpg 17 License name - Attribution-NonCommercial-NoDerivs License 18 License URL - http://creativecommons.org/licenses/by-nc-nd/2.0/ 19 Photo/video server identifier - 7205 20 Photo/video farm identifier - 8 21 Photo/video secret - df7747990d 22 Photo/video secret original - 692d7e0a7f 23 Extension of the original photo - jpg 24 Marker (0 ¼ photo, 1 ¼ video) - 0 if concat: 25 Photo/video identifier 26 Place references (null to multiple) """ # note that one input record may contain many lbsn records # therefore, return list of processed records lbsn_records = [] # start mapping input to lbsn_records post_guid = record[1] if not HF.check_notice_empty_post_guid(post_guid): return None post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid, self.origin) user_record = HF.new_lbsn_record_with_id(lbsn.User(), record[3], self.origin) user_record.user_name = unquote(record[4]).replace('+', ' ') user_record.url = f'http://www.flickr.com/photos/{user_record.pkey.id}/' if user_record: post_record.user_pkey.CopyFrom(user_record.pkey) lbsn_records.append(user_record) post_record.post_latlng = self.flickr_extract_postlatlng(record) geoaccuracy = importer.flickr_map_geoaccuracy(record[14]) if geoaccuracy: post_record.post_geoaccuracy = geoaccuracy # place record available in separate yfcc100m dataset # if record[1]: # we need some information from postRecord to create placeRecord # (e.g. user language, geoaccuracy, post_latlng) # some of the information from place will also modify postRecord # place_record = HF.new_lbsn_record_with_id(lbsn.Place(), # record[1], # self.origin) # lbsn_records.append(place_record) # post_record.place_pkey.CopyFrom(place_record.pkey) post_record.post_publish_date.CopyFrom( HF.parse_timestamp_string_to_protobuf(record[6])) post_created_date = HF.parse_csv_datestring_to_protobuf( record[5], t_format='%Y-%m-%d %H:%M:%S.%f') if post_created_date: post_record.post_create_date.CopyFrom(post_created_date) post_record.post_views_count = 0 post_record.post_comment_count = 0 post_record.post_like_count = 0 post_record.post_url = record[15] # YFCC100M dataset contains HTML codes (%20) and # space character is replaced by + post_record.post_body = unquote(record[9]).replace('+', ' ') post_record.post_title = unquote(record[8]).replace('+', ' ') post_record.post_thumbnail_url = record[16] # note: fullsize url! # split tags by , and + because by lbsn-spec, # tags are limited to single word record_tags_list = list( set( filter(None, [ HF.remove_prefix(unquote(tag), "#") for tag in re.split("[,+]+", record[10]) ]))) if record_tags_list: for tag in record_tags_list: tag = importer.clean_tags_from_flickr(tag) post_record.hashtags.append(tag) record_machine_tags = list( set( filter( None, [unquote(mtag) for mtag in re.split("[,+]+", record[11])]))) if 'video' in record_machine_tags: # all videos appear to have 'video' in machine tags post_record.post_type = lbsn.Post.VIDEO else: post_record.post_type = lbsn.Post.IMAGE # replace text-string of content license by integer-id if record[17] is not None: post_record.post_content_license = \ self.get_license_number_from_license_name(record[17]) # place record available in separate yfcc100m dataset # if records parsed as joined urls, length is larger than 25 if len(record) > 25: post_plus_place_records = self.extract_flickr_place( record[25:], post_record=post_record) if post_plus_place_records is None: lbsn_records.append(post_record) else: lbsn_records.extend(post_plus_place_records) else: lbsn_records.append(post_record) return lbsn_records
""" Module for db input connection sql mapping """ import enum from typing import Union, Optional, List, Tuple from lbsnstructure import lbsnstructure_pb2 as lbsn """Schema convention from lbsn db spec""" LBSN_SCHEMA = [ (lbsn.Origin().DESCRIPTOR.name, "social", "origin", "origin_id"), (lbsn.Country().DESCRIPTOR.name, "spatial", "country", "country_guid"), (lbsn.City().DESCRIPTOR.name, "spatial", "city", "city_guid"), (lbsn.Place().DESCRIPTOR.name, "spatial", "place", "place_guid"), (lbsn.UserGroup().DESCRIPTOR.name, "social", "user_groups", "usergroup_guid"), (lbsn.User().DESCRIPTOR.name, "social", "user", "user_guid"), (lbsn.Post().DESCRIPTOR.name, "topical", "post", "post_guid"), (lbsn.PostReaction().DESCRIPTOR.name, "topical", "post_reaction", "reaction_guid"), ] def optional_schema_override( LBSN_SCHEMA: List[Tuple[str, str, str, str]], schema_table_overrides: List[Tuple[str, str]]) -> List[Tuple[str, str, str, str]]: """Override schema and table name for selected lbsn objects.""" LBSN_SCHEMA_OVERRIDE = [] for lbsn_type, schema_name, table_name, key_col in LBSN_SCHEMA: for schema_table_override in schema_table_overrides: lbsn_object_ref, schema_table_override = schema_table_override try: