def extract_related_users( self, related_user_list, input_lbsn_type, user_record): """Extract related users from user list""" for related_user in related_user_list: related_record = HF.new_lbsn_record_with_id(lbsn.User(), str(related_user), self.origin) self.lbsn_records.append(related_record) # note the switch of order here, # direction is important for 'isConnected', # and the different list each give us a # different view on this relationship if input_lbsn_type == 'friendslist': relationship_record =\ HF.new_lbsn_relation_with_id(lbsn.Relationship(), user_record.pkey.id, related_record.pkey.id, self.origin) elif input_lbsn_type == 'followerslist': relationship_record = \ HF.new_lbsn_relation_with_id(lbsn.Relationship(), related_record.pkey.id, user_record.pkey.id, self.origin) relationship_record.relationship_type = \ lbsn.Relationship.isCONNECTED self.lbsn_records.add_relationship_to_dict( relationship_record)
def __init__(self, relationship=None): if relationship is None: relationship = lbsn.Relationship() self.origin_id = relationship.pkey.relation_to.origin.origin_id self.guid = relationship.pkey.relation_to.id self.guid_rel = relationship.pkey.relation_from.id self.rel_type = HF.null_check( lbsn.Relationship().RelationshipType.Name( relationship.relationship_type)).lower()
def dict_type_switcher(desc_name): """ Create protoBuf messages by name""" dict_switcher = { lbsn.Country().DESCRIPTOR.name: lbsn.Country(), lbsn.City().DESCRIPTOR.name: lbsn.City(), lbsn.Place().DESCRIPTOR.name: lbsn.Place(), lbsn.User().DESCRIPTOR.name: lbsn.User(), lbsn.UserGroup().DESCRIPTOR.name: lbsn.UserGroup(), lbsn.Post().DESCRIPTOR.name: lbsn.Post(), lbsn.PostReaction().DESCRIPTOR.name: lbsn.PostReaction(), lbsn.Relationship().DESCRIPTOR.name: lbsn.Relationship() } return dict_switcher.get(desc_name)
def update_key_hash(self, record, key=None): """Update key-hash with record Keep lists of pkeys for each type this can be used to check for duplicates or to get a total count for each type of records (Number of unique Users, Countries, Places etc.) in this case we assume that origin_id remains the same in each program iteration! """ if key is not None: self.key_hashes[record.DESCRIPTOR.name].add(key) return if record.DESCRIPTOR.name == lbsn.Relationship().DESCRIPTOR.name: # we need the complete uuid of both entities for # relationships because they can span different origin_ids self.key_hashes[record.DESCRIPTOR.name].add( f'{record.pkey.relation_to.origin.origin_id}' f'{record.pkey.relation_to.id}' f'{record.pkey.relation_from.origin.origin_id}' f'{record.pkey.relation_from.id}' f'{record.relationship_type}') else: # all other entities can be globally uniquely # identified by their local guid self.key_hashes[record.DESCRIPTOR.name].add(record.pkey.id)
def extract_mentioned_users(self, ref_user_records, user_record_id): for mentioned_user_record in ref_user_records: relation_record = \ HF.new_lbsn_relation_with_id(lbsn.Relationship(), user_record_id, mentioned_user_record.pkey.id, self.origin) relation_record.relationship_type = \ lbsn.Relationship.MENTIONS_USER self.lbsn_records.append( relation_record)
def extract_mentioned_users(self, ref_user_records, user_record_id): """Extract mentioned user from ref user records list""" for mentioned_user_record in ref_user_records: relation_record = \ HF.new_lbsn_relation_with_id(lbsn.Relationship(), user_record_id, mentioned_user_record.pkey.id, self.origin) relation_record.relationship_type = \ lbsn.Relationship.MENTIONS_USER self.lbsn_records.add_relationship_to_dict( relation_record)
def get_hll_metrics(cls, record) -> hll.HllMetrics: """Extracts hll metrics based on record type""" dict_switcher = { lbsn.Origin().DESCRIPTOR.name: cls.get_origin_metrics, lbsn.Country().DESCRIPTOR.name: cls.get_country_metrics, lbsn.City().DESCRIPTOR.name: cls.get_city_metrics, lbsn.Place().DESCRIPTOR.name: cls.get_place_metrics, lbsn.User().DESCRIPTOR.name: cls.get_user_metrics, lbsn.UserGroup().DESCRIPTOR.name: cls.get_usergroup_metrics, lbsn.Post().DESCRIPTOR.name: cls.get_post_metrics, lbsn.PostReaction().DESCRIPTOR.name: cls.get_postreaction_metrics, lbsn.Relationship().DESCRIPTOR.name: cls.get_relationship_metrics } extract_function = dict_switcher.get(record.DESCRIPTOR.name) record_hll_metrics = extract_function(record) return record_hll_metrics
def func_prepare_selector(self, record): """Select correct prepare function according to record type""" dict_switcher = { lbsn.Origin().DESCRIPTOR.name: self.prepare_lbsn_origin, lbsn.Country().DESCRIPTOR.name: self.prepare_lbsn_country, lbsn.City().DESCRIPTOR.name: self.prepare_lbsn_city, lbsn.Place().DESCRIPTOR.name: self.prepare_lbsn_place, lbsn.User().DESCRIPTOR.name: self.prepare_lbsn_user, lbsn.UserGroup().DESCRIPTOR.name: self.prepare_lbsn_usergroup, lbsn.Post().DESCRIPTOR.name: self.prepare_lbsn_post, lbsn.Event().DESCRIPTOR.name: self.prepare_lbsn_event, lbsn.PostReaction().DESCRIPTOR.name: self.prepare_lbsn_postreaction, lbsn.Relationship().DESCRIPTOR.name: self.prepare_lbsn_relation } prepare_function = dict_switcher.get(record.DESCRIPTOR.name) return prepare_function(record)
def __init__(self): self.lbsn_origin_dict = dict() self.lbsn_country_dict = dict() self.lbsn_city_dict = dict() self.lbsn_place_dict = dict() self.lbsn_user_group_dict = dict() self.lbsn_user_dict = dict() self.lbsn_post_dict = dict() self.lbsn_post_reaction_dict = dict() self.lbsn_relationship_dict = dict() self.key_hashes = { lbsn.Origin.DESCRIPTOR.name: set(), lbsn.Post.DESCRIPTOR.name: set(), lbsn.Country.DESCRIPTOR.name: set(), lbsn.City.DESCRIPTOR.name: set(), lbsn.Place.DESCRIPTOR.name: set(), lbsn.UserGroup.DESCRIPTOR.name: set(), lbsn.User.DESCRIPTOR.name: set(), lbsn.PostReaction.DESCRIPTOR.name: set(), lbsn.Relationship.DESCRIPTOR.name: set() } self.count_glob = 0 # total number of records added self.count_glob_total = 0 self.count_dup_merge = 0 # number of duplicate records merged self.count_dup_merge_total = 0 # returns all recordsDicts in correct order, # with names as references (tuple) self.all_dicts = [ (self.lbsn_origin_dict, lbsn.Origin().DESCRIPTOR.name), (self.lbsn_country_dict, lbsn.Country().DESCRIPTOR.name), (self.lbsn_city_dict, lbsn.City().DESCRIPTOR.name), (self.lbsn_place_dict, lbsn.Place().DESCRIPTOR.name), (self.lbsn_user_group_dict, lbsn.UserGroup().DESCRIPTOR.name), (self.lbsn_user_dict, lbsn.User().DESCRIPTOR.name), (self.lbsn_post_dict, lbsn.Post().DESCRIPTOR.name), (self.lbsn_post_reaction_dict, lbsn.PostReaction().DESCRIPTOR.name), (self.lbsn_relationship_dict, lbsn.Relationship().DESCRIPTOR.name) ]
def submit_lbsn_relationships(self): """submit relationships of different types record[1] is the PostgresQL formatted list of values, record[0] is the type of relationship that determines the table selection """ select_friends = [ relationship[1] for relationship in self.batched_lbsn_records[ lbsn.Relationship().DESCRIPTOR.name] if relationship[0] == "isfriend" ] if select_friends: if self.store_csv: self.csv_output.store_append_batch_to_csv( select_friends, self.count_round, '_user_friends_user') if self.db_cursor: args_isfriend = ','.join(select_friends) insert_sql = \ f''' INSERT INTO interlinkage."_user_friends_user" ( {self.typeNamesHeaderDict["_user_friends_user"]}) VALUES {args_isfriend} ON CONFLICT (origin_id, user_guid, friend_guid) DO NOTHING ''' self.submit_batch(insert_sql) select_connected = [ relationship[1] for relationship in self.batched_lbsn_records[ lbsn.Relationship().DESCRIPTOR.name] if relationship[0] == "isconnected" ] if select_connected: if self.store_csv: self.csv_output.store_append_batch_to_csv( select_connected, self.count_round, '_user_connectsto_user') if self.db_cursor: args_isconnected = ','.join(select_connected) insert_sql = \ f''' INSERT INTO interlinkage."_user_connectsto_user" ( {self.typeNamesHeaderDict["_user_connectsto_user"]}) VALUES {args_isconnected} ON CONFLICT (origin_id, user_guid, connectedto_user_guid) DO NOTHING ''' self.submit_batch(insert_sql) select_usergroupmember = [ relationship[1] for relationship in self.batched_lbsn_records[ lbsn.Relationship().DESCRIPTOR.name] if relationship[0] == "ingroup" ] if select_usergroupmember: if self.store_csv: self.csv_output.store_append_batch_to_csv( select_usergroupmember, self.count_round, '_user_memberof_group') if self.db_cursor: args_isingroup = ','.join(select_usergroupmember) insert_sql = \ f''' INSERT INTO interlinkage."_user_memberof_group" ( {self.typeNamesHeaderDict["_user_memberof_group"]}) VALUES {args_isingroup} ON CONFLICT (origin_id, user_guid, group_guid) DO NOTHING ''' self.submit_batch(insert_sql) select_usergroupmember = [ relationship[1] for relationship in self.batched_lbsn_records[ lbsn.Relationship().DESCRIPTOR.name] if relationship[0] == "followsgroup" ] if select_usergroupmember: if self.store_csv: self.csv_output.store_append_batch_to_csv( select_usergroupmember, self.count_round, '_user_follows_group') if self.db_cursor: args_isingroup = ','.join(select_usergroupmember) insert_sql = \ f''' INSERT INTO interlinkage."_user_follows_group" ( {self.typeNamesHeaderDict["_user_follows_group"]}) VALUES {args_isingroup} ON CONFLICT (origin_id, user_guid, group_guid) DO NOTHING ''' self.submit_batch(insert_sql) select_usermentions = [ relationship[1] for relationship in self.batched_lbsn_records[ lbsn.Relationship().DESCRIPTOR.name] if relationship[0] == "mentions_user" ] if select_usermentions: if self.store_csv: self.csv_output.store_append_batch_to_csv( select_usermentions, self.count_round, '_user_mentions_user') if self.db_cursor: args_isingroup = ','.join(select_usermentions) insert_sql = \ f''' INSERT INTO interlinkage."_user_mentions_user" ( {self.typeNamesHeaderDict["_user_mentions_user"]}) VALUES {args_isingroup} ON CONFLICT (origin_id, user_guid, mentioneduser_guid) DO NOTHING ''' self.submit_batch(insert_sql)