def __init__(self, record: lbsn.Post = None, emoji: str = None): super().__init__() self.key['latitude'] = None self.key['longitude'] = None self.key['emoji'] = None self.attrs['latlng_geom'] = None self.metrics['pud_hll'] = set() if emoji is None: # init empty return self.key['emoji'] = emoji if record is None: # init empty return if isinstance(record, lbsn.Post): coordinates_geom = record.post_latlng coordinates = HF.get_coordinates_from_ewkt(coordinates_geom) self.key['latitude'] = coordinates.lat self.key['longitude'] = coordinates.lng # additional (optional) attributes # formatted ready for sql upsert self.attrs['latlng_geom'] = HF.return_ewkb_from_geotext( coordinates_geom) else: raise ValueError("Parsing of LatLngBase only supported " "from lbsn.Post")
def __init__(self, record: lbsn.Post = None, hashtag: str = None): super().__init__() self.key['year'] = None self.key['month'] = None self.key['latitude'] = None self.key['longitude'] = None self.attrs['latlng_geom'] = None if record is None: # init empty return if isinstance(record, lbsn.Post): post_date_time = HLF.merge_dates_post(record) if post_date_time is None: return date = post_date_time.date() self.key['year'] = date.year self.key['month'] = date.month coordinates_geom = record.post_latlng coordinates = HF.get_coordinates_from_ewkt(coordinates_geom) self.key['latitude'] = coordinates.lat self.key['longitude'] = coordinates.lng # additional (optional) attributes # formatted ready for sql upsert self.attrs['latlng_geom'] = HF.return_ewkb_from_geotext( coordinates_geom) else: raise ValueError("Parsing of MonthLatLngBase only supported " "from lbsn.Post")
def merge_records(self, duplicate_record_lines, type_name): """ Will merge multiple proto buf records to one, eliminating duplicates and merging information. """ if len(duplicate_record_lines) > 1: # first do a simple compare/unique unique_records = set(duplicate_record_lines) if len(unique_records) > 1: # input(f'Len: {len(unique_records)} : {unique_records}') # if more than one unqiue record infos, # get first and deep-compare-merge with following prev_duprecord = self.get_record_from_base64_encoded_string( duplicate_record_lines[0], type_name) for duprecord in duplicate_record_lines[1:]: # merge current record with previous until no more found record = self.get_record_from_base64_encoded_string( duprecord, type_name) # will modify/overwrite prev_duprecord HF.merge_existing_records(prev_duprecord, record) merged_record = self.serialize_encode_record(prev_duprecord) else: # take first element merged_record = next(iter(unique_records)) else: merged_record = duplicate_record_lines[0] return merged_record
def hll_concat_latlng(record: lbsn.Post) -> str: """Concat post lat lng coordinates to string""" if record.post_geoaccuracy == 'latlng': coordinates_geom = HF.null_check(record.post_latlng) coordinates = HF.get_coordinates_from_ewkt(coordinates_geom) return f'{coordinates.lat}:{coordinates.lng}' return '0:0'
def __init__(self, record: Union[lbsn.Post, lbsn.Place, lbsn.City, lbsn.Country] = None): super().__init__() self.key["guid"] = None self.attrs['name'] = None self.attrs['geom_center'] = None self.attrs['geom_area'] = None self.metrics['pud_hll'] = set() self.metrics['utl_hll'] = set() self.metrics['latlng_hll'] = set() if record is None: # init empty return name = None geom_area = None if isinstance(record, lbsn.Post): coordinates_geom = record.post_latlng coordinates = HF.get_coordinates_from_ewkt(coordinates_geom) # use concat lat:lng as key of no place_key available # this should later implement assignemnt based on area # intersection self.key["guid"] = HLF.hll_concat( [coordinates.lat, coordinates.lng]) elif isinstance(record, (lbsn.Place, lbsn.City, lbsn.Country)): name = HF.null_check(record.name) coordinates_geom = record.geom_center geom_area = record.geom_area # use key from place, city or country record self.key["guid"] = HLF.hll_concat_origin_guid(record) self.attrs['name'] = name self.attrs['geom_center'] = HF.return_ewkb_from_geotext( coordinates_geom) self.attrs['geom_area'] = HF.return_ewkb_from_geotext(geom_area)
def extract_related_users( self, related_user_list, input_lbsn_type, user_record): """Extract related users from user list""" for related_user in related_user_list: related_record = HF.new_lbsn_record_with_id(lbsn.User(), str(related_user), self.origin) self.lbsn_records.append(related_record) # note the switch of order here, # direction is important for 'isConnected', # and the different list each give us a # different view on this relationship if input_lbsn_type == 'friendslist': relationship_record =\ HF.new_lbsn_relation_with_id(lbsn.Relationship(), user_record.pkey.id, related_record.pkey.id, self.origin) elif input_lbsn_type == 'followerslist': relationship_record = \ HF.new_lbsn_relation_with_id(lbsn.Relationship(), related_record.pkey.id, user_record.pkey.id, self.origin) relationship_record.relationship_type = \ lbsn.Relationship.isCONNECTED self.lbsn_records.add_relationship_to_dict( relationship_record)
def merge_dates_post(record: lbsn.Post = None) -> Optional[dt.datetime]: """Merge post_publish and post_created attributes""" post_create_date = HF.null_check_datetime(record.post_create_date) post_publish_date = HF.null_check_datetime(record.post_publish_date) if post_create_date is None: return post_publish_date return post_create_date
def _open_input_files(self) -> Iterator[IO[str]]: """Loops input filelist and returns opened file handles""" # process localfiles for file_name in self.filelist: self.continue_number += 1 self.current_source = file_name HF.log_main_debug(f'Current file: {ntpath.basename(file_name)}') yield open(file_name, 'r', encoding="utf-8", errors='replace')
def hll_concat_upt_hll(record: lbsn.Post) -> List[str]: """Concat all post terms (body, title, hashtags) and return list""" body_terms = HF.select_terms(record.post_body) title_terms = HF.select_terms(record.post_title) tag_terms = {item.lower() for item in record.hashtags if len(item) > 2} all_post_terms = set.union(body_terms, title_terms, tag_terms) user_hll = HLLFunctions.hll_concat_user(record) upt_hll = HLLFunctions.hll_concat_user_terms(user_hll, all_post_terms) return upt_hll
def prepare_lbsn_country(cls, record): """Get common attributes for records of type lbsn.Place""" place_record = PlaceBaseAttrShared(record) prepared_record = (place_record.origin_id, place_record.guid, place_record.name, place_record.name_alternatives, HF.return_ewkb_from_geotext( place_record.geom_center), HF.return_ewkb_from_geotext(place_record.geom_area), place_record.url) return prepared_record
def __init__(self, record=None): if record is None: record = lbsn.UserGroup() self.origin_id = record.pkey.origin.origin_id self.guid = record.pkey.id self.usergroup_name = HF.null_check(record.usergroup_name) self.usergroup_description = HF.null_check( record.usergroup_description) self.member_count = HF.null_check(record.member_count) self.usergroup_createdate = HF.null_check_datetime( record.usergroup_createdate) self.user_owner = HF.null_check(record.user_owner_pkey.id)
def prepare_lbsn_city(cls, record): """Get common attributes for records of type lbsn.City""" place_record = PlaceBaseAttrShared(record) country_guid = HF.null_check(record.country_pkey.id) sub_type = HF.null_check(record.sub_type) prepared_record = (place_record.origin_id, place_record.guid, place_record.name, place_record.name_alternatives, HF.return_ewkb_from_geotext( place_record.geom_center), HF.return_ewkb_from_geotext(place_record.geom_area), place_record.url, country_guid, sub_type) return prepared_record
def __init__(self, record=None): if record is None: record = lbsn.Place() # init empty structure self.origin_id = record.pkey.origin.origin_id # = 3 self.guid = record.pkey.id self.name = HF.null_check(record.name) # because ProtoBuf Repeated Field does not support distinct rule, # we remove any duplicates in list fields prior to submission here self.name_alternatives = list(set(record.name_alternatives)) if self.name and self.name in self.name_alternatives: self.name_alternatives.remove(self.name) self.url = HF.null_check(record.url) self.geom_center = HF.null_check(record.geom_center) self.geom_area = HF.null_check(record.geom_area)
def extract_user(self, json_string_dict): user = json_string_dict user_record = HF.new_lbsn_record_with_id(lbsn.User(), user.get( 'id'), self.origin) return user_record
def prepare_lbsn_event(cls, record): """Get common attributes for records of type lbsn.Post""" event_record = EventAttrShared(record) prepared_record = ( event_record.origin_id, event_record.event_guid, event_record.name, HF.return_ewkb_from_geotext(event_record.event_latlng), HF.return_ewkb_from_geotext(event_record.event_area), event_record.event_website, event_record.event_date, event_record.event_date_start, event_record.event_date_end, event_record.duration, event_record.place_guid, event_record.city_guid, event_record.country_guid, event_record.user_guid, event_record.event_description, event_record.event_type, event_record.event_share_count, event_record.event_like_count, event_record.event_comment_count, event_record.event_views_count, event_record.event_engage_count) return prepared_record
def extract_place(self, postplace_json): place = postplace_json place_id = place.get('id') if not place_id: self.log.warning(f'No PlaceGuid\n\n{place}') input("Press Enter to continue... (entry will be skipped)") return lon_center = place.get('lng') lat_center = place.get('lat') if lon_center is None or lat_center is None: # assign place to Null Island lon_center = 0 lat_center = 0 # place_guid # For POIs, City is not available on Twitter place_record = HF.new_lbsn_record_with_id( lbsn.Place(), place_id, self.origin) place_record.geom_center = "POINT(%s %s)" % (lon_center, lat_center) place_name = place.get('name').replace('\n\r', '') # for some reason, twitter place entities sometimes contain # linebreaks or whitespaces. We don't want this. place_name = place.get('name').replace('\n\r', '') # remove multiple whitespace place_name = re.sub(' +', ' ', place_name) place_slug = place.get('slug') if place_slug: place_record.url = ( f"https://www.instagram.com/explore/locations/" f"{place_id}/{place_slug}") return place_record
def sort_clean_proto_repeated_field(cls, record): """Remove duplicate values in repeated field, sort alphabetically ProtocolBuffers has no unique list field type. This function will remove duplicates, which is needed for unique compare. There is a 'bug' in Python implementation of ProtocolBuffers: - depending on the implementation type in use, it is possible to spot either 'RepeatedCompositeFieldContainer' or 'RepeatedCompositeContainer' - solution here: import and compare to both types - this is not ideal, since both types are internal to PB and subject to change - see [proto-bug](https://github.com/protocolbuffers/ protobuf/issues/3870) """ for descriptor in record.DESCRIPTOR.fields: if descriptor.label == descriptor.LABEL_REPEATED: x_attr = getattr(record, descriptor.name) if x_attr and not HF.is_composite_field_container(x_attr): x_attr_cleaned = set(x_attr) x_attr_sorted = sorted(x_attr_cleaned) # Complete clear of repeated field for _ in range(0, len(x_attr)): x_attr.pop() # add sorted list x_attr.extend(x_attr_sorted)
def extract_user(cls, record, origin): user = HF.new_lbsn_record_with_id(lbsn.User(), record.get('user_guid'), origin) set_lbsn_attr(user, "user_name", record) set_lbsn_attr(user, "user_fullname", record) set_lbsn_attr(user, "follows", record) set_lbsn_attr(user, "followed", record) set_lbsn_attr(user, "biography", record) set_lbsn_attr(user, "post_count", record) set_lbsn_attr(user, "url", record) set_lbsn_attr(user, "is_private", record) set_lbsn_attr(user, "is_available", record) lang = record.get('user_language') if lang: ref_user_language = lbsn.Language() ref_user_language.language_short = lang user.user_language.CopyFrom(ref_user_language) set_lbsn_attr(user, "user_location", record) user_location_geom = record.get("user_location_geom") if user_location_geom: setattr(user, "user_location_geom", parse_geom(user_location_geom)) set_lbsn_attr(user, "liked_count", record) active_since = record.get('active_since') if active_since: copydate_lbsn_attr(user.active_since, active_since) set_lbsn_attr(user, "profile_image_url", record) set_lbsn_attr(user, "user_timezone", record) set_lbsn_attr(user, "user_utc_offset", record) set_lbsn_attr(user, "user_groups_member", record) set_lbsn_attr(user, "user_groups_follows", record) set_lbsn_attr(user, "group_count", record) return user
def set_lbsn_pkey(lbsn_obj_pkey, pkey_obj, pkey_val, origin_val): """Sets value for lbsn_obj_pkey of pkey_obj if pkey_val is not None""" if pkey_val is None: return pkey_obj = HF.new_lbsn_record_with_id(pkey_obj, pkey_val, origin_val) lbsn_obj_pkey.CopyFrom(pkey_obj.pkey)
def extract_place(cls, record, origin): place = HF.new_lbsn_record_with_id(lbsn.Place(), record.get('place_guid'), origin) set_lbsn_attr(place, "name", record) set_lbsn_attr(place, "post_count", record) set_lbsn_attr(place, "url", record) geom_center = record.get("geom_center") if geom_center: setattr(place, "geom_center", parse_geom(geom_center)) geom_area = record.get("geom_area") if geom_area: setattr(place, "geom_area", parse_geom(geom_area)) city_guid = record.get('city_guid') if city_guid: set_lbsn_pkey(place.city_pkey, lbsn.City(), record.get('city_guid'), origin) set_lbsn_attr(place, "name_alternatives", record) set_lbsn_attr(place, "place_description", record) set_lbsn_attr(place, "place_website", record) set_lbsn_attr(place, "place_phone", record) set_lbsn_attr(place, "address", record) set_lbsn_attr(place, "zip_code", record) set_lbsn_attr(place, "attributes", record) set_lbsn_attr(place, "checkin_count", record) set_lbsn_attr(place, "like_count", record) set_lbsn_attr(place, "parent_places", record) return place
def extract_usergroup(cls, record, origin): usergroup = HF.new_lbsn_record_with_id(lbsn.UserGroup(), record.get('usergroup_guid'), origin) usergroup.usergroup_name = record.get('usergroup_name') usergroup.usergroup_description = record.get('usergroup_description') usergroup.member_count = record.get('member_count') usergroup.usergroup_createdate = record.get('usergroup_createdate') usergroup.user_owner = record.get('user_owner') user_owner = record.get('user_owner') if user_owner: usergroup.user_owner_pkey.CopyFrom( HF.new_lbsn_record_with_id(lbsn.User(), record.get('user_owner'), origin).pkey) return usergroup
def __init__(self, relationship=None): if relationship is None: relationship = lbsn.Relationship() self.origin_id = relationship.pkey.relation_to.origin.origin_id self.guid = relationship.pkey.relation_to.id self.guid_rel = relationship.pkey.relation_from.id self.rel_type = HF.null_check( lbsn.Relationship().RelationshipType.Name( relationship.relationship_type)).lower()
def process_place_record(place_record, origin): """Assignment of Flickr place types to lbsnstructure hierarchy: lbsn.Country, lbsn.City, lbsn.Place Original Flickr place types, which are more detailed, are stored in sub_type field """ place_record_split = place_record.split(":") if not len(place_record_split) == 3: raise ValueError(f'Malformed place entry:\n' f'place_record: {place_record}') place_guid = unquote(place_record_split[0]) place_name = unquote(place_record_split[1]).replace('+', ' ') place_type = unquote(place_record_split[2]) place_type_lw = place_type.lower() place_type_lw_split = place_type_lw.split("/") # assignment if any(ptls in FLICKR_COUNTRY_MATCH for ptls in place_type_lw_split): lbsn_place_record = HF.new_lbsn_record_with_id( lbsn.Country(), place_guid, origin) elif any(ptls in FLICKR_CITY_MATCH for ptls in place_type_lw_split): lbsn_place_record = HF.new_lbsn_record_with_id( lbsn.City(), place_guid, origin) elif any(ptls in FLICKR_PLACE_MATCH for ptls in place_type_lw_split): lbsn_place_record = HF.new_lbsn_record_with_id( lbsn.Place(), place_guid, origin) else: logging.getLogger('__main__').debug( f'Could not assign place type {place_type_lw}\n' f'found in place_record: {place_record}\n' f'Will assign default "lbsn.Place"') lbsn_place_record = HF.new_lbsn_record_with_id( lbsn.Place(), place_guid, origin) lbsn_place_record.name = place_name if isinstance(lbsn_place_record, lbsn.City): # record sub types only for city and place lbsn_place_record.sub_type = place_type elif isinstance(lbsn_place_record, lbsn.Place): lbsn_place_record.place_description = place_type # place_record.url (not provided) # need to consult post data for lat/lng coordinates # set to null island first lbsn_place_record.geom_center = "POINT(%s %s)" % (0, 0) return lbsn_place_record
def extract_mentioned_users(self, ref_user_records, user_record_id): for mentioned_user_record in ref_user_records: relation_record = \ HF.new_lbsn_relation_with_id(lbsn.Relationship(), user_record_id, mentioned_user_record.pkey.id, self.origin) relation_record.relationship_type = \ lbsn.Relationship.MENTIONS_USER self.lbsn_records.append( relation_record)
def extract_city(cls, record, origin): city = HF.new_lbsn_record_with_id(lbsn.City(), record.get('city_guid'), origin) set_lbsn_attr(city, "name", record) geom_center = record.get("geom_center") if geom_center: setattr(city, "geom_center", parse_geom(geom_center)) geom_area = record.get("geom_area") if geom_area: setattr(city, "geom_area", parse_geom(geom_area)) country_guid = record.get('country_guid') if country_guid: city.country_pkey.CopyFrom( HF.new_lbsn_record_with_id(lbsn.Country(), record.get('country_guid'), origin).pkey) set_lbsn_attr(city, "url", record) set_lbsn_attr(city, "name_alternatives", record) set_lbsn_attr(city, "sub_type", record) return city
def extract_mentioned_users(self, ref_user_records, user_record_id): """Extract mentioned user from ref user records list""" for mentioned_user_record in ref_user_records: relation_record = \ HF.new_lbsn_relation_with_id(lbsn.Relationship(), user_record_id, mentioned_user_record.pkey.id, self.origin) relation_record.relationship_type = \ lbsn.Relationship.MENTIONS_USER self.lbsn_records.add_relationship_to_dict( relation_record)
def close_log(): """"Closes log and writes to archive file """ logging.shutdown() # rename log file for archive purposes today = HF.get_str_formatted_today() outfile = Path(f"{today}.log") with open(outfile, 'a+') as outfile: with open('log.log') as infile: outfile.write(f'\n') for line in infile: outfile.write(line)
def __init__(self, record=None): if record is None: record = lbsn.PostReaction() self.origin_id = record.pkey.origin.origin_id self.guid = record.pkey.id self.reaction_latlng = HF.null_geom_check(record.reaction_latlng) self.user_guid = HF.null_check(record.user_pkey.id) self.referenced_post = HF.null_check(record.referencedPost_pkey.id) self.referenced_postreaction = HF.null_check( record.referencedPostreaction_pkey.id) self.reaction_type = HF.turn_lower( HF.null_check(lbsn.PostReaction().ReactionType.Name( record.reaction_type))) self.reaction_date = HF.null_check_datetime(record.reaction_date) self.reaction_content = HF.null_check(record.reaction_content) self.reaction_like_count = HF.null_check(record.reaction_like_count) self.user_mentions = list( set([pkey.id for pkey in record.user_mentions_pkey]))
def extract_country(cls, record, origin): country = HF.new_lbsn_record_with_id(lbsn.Country(), record.get('country_guid'), origin) set_lbsn_attr(country, "name", record) geom_center = record.get("geom_center") if geom_center: setattr(country, "geom_center", parse_geom(geom_center)) geom_area = record.get("geom_area") if geom_area: setattr(country, "geom_area", parse_geom(geom_area)) set_lbsn_attr(country, "url", record) set_lbsn_attr(country, "name_alternatives", record) return country
def fetch_json_data_from_file(self, file_handle): """Read json entries from file. Typical form is [{json1},{json2}], if is_stacked_json is True: will process stacked jsons in the form of {json1}{json2} If is_line_separated_json is true: {json1} {json2} ... """ # records = [] # Stacked JSON is a simple file with many concatenated jsons, e.g. # {json1}{json2} etc. if self.is_stacked_json: # note: this requires loading file completely first # not streaming optimized yet for record in HF.json_read_wrapper( HF.decode_stacked(file_handle.read())): yield record if self.is_line_separated_json: # json's separated by line ending for line in file_handle: record = HF.json_load_wrapper(line, single=True) yield record else: # normal json nesting, e.g. {{record1},{record2}} records = HF.json_load_wrapper(file_handle) if records: if isinstance(records, list): for record in records: yield record else: record = records yield record yield None