def dict_type_switcher(desc_name): """ Create protoBuf messages by name""" dict_switcher = { lbsn.Country().DESCRIPTOR.name: lbsn.Country(), lbsn.City().DESCRIPTOR.name: lbsn.City(), lbsn.Place().DESCRIPTOR.name: lbsn.Place(), lbsn.User().DESCRIPTOR.name: lbsn.User(), lbsn.UserGroup().DESCRIPTOR.name: lbsn.UserGroup(), lbsn.Post().DESCRIPTOR.name: lbsn.Post(), lbsn.PostReaction().DESCRIPTOR.name: lbsn.PostReaction(), lbsn.Relationship().DESCRIPTOR.name: lbsn.Relationship() } return dict_switcher.get(desc_name)
def extract_place(cls, record, origin): place = HF.new_lbsn_record_with_id(lbsn.Place(), record.get('place_guid'), origin) set_lbsn_attr(place, "name", record) set_lbsn_attr(place, "post_count", record) set_lbsn_attr(place, "url", record) geom_center = record.get("geom_center") if geom_center: setattr(place, "geom_center", parse_geom(geom_center)) geom_area = record.get("geom_area") if geom_area: setattr(place, "geom_area", parse_geom(geom_area)) city_guid = record.get('city_guid') if city_guid: set_lbsn_pkey(place.city_pkey, lbsn.City(), record.get('city_guid'), origin) set_lbsn_attr(place, "name_alternatives", record) set_lbsn_attr(place, "place_description", record) set_lbsn_attr(place, "place_website", record) set_lbsn_attr(place, "place_phone", record) set_lbsn_attr(place, "address", record) set_lbsn_attr(place, "zip_code", record) set_lbsn_attr(place, "attributes", record) set_lbsn_attr(place, "checkin_count", record) set_lbsn_attr(place, "like_count", record) set_lbsn_attr(place, "parent_places", record) return place
def dict_selector(self, record): """ Get dictionary by record type name""" dict_switcher = { lbsn.Post().DESCRIPTOR.name: self.lbsn_post_dict, lbsn.Country().DESCRIPTOR.name: self.lbsn_country_dict, lbsn.City().DESCRIPTOR.name: self.lbsn_city_dict, lbsn.Place().DESCRIPTOR.name: self.lbsn_place_dict, lbsn.PostReaction().DESCRIPTOR.name: self.lbsn_post_reaction_dict, lbsn.User().DESCRIPTOR.name: self.lbsn_user_dict, lbsn.UserGroup().DESCRIPTOR.name: self.lbsn_user_group_dict, lbsn.Origin().DESCRIPTOR.name: self.lbsn_origin_dict } return dict_switcher.get(record.DESCRIPTOR.name)
def type_sql_mapper(cls): """Assigns record types to SQL Insert SQLs""" type_sql_mapping = { lbsn.Origin().DESCRIPTOR.name: cls.origin_insertsql, lbsn.Country().DESCRIPTOR.name: cls.country_insertsql, lbsn.City().DESCRIPTOR.name: cls.city_insertsql, lbsn.Place().DESCRIPTOR.name: cls.place_insertsql, lbsn.User().DESCRIPTOR.name: cls.user_insertsql, lbsn.UserGroup().DESCRIPTOR.name: cls.usergroup_insertsql, lbsn.Post().DESCRIPTOR.name: cls.post_insertsql, lbsn.Event().DESCRIPTOR.name: cls.event_insertsql, lbsn.PostReaction().DESCRIPTOR.name: cls.postreaction_insertsql, } return type_sql_mapping
def get_hll_metrics(cls, record) -> hll.HllMetrics: """Extracts hll metrics based on record type""" dict_switcher = { lbsn.Origin().DESCRIPTOR.name: cls.get_origin_metrics, lbsn.Country().DESCRIPTOR.name: cls.get_country_metrics, lbsn.City().DESCRIPTOR.name: cls.get_city_metrics, lbsn.Place().DESCRIPTOR.name: cls.get_place_metrics, lbsn.User().DESCRIPTOR.name: cls.get_user_metrics, lbsn.UserGroup().DESCRIPTOR.name: cls.get_usergroup_metrics, lbsn.Post().DESCRIPTOR.name: cls.get_post_metrics, lbsn.PostReaction().DESCRIPTOR.name: cls.get_postreaction_metrics, lbsn.Relationship().DESCRIPTOR.name: cls.get_relationship_metrics } extract_function = dict_switcher.get(record.DESCRIPTOR.name) record_hll_metrics = extract_function(record) return record_hll_metrics
def func_prepare_selector(self, record): """Select correct prepare function according to record type""" dict_switcher = { lbsn.Origin().DESCRIPTOR.name: self.prepare_lbsn_origin, lbsn.Country().DESCRIPTOR.name: self.prepare_lbsn_country, lbsn.City().DESCRIPTOR.name: self.prepare_lbsn_city, lbsn.Place().DESCRIPTOR.name: self.prepare_lbsn_place, lbsn.User().DESCRIPTOR.name: self.prepare_lbsn_user, lbsn.UserGroup().DESCRIPTOR.name: self.prepare_lbsn_usergroup, lbsn.Post().DESCRIPTOR.name: self.prepare_lbsn_post, lbsn.Event().DESCRIPTOR.name: self.prepare_lbsn_event, lbsn.PostReaction().DESCRIPTOR.name: self.prepare_lbsn_postreaction, lbsn.Relationship().DESCRIPTOR.name: self.prepare_lbsn_relation } prepare_function = dict_switcher.get(record.DESCRIPTOR.name) return prepare_function(record)
def extract_event(cls, record, origin): event = HF.new_lbsn_record_with_id(lbsn.Event(), record.get('event_guid'), origin) set_lbsn_attr(event, "name", record) event_latlng = record.get("event_latlng") if event_latlng: setattr(event, "event_latlng", parse_geom(event_latlng)) event_area = record.get("event_area") if event_area: setattr(event, "event_area", parse_geom(event_area)) set_lbsn_attr(event, "event_website", record) event_date = record.get('event_date') if event_date: copydate_lbsn_attr(event.event_date, event_date) event_date_start = record.get('event_date_start') if event_date_start: copydate_lbsn_attr(event.event_date_start, event_date_start) event_date_end = record.get('event_date_end') if event_date_end: copydate_lbsn_attr(event.event_date_end, event_date_end) duration = record.get('duration') if duration: copyduration_lbsn_attr(event.duration, duration) place_guid = record.get('place_guid') if place_guid: set_lbsn_pkey(event.place_pkey, lbsn.Place(), record.get('place_guid'), origin) city_guid = record.get('city_guid') if city_guid: set_lbsn_pkey(event.city_pkey, lbsn.City(), record.get('city_guid'), origin) country_guid = record.get('country_guid') if country_guid: set_lbsn_pkey(event.country_pkey, lbsn.Country(), record.get('country_guid'), origin) set_lbsn_pkey(event.user_pkey, lbsn.User(), record.get('user_guid'), origin) set_lbsn_attr(event, "event_description", record) set_lbsn_attr(event, "event_type", record) set_lbsn_attr(event, "event_share_count", record) set_lbsn_attr(event, "event_like_count", record) set_lbsn_attr(event, "event_comment_count", record) set_lbsn_attr(event, "event_views_count", record) set_lbsn_attr(event, "event_engage_count", record) return event
def process_place_record(place_record, origin): """Assignment of Flickr place types to lbsnstructure hierarchy: lbsn.Country, lbsn.City, lbsn.Place Original Flickr place types, which are more detailed, are stored in sub_type field """ place_record_split = place_record.split(":") if not len(place_record_split) == 3: raise ValueError(f'Malformed place entry:\n' f'place_record: {place_record}') place_guid = unquote(place_record_split[0]) place_name = unquote(place_record_split[1]).replace('+', ' ') place_type = unquote(place_record_split[2]) place_type_lw = place_type.lower() place_type_lw_split = place_type_lw.split("/") # assignment if any(ptls in FLICKR_COUNTRY_MATCH for ptls in place_type_lw_split): lbsn_place_record = HF.new_lbsn_record_with_id( lbsn.Country(), place_guid, origin) elif any(ptls in FLICKR_CITY_MATCH for ptls in place_type_lw_split): lbsn_place_record = HF.new_lbsn_record_with_id( lbsn.City(), place_guid, origin) elif any(ptls in FLICKR_PLACE_MATCH for ptls in place_type_lw_split): lbsn_place_record = HF.new_lbsn_record_with_id( lbsn.Place(), place_guid, origin) else: logging.getLogger('__main__').debug( f'Could not assign place type {place_type_lw}\n' f'found in place_record: {place_record}\n' f'Will assign default "lbsn.Place"') lbsn_place_record = HF.new_lbsn_record_with_id( lbsn.Place(), place_guid, origin) lbsn_place_record.name = place_name if isinstance(lbsn_place_record, lbsn.City): # record sub types only for city and place lbsn_place_record.sub_type = place_type elif isinstance(lbsn_place_record, lbsn.Place): lbsn_place_record.place_description = place_type # place_record.url (not provided) # need to consult post data for lat/lng coordinates # set to null island first lbsn_place_record.geom_center = "POINT(%s %s)" % (0, 0) return lbsn_place_record
def get_func_record(cls, record: Dict[str, Any], input_type: Optional[str] = None): """Returns mapping function for input_type""" FUNC_MAP = { lbsn.Origin().DESCRIPTOR.name: cls.extract_origin, lbsn.Country().DESCRIPTOR.name: cls.extract_country, lbsn.City().DESCRIPTOR.name: cls.extract_city, lbsn.Place().DESCRIPTOR.name: cls.extract_place, lbsn.UserGroup().DESCRIPTOR.name: cls.extract_usergroup, lbsn.User().DESCRIPTOR.name: cls.extract_user, lbsn.Post().DESCRIPTOR.name: cls.extract_post, lbsn.PostReaction().DESCRIPTOR.name: cls.extract_postreaction, lbsn.Event().DESCRIPTOR.name: cls.extract_event, } func_map = FUNC_MAP.get(input_type) # create origin always the same origin = lbsn.Origin() origin.origin_id = record.get('origin_id') return func_map(record, origin)
def extract_city(cls, record, origin): city = HF.new_lbsn_record_with_id(lbsn.City(), record.get('city_guid'), origin) set_lbsn_attr(city, "name", record) geom_center = record.get("geom_center") if geom_center: setattr(city, "geom_center", parse_geom(geom_center)) geom_area = record.get("geom_area") if geom_area: setattr(city, "geom_area", parse_geom(geom_area)) country_guid = record.get('country_guid') if country_guid: city.country_pkey.CopyFrom( HF.new_lbsn_record_with_id(lbsn.Country(), record.get('country_guid'), origin).pkey) set_lbsn_attr(city, "url", record) set_lbsn_attr(city, "name_alternatives", record) set_lbsn_attr(city, "sub_type", record) return city
def __init__(self): self.lbsn_origin_dict = dict() self.lbsn_country_dict = dict() self.lbsn_city_dict = dict() self.lbsn_place_dict = dict() self.lbsn_user_group_dict = dict() self.lbsn_user_dict = dict() self.lbsn_post_dict = dict() self.lbsn_post_reaction_dict = dict() self.lbsn_relationship_dict = dict() self.key_hashes = { lbsn.Origin.DESCRIPTOR.name: set(), lbsn.Post.DESCRIPTOR.name: set(), lbsn.Country.DESCRIPTOR.name: set(), lbsn.City.DESCRIPTOR.name: set(), lbsn.Place.DESCRIPTOR.name: set(), lbsn.UserGroup.DESCRIPTOR.name: set(), lbsn.User.DESCRIPTOR.name: set(), lbsn.PostReaction.DESCRIPTOR.name: set(), lbsn.Relationship.DESCRIPTOR.name: set() } self.count_glob = 0 # total number of records added self.count_glob_total = 0 self.count_dup_merge = 0 # number of duplicate records merged self.count_dup_merge_total = 0 # returns all recordsDicts in correct order, # with names as references (tuple) self.all_dicts = [ (self.lbsn_origin_dict, lbsn.Origin().DESCRIPTOR.name), (self.lbsn_country_dict, lbsn.Country().DESCRIPTOR.name), (self.lbsn_city_dict, lbsn.City().DESCRIPTOR.name), (self.lbsn_place_dict, lbsn.Place().DESCRIPTOR.name), (self.lbsn_user_group_dict, lbsn.UserGroup().DESCRIPTOR.name), (self.lbsn_user_dict, lbsn.User().DESCRIPTOR.name), (self.lbsn_post_dict, lbsn.Post().DESCRIPTOR.name), (self.lbsn_post_reaction_dict, lbsn.PostReaction().DESCRIPTOR.name), (self.lbsn_relationship_dict, lbsn.Relationship().DESCRIPTOR.name) ]
def extract_post(cls, record, origin): post = HF.new_lbsn_record_with_id(lbsn.Post(), record.get('post_guid'), origin) post_latlng = record.get("post_latlng") if post_latlng: setattr(post, "post_latlng", parse_geom(post_latlng)) place_guid = record.get('place_guid') if place_guid: set_lbsn_pkey(post.place_pkey, lbsn.Place(), record.get('place_guid'), origin) city_guid = record.get('city_guid') if city_guid: set_lbsn_pkey(post.city_pkey, lbsn.City(), record.get('city_guid'), origin) country_guid = record.get('country_guid') if country_guid: set_lbsn_pkey(post.country_pkey, lbsn.Country(), record.get('country_guid'), origin) set_lbsn_pkey(post.user_pkey, lbsn.User(), record.get('user_guid'), origin) pub_date = record.get('post_publish_date') if pub_date: copydate_lbsn_attr(post.post_publish_date, pub_date) set_lbsn_attr(post, "post_body", record) post.post_geoaccuracy geo_acc = record.get("post_geoaccuracy") if geo_acc: # get enum value post.post_geoaccuracy = lbsn.Post.PostGeoaccuracy.Value( geo_acc.upper()) set_lbsn_attr(post, "hashtags", record) set_lbsn_attr(post, "emoji", record) set_lbsn_attr(post, "post_like_count", record) set_lbsn_attr(post, "post_comment_count", record) set_lbsn_attr(post, "post_views_count", record) set_lbsn_attr(post, "post_title", record) crt_date = record.get('post_create_date') if crt_date: copydate_lbsn_attr(post.post_create_date, crt_date) set_lbsn_attr(post, "post_thumbnail_url", record) set_lbsn_attr(post, "post_url", record) post_type = record.get("post_type") if post_type: # get enum value post.post_type = lbsn.Post.PostType.Value(post_type.upper()) set_lbsn_attr(post, "post_filter", record) set_lbsn_attr(post, "post_quote_count", record) set_lbsn_attr(post, "post_share_count", record) lang = record.get('post_language') if lang: ref_post_language = lbsn.Language() ref_post_language.language_short = lang post.post_language.CopyFrom(ref_post_language) set_lbsn_attr(post, "input_source", record) user_mentions = record.get("user_mentions") if user_mentions: mentioned_users_list = [] for user_id in user_mentions: # iterate over the list ref_user_record = \ HF.new_lbsn_record_with_id( lbsn.User(), user_id, origin) mentioned_users_list.append(ref_user_record) post.user_mentions_pkey.extend( [user_ref.pkey for user_ref in mentioned_users_list]) set_lbsn_attr(post, "post_content_license", record) return post
# -*- coding: utf-8 -*- """ Module for db input connection sql mapping """ import enum from typing import Union, Optional, List, Tuple from lbsnstructure import lbsnstructure_pb2 as lbsn """Schema convention from lbsn db spec""" LBSN_SCHEMA = [ (lbsn.Origin().DESCRIPTOR.name, "social", "origin", "origin_id"), (lbsn.Country().DESCRIPTOR.name, "spatial", "country", "country_guid"), (lbsn.City().DESCRIPTOR.name, "spatial", "city", "city_guid"), (lbsn.Place().DESCRIPTOR.name, "spatial", "place", "place_guid"), (lbsn.UserGroup().DESCRIPTOR.name, "social", "user_groups", "usergroup_guid"), (lbsn.User().DESCRIPTOR.name, "social", "user", "user_guid"), (lbsn.Post().DESCRIPTOR.name, "topical", "post", "post_guid"), (lbsn.PostReaction().DESCRIPTOR.name, "topical", "post_reaction", "reaction_guid"), ] def optional_schema_override( LBSN_SCHEMA: List[Tuple[str, str, str, str]], schema_table_overrides: List[Tuple[str, str]]) -> List[Tuple[str, str, str, str]]: """Override schema and table name for selected lbsn objects.""" LBSN_SCHEMA_OVERRIDE = [] for lbsn_type, schema_name, table_name, key_col in LBSN_SCHEMA: for schema_table_override in schema_table_overrides:
def extract_place( self, postplace_json, post_geoaccuracy, user_language=None): """Extract lbsn.Place from twitter json""" place = postplace_json place_id = place.get('id') if not place_id: self.log.warning(f'No PlaceGuid\n\n{place}') input("Press Enter to continue... (entry will be skipped)") return None, post_geoaccuracy, None lon_center = 0 lat_center = 0 bounding_box = place.get('bounding_box') if bounding_box: bound_coordinates = bounding_box.get('coordinates') if bound_coordinates: bounding_box_points = bound_coordinates[0] lim_y_min, lim_y_max, lim_x_min, lim_x_max = \ HF.get_rectangle_bounds(bounding_box_points) bound_points_shapely = \ geometry.MultiPoint([(lim_x_min, lim_y_min), (lim_x_max, lim_y_max)]) # True centroid (coords may be multipoint) lon_center = bound_points_shapely.centroid.coords[0][0] lat_center = bound_points_shapely.centroid.coords[0][1] place_type = place.get('place_type') if place_type == "country": # country_guid # in case of country, # we do not need to save the GUID from Twitter # - country_code is already unique country_code = place.get('country_code') if country_code: place_record = HF.new_lbsn_record_with_id( lbsn.Country(), place.get('country_code'), self.origin) if not post_geoaccuracy: post_geoaccuracy = lbsn.Post.COUNTRY else: self.log.warning( f'No country_code\n\n{place}. ' f'PlaceEntry will be skipped..') return None, post_geoaccuracy, None elif place_type in ("city", "neighborhood", "admin"): # city_guid place_record = HF.new_lbsn_record_with_id( lbsn.City(), place.get('id'), self.origin) if not place_type == "city": place_record.sub_type = place_type if not post_geoaccuracy or post_geoaccuracy == lbsn.Post.COUNTRY: post_geoaccuracy = lbsn.Post.CITY elif place_type == "poi": # place_guid # For POIs, lbsn.City is not available on Twitter place_record = HF.new_lbsn_record_with_id(lbsn.Place(), place.get( 'id'), self.origin) if not post_geoaccuracy or post_geoaccuracy in ( lbsn.Post.COUNTRY, lbsn.Post.CITY): post_geoaccuracy = lbsn.Post.PLACE else: self.log.warning(f'No lbsn.Place Type Detected: {place}') # for some reason, twitter place entities sometimes contain # linebreaks or whitespaces. We don't want this. place_name = place.get('name').replace('\n\r', '') # remove multiple whitespace place_name = re.sub(' +', ' ', place_name) if place_type == "poi" or \ user_language is None \ or not user_language.language_short \ or user_language.language_short in ('en', 'und'): # At the moment, English name are the main references; # all other language specific references are stored in # name_alternatives - except for places, where twitter # has no alternative place names # Bugfix necessary: some English names get still saved # as name_alternatives place_record.name = place_name else: place_record.name_alternatives.append(place_name) place_record.url = place.get('url') place_record.geom_center = "POINT(%s %s)" % (lon_center, lat_center) if bounding_box and bound_coordinates: # prints: 'POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0))' place_record.geom_area = Polygon(bounding_box_points).wkt ref_country_record = None if not isinstance(place_record, lbsn.Country): ref_country_code = place.get('country_code') if ref_country_code: ref_country_record = \ HF.new_lbsn_record_with_id(lbsn.Country(), ref_country_code, self.origin) # At the moment, only English name references are processed if user_language is None \ or not user_language.language_short \ or user_language.language_short in ('en', 'und'): ref_country_record.name = place.get( 'country') # Needs to be saved else: alt_name = place.get('country') ref_country_record.name_alternatives.append(alt_name) self.lbsn_records.append(ref_country_record) if post_geoaccuracy == lbsn.Post.CITY and ref_country_record: # country_pkey only on lbsn.City(), lbsn.Place() has city_pkey, # but this is not available for Twitter place_record.country_pkey.CopyFrom(ref_country_record.pkey) # log.debug(f'Final lbsn.Place Record: {placeRecord}') return place_record, post_geoaccuracy, ref_country_record