Ejemplo n.º 1
0
 def dict_type_switcher(desc_name):
     """ Create protoBuf messages by name"""
     dict_switcher = {
         lbsn.Country().DESCRIPTOR.name: lbsn.Country(),
         lbsn.City().DESCRIPTOR.name: lbsn.City(),
         lbsn.Place().DESCRIPTOR.name: lbsn.Place(),
         lbsn.User().DESCRIPTOR.name: lbsn.User(),
         lbsn.UserGroup().DESCRIPTOR.name: lbsn.UserGroup(),
         lbsn.Post().DESCRIPTOR.name: lbsn.Post(),
         lbsn.PostReaction().DESCRIPTOR.name: lbsn.PostReaction(),
         lbsn.Relationship().DESCRIPTOR.name: lbsn.Relationship()
     }
     return dict_switcher.get(desc_name)
Ejemplo n.º 2
0
 def extract_place(cls, record, origin):
     place = HF.new_lbsn_record_with_id(lbsn.Place(),
                                        record.get('place_guid'), origin)
     set_lbsn_attr(place, "name", record)
     set_lbsn_attr(place, "post_count", record)
     set_lbsn_attr(place, "url", record)
     geom_center = record.get("geom_center")
     if geom_center:
         setattr(place, "geom_center", parse_geom(geom_center))
     geom_area = record.get("geom_area")
     if geom_area:
         setattr(place, "geom_area", parse_geom(geom_area))
     city_guid = record.get('city_guid')
     if city_guid:
         set_lbsn_pkey(place.city_pkey, lbsn.City(),
                       record.get('city_guid'), origin)
     set_lbsn_attr(place, "name_alternatives", record)
     set_lbsn_attr(place, "place_description", record)
     set_lbsn_attr(place, "place_website", record)
     set_lbsn_attr(place, "place_phone", record)
     set_lbsn_attr(place, "address", record)
     set_lbsn_attr(place, "zip_code", record)
     set_lbsn_attr(place, "attributes", record)
     set_lbsn_attr(place, "checkin_count", record)
     set_lbsn_attr(place, "like_count", record)
     set_lbsn_attr(place, "parent_places", record)
     return place
 def extract_place(self, postplace_json):
     place = postplace_json
     place_id = place.get('id')
     if not place_id:
         self.log.warning(f'No PlaceGuid\n\n{place}')
         input("Press Enter to continue... (entry will be skipped)")
         return
     lon_center = place.get('lng')
     lat_center = place.get('lat')
     if lon_center is None or lat_center is None:
         # assign place to Null Island
         lon_center = 0
         lat_center = 0
     # place_guid
     # For POIs, City is not available on Twitter
     place_record = HF.new_lbsn_record_with_id(
         lbsn.Place(), place_id, self.origin)
     place_record.geom_center = "POINT(%s %s)" % (lon_center, lat_center)
     place_name = place.get('name').replace('\n\r', '')
     # for some reason, twitter place entities sometimes contain
     # linebreaks or whitespaces. We don't want this.
     place_name = place.get('name').replace('\n\r', '')
     # remove multiple whitespace
     place_name = re.sub(' +', ' ', place_name)
     place_slug = place.get('slug')
     if place_slug:
         place_record.url = (
             f"https://www.instagram.com/explore/locations/"
             f"{place_id}/{place_slug}")
     return place_record
Ejemplo n.º 4
0
 def process_place_record(place_record, origin):
     """Assignment of Flickr place types to lbsnstructure
     hierarchy: lbsn.Country, lbsn.City, lbsn.Place
     Original Flickr place types, which are more detailed,
     are stored in sub_type field
     """
     place_record_split = place_record.split(":")
     if not len(place_record_split) == 3:
         raise ValueError(f'Malformed place entry:\n'
                          f'place_record: {place_record}')
     place_guid = unquote(place_record_split[0])
     place_name = unquote(place_record_split[1]).replace('+', ' ')
     place_type = unquote(place_record_split[2])
     place_type_lw = place_type.lower()
     place_type_lw_split = place_type_lw.split("/")
     # assignment
     if any(ptls in FLICKR_COUNTRY_MATCH for ptls in place_type_lw_split):
         lbsn_place_record = HF.new_lbsn_record_with_id(
             lbsn.Country(), place_guid, origin)
     elif any(ptls in FLICKR_CITY_MATCH for ptls in place_type_lw_split):
         lbsn_place_record = HF.new_lbsn_record_with_id(
             lbsn.City(), place_guid, origin)
     elif any(ptls in FLICKR_PLACE_MATCH for ptls in place_type_lw_split):
         lbsn_place_record = HF.new_lbsn_record_with_id(
             lbsn.Place(), place_guid, origin)
     else:
         logging.getLogger('__main__').debug(
             f'Could not assign place type {place_type_lw}\n'
             f'found in place_record: {place_record}\n'
             f'Will assign default "lbsn.Place"')
         lbsn_place_record = HF.new_lbsn_record_with_id(
             lbsn.Place(), place_guid, origin)
     lbsn_place_record.name = place_name
     if isinstance(lbsn_place_record, lbsn.City):
         # record sub types only for city and place
         lbsn_place_record.sub_type = place_type
     elif isinstance(lbsn_place_record, lbsn.Place):
         lbsn_place_record.place_description = place_type
     # place_record.url (not provided)
     # need to consult post data for lat/lng coordinates
     # set to null island first
     lbsn_place_record.geom_center = "POINT(%s %s)" % (0, 0)
     return lbsn_place_record
Ejemplo n.º 5
0
 def dict_selector(self, record):
     """ Get dictionary by record type name"""
     dict_switcher = {
         lbsn.Post().DESCRIPTOR.name: self.lbsn_post_dict,
         lbsn.Country().DESCRIPTOR.name: self.lbsn_country_dict,
         lbsn.City().DESCRIPTOR.name: self.lbsn_city_dict,
         lbsn.Place().DESCRIPTOR.name: self.lbsn_place_dict,
         lbsn.PostReaction().DESCRIPTOR.name: self.lbsn_post_reaction_dict,
         lbsn.User().DESCRIPTOR.name: self.lbsn_user_dict,
         lbsn.UserGroup().DESCRIPTOR.name: self.lbsn_user_group_dict,
         lbsn.Origin().DESCRIPTOR.name: self.lbsn_origin_dict
     }
     return dict_switcher.get(record.DESCRIPTOR.name)
 def __init__(self, record=None):
     if record is None:
         record = lbsn.Place()  # init empty structure
     self.origin_id = record.pkey.origin.origin_id  # = 3
     self.guid = record.pkey.id
     self.name = HF.null_check(record.name)
     # because ProtoBuf Repeated Field does not support distinct rule,
     # we remove any duplicates in list fields prior to submission here
     self.name_alternatives = list(set(record.name_alternatives))
     if self.name and self.name in self.name_alternatives:
         self.name_alternatives.remove(self.name)
     self.url = HF.null_check(record.url)
     self.geom_center = HF.null_check(record.geom_center)
     self.geom_area = HF.null_check(record.geom_area)
Ejemplo n.º 7
0
 def type_sql_mapper(cls):
     """Assigns record types to SQL Insert SQLs"""
     type_sql_mapping = {
         lbsn.Origin().DESCRIPTOR.name: cls.origin_insertsql,
         lbsn.Country().DESCRIPTOR.name: cls.country_insertsql,
         lbsn.City().DESCRIPTOR.name: cls.city_insertsql,
         lbsn.Place().DESCRIPTOR.name: cls.place_insertsql,
         lbsn.User().DESCRIPTOR.name: cls.user_insertsql,
         lbsn.UserGroup().DESCRIPTOR.name: cls.usergroup_insertsql,
         lbsn.Post().DESCRIPTOR.name: cls.post_insertsql,
         lbsn.Event().DESCRIPTOR.name: cls.event_insertsql,
         lbsn.PostReaction().DESCRIPTOR.name: cls.postreaction_insertsql,
     }
     return type_sql_mapping
Ejemplo n.º 8
0
 def get_hll_metrics(cls, record) -> hll.HllMetrics:
     """Extracts hll metrics based on record type"""
     dict_switcher = {
         lbsn.Origin().DESCRIPTOR.name: cls.get_origin_metrics,
         lbsn.Country().DESCRIPTOR.name: cls.get_country_metrics,
         lbsn.City().DESCRIPTOR.name: cls.get_city_metrics,
         lbsn.Place().DESCRIPTOR.name: cls.get_place_metrics,
         lbsn.User().DESCRIPTOR.name: cls.get_user_metrics,
         lbsn.UserGroup().DESCRIPTOR.name: cls.get_usergroup_metrics,
         lbsn.Post().DESCRIPTOR.name: cls.get_post_metrics,
         lbsn.PostReaction().DESCRIPTOR.name: cls.get_postreaction_metrics,
         lbsn.Relationship().DESCRIPTOR.name: cls.get_relationship_metrics
     }
     extract_function = dict_switcher.get(record.DESCRIPTOR.name)
     record_hll_metrics = extract_function(record)
     return record_hll_metrics
 def func_prepare_selector(self, record):
     """Select correct prepare function according to record type"""
     dict_switcher = {
         lbsn.Origin().DESCRIPTOR.name: self.prepare_lbsn_origin,
         lbsn.Country().DESCRIPTOR.name: self.prepare_lbsn_country,
         lbsn.City().DESCRIPTOR.name: self.prepare_lbsn_city,
         lbsn.Place().DESCRIPTOR.name: self.prepare_lbsn_place,
         lbsn.User().DESCRIPTOR.name: self.prepare_lbsn_user,
         lbsn.UserGroup().DESCRIPTOR.name: self.prepare_lbsn_usergroup,
         lbsn.Post().DESCRIPTOR.name: self.prepare_lbsn_post,
         lbsn.Event().DESCRIPTOR.name: self.prepare_lbsn_event,
         lbsn.PostReaction().DESCRIPTOR.name:
         self.prepare_lbsn_postreaction,
         lbsn.Relationship().DESCRIPTOR.name: self.prepare_lbsn_relation
     }
     prepare_function = dict_switcher.get(record.DESCRIPTOR.name)
     return prepare_function(record)
Ejemplo n.º 10
0
 def extract_event(cls, record, origin):
     event = HF.new_lbsn_record_with_id(lbsn.Event(),
                                        record.get('event_guid'), origin)
     set_lbsn_attr(event, "name", record)
     event_latlng = record.get("event_latlng")
     if event_latlng:
         setattr(event, "event_latlng", parse_geom(event_latlng))
     event_area = record.get("event_area")
     if event_area:
         setattr(event, "event_area", parse_geom(event_area))
     set_lbsn_attr(event, "event_website", record)
     event_date = record.get('event_date')
     if event_date:
         copydate_lbsn_attr(event.event_date, event_date)
     event_date_start = record.get('event_date_start')
     if event_date_start:
         copydate_lbsn_attr(event.event_date_start, event_date_start)
     event_date_end = record.get('event_date_end')
     if event_date_end:
         copydate_lbsn_attr(event.event_date_end, event_date_end)
     duration = record.get('duration')
     if duration:
         copyduration_lbsn_attr(event.duration, duration)
     place_guid = record.get('place_guid')
     if place_guid:
         set_lbsn_pkey(event.place_pkey, lbsn.Place(),
                       record.get('place_guid'), origin)
     city_guid = record.get('city_guid')
     if city_guid:
         set_lbsn_pkey(event.city_pkey, lbsn.City(),
                       record.get('city_guid'), origin)
     country_guid = record.get('country_guid')
     if country_guid:
         set_lbsn_pkey(event.country_pkey, lbsn.Country(),
                       record.get('country_guid'), origin)
     set_lbsn_pkey(event.user_pkey, lbsn.User(), record.get('user_guid'),
                   origin)
     set_lbsn_attr(event, "event_description", record)
     set_lbsn_attr(event, "event_type", record)
     set_lbsn_attr(event, "event_share_count", record)
     set_lbsn_attr(event, "event_like_count", record)
     set_lbsn_attr(event, "event_comment_count", record)
     set_lbsn_attr(event, "event_views_count", record)
     set_lbsn_attr(event, "event_engage_count", record)
     return event
Ejemplo n.º 11
0
 def get_func_record(cls,
                     record: Dict[str, Any],
                     input_type: Optional[str] = None):
     """Returns mapping function for input_type"""
     FUNC_MAP = {
         lbsn.Origin().DESCRIPTOR.name: cls.extract_origin,
         lbsn.Country().DESCRIPTOR.name: cls.extract_country,
         lbsn.City().DESCRIPTOR.name: cls.extract_city,
         lbsn.Place().DESCRIPTOR.name: cls.extract_place,
         lbsn.UserGroup().DESCRIPTOR.name: cls.extract_usergroup,
         lbsn.User().DESCRIPTOR.name: cls.extract_user,
         lbsn.Post().DESCRIPTOR.name: cls.extract_post,
         lbsn.PostReaction().DESCRIPTOR.name: cls.extract_postreaction,
         lbsn.Event().DESCRIPTOR.name: cls.extract_event,
     }
     func_map = FUNC_MAP.get(input_type)
     # create origin always the same
     origin = lbsn.Origin()
     origin.origin_id = record.get('origin_id')
     return func_map(record, origin)
Ejemplo n.º 12
0
 def __init__(self):
     self.lbsn_origin_dict = dict()
     self.lbsn_country_dict = dict()
     self.lbsn_city_dict = dict()
     self.lbsn_place_dict = dict()
     self.lbsn_user_group_dict = dict()
     self.lbsn_user_dict = dict()
     self.lbsn_post_dict = dict()
     self.lbsn_post_reaction_dict = dict()
     self.lbsn_relationship_dict = dict()
     self.key_hashes = {
         lbsn.Origin.DESCRIPTOR.name: set(),
         lbsn.Post.DESCRIPTOR.name: set(),
         lbsn.Country.DESCRIPTOR.name: set(),
         lbsn.City.DESCRIPTOR.name: set(),
         lbsn.Place.DESCRIPTOR.name: set(),
         lbsn.UserGroup.DESCRIPTOR.name: set(),
         lbsn.User.DESCRIPTOR.name: set(),
         lbsn.PostReaction.DESCRIPTOR.name: set(),
         lbsn.Relationship.DESCRIPTOR.name: set()
     }
     self.count_glob = 0  # total number of records added
     self.count_glob_total = 0
     self.count_dup_merge = 0  # number of duplicate records merged
     self.count_dup_merge_total = 0
     # returns all recordsDicts in correct order,
     # with names as references (tuple)
     self.all_dicts = [
         (self.lbsn_origin_dict, lbsn.Origin().DESCRIPTOR.name),
         (self.lbsn_country_dict, lbsn.Country().DESCRIPTOR.name),
         (self.lbsn_city_dict, lbsn.City().DESCRIPTOR.name),
         (self.lbsn_place_dict, lbsn.Place().DESCRIPTOR.name),
         (self.lbsn_user_group_dict, lbsn.UserGroup().DESCRIPTOR.name),
         (self.lbsn_user_dict, lbsn.User().DESCRIPTOR.name),
         (self.lbsn_post_dict, lbsn.Post().DESCRIPTOR.name),
         (self.lbsn_post_reaction_dict,
          lbsn.PostReaction().DESCRIPTOR.name),
         (self.lbsn_relationship_dict, lbsn.Relationship().DESCRIPTOR.name)
     ]
Ejemplo n.º 13
0
 def extract_post(cls, record, origin):
     post = HF.new_lbsn_record_with_id(lbsn.Post(), record.get('post_guid'),
                                       origin)
     post_latlng = record.get("post_latlng")
     if post_latlng:
         setattr(post, "post_latlng", parse_geom(post_latlng))
     place_guid = record.get('place_guid')
     if place_guid:
         set_lbsn_pkey(post.place_pkey, lbsn.Place(),
                       record.get('place_guid'), origin)
     city_guid = record.get('city_guid')
     if city_guid:
         set_lbsn_pkey(post.city_pkey, lbsn.City(), record.get('city_guid'),
                       origin)
     country_guid = record.get('country_guid')
     if country_guid:
         set_lbsn_pkey(post.country_pkey, lbsn.Country(),
                       record.get('country_guid'), origin)
     set_lbsn_pkey(post.user_pkey, lbsn.User(), record.get('user_guid'),
                   origin)
     pub_date = record.get('post_publish_date')
     if pub_date:
         copydate_lbsn_attr(post.post_publish_date, pub_date)
     set_lbsn_attr(post, "post_body", record)
     post.post_geoaccuracy
     geo_acc = record.get("post_geoaccuracy")
     if geo_acc:
         # get enum value
         post.post_geoaccuracy = lbsn.Post.PostGeoaccuracy.Value(
             geo_acc.upper())
     set_lbsn_attr(post, "hashtags", record)
     set_lbsn_attr(post, "emoji", record)
     set_lbsn_attr(post, "post_like_count", record)
     set_lbsn_attr(post, "post_comment_count", record)
     set_lbsn_attr(post, "post_views_count", record)
     set_lbsn_attr(post, "post_title", record)
     crt_date = record.get('post_create_date')
     if crt_date:
         copydate_lbsn_attr(post.post_create_date, crt_date)
     set_lbsn_attr(post, "post_thumbnail_url", record)
     set_lbsn_attr(post, "post_url", record)
     post_type = record.get("post_type")
     if post_type:
         # get enum value
         post.post_type = lbsn.Post.PostType.Value(post_type.upper())
     set_lbsn_attr(post, "post_filter", record)
     set_lbsn_attr(post, "post_quote_count", record)
     set_lbsn_attr(post, "post_share_count", record)
     lang = record.get('post_language')
     if lang:
         ref_post_language = lbsn.Language()
         ref_post_language.language_short = lang
         post.post_language.CopyFrom(ref_post_language)
     set_lbsn_attr(post, "input_source", record)
     user_mentions = record.get("user_mentions")
     if user_mentions:
         mentioned_users_list = []
         for user_id in user_mentions:  # iterate over the list
             ref_user_record = \
                 HF.new_lbsn_record_with_id(
                     lbsn.User(), user_id, origin)
             mentioned_users_list.append(ref_user_record)
         post.user_mentions_pkey.extend(
             [user_ref.pkey for user_ref in mentioned_users_list])
     set_lbsn_attr(post, "post_content_license", record)
     return post
Ejemplo n.º 14
0
# -*- coding: utf-8 -*-

"""
Module for db input connection sql mapping
"""

import enum
from typing import Union, Optional, List, Tuple
from lbsnstructure import lbsnstructure_pb2 as lbsn

"""Schema convention from lbsn db spec"""
LBSN_SCHEMA = [
    (lbsn.Origin().DESCRIPTOR.name, "social", "origin", "origin_id"),
    (lbsn.Country().DESCRIPTOR.name, "spatial", "country", "country_guid"),
    (lbsn.City().DESCRIPTOR.name, "spatial", "city", "city_guid"),
    (lbsn.Place().DESCRIPTOR.name, "spatial", "place", "place_guid"),
    (lbsn.UserGroup().DESCRIPTOR.name, "social", "user_groups", "usergroup_guid"),
    (lbsn.User().DESCRIPTOR.name, "social", "user", "user_guid"),
    (lbsn.Post().DESCRIPTOR.name, "topical", "post", "post_guid"),
    (lbsn.PostReaction().DESCRIPTOR.name,
     "topical", "post_reaction", "reaction_guid"),
]


def optional_schema_override(
        LBSN_SCHEMA: List[Tuple[str, str, str, str]],
        schema_table_overrides: List[Tuple[str, str]]) -> List[Tuple[str, str, str, str]]:
    """Override schema and table name for selected lbsn objects."""
    LBSN_SCHEMA_OVERRIDE = []
    for lbsn_type, schema_name, table_name, key_col in LBSN_SCHEMA:
        for schema_table_override in schema_table_overrides:
 def extract_place(
         self, postplace_json,
         post_geoaccuracy, user_language=None):
     """Extract lbsn.Place from twitter json"""
     place = postplace_json
     place_id = place.get('id')
     if not place_id:
         self.log.warning(f'No PlaceGuid\n\n{place}')
         input("Press Enter to continue... (entry will be skipped)")
         return None, post_geoaccuracy, None
     lon_center = 0
     lat_center = 0
     bounding_box = place.get('bounding_box')
     if bounding_box:
         bound_coordinates = bounding_box.get('coordinates')
         if bound_coordinates:
             bounding_box_points = bound_coordinates[0]
         lim_y_min, lim_y_max, lim_x_min, lim_x_max = \
             HF.get_rectangle_bounds(bounding_box_points)
         bound_points_shapely = \
             geometry.MultiPoint([(lim_x_min, lim_y_min),
                                  (lim_x_max, lim_y_max)])
         # True centroid (coords may be multipoint)
         lon_center = bound_points_shapely.centroid.coords[0][0]
         lat_center = bound_points_shapely.centroid.coords[0][1]
     place_type = place.get('place_type')
     if place_type == "country":
         # country_guid
         # in case of country,
         # we do not need to save the GUID from Twitter
         # - country_code is already unique
         country_code = place.get('country_code')
         if country_code:
             place_record = HF.new_lbsn_record_with_id(
                 lbsn.Country(), place.get('country_code'),
                 self.origin)
             if not post_geoaccuracy:
                 post_geoaccuracy = lbsn.Post.COUNTRY
         else:
             self.log.warning(
                 f'No country_code\n\n{place}. '
                 f'PlaceEntry will be skipped..')
             return None, post_geoaccuracy, None
     elif place_type in ("city", "neighborhood", "admin"):
         # city_guid
         place_record = HF.new_lbsn_record_with_id(
             lbsn.City(), place.get('id'), self.origin)
         if not place_type == "city":
             place_record.sub_type = place_type
         if not post_geoaccuracy or post_geoaccuracy == lbsn.Post.COUNTRY:
             post_geoaccuracy = lbsn.Post.CITY
     elif place_type == "poi":
         # place_guid
         # For POIs, lbsn.City is not available on Twitter
         place_record = HF.new_lbsn_record_with_id(lbsn.Place(),
                                                   place.get(
             'id'),
             self.origin)
         if not post_geoaccuracy or post_geoaccuracy in (
                 lbsn.Post.COUNTRY, lbsn.Post.CITY):
             post_geoaccuracy = lbsn.Post.PLACE
     else:
         self.log.warning(f'No lbsn.Place Type Detected: {place}')
     # for some reason, twitter place entities sometimes contain
     # linebreaks or whitespaces. We don't want this.
     place_name = place.get('name').replace('\n\r', '')
     # remove multiple whitespace
     place_name = re.sub(' +', ' ', place_name)
     if place_type == "poi" or \
        user_language is None \
        or not user_language.language_short \
        or user_language.language_short in ('en', 'und'):
         # At the moment, English name are the main references;
         # all other language specific references are stored in
         # name_alternatives - except for places, where twitter
         # has no alternative place names
         # Bugfix necessary: some English names get still saved
         # as name_alternatives
         place_record.name = place_name
     else:
         place_record.name_alternatives.append(place_name)
     place_record.url = place.get('url')
     place_record.geom_center = "POINT(%s %s)" % (lon_center, lat_center)
     if bounding_box and bound_coordinates:
         # prints: 'POLYGON ((0 0, 1 0, 1 1, 0 1, 0 0))'
         place_record.geom_area = Polygon(bounding_box_points).wkt
     ref_country_record = None
     if not isinstance(place_record, lbsn.Country):
         ref_country_code = place.get('country_code')
         if ref_country_code:
             ref_country_record = \
                 HF.new_lbsn_record_with_id(lbsn.Country(),
                                            ref_country_code,
                                            self.origin)
             # At the moment, only English name references are processed
             if user_language is None \
                or not user_language.language_short \
                or user_language.language_short in ('en', 'und'):
                 ref_country_record.name = place.get(
                     'country')  # Needs to be saved
             else:
                 alt_name = place.get('country')
                 ref_country_record.name_alternatives.append(alt_name)
             self.lbsn_records.append(ref_country_record)
     if post_geoaccuracy == lbsn.Post.CITY and ref_country_record:
         # country_pkey only on lbsn.City(), lbsn.Place() has city_pkey,
         # but this is not available for Twitter
         place_record.country_pkey.CopyFrom(ref_country_record.pkey)
     # log.debug(f'Final lbsn.Place Record: {placeRecord}')
     return place_record, post_geoaccuracy, ref_country_record