def __init__(self,
              disableReactionPostReferencing=False,
              geocodes=False,
              mapFullRelations=False,
              map_reactions=True,
              ignore_non_geotagged=False,
              ignore_sources_set=None,
              min_geoaccuracy=None):
     # We're dealing with Twitter in this class,
     # lets create the OriginID globally
     # this OriginID is required for all CompositeKeys
     origin = lbsn.Origin()
     origin.origin_id = lbsn.Origin.TWITTER
     self.origin = origin
     # this is where all the data will be stored
     # self.lbsn_records = LBSNRecordDicts()
     self.lbsn_records = []
     self.null_island = 0
     self.log = logging.getLogger('__main__')  # logging.getLogger()
     self.disable_reaction_post_referencing = disableReactionPostReferencing
     self.map_full_relations = mapFullRelations
     self.geocodes = geocodes
     self.map_reactions = map_reactions
     self.ignore_non_geotagged = ignore_non_geotagged
     self.ignore_sources_set = ignore_sources_set
     self.min_geoaccuracy = min_geoaccuracy
     self.skipped_low_geoaccuracy = 0
     self.skipped_ignore_list = 0
Beispiel #2
0
 def get_func_record(cls,
                     record: Dict[str, Any],
                     input_type: Optional[str] = None):
     """Returns mapping function for input_type"""
     FUNC_MAP = {
         lbsn.Origin().DESCRIPTOR.name: cls.extract_origin,
         lbsn.Country().DESCRIPTOR.name: cls.extract_country,
         lbsn.City().DESCRIPTOR.name: cls.extract_city,
         lbsn.Place().DESCRIPTOR.name: cls.extract_place,
         lbsn.UserGroup().DESCRIPTOR.name: cls.extract_usergroup,
         lbsn.User().DESCRIPTOR.name: cls.extract_user,
         lbsn.Post().DESCRIPTOR.name: cls.extract_post,
         lbsn.PostReaction().DESCRIPTOR.name: cls.extract_postreaction,
         lbsn.Event().DESCRIPTOR.name: cls.extract_event,
     }
     func_map = FUNC_MAP.get(input_type)
     # create origin always the same
     origin = lbsn.Origin()
     origin.origin_id = record.get('origin_id')
     return func_map(record, origin)
 def dict_selector(self, record):
     """ Get dictionary by record type name"""
     dict_switcher = {
         lbsn.Post().DESCRIPTOR.name: self.lbsn_post_dict,
         lbsn.Country().DESCRIPTOR.name: self.lbsn_country_dict,
         lbsn.City().DESCRIPTOR.name: self.lbsn_city_dict,
         lbsn.Place().DESCRIPTOR.name: self.lbsn_place_dict,
         lbsn.PostReaction().DESCRIPTOR.name: self.lbsn_post_reaction_dict,
         lbsn.User().DESCRIPTOR.name: self.lbsn_user_dict,
         lbsn.UserGroup().DESCRIPTOR.name: self.lbsn_user_group_dict,
         lbsn.Origin().DESCRIPTOR.name: self.lbsn_origin_dict
     }
     return dict_switcher.get(record.DESCRIPTOR.name)
Beispiel #4
0
 def type_sql_mapper(cls):
     """Assigns record types to SQL Insert SQLs"""
     type_sql_mapping = {
         lbsn.Origin().DESCRIPTOR.name: cls.origin_insertsql,
         lbsn.Country().DESCRIPTOR.name: cls.country_insertsql,
         lbsn.City().DESCRIPTOR.name: cls.city_insertsql,
         lbsn.Place().DESCRIPTOR.name: cls.place_insertsql,
         lbsn.User().DESCRIPTOR.name: cls.user_insertsql,
         lbsn.UserGroup().DESCRIPTOR.name: cls.usergroup_insertsql,
         lbsn.Post().DESCRIPTOR.name: cls.post_insertsql,
         lbsn.Event().DESCRIPTOR.name: cls.event_insertsql,
         lbsn.PostReaction().DESCRIPTOR.name: cls.postreaction_insertsql,
     }
     return type_sql_mapping
Beispiel #5
0
 def get_hll_metrics(cls, record) -> hll.HllMetrics:
     """Extracts hll metrics based on record type"""
     dict_switcher = {
         lbsn.Origin().DESCRIPTOR.name: cls.get_origin_metrics,
         lbsn.Country().DESCRIPTOR.name: cls.get_country_metrics,
         lbsn.City().DESCRIPTOR.name: cls.get_city_metrics,
         lbsn.Place().DESCRIPTOR.name: cls.get_place_metrics,
         lbsn.User().DESCRIPTOR.name: cls.get_user_metrics,
         lbsn.UserGroup().DESCRIPTOR.name: cls.get_usergroup_metrics,
         lbsn.Post().DESCRIPTOR.name: cls.get_post_metrics,
         lbsn.PostReaction().DESCRIPTOR.name: cls.get_postreaction_metrics,
         lbsn.Relationship().DESCRIPTOR.name: cls.get_relationship_metrics
     }
     extract_function = dict_switcher.get(record.DESCRIPTOR.name)
     record_hll_metrics = extract_function(record)
     return record_hll_metrics
 def func_prepare_selector(self, record):
     """Select correct prepare function according to record type"""
     dict_switcher = {
         lbsn.Origin().DESCRIPTOR.name: self.prepare_lbsn_origin,
         lbsn.Country().DESCRIPTOR.name: self.prepare_lbsn_country,
         lbsn.City().DESCRIPTOR.name: self.prepare_lbsn_city,
         lbsn.Place().DESCRIPTOR.name: self.prepare_lbsn_place,
         lbsn.User().DESCRIPTOR.name: self.prepare_lbsn_user,
         lbsn.UserGroup().DESCRIPTOR.name: self.prepare_lbsn_usergroup,
         lbsn.Post().DESCRIPTOR.name: self.prepare_lbsn_post,
         lbsn.Event().DESCRIPTOR.name: self.prepare_lbsn_event,
         lbsn.PostReaction().DESCRIPTOR.name:
         self.prepare_lbsn_postreaction,
         lbsn.Relationship().DESCRIPTOR.name: self.prepare_lbsn_relation
     }
     prepare_function = dict_switcher.get(record.DESCRIPTOR.name)
     return prepare_function(record)
 def store_origin(self, origin_id, name):
     """Store origin of input source sql"""
     if self.dry_run:
         return
     if self.store_csv:
         origin = lbsn.Origin()
         origin.origin_id = origin_id
         self.csv_output.store_append_batch_to_csv(
             [origin], 0, lbsn.Origin.DESCRIPTOR.name)
         return
     insert_sql = \
         f'''
         INSERT INTO social."origin" (
             origin_id, name)
         VALUES ({origin_id},'{name}')
         ON CONFLICT (origin_id)
         DO NOTHING
         '''
     self.db_cursor.execute(insert_sql)
Beispiel #8
0
 def __init__(self,
              disable_reaction_post_referencing=False,
              geocodes=False,
              map_full_relations=False,
              map_reactions=True,
              ignore_non_geotagged=False,
              ignore_sources_set=None,
              min_geoaccuracy=None):
     # We're dealing with LBSN in this class, lets create the OriginID
     # globally
     # this OriginID is required for all CompositeKeys
     origin = lbsn.Origin()
     origin.origin_id = lbsn.Origin.LBSN
     self.origin = origin
     self.null_island = 0
     # this is where all the data will be stored
     # self.lbsn_records = []
     self.log = logging.getLogger('__main__')  # get the main logger object
     self.skipped_count = 0
     self.skipped_low_geoaccuracy = 0
 def __init__(self):
     self.lbsn_origin_dict = dict()
     self.lbsn_country_dict = dict()
     self.lbsn_city_dict = dict()
     self.lbsn_place_dict = dict()
     self.lbsn_user_group_dict = dict()
     self.lbsn_user_dict = dict()
     self.lbsn_post_dict = dict()
     self.lbsn_post_reaction_dict = dict()
     self.lbsn_relationship_dict = dict()
     self.key_hashes = {
         lbsn.Origin.DESCRIPTOR.name: set(),
         lbsn.Post.DESCRIPTOR.name: set(),
         lbsn.Country.DESCRIPTOR.name: set(),
         lbsn.City.DESCRIPTOR.name: set(),
         lbsn.Place.DESCRIPTOR.name: set(),
         lbsn.UserGroup.DESCRIPTOR.name: set(),
         lbsn.User.DESCRIPTOR.name: set(),
         lbsn.PostReaction.DESCRIPTOR.name: set(),
         lbsn.Relationship.DESCRIPTOR.name: set()
     }
     self.count_glob = 0  # total number of records added
     self.count_glob_total = 0
     self.count_dup_merge = 0  # number of duplicate records merged
     self.count_dup_merge_total = 0
     # returns all recordsDicts in correct order,
     # with names as references (tuple)
     self.all_dicts = [
         (self.lbsn_origin_dict, lbsn.Origin().DESCRIPTOR.name),
         (self.lbsn_country_dict, lbsn.Country().DESCRIPTOR.name),
         (self.lbsn_city_dict, lbsn.City().DESCRIPTOR.name),
         (self.lbsn_place_dict, lbsn.Place().DESCRIPTOR.name),
         (self.lbsn_user_group_dict, lbsn.UserGroup().DESCRIPTOR.name),
         (self.lbsn_user_dict, lbsn.User().DESCRIPTOR.name),
         (self.lbsn_post_dict, lbsn.Post().DESCRIPTOR.name),
         (self.lbsn_post_reaction_dict,
          lbsn.PostReaction().DESCRIPTOR.name),
         (self.lbsn_relationship_dict, lbsn.Relationship().DESCRIPTOR.name)
     ]
Beispiel #10
0
 def __init__(self,
              disable_reaction_post_referencing=False,
              geocodes=False,
              map_full_relations=False,
              map_reactions=True,
              ignore_non_geotagged=False,
              ignore_sources_set=None,
              min_geoaccuracy=None):
     # We're dealing with Flickr in this class, lets create the OriginID
     # globally
     # this OriginID is required for all CompositeKeys
     origin = lbsn.Origin()
     origin.origin_id = lbsn.Origin.FLICKR
     self.origin = origin
     self.null_island = 0
     self.log = logging.getLogger('__main__')  # get the main logger object
     self.skipped_count = 0
     self.skipped_low_geoaccuracy = 0
     # some records in YFCC100m are larger
     # than the default csv limit in python
     # of 131072
     csv.field_size_limit(500000)
     # self.disableReactionPostReferencing = disableReactionPostReferencing
     # self.mapFullRelations = mapFullRelations
     # self.geocodes = geocodes
     self.lic_dict = {
         "All Rights Reserved": 0,
         "Attribution-NonCommercial-ShareAlike License": 1,
         "Attribution-NonCommercial License": 2,
         "Attribution-NonCommercial-NoDerivs License": 3,
         "Attribution License": 4,
         "Attribution-ShareAlike License": 5,
         "Attribution-NoDerivs License": 6,
         "No known copyright restrictions": 7,
         "United States Government Work": 8,
         "Public Domain Dedication (CC0)": 9,
         "Public Domain Mark": 10
     }
Beispiel #11
0
# -*- coding: utf-8 -*-

"""
Module for db input connection sql mapping
"""

import enum
from typing import Union, Optional, List, Tuple
from lbsnstructure import lbsnstructure_pb2 as lbsn

"""Schema convention from lbsn db spec"""
LBSN_SCHEMA = [
    (lbsn.Origin().DESCRIPTOR.name, "social", "origin", "origin_id"),
    (lbsn.Country().DESCRIPTOR.name, "spatial", "country", "country_guid"),
    (lbsn.City().DESCRIPTOR.name, "spatial", "city", "city_guid"),
    (lbsn.Place().DESCRIPTOR.name, "spatial", "place", "place_guid"),
    (lbsn.UserGroup().DESCRIPTOR.name, "social", "user_groups", "usergroup_guid"),
    (lbsn.User().DESCRIPTOR.name, "social", "user", "user_guid"),
    (lbsn.Post().DESCRIPTOR.name, "topical", "post", "post_guid"),
    (lbsn.PostReaction().DESCRIPTOR.name,
     "topical", "post_reaction", "reaction_guid"),
]


def optional_schema_override(
        LBSN_SCHEMA: List[Tuple[str, str, str, str]],
        schema_table_overrides: List[Tuple[str, str]]) -> List[Tuple[str, str, str, str]]:
    """Override schema and table name for selected lbsn objects."""
    LBSN_SCHEMA_OVERRIDE = []
    for lbsn_type, schema_name, table_name, key_col in LBSN_SCHEMA:
        for schema_table_override in schema_table_overrides: