def dict_type_switcher(desc_name):
     """ Create protoBuf messages by name"""
     dict_switcher = {
         lbsn.Country().DESCRIPTOR.name: lbsn.Country(),
         lbsn.City().DESCRIPTOR.name: lbsn.City(),
         lbsn.Place().DESCRIPTOR.name: lbsn.Place(),
         lbsn.User().DESCRIPTOR.name: lbsn.User(),
         lbsn.UserGroup().DESCRIPTOR.name: lbsn.UserGroup(),
         lbsn.Post().DESCRIPTOR.name: lbsn.Post(),
         lbsn.PostReaction().DESCRIPTOR.name: lbsn.PostReaction(),
         lbsn.Relationship().DESCRIPTOR.name: lbsn.Relationship()
     }
     return dict_switcher.get(desc_name)
 def __init__(self, record=None):
     if record is None:
         record = lbsn.User()
     self.origin_id = record.pkey.origin.origin_id
     self.guid = record.pkey.id
     self.user_name = HF.null_check(record.user_name)
     self.user_fullname = HF.null_check(record.user_fullname)
     self.follows = HF.null_check(record.follows)
     self.followed = HF.null_check(record.followed)
     self.group_count = HF.null_check(record.group_count)
     self.biography = HF.null_check(record.biography)
     self.post_count = HF.null_check(record.post_count)
     self.url = HF.null_check(record.url)
     self.is_private = HF.null_check(record.is_private)
     self.is_available = HF.null_check(record.is_available)
     self.user_language = HF.null_check(record.user_language.language_short)
     self.user_location = HF.null_check(record.user_location)
     self.user_location_geom = HF.null_check(record.user_location_geom)
     self.liked_count = HF.null_check(record.liked_count)
     self.active_since = HF.null_check_datetime(record.active_since)
     self.profile_image_url = HF.null_check(record.profile_image_url)
     self.user_timezone = HF.null_check(record.user_timezone)
     self.user_utc_offset = HF.null_check(record.user_utc_offset)
     self.user_groups_member = list(set(record.user_groups_member))
     self.user_groups_follows = list(set(record.user_groups_follows))
 def extract_related_users(
         self, related_user_list, input_lbsn_type, user_record):
     """Extract related users from user list"""
     for related_user in related_user_list:
         related_record = HF.new_lbsn_record_with_id(lbsn.User(),
                                                     str(related_user),
                                                     self.origin)
         self.lbsn_records.append(related_record)
         # note the switch of order here,
         # direction is important for 'isConnected',
         # and the different list each give us a
         # different view on this relationship
         if input_lbsn_type == 'friendslist':
             relationship_record =\
                 HF.new_lbsn_relation_with_id(lbsn.Relationship(),
                                              user_record.pkey.id,
                                              related_record.pkey.id,
                                              self.origin)
         elif input_lbsn_type == 'followerslist':
             relationship_record = \
                 HF.new_lbsn_relation_with_id(lbsn.Relationship(),
                                              related_record.pkey.id,
                                              user_record.pkey.id,
                                              self.origin)
         relationship_record.relationship_type = \
             lbsn.Relationship.isCONNECTED
         self.lbsn_records.add_relationship_to_dict(
             relationship_record)
Exemple #4
0
    def extract_user(cls, record, origin):
        user = HF.new_lbsn_record_with_id(lbsn.User(), record.get('user_guid'),
                                          origin)
        set_lbsn_attr(user, "user_name", record)
        set_lbsn_attr(user, "user_fullname", record)
        set_lbsn_attr(user, "follows", record)
        set_lbsn_attr(user, "followed", record)
        set_lbsn_attr(user, "biography", record)
        set_lbsn_attr(user, "post_count", record)
        set_lbsn_attr(user, "url", record)
        set_lbsn_attr(user, "is_private", record)
        set_lbsn_attr(user, "is_available", record)

        lang = record.get('user_language')
        if lang:
            ref_user_language = lbsn.Language()
            ref_user_language.language_short = lang
            user.user_language.CopyFrom(ref_user_language)

        set_lbsn_attr(user, "user_location", record)
        user_location_geom = record.get("user_location_geom")
        if user_location_geom:
            setattr(user, "user_location_geom", parse_geom(user_location_geom))
        set_lbsn_attr(user, "liked_count", record)
        active_since = record.get('active_since')
        if active_since:
            copydate_lbsn_attr(user.active_since, active_since)
        set_lbsn_attr(user, "profile_image_url", record)
        set_lbsn_attr(user, "user_timezone", record)
        set_lbsn_attr(user, "user_utc_offset", record)
        set_lbsn_attr(user, "user_groups_member", record)
        set_lbsn_attr(user, "user_groups_follows", record)
        set_lbsn_attr(user, "group_count", record)
        return user
 def extract_user(self, json_string_dict):
     user = json_string_dict
     user_record = HF.new_lbsn_record_with_id(lbsn.User(),
                                              user.get(
         'id'),
         self.origin)
     return user_record
 def get_mentioned_users(userMentions_jsonString, origin):
     """Return list of mentioned users from json"""
     mentioned_users_list = []
     for user_mention in userMentions_jsonString:  # iterate over the list
         ref_user_record = \
             HelperFunctions.new_lbsn_record_with_id(
                 lbsn.User(), user_mention.get('id_str'), origin)
         ref_user_record.user_fullname = \
             user_mention.get('name')  # Needs to be saved
         ref_user_record.user_name = user_mention.get('screen_name')
         mentioned_users_list.append(ref_user_record)
     return mentioned_users_list
 def dict_selector(self, record):
     """ Get dictionary by record type name"""
     dict_switcher = {
         lbsn.Post().DESCRIPTOR.name: self.lbsn_post_dict,
         lbsn.Country().DESCRIPTOR.name: self.lbsn_country_dict,
         lbsn.City().DESCRIPTOR.name: self.lbsn_city_dict,
         lbsn.Place().DESCRIPTOR.name: self.lbsn_place_dict,
         lbsn.PostReaction().DESCRIPTOR.name: self.lbsn_post_reaction_dict,
         lbsn.User().DESCRIPTOR.name: self.lbsn_user_dict,
         lbsn.UserGroup().DESCRIPTOR.name: self.lbsn_user_group_dict,
         lbsn.Origin().DESCRIPTOR.name: self.lbsn_origin_dict
     }
     return dict_switcher.get(record.DESCRIPTOR.name)
Exemple #8
0
 def type_sql_mapper(cls):
     """Assigns record types to SQL Insert SQLs"""
     type_sql_mapping = {
         lbsn.Origin().DESCRIPTOR.name: cls.origin_insertsql,
         lbsn.Country().DESCRIPTOR.name: cls.country_insertsql,
         lbsn.City().DESCRIPTOR.name: cls.city_insertsql,
         lbsn.Place().DESCRIPTOR.name: cls.place_insertsql,
         lbsn.User().DESCRIPTOR.name: cls.user_insertsql,
         lbsn.UserGroup().DESCRIPTOR.name: cls.usergroup_insertsql,
         lbsn.Post().DESCRIPTOR.name: cls.post_insertsql,
         lbsn.Event().DESCRIPTOR.name: cls.event_insertsql,
         lbsn.PostReaction().DESCRIPTOR.name: cls.postreaction_insertsql,
     }
     return type_sql_mapping
Exemple #9
0
 def extract_usergroup(cls, record, origin):
     usergroup = HF.new_lbsn_record_with_id(lbsn.UserGroup(),
                                            record.get('usergroup_guid'),
                                            origin)
     usergroup.usergroup_name = record.get('usergroup_name')
     usergroup.usergroup_description = record.get('usergroup_description')
     usergroup.member_count = record.get('member_count')
     usergroup.usergroup_createdate = record.get('usergroup_createdate')
     usergroup.user_owner = record.get('user_owner')
     user_owner = record.get('user_owner')
     if user_owner:
         usergroup.user_owner_pkey.CopyFrom(
             HF.new_lbsn_record_with_id(lbsn.User(),
                                        record.get('user_owner'),
                                        origin).pkey)
     return usergroup
Exemple #10
0
 def get_hll_metrics(cls, record) -> hll.HllMetrics:
     """Extracts hll metrics based on record type"""
     dict_switcher = {
         lbsn.Origin().DESCRIPTOR.name: cls.get_origin_metrics,
         lbsn.Country().DESCRIPTOR.name: cls.get_country_metrics,
         lbsn.City().DESCRIPTOR.name: cls.get_city_metrics,
         lbsn.Place().DESCRIPTOR.name: cls.get_place_metrics,
         lbsn.User().DESCRIPTOR.name: cls.get_user_metrics,
         lbsn.UserGroup().DESCRIPTOR.name: cls.get_usergroup_metrics,
         lbsn.Post().DESCRIPTOR.name: cls.get_post_metrics,
         lbsn.PostReaction().DESCRIPTOR.name: cls.get_postreaction_metrics,
         lbsn.Relationship().DESCRIPTOR.name: cls.get_relationship_metrics
     }
     extract_function = dict_switcher.get(record.DESCRIPTOR.name)
     record_hll_metrics = extract_function(record)
     return record_hll_metrics
 def func_prepare_selector(self, record):
     """Select correct prepare function according to record type"""
     dict_switcher = {
         lbsn.Origin().DESCRIPTOR.name: self.prepare_lbsn_origin,
         lbsn.Country().DESCRIPTOR.name: self.prepare_lbsn_country,
         lbsn.City().DESCRIPTOR.name: self.prepare_lbsn_city,
         lbsn.Place().DESCRIPTOR.name: self.prepare_lbsn_place,
         lbsn.User().DESCRIPTOR.name: self.prepare_lbsn_user,
         lbsn.UserGroup().DESCRIPTOR.name: self.prepare_lbsn_usergroup,
         lbsn.Post().DESCRIPTOR.name: self.prepare_lbsn_post,
         lbsn.Event().DESCRIPTOR.name: self.prepare_lbsn_event,
         lbsn.PostReaction().DESCRIPTOR.name:
         self.prepare_lbsn_postreaction,
         lbsn.Relationship().DESCRIPTOR.name: self.prepare_lbsn_relation
     }
     prepare_function = dict_switcher.get(record.DESCRIPTOR.name)
     return prepare_function(record)
Exemple #12
0
 def extract_event(cls, record, origin):
     event = HF.new_lbsn_record_with_id(lbsn.Event(),
                                        record.get('event_guid'), origin)
     set_lbsn_attr(event, "name", record)
     event_latlng = record.get("event_latlng")
     if event_latlng:
         setattr(event, "event_latlng", parse_geom(event_latlng))
     event_area = record.get("event_area")
     if event_area:
         setattr(event, "event_area", parse_geom(event_area))
     set_lbsn_attr(event, "event_website", record)
     event_date = record.get('event_date')
     if event_date:
         copydate_lbsn_attr(event.event_date, event_date)
     event_date_start = record.get('event_date_start')
     if event_date_start:
         copydate_lbsn_attr(event.event_date_start, event_date_start)
     event_date_end = record.get('event_date_end')
     if event_date_end:
         copydate_lbsn_attr(event.event_date_end, event_date_end)
     duration = record.get('duration')
     if duration:
         copyduration_lbsn_attr(event.duration, duration)
     place_guid = record.get('place_guid')
     if place_guid:
         set_lbsn_pkey(event.place_pkey, lbsn.Place(),
                       record.get('place_guid'), origin)
     city_guid = record.get('city_guid')
     if city_guid:
         set_lbsn_pkey(event.city_pkey, lbsn.City(),
                       record.get('city_guid'), origin)
     country_guid = record.get('country_guid')
     if country_guid:
         set_lbsn_pkey(event.country_pkey, lbsn.Country(),
                       record.get('country_guid'), origin)
     set_lbsn_pkey(event.user_pkey, lbsn.User(), record.get('user_guid'),
                   origin)
     set_lbsn_attr(event, "event_description", record)
     set_lbsn_attr(event, "event_type", record)
     set_lbsn_attr(event, "event_share_count", record)
     set_lbsn_attr(event, "event_like_count", record)
     set_lbsn_attr(event, "event_comment_count", record)
     set_lbsn_attr(event, "event_views_count", record)
     set_lbsn_attr(event, "event_engage_count", record)
     return event
Exemple #13
0
 def get_func_record(cls,
                     record: Dict[str, Any],
                     input_type: Optional[str] = None):
     """Returns mapping function for input_type"""
     FUNC_MAP = {
         lbsn.Origin().DESCRIPTOR.name: cls.extract_origin,
         lbsn.Country().DESCRIPTOR.name: cls.extract_country,
         lbsn.City().DESCRIPTOR.name: cls.extract_city,
         lbsn.Place().DESCRIPTOR.name: cls.extract_place,
         lbsn.UserGroup().DESCRIPTOR.name: cls.extract_usergroup,
         lbsn.User().DESCRIPTOR.name: cls.extract_user,
         lbsn.Post().DESCRIPTOR.name: cls.extract_post,
         lbsn.PostReaction().DESCRIPTOR.name: cls.extract_postreaction,
         lbsn.Event().DESCRIPTOR.name: cls.extract_event,
     }
     func_map = FUNC_MAP.get(input_type)
     # create origin always the same
     origin = lbsn.Origin()
     origin.origin_id = record.get('origin_id')
     return func_map(record, origin)
 def __init__(self):
     self.lbsn_origin_dict = dict()
     self.lbsn_country_dict = dict()
     self.lbsn_city_dict = dict()
     self.lbsn_place_dict = dict()
     self.lbsn_user_group_dict = dict()
     self.lbsn_user_dict = dict()
     self.lbsn_post_dict = dict()
     self.lbsn_post_reaction_dict = dict()
     self.lbsn_relationship_dict = dict()
     self.key_hashes = {
         lbsn.Origin.DESCRIPTOR.name: set(),
         lbsn.Post.DESCRIPTOR.name: set(),
         lbsn.Country.DESCRIPTOR.name: set(),
         lbsn.City.DESCRIPTOR.name: set(),
         lbsn.Place.DESCRIPTOR.name: set(),
         lbsn.UserGroup.DESCRIPTOR.name: set(),
         lbsn.User.DESCRIPTOR.name: set(),
         lbsn.PostReaction.DESCRIPTOR.name: set(),
         lbsn.Relationship.DESCRIPTOR.name: set()
     }
     self.count_glob = 0  # total number of records added
     self.count_glob_total = 0
     self.count_dup_merge = 0  # number of duplicate records merged
     self.count_dup_merge_total = 0
     # returns all recordsDicts in correct order,
     # with names as references (tuple)
     self.all_dicts = [
         (self.lbsn_origin_dict, lbsn.Origin().DESCRIPTOR.name),
         (self.lbsn_country_dict, lbsn.Country().DESCRIPTOR.name),
         (self.lbsn_city_dict, lbsn.City().DESCRIPTOR.name),
         (self.lbsn_place_dict, lbsn.Place().DESCRIPTOR.name),
         (self.lbsn_user_group_dict, lbsn.UserGroup().DESCRIPTOR.name),
         (self.lbsn_user_dict, lbsn.User().DESCRIPTOR.name),
         (self.lbsn_post_dict, lbsn.Post().DESCRIPTOR.name),
         (self.lbsn_post_reaction_dict,
          lbsn.PostReaction().DESCRIPTOR.name),
         (self.lbsn_relationship_dict, lbsn.Relationship().DESCRIPTOR.name)
     ]
Exemple #15
0
 def extract_post(cls, record, origin):
     post = HF.new_lbsn_record_with_id(lbsn.Post(), record.get('post_guid'),
                                       origin)
     post_latlng = record.get("post_latlng")
     if post_latlng:
         setattr(post, "post_latlng", parse_geom(post_latlng))
     place_guid = record.get('place_guid')
     if place_guid:
         set_lbsn_pkey(post.place_pkey, lbsn.Place(),
                       record.get('place_guid'), origin)
     city_guid = record.get('city_guid')
     if city_guid:
         set_lbsn_pkey(post.city_pkey, lbsn.City(), record.get('city_guid'),
                       origin)
     country_guid = record.get('country_guid')
     if country_guid:
         set_lbsn_pkey(post.country_pkey, lbsn.Country(),
                       record.get('country_guid'), origin)
     set_lbsn_pkey(post.user_pkey, lbsn.User(), record.get('user_guid'),
                   origin)
     pub_date = record.get('post_publish_date')
     if pub_date:
         copydate_lbsn_attr(post.post_publish_date, pub_date)
     set_lbsn_attr(post, "post_body", record)
     post.post_geoaccuracy
     geo_acc = record.get("post_geoaccuracy")
     if geo_acc:
         # get enum value
         post.post_geoaccuracy = lbsn.Post.PostGeoaccuracy.Value(
             geo_acc.upper())
     set_lbsn_attr(post, "hashtags", record)
     set_lbsn_attr(post, "emoji", record)
     set_lbsn_attr(post, "post_like_count", record)
     set_lbsn_attr(post, "post_comment_count", record)
     set_lbsn_attr(post, "post_views_count", record)
     set_lbsn_attr(post, "post_title", record)
     crt_date = record.get('post_create_date')
     if crt_date:
         copydate_lbsn_attr(post.post_create_date, crt_date)
     set_lbsn_attr(post, "post_thumbnail_url", record)
     set_lbsn_attr(post, "post_url", record)
     post_type = record.get("post_type")
     if post_type:
         # get enum value
         post.post_type = lbsn.Post.PostType.Value(post_type.upper())
     set_lbsn_attr(post, "post_filter", record)
     set_lbsn_attr(post, "post_quote_count", record)
     set_lbsn_attr(post, "post_share_count", record)
     lang = record.get('post_language')
     if lang:
         ref_post_language = lbsn.Language()
         ref_post_language.language_short = lang
         post.post_language.CopyFrom(ref_post_language)
     set_lbsn_attr(post, "input_source", record)
     user_mentions = record.get("user_mentions")
     if user_mentions:
         mentioned_users_list = []
         for user_id in user_mentions:  # iterate over the list
             ref_user_record = \
                 HF.new_lbsn_record_with_id(
                     lbsn.User(), user_id, origin)
             mentioned_users_list.append(ref_user_record)
         post.user_mentions_pkey.extend(
             [user_ref.pkey for user_ref in mentioned_users_list])
     set_lbsn_attr(post, "post_content_license", record)
     return post
    def parse_json_post(self, json_string_dict, user_pkey=None):
        """Extract json post retrieved from Twitter API

        The process is nested, but pretty linear:
        1. Extract all relevant lbsn.Post Attributes
           1.a extract post coordinates
           1.b extract user attributes
           1.c extract place attributes
        (poi, city, neigborhood, admin, country)
           1.d extract extract extended tweet,
        if available, and extended entities, if available
        2. decide if post is reaction
        (reply, quote, share, see https://developer.twitter.com/
        en/docs/tweets/data-dictionary/overview/entities-object.html)
        3. if post is reaction, copy reduced reaction
        attributes from extracted lbsn.Post
        4. add post/reaction to recordDict
        5. process all referenced posts
           5.a Retweet(=Share) and Quote Tweets are special kinds
        of Tweets that contain the original Tweet as an embedded object.
           5.b Retweets have a top-level "retweeted_status"
        object, and Quoted Tweets have a "quoted_status" object
        process tweet-post object

        Note: one input record may contain many lbsn records
        therefore, records are first added to self.lbsn_records
        to be later returned together
        """
        post_record = self.extract_post(
            json_string_dict, user_pkey)

        if not post_record:
            # in case no post record has been extracted
            # (e.g. non_geotagged clause)
            return
        # Assignment Step
        # check if post is reaction to other post
        # reaction means: reduced structure compared to post;
        # reactions often include the complete original post,
        # therefore nested processing necessary
        if HF.is_post_reaction(json_string_dict):
            if self.map_reactions is False:
                return
            post_reaction_record = self.map_postrecord_to_postreactionrecord(
                post_record)
            refuser_pkey = None
            if 'quoted_status' in json_string_dict:
                # Note: Quote is both: Share & Reply
                if 'user' not in json_string_dict.get('quoted_status'):
                    refuser_pkey = \
                        HF.substitute_referenced_user(json_string_dict,
                                                      self.origin,
                                                      self.log)
                post_reaction_record.reaction_type = lbsn.PostReaction.QUOTE
                ref_post_record = self.extract_post(
                    json_string_dict.get('quoted_status'))
            elif 'retweeted_status' in json_string_dict:
                # Note: No retweets are available when data is queried
                # using Bounding Box because of Geo-Tweet limitation:
                # "Note that native Retweets are not matched by this
                # parameter. While the original Tweet may have a location,
                # the Retweet will not"
                # see https://developer.twitter.com/en/docs/
                # tweets/filter-realtime/guides/basic-stream-parameters.html
                if 'user' not in json_string_dict.get('retweeted_status'):
                    # Current issue with Twitter search: the retweeting
                    # user is not returned in retweeted_status
                    # but we can get this from other information,
                    # such as user_mentions field from the retweet
                    # https://twittercommunity.com/t/status-retweeted-
                    # status-quoted-status-user-missing-from-search-tweets-json-response/63355
                    refuser_pkey = \
                        HF.substitute_referenced_user(json_string_dict,
                                                      self.origin,
                                                      self.log)
                post_reaction_record.reaction_type = lbsn.PostReaction.SHARE
                retweet_post = json_string_dict.get('retweeted_status')
                ref_post_record = self.extract_post(retweet_post, refuser_pkey)

            elif json_string_dict.get('in_reply_to_status_id_str'):
                # if reply, original tweet is not available (?)
                post_reaction_record.reaction_type = lbsn.PostReaction.COMMENT
                ref_post_record = \
                    HF.new_lbsn_record_with_id(
                        lbsn.Post(), json_string_dict.get(
                            'in_reply_to_status_id_str'),
                        self.origin)
                ref_user_record = \
                    HF.new_lbsn_record_with_id(
                        lbsn.User(),
                        json_string_dict.get(
                            'in_reply_to_user_id_str'),
                        self.origin)
                ref_user_record.user_name = json_string_dict.get(
                    'in_reply_to_screen_name')  # Needs to be saved
                self.lbsn_records.append(ref_user_record)
                ref_post_record.user_pkey.CopyFrom(ref_user_record.pkey)

            # add referenced post pkey to reaction
            if not self.disable_reaction_post_referencing:
                post_reaction_record.referencedPost_pkey.CopyFrom(
                    ref_post_record.pkey)
                # ToDo: if a Reaction refers to another
                # reaction (Information Spread)
                # This information is currently not
                # [available from Twitter](https://developer.twitter.com/
                # en/docs/tweets/data-dictionary/overview/tweet-object):
                # "Note that retweets of retweets do not show
                # representations of the intermediary retweet [...]"
                # would be added to
                # postReactionRecord.referencedPostReaction_pkey
                if ref_post_record:
                    self.lbsn_records.append(ref_post_record)
            # add postReactionRecord to Dict
            self.lbsn_records.append(post_reaction_record)
        else:
            # otherwise add post to self.lbsn_records
            # which already includes all other entries (lbsn.User, lbsn.City, lbsn.Place etc.)
            self.lbsn_records.append(post_record)
 def extract_user(self, json_string_dict):
     """Extract lbsn.User from Twitter json"""
     user = json_string_dict
     user_record = HF.new_lbsn_record_with_id(
         lbsn.User(), user.get('id_str'), self.origin)
     # get additional information about the user, if available
     user_record.user_fullname = user.get('name')
     user_record.follows = user.get('friends_count')
     user_record.is_private = user.get('protected')
     user_record.followed = user.get('followers_count')
     user_bio = user.get('description')
     if user_bio:
         user_record.biography = user_bio
     user_record.user_name = user.get('screen_name')
     listed_count = user.get('listed_count')
     if listed_count:
         user_record.group_count = listed_count
     user_record.post_count = user.get('statuses_count')
     user_record.url = f'https://twitter.com/intent/user?user_id=' \
                       f'{user_record.pkey.id}'
     ref_user_language = lbsn.Language()
     ref_user_language.language_short = user.get('lang')
     user_record.user_language.CopyFrom(ref_user_language)
     user_location = user.get('location')
     if user_location:
         user_record.user_location = user_location
         if self.geocodes and user_record.user_location in self.geocodes:
             l_lat = self.geocodes[user_record.user_location][0]
             l_lng = self.geocodes[user_record.user_location][1]
             user_record.user_location_geom = "POINT(%s %s)" % (
                 l_lng, l_lat)
     # userGeoLocation = user.get('profile_location') # todo!
     user_record.liked_count = user.get('favourites_count')
     user_record.active_since.CopyFrom(
         HF.json_date_string_to_proto(user.get('created_at')))
     user_profile_image_url = user.get('profile_image_url')
     if not user_profile_image_url == f'http://abs.twimg.com/sticky/' \
                                      f'default_profile_images/' \
                                      f'default_profile_normal.png':
         user_record.profile_image_url = user_profile_image_url
     user_timezone = user.get('time_zone')
     if user_timezone:
         user_record.user_timezone = user_timezone
     user_utc_offset = user.get('utc_offset')
     if user_utc_offset:
         user_record.user_utc_offset = user_utc_offset
     # the following example demonstrates specific information
     # that cannot be extracted from twitter post data
     # deutscherBundestagGroup = \
     # HF.createNewLBSNRecord_with_id(lbsn.UserGroup(),
     #                               "MdB (Bundestag)",
     #                               self.origin)
     # userRecord.user_groups_member.append(
     #    deutscherBundestagGroup.pkey.id)
     # if self.mapFullRelations:
     #       relationshipRecord = \
     #       HF.createNewLBSNRelationship_with_id(lbsn.Relationship(),
     #                                            userRecord.pkey.id,
     #                                            deutscherBundestagGroup.pkey.id,
     #                                            self.origin)
     #       relationshipRecord.relationship_type = lbsn.Relationship.inGROUP
     #       self.lbsn_records.AddRelationshipToDict(relationshipRecord)
     # userRecord.user_groups_follows = []
     return user_record
    def extract_post(self, json_string_dict, user_pkey=None):
        """Returns tuple of lbsn.Post() and List of post_context_records

        e.g.:
            (lbsn.Post(), [lbsn.Country(), lbsn.City(), lbsn.Place(), lbsn.User()])
        """
        post_guid = json_string_dict.get('id_str')

        if not HF.check_notice_empty_post_guid(post_guid):
            return None, None
        post_record = HF.new_lbsn_record_with_id(lbsn.Post(),
                                                 post_guid,
                                                 self.origin)
        post_geoacc = None
        user_record = None
        user_info = json_string_dict.get('user')
        if user_info:
            # Get lbsn.Post/Reaction Details of lbsn.User
            user_record = self.extract_user(json_string_dict.get('user'))
        elif user_pkey:
            # userPkey is already available for posts that are statuses
            user_record = HF.new_lbsn_record_with_id(lbsn.User(),
                                                     user_pkey.id,
                                                     self.origin)
        if user_record:
            # self.lbsn_records.append(user_record)
            self.lbsn_records.append(user_record)
        else:
            self.log.warning(f'Record {self.lbsn_records.count_glob_total}: '
                             f'No lbsn.User record found for post: {post_guid} '
                             f'(post saved without userid)..')
            print(f'Record {self.lbsn_records.count_glob_total}', end='\r')

        # Some preprocessing for all types:
        post_coordinates = json_string_dict.get('coordinates')
        if post_coordinates:
            l_lng = post_coordinates.get('coordinates')[0]
            l_lat = post_coordinates.get('coordinates')[1]
            post_record.post_geoaccuracy = lbsn.Post.LATLNG
            post_record.post_latlng = "POINT(%s %s)" % (l_lng, l_lat)

        # Check if lbsn.Place is mentioned
        post_place_json = json_string_dict.get('place')
        if post_place_json:
            # we need some information from postRecord to create placeRecord
            # (e.g. user language, geoaccuracy, post_latlng)
            # some of the information from place will also modify postRecord
            # attributes; therefore return both
            if user_record:
                user_lang = user_record.user_language
            else:
                user_lang = None
            place_record, \
                post_geoacc, \
                post_country = self.extract_place(post_place_json,
                                                  post_record.post_geoaccuracy,
                                                  user_lang)
            if not post_record.post_geoaccuracy:
                post_record.post_geoaccuracy = post_geoacc
            # postRecord.post_geoaccuracy = twitterPostAttributes.geoaccuracy
            # self.lbsn_records.append(place_record)
            self.lbsn_records.append(place_record)
            if post_country:
                post_record.country_pkey.CopyFrom(post_country.pkey)
            if isinstance(place_record, lbsn.City):
                post_record.city_pkey.CopyFrom(place_record.pkey)
            # either city or place, Twitter user cannot attach both (?)
            elif isinstance(place_record, lbsn.Place):
                post_record.place_pkey.CopyFrom(place_record.pkey)
            # substitute postRecord LatLng Coordinates from placeRecord,
            # if not already set
            if not post_record.post_latlng:
                # Note: this will also substitute lbsn.Country lat/lng in post
                # this information is also available by query of
                # country_guid in posts
                # use input arg min_geoaccuracy to exclude country geo-posts
                post_record.post_latlng = place_record.geom_center
        # if still no geoinformation, send post to Null-Island
        if not post_record.post_latlng:
            if self.ignore_non_geotagged is True:
                return None
            self.null_island += 1
            post_record.post_latlng = "POINT(%s %s)" % (0, 0)
        if self.min_geoaccuracy:
            if not HF.geoacc_within_threshold(post_record.post_geoaccuracy,
                                              self.min_geoaccuracy):
                self.skipped_low_geoaccuracy += 1
                return None
        # Process attributes of twitter post
        post_source = json_string_dict.get('source')
        if post_source:
            post_record.input_source = HF.cleanhtml(
                json_string_dict.get('source'))
            if self.ignore_sources_set and \
                    post_record.input_source in self.ignore_sources_set:
                # skip entry if in ignore list
                self.skipped_ignore_list += 1
                return None
        post_record.post_publish_date.CopyFrom(
            HF.json_date_string_to_proto(json_string_dict.get('created_at')))
        if user_record:
            post_record.user_pkey.CopyFrom(user_record.pkey)
        post_record.post_quote_count = HF.value_count(
            json_string_dict.get('quote_count'))
        post_record.post_comment_count = HF.value_count(
            json_string_dict.get('reply_count'))
        post_record.post_share_count = HF.value_count(
            json_string_dict.get('retweet_count'))
        post_record.post_like_count = HF.value_count(
            json_string_dict.get('favorite_count'))
        post_record.post_url = f'https://twitter.com/statuses/{post_guid}'
        language_str = json_string_dict.get('lang')
        if language_str:
            post_language = lbsn.Language()
            post_language.language_short = json_string_dict.get('lang')
            post_record.post_language.CopyFrom(post_language)
        # If Extended_tweet object is available,
        # process entities and post_body (text) data from extended object
        is_truncated = json_string_dict.get('truncated')
        if is_truncated and 'extended_tweet' in json_string_dict:
            # if the "truncated" field is set to true,
            # and the "extended_tweet" object provides complete
            # "full_text" and "entities" Tweet metadata
            # Source for all data is extended object, if available
            json_string_dict = json_string_dict.get('extended_tweet')
            post_record.post_body = json_string_dict.get('full_text')
            # else:
            #    self.log.warning(f'Truncated but no extended_tweet:'
            #                     f'{json_string_dict}')
            #    input("Press Enter to continue... (entry will be skipped)")
            #    return None
        else:
            if 'full_text' in json_string_dict:
                post_record.post_body = json_string_dict.get('full_text')
            else:
                post_record.post_body = json_string_dict.get('text')
        # entities section always exists and includes meta information
        # such as hashtags or user_mentions
        entities_json = json_string_dict.get('entities')
        # extract hashtags
        hashtags_json = entities_json.get('hashtags')
        if hashtags_json:
            for hashtag in hashtags_json:  # iterate over the list
                post_record.hashtags.append(hashtag.get("text"))
        # Look for mentioned userRecords
        user_mentions_json = entities_json.get('user_mentions')
        if user_mentions_json:
            ref_user_records = HF.get_mentioned_users(user_mentions_json,
                                                      self.origin)
            # self.lbsn_records.append(ref_user_records)
            self.lbsn_records.append(ref_user_records)
            post_record.user_mentions_pkey.extend(
                [user_ref.pkey for user_ref in ref_user_records])
            if self.map_full_relations:
                self.extract_mentioned_users(
                    ref_user_records, user_record.pkey.id)
        # sometimes, extended_entities section exists and includes
        # additional information on media, but never hashtags or user_mentions
        # Since the media type metadata in the extended_entities section
        # correctly indicates the media type
        # (‘photo’, ‘video’ or ‘animated_gif’),
        # and supports up to 4 photos, it is the preferred metadata
        # source for native media. See:
        # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/extended-entities-object.html#extended-entities-object
        if 'extended_entities' in json_string_dict:
            entities_json = json_string_dict.get('extended_entities')
        media_json = entities_json.get('media')
        if media_json:
            post_record.post_type = HF.assign_media_post_type(media_json)
        else:
            post_record.post_type = lbsn.Post.TEXT
        post_record.emoji.extend(HF.extract_emoji(post_record.post_body))
        # because standard print statement will produce escaped text,
        # we can use protobuf text_format to give us a human friendly
        # version of the text
        # log.debug(f'lbsn.Post record: '
        #           f'{text_format.MessageToString(postRecord, as_utf8=True)}')
        # log.debug(f'lbsn.Post record: {postRecord}')
        return post_record
    def parse_json_record(self, json_string_dict, input_lbsn_type=None):
        """Will parse Twitter json retrieved from Twitter API,
        returns a list of LBSN records.


        Fully extracting Twitter json's to flat relational db structure
        is challenging because Twitter json's may consist of deeply nested
        structures, which can include many LBSN record entities, e.g.:
        - the lbsn.Post itself
        - the lbsn.User who posted, and its attributes
        - Coordinates, Places, Cities, Countries linked to the post
        - lbsn.Language of the post
        - shared or retweeted Posts and their attribues (
            Users, Places, Cities etc.)
        - mentioned users in the post ("@-mentions")
        - special jsons retrieved from other API endpoints, e.g.
            groups of users etc.

        This methods tries to do all of this automatically, but default
        values may need adjustment for specific cases. All extracted
        LBSN records are added subsequently to self.lbsn_records and
        returned finally as a single list of records in this method. This
        guarantees that db-key-relations are acknowledged when submitting
        records to db. The order of LBSN record type extraction
        follows the order of db inserts
        """
        # clear any records from previous run
        self.lbsn_records.clear()
        # decide if main object is post or user json
        if input_lbsn_type and input_lbsn_type in ('friendslist',
                                                   'followerslist'):
            for user, related_user_list in json_string_dict.items():
                user_record = HF.new_lbsn_record_with_id(
                    lbsn.User(), str(user), self.origin)
                self.lbsn_records.append(user_record)
                self.extract_related_users(related_user_list,
                                           input_lbsn_type, user_record)
        elif (input_lbsn_type and input_lbsn_type == 'profile') \
                or 'screen_name' in json_string_dict:
            # user
            user_record = self.extract_user(json_string_dict)
            self.lbsn_records.append(user_record)
            # sys.exit(f'lbsn.Post record: {text_format.MessageToString(userRecord,
            #                                                      as_utf8=True)}')
            if not user_record.is_private:
                # if user profile is private, we cannot access posts
                user_status = None
                if 'status' in json_string_dict:
                    user_status = json_string_dict.get('status')
                elif 'quoted_status' in json_string_dict:
                    user_status = json_string_dict.get('quoted_status')
                elif 'retweeted_status' in json_string_dict:
                    user_status = json_string_dict.get('retweeted_status')
                # in case user status is available
                if user_status:
                    self.parse_json_post(
                        user_status, user_pkey=user_record.pkey)
        else:
            # otherwise, parse post
            self.parse_json_post(json_string_dict)

        # finally, return list of all extracted records
        return self.lbsn_records
Exemple #20
0
    def extract_flickr_post(self, record):
        """Main function for processing Flickr YFCC100M CSV entry.
           This mothod is adapted to a special structure, adapt if needed.

        To Do:
            - parameterize column numbers and structure
            - provide external config-file for specific CSV structures
            - currently not included in lbsn mapping are MachineTags,
              GeoContext (indoors, outdoors), WoeId
              and some extra attributes only present for Flickr

        Overview of available columns and examples:
        0 row-number    -   0
        1 Photo/video identifier    -   6985418911
        2 lbsn.User NSID(PostID?)     -   4e2f7a26a1dfbf165a7e30bdabf7e72a
        3 lbsn.User ID     -   39089491@N00
        4 lbsn.User nickname     -   gnuckx
        5 Date taken    -   2012-02-16 09:56:37.0
        6 Date uploaded     -   1331840483
        7 Capture device    -   Canon+PowerShot+ELPH+310+HS
        8 Title     -   IMG_0520
        9 Description      -     My vacation
        10 tags (comma-separated)   -   canon,canon+powershot+hs+310
        11 Machine tags (comma-separated)   - landscape, hills, water
        12 Longitude    -   -81.804885
        13 Latitude     -   24.550558
        14 Accuracy -   12
        15 Photo/video page URL -   http://www.flickr.com/photos/39089491@N00/6985418911/
        16 Photo/video download URL -   http://farm8.staticflickr.com/7205/6985418911_df7747990d.jpg
        17 License name -   Attribution-NonCommercial-NoDerivs License
        18 License URL  -   http://creativecommons.org/licenses/by-nc-nd/2.0/
        19 Photo/video server identifier    -   7205
        20 Photo/video farm identifier  -   8
        21 Photo/video secret   -   df7747990d
        22 Photo/video secret original  -   692d7e0a7f
        23 Extension of the original photo  -   jpg
        24 Marker (0 ¼ photo, 1 ¼ video)    -   0
        if concat:
            25 Photo/video identifier
            26 Place references (null to multiple)
        """
        # note that one input record may contain many lbsn records
        # therefore, return list of processed records
        lbsn_records = []
        # start mapping input to lbsn_records
        post_guid = record[1]
        if not HF.check_notice_empty_post_guid(post_guid):
            return None
        post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid,
                                                 self.origin)
        user_record = HF.new_lbsn_record_with_id(lbsn.User(), record[3],
                                                 self.origin)
        user_record.user_name = unquote(record[4]).replace('+', ' ')
        user_record.url = f'http://www.flickr.com/photos/{user_record.pkey.id}/'
        if user_record:
            post_record.user_pkey.CopyFrom(user_record.pkey)
        lbsn_records.append(user_record)
        post_record.post_latlng = self.flickr_extract_postlatlng(record)
        geoaccuracy = importer.flickr_map_geoaccuracy(record[14])
        if geoaccuracy:
            post_record.post_geoaccuracy = geoaccuracy
        # place record available in separate yfcc100m dataset
        # if record[1]:
        # we need some information from postRecord to create placeRecord
        # (e.g.  user language, geoaccuracy, post_latlng)
        # some of the information from place will also modify postRecord
        # place_record = HF.new_lbsn_record_with_id(lbsn.Place(),
        #                                           record[1],
        #                                           self.origin)
        # lbsn_records.append(place_record)
        # post_record.place_pkey.CopyFrom(place_record.pkey)
        post_record.post_publish_date.CopyFrom(
            HF.parse_timestamp_string_to_protobuf(record[6]))
        post_created_date = HF.parse_csv_datestring_to_protobuf(
            record[5], t_format='%Y-%m-%d %H:%M:%S.%f')
        if post_created_date:
            post_record.post_create_date.CopyFrom(post_created_date)
        post_record.post_views_count = 0
        post_record.post_comment_count = 0
        post_record.post_like_count = 0
        post_record.post_url = record[15]
        # YFCC100M dataset contains HTML codes (%20) and
        # space character is replaced by +
        post_record.post_body = unquote(record[9]).replace('+', ' ')
        post_record.post_title = unquote(record[8]).replace('+', ' ')
        post_record.post_thumbnail_url = record[16]  # note: fullsize url!
        # split tags by , and + because by lbsn-spec,
        # tags are limited to single word
        record_tags_list = list(
            set(
                filter(None, [
                    HF.remove_prefix(unquote(tag), "#")
                    for tag in re.split("[,+]+", record[10])
                ])))
        if record_tags_list:
            for tag in record_tags_list:
                tag = importer.clean_tags_from_flickr(tag)
                post_record.hashtags.append(tag)
        record_machine_tags = list(
            set(
                filter(
                    None,
                    [unquote(mtag)
                     for mtag in re.split("[,+]+", record[11])])))
        if 'video' in record_machine_tags:
            # all videos appear to have 'video' in machine tags
            post_record.post_type = lbsn.Post.VIDEO
        else:
            post_record.post_type = lbsn.Post.IMAGE
        # replace text-string of content license by integer-id
        if record[17] is not None:
            post_record.post_content_license = \
                self.get_license_number_from_license_name(record[17])
        # place record available in separate yfcc100m dataset
        # if records parsed as joined urls, length is larger than 25
        if len(record) > 25:
            post_plus_place_records = self.extract_flickr_place(
                record[25:], post_record=post_record)
            if post_plus_place_records is None:
                lbsn_records.append(post_record)
            else:
                lbsn_records.extend(post_plus_place_records)
        else:
            lbsn_records.append(post_record)
        return lbsn_records
Exemple #21
0
"""
Module for db input connection sql mapping
"""

import enum
from typing import Union, Optional, List, Tuple
from lbsnstructure import lbsnstructure_pb2 as lbsn

"""Schema convention from lbsn db spec"""
LBSN_SCHEMA = [
    (lbsn.Origin().DESCRIPTOR.name, "social", "origin", "origin_id"),
    (lbsn.Country().DESCRIPTOR.name, "spatial", "country", "country_guid"),
    (lbsn.City().DESCRIPTOR.name, "spatial", "city", "city_guid"),
    (lbsn.Place().DESCRIPTOR.name, "spatial", "place", "place_guid"),
    (lbsn.UserGroup().DESCRIPTOR.name, "social", "user_groups", "usergroup_guid"),
    (lbsn.User().DESCRIPTOR.name, "social", "user", "user_guid"),
    (lbsn.Post().DESCRIPTOR.name, "topical", "post", "post_guid"),
    (lbsn.PostReaction().DESCRIPTOR.name,
     "topical", "post_reaction", "reaction_guid"),
]


def optional_schema_override(
        LBSN_SCHEMA: List[Tuple[str, str, str, str]],
        schema_table_overrides: List[Tuple[str, str]]) -> List[Tuple[str, str, str, str]]:
    """Override schema and table name for selected lbsn objects."""
    LBSN_SCHEMA_OVERRIDE = []
    for lbsn_type, schema_name, table_name, key_col in LBSN_SCHEMA:
        for schema_table_override in schema_table_overrides:
            lbsn_object_ref, schema_table_override = schema_table_override
            try: