Esempio n. 1
0
 def __init__(self, record: lbsn.Post = None, emoji: str = None):
     super().__init__()
     self.key['latitude'] = None
     self.key['longitude'] = None
     self.key['emoji'] = None
     self.attrs['latlng_geom'] = None
     self.metrics['pud_hll'] = set()
     if emoji is None:
         # init empty
         return
     self.key['emoji'] = emoji
     if record is None:
         # init empty
         return
     if isinstance(record, lbsn.Post):
         coordinates_geom = record.post_latlng
         coordinates = HF.get_coordinates_from_ewkt(coordinates_geom)
         self.key['latitude'] = coordinates.lat
         self.key['longitude'] = coordinates.lng
         # additional (optional) attributes
         # formatted ready for sql upsert
         self.attrs['latlng_geom'] = HF.return_ewkb_from_geotext(
             coordinates_geom)
     else:
         raise ValueError("Parsing of LatLngBase only supported "
                          "from lbsn.Post")
Esempio n. 2
0
    def __init__(self, record: lbsn.Post = None, hashtag: str = None):
        super().__init__()
        self.key['year'] = None
        self.key['month'] = None
        self.key['latitude'] = None
        self.key['longitude'] = None
        self.attrs['latlng_geom'] = None
        if record is None:
            # init empty
            return
        if isinstance(record, lbsn.Post):
            post_date_time = HLF.merge_dates_post(record)
            if post_date_time is None:
                return
            date = post_date_time.date()
            self.key['year'] = date.year
            self.key['month'] = date.month

            coordinates_geom = record.post_latlng
            coordinates = HF.get_coordinates_from_ewkt(coordinates_geom)
            self.key['latitude'] = coordinates.lat
            self.key['longitude'] = coordinates.lng
            # additional (optional) attributes
            # formatted ready for sql upsert
            self.attrs['latlng_geom'] = HF.return_ewkb_from_geotext(
                coordinates_geom)
        else:
            raise ValueError("Parsing of MonthLatLngBase only supported "
                             "from lbsn.Post")
Esempio n. 3
0
 def merge_records(self, duplicate_record_lines, type_name):
     """ Will merge multiple proto buf records to one,
         eliminating duplicates and merging information.
     """
     if len(duplicate_record_lines) > 1:
         # first do a simple compare/unique
         unique_records = set(duplicate_record_lines)
         if len(unique_records) > 1:
             # input(f'Len: {len(unique_records)} : {unique_records}')
             # if more than one unqiue record infos,
             # get first and deep-compare-merge with following
             prev_duprecord = self.get_record_from_base64_encoded_string(
                 duplicate_record_lines[0], type_name)
             for duprecord in duplicate_record_lines[1:]:
                 # merge current record with previous until no more found
                 record = self.get_record_from_base64_encoded_string(
                     duprecord, type_name)
                 # will modify/overwrite prev_duprecord
                 HF.merge_existing_records(prev_duprecord, record)
             merged_record = self.serialize_encode_record(prev_duprecord)
         else:
             # take first element
             merged_record = next(iter(unique_records))
     else:
         merged_record = duplicate_record_lines[0]
     return merged_record
Esempio n. 4
0
 def hll_concat_latlng(record: lbsn.Post) -> str:
     """Concat post lat lng coordinates to string"""
     if record.post_geoaccuracy == 'latlng':
         coordinates_geom = HF.null_check(record.post_latlng)
         coordinates = HF.get_coordinates_from_ewkt(coordinates_geom)
         return f'{coordinates.lat}:{coordinates.lng}'
     return '0:0'
Esempio n. 5
0
    def __init__(self,
                 record: Union[lbsn.Post, lbsn.Place, lbsn.City,
                               lbsn.Country] = None):
        super().__init__()
        self.key["guid"] = None
        self.attrs['name'] = None
        self.attrs['geom_center'] = None
        self.attrs['geom_area'] = None
        self.metrics['pud_hll'] = set()
        self.metrics['utl_hll'] = set()
        self.metrics['latlng_hll'] = set()
        if record is None:
            # init empty
            return
        name = None
        geom_area = None
        if isinstance(record, lbsn.Post):
            coordinates_geom = record.post_latlng
            coordinates = HF.get_coordinates_from_ewkt(coordinates_geom)
            # use concat lat:lng as key of no place_key available
            # this should later implement assignemnt based on area
            # intersection
            self.key["guid"] = HLF.hll_concat(
                [coordinates.lat, coordinates.lng])
        elif isinstance(record, (lbsn.Place, lbsn.City, lbsn.Country)):
            name = HF.null_check(record.name)
            coordinates_geom = record.geom_center
            geom_area = record.geom_area
            # use key from place, city or country record
            self.key["guid"] = HLF.hll_concat_origin_guid(record)

        self.attrs['name'] = name
        self.attrs['geom_center'] = HF.return_ewkb_from_geotext(
            coordinates_geom)
        self.attrs['geom_area'] = HF.return_ewkb_from_geotext(geom_area)
 def extract_related_users(
         self, related_user_list, input_lbsn_type, user_record):
     """Extract related users from user list"""
     for related_user in related_user_list:
         related_record = HF.new_lbsn_record_with_id(lbsn.User(),
                                                     str(related_user),
                                                     self.origin)
         self.lbsn_records.append(related_record)
         # note the switch of order here,
         # direction is important for 'isConnected',
         # and the different list each give us a
         # different view on this relationship
         if input_lbsn_type == 'friendslist':
             relationship_record =\
                 HF.new_lbsn_relation_with_id(lbsn.Relationship(),
                                              user_record.pkey.id,
                                              related_record.pkey.id,
                                              self.origin)
         elif input_lbsn_type == 'followerslist':
             relationship_record = \
                 HF.new_lbsn_relation_with_id(lbsn.Relationship(),
                                              related_record.pkey.id,
                                              user_record.pkey.id,
                                              self.origin)
         relationship_record.relationship_type = \
             lbsn.Relationship.isCONNECTED
         self.lbsn_records.add_relationship_to_dict(
             relationship_record)
Esempio n. 7
0
 def merge_dates_post(record: lbsn.Post = None) -> Optional[dt.datetime]:
     """Merge post_publish and post_created attributes"""
     post_create_date = HF.null_check_datetime(record.post_create_date)
     post_publish_date = HF.null_check_datetime(record.post_publish_date)
     if post_create_date is None:
         return post_publish_date
     return post_create_date
Esempio n. 8
0
 def _open_input_files(self) -> Iterator[IO[str]]:
     """Loops input filelist and returns opened file handles"""
     # process localfiles
     for file_name in self.filelist:
         self.continue_number += 1
         self.current_source = file_name
         HF.log_main_debug(f'Current file: {ntpath.basename(file_name)}')
         yield open(file_name, 'r', encoding="utf-8", errors='replace')
Esempio n. 9
0
 def hll_concat_upt_hll(record: lbsn.Post) -> List[str]:
     """Concat all post terms (body, title, hashtags) and return list"""
     body_terms = HF.select_terms(record.post_body)
     title_terms = HF.select_terms(record.post_title)
     tag_terms = {item.lower() for item in record.hashtags if len(item) > 2}
     all_post_terms = set.union(body_terms, title_terms, tag_terms)
     user_hll = HLLFunctions.hll_concat_user(record)
     upt_hll = HLLFunctions.hll_concat_user_terms(user_hll, all_post_terms)
     return upt_hll
 def prepare_lbsn_country(cls, record):
     """Get common attributes for records of type lbsn.Place"""
     place_record = PlaceBaseAttrShared(record)
     prepared_record = (place_record.origin_id, place_record.guid,
                        place_record.name, place_record.name_alternatives,
                        HF.return_ewkb_from_geotext(
                            place_record.geom_center),
                        HF.return_ewkb_from_geotext(place_record.geom_area),
                        place_record.url)
     return prepared_record
 def __init__(self, record=None):
     if record is None:
         record = lbsn.UserGroup()
     self.origin_id = record.pkey.origin.origin_id
     self.guid = record.pkey.id
     self.usergroup_name = HF.null_check(record.usergroup_name)
     self.usergroup_description = HF.null_check(
         record.usergroup_description)
     self.member_count = HF.null_check(record.member_count)
     self.usergroup_createdate = HF.null_check_datetime(
         record.usergroup_createdate)
     self.user_owner = HF.null_check(record.user_owner_pkey.id)
 def prepare_lbsn_city(cls, record):
     """Get common attributes for records of type lbsn.City"""
     place_record = PlaceBaseAttrShared(record)
     country_guid = HF.null_check(record.country_pkey.id)
     sub_type = HF.null_check(record.sub_type)
     prepared_record = (place_record.origin_id, place_record.guid,
                        place_record.name, place_record.name_alternatives,
                        HF.return_ewkb_from_geotext(
                            place_record.geom_center),
                        HF.return_ewkb_from_geotext(place_record.geom_area),
                        place_record.url, country_guid, sub_type)
     return prepared_record
 def __init__(self, record=None):
     if record is None:
         record = lbsn.Place()  # init empty structure
     self.origin_id = record.pkey.origin.origin_id  # = 3
     self.guid = record.pkey.id
     self.name = HF.null_check(record.name)
     # because ProtoBuf Repeated Field does not support distinct rule,
     # we remove any duplicates in list fields prior to submission here
     self.name_alternatives = list(set(record.name_alternatives))
     if self.name and self.name in self.name_alternatives:
         self.name_alternatives.remove(self.name)
     self.url = HF.null_check(record.url)
     self.geom_center = HF.null_check(record.geom_center)
     self.geom_area = HF.null_check(record.geom_area)
 def extract_user(self, json_string_dict):
     user = json_string_dict
     user_record = HF.new_lbsn_record_with_id(lbsn.User(),
                                              user.get(
         'id'),
         self.origin)
     return user_record
 def prepare_lbsn_event(cls, record):
     """Get common attributes for records of type lbsn.Post"""
     event_record = EventAttrShared(record)
     prepared_record = (
         event_record.origin_id, event_record.event_guid, event_record.name,
         HF.return_ewkb_from_geotext(event_record.event_latlng),
         HF.return_ewkb_from_geotext(event_record.event_area),
         event_record.event_website, event_record.event_date,
         event_record.event_date_start, event_record.event_date_end,
         event_record.duration, event_record.place_guid,
         event_record.city_guid, event_record.country_guid,
         event_record.user_guid, event_record.event_description,
         event_record.event_type, event_record.event_share_count,
         event_record.event_like_count, event_record.event_comment_count,
         event_record.event_views_count, event_record.event_engage_count)
     return prepared_record
 def extract_place(self, postplace_json):
     place = postplace_json
     place_id = place.get('id')
     if not place_id:
         self.log.warning(f'No PlaceGuid\n\n{place}')
         input("Press Enter to continue... (entry will be skipped)")
         return
     lon_center = place.get('lng')
     lat_center = place.get('lat')
     if lon_center is None or lat_center is None:
         # assign place to Null Island
         lon_center = 0
         lat_center = 0
     # place_guid
     # For POIs, City is not available on Twitter
     place_record = HF.new_lbsn_record_with_id(
         lbsn.Place(), place_id, self.origin)
     place_record.geom_center = "POINT(%s %s)" % (lon_center, lat_center)
     place_name = place.get('name').replace('\n\r', '')
     # for some reason, twitter place entities sometimes contain
     # linebreaks or whitespaces. We don't want this.
     place_name = place.get('name').replace('\n\r', '')
     # remove multiple whitespace
     place_name = re.sub(' +', ' ', place_name)
     place_slug = place.get('slug')
     if place_slug:
         place_record.url = (
             f"https://www.instagram.com/explore/locations/"
             f"{place_id}/{place_slug}")
     return place_record
Esempio n. 17
0
    def sort_clean_proto_repeated_field(cls, record):
        """Remove duplicate values in repeated field, sort alphabetically

        ProtocolBuffers has no unique list field type. This function will
        remove duplicates, which is needed for unique compare.

        There is a 'bug' in Python implementation of ProtocolBuffers:
        - depending on the implementation type in use, it is possible
        to spot either 'RepeatedCompositeFieldContainer'
            or 'RepeatedCompositeContainer'
        - solution here: import and compare to both types
        - this is not ideal, since both types are internal to PB and
            subject to change
        - see [proto-bug](https://github.com/protocolbuffers/
            protobuf/issues/3870)
        """
        for descriptor in record.DESCRIPTOR.fields:
            if descriptor.label == descriptor.LABEL_REPEATED:
                x_attr = getattr(record, descriptor.name)
                if x_attr and not HF.is_composite_field_container(x_attr):
                    x_attr_cleaned = set(x_attr)
                    x_attr_sorted = sorted(x_attr_cleaned)
                    # Complete clear of repeated field
                    for _ in range(0, len(x_attr)):
                        x_attr.pop()
                    # add sorted list
                    x_attr.extend(x_attr_sorted)
Esempio n. 18
0
    def extract_user(cls, record, origin):
        user = HF.new_lbsn_record_with_id(lbsn.User(), record.get('user_guid'),
                                          origin)
        set_lbsn_attr(user, "user_name", record)
        set_lbsn_attr(user, "user_fullname", record)
        set_lbsn_attr(user, "follows", record)
        set_lbsn_attr(user, "followed", record)
        set_lbsn_attr(user, "biography", record)
        set_lbsn_attr(user, "post_count", record)
        set_lbsn_attr(user, "url", record)
        set_lbsn_attr(user, "is_private", record)
        set_lbsn_attr(user, "is_available", record)

        lang = record.get('user_language')
        if lang:
            ref_user_language = lbsn.Language()
            ref_user_language.language_short = lang
            user.user_language.CopyFrom(ref_user_language)

        set_lbsn_attr(user, "user_location", record)
        user_location_geom = record.get("user_location_geom")
        if user_location_geom:
            setattr(user, "user_location_geom", parse_geom(user_location_geom))
        set_lbsn_attr(user, "liked_count", record)
        active_since = record.get('active_since')
        if active_since:
            copydate_lbsn_attr(user.active_since, active_since)
        set_lbsn_attr(user, "profile_image_url", record)
        set_lbsn_attr(user, "user_timezone", record)
        set_lbsn_attr(user, "user_utc_offset", record)
        set_lbsn_attr(user, "user_groups_member", record)
        set_lbsn_attr(user, "user_groups_follows", record)
        set_lbsn_attr(user, "group_count", record)
        return user
Esempio n. 19
0
def set_lbsn_pkey(lbsn_obj_pkey, pkey_obj, pkey_val, origin_val):
    """Sets value for lbsn_obj_pkey of pkey_obj if
    pkey_val is not None"""
    if pkey_val is None:
        return
    pkey_obj = HF.new_lbsn_record_with_id(pkey_obj, pkey_val, origin_val)
    lbsn_obj_pkey.CopyFrom(pkey_obj.pkey)
Esempio n. 20
0
 def extract_place(cls, record, origin):
     place = HF.new_lbsn_record_with_id(lbsn.Place(),
                                        record.get('place_guid'), origin)
     set_lbsn_attr(place, "name", record)
     set_lbsn_attr(place, "post_count", record)
     set_lbsn_attr(place, "url", record)
     geom_center = record.get("geom_center")
     if geom_center:
         setattr(place, "geom_center", parse_geom(geom_center))
     geom_area = record.get("geom_area")
     if geom_area:
         setattr(place, "geom_area", parse_geom(geom_area))
     city_guid = record.get('city_guid')
     if city_guid:
         set_lbsn_pkey(place.city_pkey, lbsn.City(),
                       record.get('city_guid'), origin)
     set_lbsn_attr(place, "name_alternatives", record)
     set_lbsn_attr(place, "place_description", record)
     set_lbsn_attr(place, "place_website", record)
     set_lbsn_attr(place, "place_phone", record)
     set_lbsn_attr(place, "address", record)
     set_lbsn_attr(place, "zip_code", record)
     set_lbsn_attr(place, "attributes", record)
     set_lbsn_attr(place, "checkin_count", record)
     set_lbsn_attr(place, "like_count", record)
     set_lbsn_attr(place, "parent_places", record)
     return place
Esempio n. 21
0
 def extract_usergroup(cls, record, origin):
     usergroup = HF.new_lbsn_record_with_id(lbsn.UserGroup(),
                                            record.get('usergroup_guid'),
                                            origin)
     usergroup.usergroup_name = record.get('usergroup_name')
     usergroup.usergroup_description = record.get('usergroup_description')
     usergroup.member_count = record.get('member_count')
     usergroup.usergroup_createdate = record.get('usergroup_createdate')
     usergroup.user_owner = record.get('user_owner')
     user_owner = record.get('user_owner')
     if user_owner:
         usergroup.user_owner_pkey.CopyFrom(
             HF.new_lbsn_record_with_id(lbsn.User(),
                                        record.get('user_owner'),
                                        origin).pkey)
     return usergroup
 def __init__(self, relationship=None):
     if relationship is None:
         relationship = lbsn.Relationship()
     self.origin_id = relationship.pkey.relation_to.origin.origin_id
     self.guid = relationship.pkey.relation_to.id
     self.guid_rel = relationship.pkey.relation_from.id
     self.rel_type = HF.null_check(
         lbsn.Relationship().RelationshipType.Name(
             relationship.relationship_type)).lower()
Esempio n. 23
0
 def process_place_record(place_record, origin):
     """Assignment of Flickr place types to lbsnstructure
     hierarchy: lbsn.Country, lbsn.City, lbsn.Place
     Original Flickr place types, which are more detailed,
     are stored in sub_type field
     """
     place_record_split = place_record.split(":")
     if not len(place_record_split) == 3:
         raise ValueError(f'Malformed place entry:\n'
                          f'place_record: {place_record}')
     place_guid = unquote(place_record_split[0])
     place_name = unquote(place_record_split[1]).replace('+', ' ')
     place_type = unquote(place_record_split[2])
     place_type_lw = place_type.lower()
     place_type_lw_split = place_type_lw.split("/")
     # assignment
     if any(ptls in FLICKR_COUNTRY_MATCH for ptls in place_type_lw_split):
         lbsn_place_record = HF.new_lbsn_record_with_id(
             lbsn.Country(), place_guid, origin)
     elif any(ptls in FLICKR_CITY_MATCH for ptls in place_type_lw_split):
         lbsn_place_record = HF.new_lbsn_record_with_id(
             lbsn.City(), place_guid, origin)
     elif any(ptls in FLICKR_PLACE_MATCH for ptls in place_type_lw_split):
         lbsn_place_record = HF.new_lbsn_record_with_id(
             lbsn.Place(), place_guid, origin)
     else:
         logging.getLogger('__main__').debug(
             f'Could not assign place type {place_type_lw}\n'
             f'found in place_record: {place_record}\n'
             f'Will assign default "lbsn.Place"')
         lbsn_place_record = HF.new_lbsn_record_with_id(
             lbsn.Place(), place_guid, origin)
     lbsn_place_record.name = place_name
     if isinstance(lbsn_place_record, lbsn.City):
         # record sub types only for city and place
         lbsn_place_record.sub_type = place_type
     elif isinstance(lbsn_place_record, lbsn.Place):
         lbsn_place_record.place_description = place_type
     # place_record.url (not provided)
     # need to consult post data for lat/lng coordinates
     # set to null island first
     lbsn_place_record.geom_center = "POINT(%s %s)" % (0, 0)
     return lbsn_place_record
 def extract_mentioned_users(self, ref_user_records, user_record_id):
     for mentioned_user_record in ref_user_records:
         relation_record = \
             HF.new_lbsn_relation_with_id(lbsn.Relationship(),
                                          user_record_id,
                                          mentioned_user_record.pkey.id,
                                          self.origin)
         relation_record.relationship_type = \
             lbsn.Relationship.MENTIONS_USER
         self.lbsn_records.append(
             relation_record)
Esempio n. 25
0
 def extract_city(cls, record, origin):
     city = HF.new_lbsn_record_with_id(lbsn.City(), record.get('city_guid'),
                                       origin)
     set_lbsn_attr(city, "name", record)
     geom_center = record.get("geom_center")
     if geom_center:
         setattr(city, "geom_center", parse_geom(geom_center))
     geom_area = record.get("geom_area")
     if geom_area:
         setattr(city, "geom_area", parse_geom(geom_area))
     country_guid = record.get('country_guid')
     if country_guid:
         city.country_pkey.CopyFrom(
             HF.new_lbsn_record_with_id(lbsn.Country(),
                                        record.get('country_guid'),
                                        origin).pkey)
     set_lbsn_attr(city, "url", record)
     set_lbsn_attr(city, "name_alternatives", record)
     set_lbsn_attr(city, "sub_type", record)
     return city
 def extract_mentioned_users(self, ref_user_records, user_record_id):
     """Extract mentioned user from ref user records list"""
     for mentioned_user_record in ref_user_records:
         relation_record = \
             HF.new_lbsn_relation_with_id(lbsn.Relationship(),
                                          user_record_id,
                                          mentioned_user_record.pkey.id,
                                          self.origin)
         relation_record.relationship_type = \
             lbsn.Relationship.MENTIONS_USER
         self.lbsn_records.add_relationship_to_dict(
             relation_record)
Esempio n. 27
0
 def close_log():
     """"Closes log and writes to archive file
     """
     logging.shutdown()
     # rename log file for archive purposes
     today = HF.get_str_formatted_today()
     outfile = Path(f"{today}.log")
     with open(outfile, 'a+') as outfile:
         with open('log.log') as infile:
             outfile.write(f'\n')
             for line in infile:
                 outfile.write(line)
 def __init__(self, record=None):
     if record is None:
         record = lbsn.PostReaction()
     self.origin_id = record.pkey.origin.origin_id
     self.guid = record.pkey.id
     self.reaction_latlng = HF.null_geom_check(record.reaction_latlng)
     self.user_guid = HF.null_check(record.user_pkey.id)
     self.referenced_post = HF.null_check(record.referencedPost_pkey.id)
     self.referenced_postreaction = HF.null_check(
         record.referencedPostreaction_pkey.id)
     self.reaction_type = HF.turn_lower(
         HF.null_check(lbsn.PostReaction().ReactionType.Name(
             record.reaction_type)))
     self.reaction_date = HF.null_check_datetime(record.reaction_date)
     self.reaction_content = HF.null_check(record.reaction_content)
     self.reaction_like_count = HF.null_check(record.reaction_like_count)
     self.user_mentions = list(
         set([pkey.id for pkey in record.user_mentions_pkey]))
Esempio n. 29
0
 def extract_country(cls, record, origin):
     country = HF.new_lbsn_record_with_id(lbsn.Country(),
                                          record.get('country_guid'),
                                          origin)
     set_lbsn_attr(country, "name", record)
     geom_center = record.get("geom_center")
     if geom_center:
         setattr(country, "geom_center", parse_geom(geom_center))
     geom_area = record.get("geom_area")
     if geom_area:
         setattr(country, "geom_area", parse_geom(geom_area))
     set_lbsn_attr(country, "url", record)
     set_lbsn_attr(country, "name_alternatives", record)
     return country
Esempio n. 30
0
    def fetch_json_data_from_file(self, file_handle):
        """Read json entries from file.

        Typical form is [{json1},{json2}], if is_stacked_json is True:
        will process stacked jsons in the form of {json1}{json2}

        If is_line_separated_json is true:
        {json1}
        {json2}
        ...
        """
        # records = []
        # Stacked JSON is a simple file with many concatenated jsons, e.g.
        # {json1}{json2} etc.
        if self.is_stacked_json:
            # note: this requires loading file completely first
            # not streaming optimized yet
            for record in HF.json_read_wrapper(
                    HF.decode_stacked(file_handle.read())):
                yield record
        if self.is_line_separated_json:
            # json's separated by line ending
            for line in file_handle:
                record = HF.json_load_wrapper(line, single=True)
                yield record
        else:
            # normal json nesting, e.g.  {{record1},{record2}}
            records = HF.json_load_wrapper(file_handle)
            if records:
                if isinstance(records, list):
                    for record in records:
                        yield record
                else:
                    record = records
                    yield record
            yield None