Python Postの例、lbsnstructure.lbsnstructure_pb2.Post Pythonの例

コード例 #1

0

ファイルを表示

ファイル: helper_functions.py プロジェクト: Sieboldianus/lbsntransform

 def dict_type_switcher(desc_name):
     """ Create protoBuf messages by name"""
     dict_switcher = {
         lbsn.Country().DESCRIPTOR.name: lbsn.Country(),
         lbsn.City().DESCRIPTOR.name: lbsn.City(),
         lbsn.Place().DESCRIPTOR.name: lbsn.Place(),
         lbsn.User().DESCRIPTOR.name: lbsn.User(),
         lbsn.UserGroup().DESCRIPTOR.name: lbsn.UserGroup(),
         lbsn.Post().DESCRIPTOR.name: lbsn.Post(),
         lbsn.PostReaction().DESCRIPTOR.name: lbsn.PostReaction(),
         lbsn.Relationship().DESCRIPTOR.name: lbsn.Relationship()
     }
     return dict_switcher.get(desc_name)

コード例 #2

0

ファイルを表示

ファイル: shared_structure.py プロジェクト: Sieboldianus/lbsntransform

 def dict_selector(self, record):
     """ Get dictionary by record type name"""
     dict_switcher = {
         lbsn.Post().DESCRIPTOR.name: self.lbsn_post_dict,
         lbsn.Country().DESCRIPTOR.name: self.lbsn_country_dict,
         lbsn.City().DESCRIPTOR.name: self.lbsn_city_dict,
         lbsn.Place().DESCRIPTOR.name: self.lbsn_place_dict,
         lbsn.PostReaction().DESCRIPTOR.name: self.lbsn_post_reaction_dict,
         lbsn.User().DESCRIPTOR.name: self.lbsn_user_dict,
         lbsn.UserGroup().DESCRIPTOR.name: self.lbsn_user_group_dict,
         lbsn.Origin().DESCRIPTOR.name: self.lbsn_origin_dict
     }
     return dict_switcher.get(record.DESCRIPTOR.name)

コード例 #3

0

ファイルを表示

ファイル: shared_structure_proto_lbsndb.py プロジェクト: Sieboldianus/lbsntransform

    def __init__(self, record=None):
        if record is None:
            record = lbsn.Post()

        self.origin_id = record.pkey.origin.origin_id
        self.guid = record.pkey.id
        self.post_latlng = HF.null_geom_check(record.post_latlng)
        self.place_guid = HF.null_check(record.place_pkey.id)
        self.city_guid = HF.null_check(record.city_pkey.id)
        self.country_guid = HF.null_check(record.country_pkey.id)
        self.post_geoaccuracy = HF.turn_lower(
            HF.null_check(lbsn.Post().PostGeoaccuracy.Name(
                record.post_geoaccuracy)))
        self.user_guid = HF.null_check(record.user_pkey.id)
        self.post_create_date = HF.null_check_datetime(record.post_create_date)
        self.post_publish_date = HF.null_check_datetime(
            record.post_publish_date)
        self.post_body = HF.null_check(record.post_body)
        self.post_language = HF.null_check(record.post_language.language_short)
        self.user_mentions = list(
            set([pkey.id for pkey in record.user_mentions_pkey]))
        self.hashtags = list(set(record.hashtags))
        self.emoji = list(set(record.emoji))
        self.post_like_count = HF.null_check(record.post_like_count)
        self.post_comment_count = HF.null_check(record.post_comment_count)
        self.post_views_count = HF.null_check(record.post_views_count)
        self.post_title = HF.null_check(record.post_title)
        self.post_thumbnail_url = HF.null_check(record.post_thumbnail_url)
        self.post_url = HF.null_check(record.post_url)
        self.post_type = HF.turn_lower(
            HF.null_check(lbsn.Post().PostType.Name(record.post_type)))
        self.post_filter = HF.null_check(record.post_filter)
        self.post_quote_count = HF.null_check(record.post_quote_count)
        self.post_share_count = HF.null_check(record.post_share_count)
        self.input_source = HF.null_check(record.input_source)
        self.post_content_license = HF.null_check(record.post_content_license)
        # optional:
        self.latitude = 0
        self.longitude = 0

コード例 #4

0

ファイルを表示

 def type_sql_mapper(cls):
     """Assigns record types to SQL Insert SQLs"""
     type_sql_mapping = {
         lbsn.Origin().DESCRIPTOR.name: cls.origin_insertsql,
         lbsn.Country().DESCRIPTOR.name: cls.country_insertsql,
         lbsn.City().DESCRIPTOR.name: cls.city_insertsql,
         lbsn.Place().DESCRIPTOR.name: cls.place_insertsql,
         lbsn.User().DESCRIPTOR.name: cls.user_insertsql,
         lbsn.UserGroup().DESCRIPTOR.name: cls.usergroup_insertsql,
         lbsn.Post().DESCRIPTOR.name: cls.post_insertsql,
         lbsn.Event().DESCRIPTOR.name: cls.event_insertsql,
         lbsn.PostReaction().DESCRIPTOR.name: cls.postreaction_insertsql,
     }
     return type_sql_mapping

コード例 #5

0

ファイルを表示

 def get_hll_metrics(cls, record) -> hll.HllMetrics:
     """Extracts hll metrics based on record type"""
     dict_switcher = {
         lbsn.Origin().DESCRIPTOR.name: cls.get_origin_metrics,
         lbsn.Country().DESCRIPTOR.name: cls.get_country_metrics,
         lbsn.City().DESCRIPTOR.name: cls.get_city_metrics,
         lbsn.Place().DESCRIPTOR.name: cls.get_place_metrics,
         lbsn.User().DESCRIPTOR.name: cls.get_user_metrics,
         lbsn.UserGroup().DESCRIPTOR.name: cls.get_usergroup_metrics,
         lbsn.Post().DESCRIPTOR.name: cls.get_post_metrics,
         lbsn.PostReaction().DESCRIPTOR.name: cls.get_postreaction_metrics,
         lbsn.Relationship().DESCRIPTOR.name: cls.get_relationship_metrics
     }
     extract_function = dict_switcher.get(record.DESCRIPTOR.name)
     record_hll_metrics = extract_function(record)
     return record_hll_metrics

コード例 #6

0

ファイルを表示

ファイル: shared_structure_proto_lbsndb.py プロジェクト: Sieboldianus/lbsntransform

 def func_prepare_selector(self, record):
     """Select correct prepare function according to record type"""
     dict_switcher = {
         lbsn.Origin().DESCRIPTOR.name: self.prepare_lbsn_origin,
         lbsn.Country().DESCRIPTOR.name: self.prepare_lbsn_country,
         lbsn.City().DESCRIPTOR.name: self.prepare_lbsn_city,
         lbsn.Place().DESCRIPTOR.name: self.prepare_lbsn_place,
         lbsn.User().DESCRIPTOR.name: self.prepare_lbsn_user,
         lbsn.UserGroup().DESCRIPTOR.name: self.prepare_lbsn_usergroup,
         lbsn.Post().DESCRIPTOR.name: self.prepare_lbsn_post,
         lbsn.Event().DESCRIPTOR.name: self.prepare_lbsn_event,
         lbsn.PostReaction().DESCRIPTOR.name:
         self.prepare_lbsn_postreaction,
         lbsn.Relationship().DESCRIPTOR.name: self.prepare_lbsn_relation
     }
     prepare_function = dict_switcher.get(record.DESCRIPTOR.name)
     return prepare_function(record)

コード例 #7

0

ファイルを表示

 def get_func_record(cls,
                     record: Dict[str, Any],
                     input_type: Optional[str] = None):
     """Returns mapping function for input_type"""
     FUNC_MAP = {
         lbsn.Origin().DESCRIPTOR.name: cls.extract_origin,
         lbsn.Country().DESCRIPTOR.name: cls.extract_country,
         lbsn.City().DESCRIPTOR.name: cls.extract_city,
         lbsn.Place().DESCRIPTOR.name: cls.extract_place,
         lbsn.UserGroup().DESCRIPTOR.name: cls.extract_usergroup,
         lbsn.User().DESCRIPTOR.name: cls.extract_user,
         lbsn.Post().DESCRIPTOR.name: cls.extract_post,
         lbsn.PostReaction().DESCRIPTOR.name: cls.extract_postreaction,
         lbsn.Event().DESCRIPTOR.name: cls.extract_event,
     }
     func_map = FUNC_MAP.get(input_type)
     # create origin always the same
     origin = lbsn.Origin()
     origin.origin_id = record.get('origin_id')
     return func_map(record, origin)

コード例 #8

0

ファイルを表示

ファイル: shared_structure.py プロジェクト: Sieboldianus/lbsntransform

 def __init__(self):
     self.lbsn_origin_dict = dict()
     self.lbsn_country_dict = dict()
     self.lbsn_city_dict = dict()
     self.lbsn_place_dict = dict()
     self.lbsn_user_group_dict = dict()
     self.lbsn_user_dict = dict()
     self.lbsn_post_dict = dict()
     self.lbsn_post_reaction_dict = dict()
     self.lbsn_relationship_dict = dict()
     self.key_hashes = {
         lbsn.Origin.DESCRIPTOR.name: set(),
         lbsn.Post.DESCRIPTOR.name: set(),
         lbsn.Country.DESCRIPTOR.name: set(),
         lbsn.City.DESCRIPTOR.name: set(),
         lbsn.Place.DESCRIPTOR.name: set(),
         lbsn.UserGroup.DESCRIPTOR.name: set(),
         lbsn.User.DESCRIPTOR.name: set(),
         lbsn.PostReaction.DESCRIPTOR.name: set(),
         lbsn.Relationship.DESCRIPTOR.name: set()
     }
     self.count_glob = 0  # total number of records added
     self.count_glob_total = 0
     self.count_dup_merge = 0  # number of duplicate records merged
     self.count_dup_merge_total = 0
     # returns all recordsDicts in correct order,
     # with names as references (tuple)
     self.all_dicts = [
         (self.lbsn_origin_dict, lbsn.Origin().DESCRIPTOR.name),
         (self.lbsn_country_dict, lbsn.Country().DESCRIPTOR.name),
         (self.lbsn_city_dict, lbsn.City().DESCRIPTOR.name),
         (self.lbsn_place_dict, lbsn.Place().DESCRIPTOR.name),
         (self.lbsn_user_group_dict, lbsn.UserGroup().DESCRIPTOR.name),
         (self.lbsn_user_dict, lbsn.User().DESCRIPTOR.name),
         (self.lbsn_post_dict, lbsn.Post().DESCRIPTOR.name),
         (self.lbsn_post_reaction_dict,
          lbsn.PostReaction().DESCRIPTOR.name),
         (self.lbsn_relationship_dict, lbsn.Relationship().DESCRIPTOR.name)
     ]

コード例 #9

0

ファイルを表示

 def update_post_with_place(self,
                            post_record: lbsn.Post = None,
                            post_guid: str = None,
                            place_records: List[lbsn.Place] = None):
     """Update post record with entries from place record"""
     if post_record is None:
         if post_guid is None:
             raise ValueError("Cannot create lbsn.Post without post_guid")
         # create new post record
         post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid,
                                                  self.origin)
     if place_records is None:
         return post_record
     for place_record in place_records:
         if isinstance(place_record, lbsn.Country):
             post_record.country_pkey.CopyFrom(place_record.pkey)
         if isinstance(place_record, lbsn.City):
             post_record.city_pkey.CopyFrom(place_record.pkey)
         # either city or place, Twitter user cannot attach both (?)
         elif isinstance(place_record, lbsn.Place):
             post_record.place_pkey.CopyFrom(place_record.pkey)
     return post_record

コード例 #10

0

ファイルを表示

 def extract_post(cls, record, origin):
     post = HF.new_lbsn_record_with_id(lbsn.Post(), record.get('post_guid'),
                                       origin)
     post_latlng = record.get("post_latlng")
     if post_latlng:
         setattr(post, "post_latlng", parse_geom(post_latlng))
     place_guid = record.get('place_guid')
     if place_guid:
         set_lbsn_pkey(post.place_pkey, lbsn.Place(),
                       record.get('place_guid'), origin)
     city_guid = record.get('city_guid')
     if city_guid:
         set_lbsn_pkey(post.city_pkey, lbsn.City(), record.get('city_guid'),
                       origin)
     country_guid = record.get('country_guid')
     if country_guid:
         set_lbsn_pkey(post.country_pkey, lbsn.Country(),
                       record.get('country_guid'), origin)
     set_lbsn_pkey(post.user_pkey, lbsn.User(), record.get('user_guid'),
                   origin)
     pub_date = record.get('post_publish_date')
     if pub_date:
         copydate_lbsn_attr(post.post_publish_date, pub_date)
     set_lbsn_attr(post, "post_body", record)
     post.post_geoaccuracy
     geo_acc = record.get("post_geoaccuracy")
     if geo_acc:
         # get enum value
         post.post_geoaccuracy = lbsn.Post.PostGeoaccuracy.Value(
             geo_acc.upper())
     set_lbsn_attr(post, "hashtags", record)
     set_lbsn_attr(post, "emoji", record)
     set_lbsn_attr(post, "post_like_count", record)
     set_lbsn_attr(post, "post_comment_count", record)
     set_lbsn_attr(post, "post_views_count", record)
     set_lbsn_attr(post, "post_title", record)
     crt_date = record.get('post_create_date')
     if crt_date:
         copydate_lbsn_attr(post.post_create_date, crt_date)
     set_lbsn_attr(post, "post_thumbnail_url", record)
     set_lbsn_attr(post, "post_url", record)
     post_type = record.get("post_type")
     if post_type:
         # get enum value
         post.post_type = lbsn.Post.PostType.Value(post_type.upper())
     set_lbsn_attr(post, "post_filter", record)
     set_lbsn_attr(post, "post_quote_count", record)
     set_lbsn_attr(post, "post_share_count", record)
     lang = record.get('post_language')
     if lang:
         ref_post_language = lbsn.Language()
         ref_post_language.language_short = lang
         post.post_language.CopyFrom(ref_post_language)
     set_lbsn_attr(post, "input_source", record)
     user_mentions = record.get("user_mentions")
     if user_mentions:
         mentioned_users_list = []
         for user_id in user_mentions:  # iterate over the list
             ref_user_record = \
                 HF.new_lbsn_record_with_id(
                     lbsn.User(), user_id, origin)
             mentioned_users_list.append(ref_user_record)
         post.user_mentions_pkey.extend(
             [user_ref.pkey for user_ref in mentioned_users_list])
     set_lbsn_attr(post, "post_content_license", record)
     return post

コード例 #11

0

ファイルを表示

    def extract_flickr_post(self, record):
        """Main function for processing Flickr YFCC100M CSV entry.
           This mothod is adapted to a special structure, adapt if needed.

        To Do:
            - parameterize column numbers and structure
            - provide external config-file for specific CSV structures
            - currently not included in lbsn mapping are MachineTags,
              GeoContext (indoors, outdoors), WoeId
              and some extra attributes only present for Flickr

        Overview of available columns and examples:
        0 row-number    -   0
        1 Photo/video identifier    -   6985418911
        2 lbsn.User NSID(PostID?)     -   4e2f7a26a1dfbf165a7e30bdabf7e72a
        3 lbsn.User ID     -   39089491@N00
        4 lbsn.User nickname     -   gnuckx
        5 Date taken    -   2012-02-16 09:56:37.0
        6 Date uploaded     -   1331840483
        7 Capture device    -   Canon+PowerShot+ELPH+310+HS
        8 Title     -   IMG_0520
        9 Description      -     My vacation
        10 tags (comma-separated)   -   canon,canon+powershot+hs+310
        11 Machine tags (comma-separated)   - landscape, hills, water
        12 Longitude    -   -81.804885
        13 Latitude     -   24.550558
        14 Accuracy -   12
        15 Photo/video page URL -   http://www.flickr.com/photos/39089491@N00/6985418911/
        16 Photo/video download URL -   http://farm8.staticflickr.com/7205/6985418911_df7747990d.jpg
        17 License name -   Attribution-NonCommercial-NoDerivs License
        18 License URL  -   http://creativecommons.org/licenses/by-nc-nd/2.0/
        19 Photo/video server identifier    -   7205
        20 Photo/video farm identifier  -   8
        21 Photo/video secret   -   df7747990d
        22 Photo/video secret original  -   692d7e0a7f
        23 Extension of the original photo  -   jpg
        24 Marker (0 ¼ photo, 1 ¼ video)    -   0
        if concat:
            25 Photo/video identifier
            26 Place references (null to multiple)
        """
        # note that one input record may contain many lbsn records
        # therefore, return list of processed records
        lbsn_records = []
        # start mapping input to lbsn_records
        post_guid = record[1]
        if not HF.check_notice_empty_post_guid(post_guid):
            return None
        post_record = HF.new_lbsn_record_with_id(lbsn.Post(), post_guid,
                                                 self.origin)
        user_record = HF.new_lbsn_record_with_id(lbsn.User(), record[3],
                                                 self.origin)
        user_record.user_name = unquote(record[4]).replace('+', ' ')
        user_record.url = f'http://www.flickr.com/photos/{user_record.pkey.id}/'
        if user_record:
            post_record.user_pkey.CopyFrom(user_record.pkey)
        lbsn_records.append(user_record)
        post_record.post_latlng = self.flickr_extract_postlatlng(record)
        geoaccuracy = importer.flickr_map_geoaccuracy(record[14])
        if geoaccuracy:
            post_record.post_geoaccuracy = geoaccuracy
        # place record available in separate yfcc100m dataset
        # if record[1]:
        # we need some information from postRecord to create placeRecord
        # (e.g.  user language, geoaccuracy, post_latlng)
        # some of the information from place will also modify postRecord
        # place_record = HF.new_lbsn_record_with_id(lbsn.Place(),
        #                                           record[1],
        #                                           self.origin)
        # lbsn_records.append(place_record)
        # post_record.place_pkey.CopyFrom(place_record.pkey)
        post_record.post_publish_date.CopyFrom(
            HF.parse_timestamp_string_to_protobuf(record[6]))
        post_created_date = HF.parse_csv_datestring_to_protobuf(
            record[5], t_format='%Y-%m-%d %H:%M:%S.%f')
        if post_created_date:
            post_record.post_create_date.CopyFrom(post_created_date)
        post_record.post_views_count = 0
        post_record.post_comment_count = 0
        post_record.post_like_count = 0
        post_record.post_url = record[15]
        # YFCC100M dataset contains HTML codes (%20) and
        # space character is replaced by +
        post_record.post_body = unquote(record[9]).replace('+', ' ')
        post_record.post_title = unquote(record[8]).replace('+', ' ')
        post_record.post_thumbnail_url = record[16]  # note: fullsize url!
        # split tags by , and + because by lbsn-spec,
        # tags are limited to single word
        record_tags_list = list(
            set(
                filter(None, [
                    HF.remove_prefix(unquote(tag), "#")
                    for tag in re.split("[,+]+", record[10])
                ])))
        if record_tags_list:
            for tag in record_tags_list:
                tag = importer.clean_tags_from_flickr(tag)
                post_record.hashtags.append(tag)
        record_machine_tags = list(
            set(
                filter(
                    None,
                    [unquote(mtag)
                     for mtag in re.split("[,+]+", record[11])])))
        if 'video' in record_machine_tags:
            # all videos appear to have 'video' in machine tags
            post_record.post_type = lbsn.Post.VIDEO
        else:
            post_record.post_type = lbsn.Post.IMAGE
        # replace text-string of content license by integer-id
        if record[17] is not None:
            post_record.post_content_license = \
                self.get_license_number_from_license_name(record[17])
        # place record available in separate yfcc100m dataset
        # if records parsed as joined urls, length is larger than 25
        if len(record) > 25:
            post_plus_place_records = self.extract_flickr_place(
                record[25:], post_record=post_record)
            if post_plus_place_records is None:
                lbsn_records.append(post_record)
            else:
                lbsn_records.extend(post_plus_place_records)
        else:
            lbsn_records.append(post_record)
        return lbsn_records

コード例 #12

0

ファイルを表示

Module for db input connection sql mapping
"""

import enum
from typing import Union, Optional, List, Tuple
from lbsnstructure import lbsnstructure_pb2 as lbsn

"""Schema convention from lbsn db spec"""
LBSN_SCHEMA = [
    (lbsn.Origin().DESCRIPTOR.name, "social", "origin", "origin_id"),
    (lbsn.Country().DESCRIPTOR.name, "spatial", "country", "country_guid"),
    (lbsn.City().DESCRIPTOR.name, "spatial", "city", "city_guid"),
    (lbsn.Place().DESCRIPTOR.name, "spatial", "place", "place_guid"),
    (lbsn.UserGroup().DESCRIPTOR.name, "social", "user_groups", "usergroup_guid"),
    (lbsn.User().DESCRIPTOR.name, "social", "user", "user_guid"),
    (lbsn.Post().DESCRIPTOR.name, "topical", "post", "post_guid"),
    (lbsn.PostReaction().DESCRIPTOR.name,
     "topical", "post_reaction", "reaction_guid"),
]


def optional_schema_override(
        LBSN_SCHEMA: List[Tuple[str, str, str, str]],
        schema_table_overrides: List[Tuple[str, str]]) -> List[Tuple[str, str, str, str]]:
    """Override schema and table name for selected lbsn objects."""
    LBSN_SCHEMA_OVERRIDE = []
    for lbsn_type, schema_name, table_name, key_col in LBSN_SCHEMA:
        for schema_table_override in schema_table_overrides:
            lbsn_object_ref, schema_table_override = schema_table_override
            try:
                schema_override, table_override = schema_table_override.split(

コード例 #13

0

ファイルを表示

ファイル: field_mapping_twitter.py プロジェクト: Sieboldianus/lbsntransform

    def extract_post(self, json_string_dict, user_pkey=None):
        """Returns tuple of lbsn.Post() and List of post_context_records

        e.g.:
            (lbsn.Post(), [lbsn.Country(), lbsn.City(), lbsn.Place(), lbsn.User()])
        """
        post_guid = json_string_dict.get('id_str')

        if not HF.check_notice_empty_post_guid(post_guid):
            return None, None
        post_record = HF.new_lbsn_record_with_id(lbsn.Post(),
                                                 post_guid,
                                                 self.origin)
        post_geoacc = None
        user_record = None
        user_info = json_string_dict.get('user')
        if user_info:
            # Get lbsn.Post/Reaction Details of lbsn.User
            user_record = self.extract_user(json_string_dict.get('user'))
        elif user_pkey:
            # userPkey is already available for posts that are statuses
            user_record = HF.new_lbsn_record_with_id(lbsn.User(),
                                                     user_pkey.id,
                                                     self.origin)
        if user_record:
            # self.lbsn_records.append(user_record)
            self.lbsn_records.append(user_record)
        else:
            self.log.warning(f'Record {self.lbsn_records.count_glob_total}: '
                             f'No lbsn.User record found for post: {post_guid} '
                             f'(post saved without userid)..')
            print(f'Record {self.lbsn_records.count_glob_total}', end='\r')

        # Some preprocessing for all types:
        post_coordinates = json_string_dict.get('coordinates')
        if post_coordinates:
            l_lng = post_coordinates.get('coordinates')[0]
            l_lat = post_coordinates.get('coordinates')[1]
            post_record.post_geoaccuracy = lbsn.Post.LATLNG
            post_record.post_latlng = "POINT(%s %s)" % (l_lng, l_lat)

        # Check if lbsn.Place is mentioned
        post_place_json = json_string_dict.get('place')
        if post_place_json:
            # we need some information from postRecord to create placeRecord
            # (e.g. user language, geoaccuracy, post_latlng)
            # some of the information from place will also modify postRecord
            # attributes; therefore return both
            if user_record:
                user_lang = user_record.user_language
            else:
                user_lang = None
            place_record, \
                post_geoacc, \
                post_country = self.extract_place(post_place_json,
                                                  post_record.post_geoaccuracy,
                                                  user_lang)
            if not post_record.post_geoaccuracy:
                post_record.post_geoaccuracy = post_geoacc
            # postRecord.post_geoaccuracy = twitterPostAttributes.geoaccuracy
            # self.lbsn_records.append(place_record)
            self.lbsn_records.append(place_record)
            if post_country:
                post_record.country_pkey.CopyFrom(post_country.pkey)
            if isinstance(place_record, lbsn.City):
                post_record.city_pkey.CopyFrom(place_record.pkey)
            # either city or place, Twitter user cannot attach both (?)
            elif isinstance(place_record, lbsn.Place):
                post_record.place_pkey.CopyFrom(place_record.pkey)
            # substitute postRecord LatLng Coordinates from placeRecord,
            # if not already set
            if not post_record.post_latlng:
                # Note: this will also substitute lbsn.Country lat/lng in post
                # this information is also available by query of
                # country_guid in posts
                # use input arg min_geoaccuracy to exclude country geo-posts
                post_record.post_latlng = place_record.geom_center
        # if still no geoinformation, send post to Null-Island
        if not post_record.post_latlng:
            if self.ignore_non_geotagged is True:
                return None
            self.null_island += 1
            post_record.post_latlng = "POINT(%s %s)" % (0, 0)
        if self.min_geoaccuracy:
            if not HF.geoacc_within_threshold(post_record.post_geoaccuracy,
                                              self.min_geoaccuracy):
                self.skipped_low_geoaccuracy += 1
                return None
        # Process attributes of twitter post
        post_source = json_string_dict.get('source')
        if post_source:
            post_record.input_source = HF.cleanhtml(
                json_string_dict.get('source'))
            if self.ignore_sources_set and \
                    post_record.input_source in self.ignore_sources_set:
                # skip entry if in ignore list
                self.skipped_ignore_list += 1
                return None
        post_record.post_publish_date.CopyFrom(
            HF.json_date_string_to_proto(json_string_dict.get('created_at')))
        if user_record:
            post_record.user_pkey.CopyFrom(user_record.pkey)
        post_record.post_quote_count = HF.value_count(
            json_string_dict.get('quote_count'))
        post_record.post_comment_count = HF.value_count(
            json_string_dict.get('reply_count'))
        post_record.post_share_count = HF.value_count(
            json_string_dict.get('retweet_count'))
        post_record.post_like_count = HF.value_count(
            json_string_dict.get('favorite_count'))
        post_record.post_url = f'https://twitter.com/statuses/{post_guid}'
        language_str = json_string_dict.get('lang')
        if language_str:
            post_language = lbsn.Language()
            post_language.language_short = json_string_dict.get('lang')
            post_record.post_language.CopyFrom(post_language)
        # If Extended_tweet object is available,
        # process entities and post_body (text) data from extended object
        is_truncated = json_string_dict.get('truncated')
        if is_truncated and 'extended_tweet' in json_string_dict:
            # if the "truncated" field is set to true,
            # and the "extended_tweet" object provides complete
            # "full_text" and "entities" Tweet metadata
            # Source for all data is extended object, if available
            json_string_dict = json_string_dict.get('extended_tweet')
            post_record.post_body = json_string_dict.get('full_text')
            # else:
            #    self.log.warning(f'Truncated but no extended_tweet:'
            #                     f'{json_string_dict}')
            #    input("Press Enter to continue... (entry will be skipped)")
            #    return None
        else:
            if 'full_text' in json_string_dict:
                post_record.post_body = json_string_dict.get('full_text')
            else:
                post_record.post_body = json_string_dict.get('text')
        # entities section always exists and includes meta information
        # such as hashtags or user_mentions
        entities_json = json_string_dict.get('entities')
        # extract hashtags
        hashtags_json = entities_json.get('hashtags')
        if hashtags_json:
            for hashtag in hashtags_json:  # iterate over the list
                post_record.hashtags.append(hashtag.get("text"))
        # Look for mentioned userRecords
        user_mentions_json = entities_json.get('user_mentions')
        if user_mentions_json:
            ref_user_records = HF.get_mentioned_users(user_mentions_json,
                                                      self.origin)
            # self.lbsn_records.append(ref_user_records)
            self.lbsn_records.append(ref_user_records)
            post_record.user_mentions_pkey.extend(
                [user_ref.pkey for user_ref in ref_user_records])
            if self.map_full_relations:
                self.extract_mentioned_users(
                    ref_user_records, user_record.pkey.id)
        # sometimes, extended_entities section exists and includes
        # additional information on media, but never hashtags or user_mentions
        # Since the media type metadata in the extended_entities section
        # correctly indicates the media type
        # (‘photo’, ‘video’ or ‘animated_gif’),
        # and supports up to 4 photos, it is the preferred metadata
        # source for native media. See:
        # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/extended-entities-object.html#extended-entities-object
        if 'extended_entities' in json_string_dict:
            entities_json = json_string_dict.get('extended_entities')
        media_json = entities_json.get('media')
        if media_json:
            post_record.post_type = HF.assign_media_post_type(media_json)
        else:
            post_record.post_type = lbsn.Post.TEXT
        post_record.emoji.extend(HF.extract_emoji(post_record.post_body))
        # because standard print statement will produce escaped text,
        # we can use protobuf text_format to give us a human friendly
        # version of the text
        # log.debug(f'lbsn.Post record: '
        #           f'{text_format.MessageToString(postRecord, as_utf8=True)}')
        # log.debug(f'lbsn.Post record: {postRecord}')
        return post_record

コード例 #14

0

ファイルを表示

ファイル: field_mapping_twitter.py プロジェクト: Sieboldianus/lbsntransform

    def parse_json_post(self, json_string_dict, user_pkey=None):
        """Extract json post retrieved from Twitter API

        The process is nested, but pretty linear:
        1. Extract all relevant lbsn.Post Attributes
           1.a extract post coordinates
           1.b extract user attributes
           1.c extract place attributes
        (poi, city, neigborhood, admin, country)
           1.d extract extract extended tweet,
        if available, and extended entities, if available
        2. decide if post is reaction
        (reply, quote, share, see https://developer.twitter.com/
        en/docs/tweets/data-dictionary/overview/entities-object.html)
        3. if post is reaction, copy reduced reaction
        attributes from extracted lbsn.Post
        4. add post/reaction to recordDict
        5. process all referenced posts
           5.a Retweet(=Share) and Quote Tweets are special kinds
        of Tweets that contain the original Tweet as an embedded object.
           5.b Retweets have a top-level "retweeted_status"
        object, and Quoted Tweets have a "quoted_status" object
        process tweet-post object

        Note: one input record may contain many lbsn records
        therefore, records are first added to self.lbsn_records
        to be later returned together
        """
        post_record = self.extract_post(
            json_string_dict, user_pkey)

        if not post_record:
            # in case no post record has been extracted
            # (e.g. non_geotagged clause)
            return
        # Assignment Step
        # check if post is reaction to other post
        # reaction means: reduced structure compared to post;
        # reactions often include the complete original post,
        # therefore nested processing necessary
        if HF.is_post_reaction(json_string_dict):
            if self.map_reactions is False:
                return
            post_reaction_record = self.map_postrecord_to_postreactionrecord(
                post_record)
            refuser_pkey = None
            if 'quoted_status' in json_string_dict:
                # Note: Quote is both: Share & Reply
                if 'user' not in json_string_dict.get('quoted_status'):
                    refuser_pkey = \
                        HF.substitute_referenced_user(json_string_dict,
                                                      self.origin,
                                                      self.log)
                post_reaction_record.reaction_type = lbsn.PostReaction.QUOTE
                ref_post_record = self.extract_post(
                    json_string_dict.get('quoted_status'))
            elif 'retweeted_status' in json_string_dict:
                # Note: No retweets are available when data is queried
                # using Bounding Box because of Geo-Tweet limitation:
                # "Note that native Retweets are not matched by this
                # parameter. While the original Tweet may have a location,
                # the Retweet will not"
                # see https://developer.twitter.com/en/docs/
                # tweets/filter-realtime/guides/basic-stream-parameters.html
                if 'user' not in json_string_dict.get('retweeted_status'):
                    # Current issue with Twitter search: the retweeting
                    # user is not returned in retweeted_status
                    # but we can get this from other information,
                    # such as user_mentions field from the retweet
                    # https://twittercommunity.com/t/status-retweeted-
                    # status-quoted-status-user-missing-from-search-tweets-json-response/63355
                    refuser_pkey = \
                        HF.substitute_referenced_user(json_string_dict,
                                                      self.origin,
                                                      self.log)
                post_reaction_record.reaction_type = lbsn.PostReaction.SHARE
                retweet_post = json_string_dict.get('retweeted_status')
                ref_post_record = self.extract_post(retweet_post, refuser_pkey)

            elif json_string_dict.get('in_reply_to_status_id_str'):
                # if reply, original tweet is not available (?)
                post_reaction_record.reaction_type = lbsn.PostReaction.COMMENT
                ref_post_record = \
                    HF.new_lbsn_record_with_id(
                        lbsn.Post(), json_string_dict.get(
                            'in_reply_to_status_id_str'),
                        self.origin)
                ref_user_record = \
                    HF.new_lbsn_record_with_id(
                        lbsn.User(),
                        json_string_dict.get(
                            'in_reply_to_user_id_str'),
                        self.origin)
                ref_user_record.user_name = json_string_dict.get(
                    'in_reply_to_screen_name')  # Needs to be saved
                self.lbsn_records.append(ref_user_record)
                ref_post_record.user_pkey.CopyFrom(ref_user_record.pkey)

            # add referenced post pkey to reaction
            if not self.disable_reaction_post_referencing:
                post_reaction_record.referencedPost_pkey.CopyFrom(
                    ref_post_record.pkey)
                # ToDo: if a Reaction refers to another
                # reaction (Information Spread)
                # This information is currently not
                # [available from Twitter](https://developer.twitter.com/
                # en/docs/tweets/data-dictionary/overview/tweet-object):
                # "Note that retweets of retweets do not show
                # representations of the intermediary retweet [...]"
                # would be added to
                # postReactionRecord.referencedPostReaction_pkey
                if ref_post_record:
                    self.lbsn_records.append(ref_post_record)
            # add postReactionRecord to Dict
            self.lbsn_records.append(post_reaction_record)
        else:
            # otherwise add post to self.lbsn_records
            # which already includes all other entries (lbsn.User, lbsn.City, lbsn.Place etc.)
            self.lbsn_records.append(post_record)

コード例 #15

0

ファイルを表示

ファイル: instagram-mapping-for-fast-instagram-scraper.py プロジェクト: mcnesium/LBSN-Dashboard

    def extract_post(
        self, json_string_dict: Dict[str, Any], place_record: lbsn.Place = None):
        post_guid = json_string_dict.get('id')
        if not HF.check_notice_empty_post_guid(post_guid):
            return None
        post_record = HF.new_lbsn_record_with_id(lbsn.Post(),
                                                 post_guid,
                                                 self.origin)
        user_record = None
        user_info = json_string_dict.get('owner')
        if user_info:
            # Get Post/Reaction Details of User
            user_record = self.extract_user(user_info)
        if user_record:
            self.lbsn_records.append(user_record)
        else:
            self.log.warning(
                f'No User record found for post: {post_guid} '
                f'(post saved without userid)..')

        # Check from upstream to update post attrs
        if place_record:
            # assign place accuracy, by default
            post_record.post_geoaccuracy = lbsn.Post.PLACE
            post_record.place_pkey.CopyFrom(place_record.pkey)
            post_record.post_latlng = place_record.geom_center
        else:
            post_record.post_geoaccuracy = None

        # if still no geoinformation, send post to Null-Island
        if not post_record.post_latlng:
            if self.ignore_non_geotagged is True:
                return None
            else:
                self.null_island += 1
                post_record.post_latlng = "POINT(%s %s)" % (0, 0)
        if self.min_geoaccuracy:
            if not HF.geoacc_within_threshold(
                    post_record.post_geoaccuracy, self.min_geoaccuracy):
                self.skipped_low_geoaccuracy += 1
                return
        post_record.post_publish_date.CopyFrom(
            HF.json_date_timestamp_to_proto(
                json_string_dict.get('taken_at_timestamp')))
        if user_record:
            post_record.user_pkey.CopyFrom(user_record.pkey)

        def value_count(x): return 0 if x is None else x
        post_record.post_comment_count = value_count(
            json_string_dict.get('edge_media_to_comment').get('count'))
        post_record.post_like_count = value_count(
            json_string_dict.get('edge_liked_by').get('count'))
        post_shortcode = json_string_dict.get('shortcode')
        post_record.post_url = f'http://www.instagram.com/p/{post_shortcode}'
        if json_string_dict.get("thumbnail_src"):
            post_record.post_thumbnail_url = json_string_dict.get(
                "thumbnail_src")
        post_caption_edge = json_string_dict.get('edge_media_to_caption')
        if post_caption_edge:
            post_caption_edge_edges = post_caption_edge.get("edges")
            if post_caption_edge_edges and not len(
                    post_caption_edge_edges) == 0:
                post_caption = post_caption_edge[
                    "edges"][0]["node"]["text"]
                post_record.post_body = post_caption.replace(
                    '\n', ' ').replace('\r', '')
                hashtags = HF.extract_hashtags_from_string(post_caption)
                if hashtags:
                    for hashtag in hashtags:
                        post_record.hashtags.append(hashtag)
        is_video = json_string_dict.get('is_video')
        if is_video:
            post_record.post_type = lbsn.Post.VIDEO
            post_record.post_views_count = value_count(
                json_string_dict.get('video_view_count'))
        else:
            post_record.post_type = lbsn.Post.IMAGE
        post_record.emoji.extend(HF.extract_emoji(post_record.post_body))
        self.lbsn_records.append(post_record)