コード例 #1
0
def _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs):
    media_lookup = {}

    include_media_metadata = ('media_metadata' in kwargs) and (kwargs['media_metadata'] is True)
    include_story_tags = ('story_tags' in kwargs) and (kwargs['story_tags'] is True)
    include_reddit_submissions = ('reddit_submissions' in kwargs) and (kwargs['reddit_submissions'] is True)

    args = kwargs.copy()   # need to make sure invalid params don't make it to API call
    optional_args = ['media_metadata', 'story_limit', 'reddit_submissions', 'story_tags', 'include_fb_date']
    for key in optional_args:
        if key in args:
            del args[key]
    story_page = apicache.topic_story_list_by_page(user_key, topics_id, link_id=link_id, **args)

    if len(story_page['stories']) > 0:  # be careful to not construct malformed query if no story ids

        # build a media lookup table in parallel so it is faster
        if include_media_metadata:
            with concurrent.futures.ProcessPoolExecutor() as executor:
                jobs = [{'user_key': user_key, 'media_id': s['media_id']} for s in story_page['stories']]
                job_results = executor.map(_media_info_worker, jobs)  # blocks until they are all done
                media_lookup = {j['media_id']: j for j in job_results}

        if include_story_tags:
            story_ids = [str(s['stories_id']) for s in story_page['stories']]
            stories_with_tags = apicache.story_list(user_key, 'stories_id:(' + " ".join(story_ids) + ")", args['limit'])

        # update story info for each story in the page, put it into the [stories] field, send updated page with
        # stories back
        for s in story_page['stories']:

            # add in media metadata to the story (from page-level cache built earlier)
            if include_media_metadata:
                media = media_lookup[s['media_id']]
                # add in media metadata items
                for k, v in media['metadata'].items():
                    s['media_{}'.format(k)] = v['label'] if v is not None else None

            # build lookup for id => story for all stories in stories with tags (non topic results)
            if include_story_tags:
                for st in stories_with_tags:
                    if s['stories_id'] == st['stories_id']:
                        s.update(st)
                        foci_names = [f['name'] for f in s['foci']]
                        s['subtopics'] = ", ".join(foci_names)
                        s['themes'] = ''
                        story_tag_ids = [t['tags_id'] for t in s['story_tags']]
                        if tag_util.NYT_LABELER_1_0_0_TAG_ID in story_tag_ids:
                            story_tag_ids = [t['tag'] for t in s['story_tags']
                                             if t['tag_sets_id'] == tag_util.NYT_LABELS_TAG_SET_ID]
                            s['themes'] = ", ".join(story_tag_ids)

        # now add in reddit share data if requested
        if include_reddit_submissions:
            story_reddit_submissions = pushshift.reddit_url_submission_counts(story_page['stories'])
            for s in story_page['stories']:
                s['reddit_submissions'] = story_reddit_submissions[s['stories_id']]

    return story_page
コード例 #2
0
def _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs):
    add_media_fields = False  # switch for including all the media metadata in each row (ie. story)

    story_page = topic_story_list_by_page(user_key,
                                          topics_id,
                                          link_id=link_id,
                                          **kwargs)

    story_ids = [str(s['stories_id']) for s in story_page['stories']]
    stories_with_tags = story_list(user_key,
                                   'stories_id:(' + " ".join(story_ids) + ")",
                                   kwargs['limit'])

    # build a media lookup table in parallel so it is faster
    if add_media_fields:
        pool = Pool(processes=MEDIA_INFO_POOL_SIZE)
        jobs = [{
            'user_key': user_key,
            'media_id': s['media_id']
        } for s in story_page['stories']]
        job_results = pool.map(_media_info_worker,
                               jobs)  # blocks until they are all done
        media_lookup = {j['media_id']: j for j in job_results}
        pool.terminate()

    # update story info for each story in the page, put it into the [stories] field, send updated page with stories back
    for s in story_page['stories']:

        # add in media metadata to the story (from page-level cache built earlier)
        if add_media_fields:
            media = media_lookup[s['media_id']]

            # add in foci/subtopic names
            for k, v in media['metadata'].iteritems():
                s[u'media_{}'.format(
                    k)] = v['label'] if v is not None else None

        # build lookup for id => story for all stories in stories with tags (non topic results)
        for st in stories_with_tags:

            if s['stories_id'] == st['stories_id']:
                s.update(st)

                foci_names = [f['name'] for f in s['foci']]
                s['subtopics'] = ", ".join(foci_names)

                s['themes'] = ''
                story_tag_ids = [t['tags_id'] for t in s['story_tags']]
                if tag_util.NYT_LABELER_1_0_0_TAG_ID in story_tag_ids:
                    story_tag_ids = [
                        t['tag'] for t in s['story_tags']
                        if t['tag_sets_id'] == tag_util.NYT_LABELS_TAG_SET_ID
                    ]
                    s['themes'] = ", ".join(story_tag_ids)

    return story_page  # need links too
コード例 #3
0
def _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs):
    add_media_fields = False  # switch for including all the media metadata in each row (ie. story)
    media_lookup = {}

    story_page = apicache.topic_story_list_by_page(user_key, topics_id, link_id=link_id, **kwargs)

    if len(story_page['stories']) > 0:  # be careful to not construct malformed query if no story ids

        story_ids = [str(s['stories_id']) for s in story_page['stories']]
        stories_with_tags = apicache.story_list(user_key, 'stories_id:(' + " ".join(story_ids) + ")", kwargs['limit'])

        # build a media lookup table in parallel so it is faster
        if add_media_fields:
            pool = Pool(processes=MEDIA_INFO_POOL_SIZE)
            jobs = [{'user_key': user_key, 'media_id': s['media_id']} for s in story_page['stories']]
            job_results = pool.map(_media_info_worker, jobs)  # blocks until they are all done
            media_lookup = {j['media_id']: j for j in job_results}
            pool.terminate()

        # update story info for each story in the page, put it into the [stories] field, send updated page with
        # stories back
        for s in story_page['stories']:

            # add in media metadata to the story (from page-level cache built earlier)
            if add_media_fields:
                media = media_lookup[s['media_id']]

                # add in foci/subtopic names
                for k, v in media['metadata'].items():
                    s['media_{}'.format(k)] = v['label'] if v is not None else None

            # build lookup for id => story for all stories in stories with tags (non topic results)
            for st in stories_with_tags:

                if s['stories_id'] == st['stories_id']:
                    s.update(st)

                    foci_names = [f['name'] for f in s['foci']]
                    s['subtopics'] = ", ".join(foci_names)

                    s['themes'] = ''
                    story_tag_ids = [t['tags_id'] for t in s['story_tags']]
                    if tag_util.NYT_LABELER_1_0_0_TAG_ID in story_tag_ids:
                        story_tag_ids = [t['tag'] for t in s['story_tags']
                                         if t['tag_sets_id'] == tag_util.NYT_LABELS_TAG_SET_ID]
                        s['themes'] = ", ".join(story_tag_ids)

    return story_page  # need links too