def _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs): media_lookup = {} include_media_metadata = ('media_metadata' in kwargs) and (kwargs['media_metadata'] is True) include_story_tags = ('story_tags' in kwargs) and (kwargs['story_tags'] is True) include_reddit_submissions = ('reddit_submissions' in kwargs) and (kwargs['reddit_submissions'] is True) args = kwargs.copy() # need to make sure invalid params don't make it to API call optional_args = ['media_metadata', 'story_limit', 'reddit_submissions', 'story_tags', 'include_fb_date'] for key in optional_args: if key in args: del args[key] story_page = apicache.topic_story_list_by_page(user_key, topics_id, link_id=link_id, **args) if len(story_page['stories']) > 0: # be careful to not construct malformed query if no story ids # build a media lookup table in parallel so it is faster if include_media_metadata: with concurrent.futures.ProcessPoolExecutor() as executor: jobs = [{'user_key': user_key, 'media_id': s['media_id']} for s in story_page['stories']] job_results = executor.map(_media_info_worker, jobs) # blocks until they are all done media_lookup = {j['media_id']: j for j in job_results} if include_story_tags: story_ids = [str(s['stories_id']) for s in story_page['stories']] stories_with_tags = apicache.story_list(user_key, 'stories_id:(' + " ".join(story_ids) + ")", args['limit']) # update story info for each story in the page, put it into the [stories] field, send updated page with # stories back for s in story_page['stories']: # add in media metadata to the story (from page-level cache built earlier) if include_media_metadata: media = media_lookup[s['media_id']] # add in media metadata items for k, v in media['metadata'].items(): s['media_{}'.format(k)] = v['label'] if v is not None else None # build lookup for id => story for all stories in stories with tags (non topic results) if include_story_tags: for st in stories_with_tags: if s['stories_id'] == st['stories_id']: s.update(st) foci_names = [f['name'] for f in s['foci']] s['subtopics'] = ", ".join(foci_names) s['themes'] = '' story_tag_ids = [t['tags_id'] for t in s['story_tags']] if tag_util.NYT_LABELER_1_0_0_TAG_ID in story_tag_ids: story_tag_ids = [t['tag'] for t in s['story_tags'] if t['tag_sets_id'] == tag_util.NYT_LABELS_TAG_SET_ID] s['themes'] = ", ".join(story_tag_ids) # now add in reddit share data if requested if include_reddit_submissions: story_reddit_submissions = pushshift.reddit_url_submission_counts(story_page['stories']) for s in story_page['stories']: s['reddit_submissions'] = story_reddit_submissions[s['stories_id']] return story_page
def _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs): add_media_fields = False # switch for including all the media metadata in each row (ie. story) story_page = topic_story_list_by_page(user_key, topics_id, link_id=link_id, **kwargs) story_ids = [str(s['stories_id']) for s in story_page['stories']] stories_with_tags = story_list(user_key, 'stories_id:(' + " ".join(story_ids) + ")", kwargs['limit']) # build a media lookup table in parallel so it is faster if add_media_fields: pool = Pool(processes=MEDIA_INFO_POOL_SIZE) jobs = [{ 'user_key': user_key, 'media_id': s['media_id'] } for s in story_page['stories']] job_results = pool.map(_media_info_worker, jobs) # blocks until they are all done media_lookup = {j['media_id']: j for j in job_results} pool.terminate() # update story info for each story in the page, put it into the [stories] field, send updated page with stories back for s in story_page['stories']: # add in media metadata to the story (from page-level cache built earlier) if add_media_fields: media = media_lookup[s['media_id']] # add in foci/subtopic names for k, v in media['metadata'].iteritems(): s[u'media_{}'.format( k)] = v['label'] if v is not None else None # build lookup for id => story for all stories in stories with tags (non topic results) for st in stories_with_tags: if s['stories_id'] == st['stories_id']: s.update(st) foci_names = [f['name'] for f in s['foci']] s['subtopics'] = ", ".join(foci_names) s['themes'] = '' story_tag_ids = [t['tags_id'] for t in s['story_tags']] if tag_util.NYT_LABELER_1_0_0_TAG_ID in story_tag_ids: story_tag_ids = [ t['tag'] for t in s['story_tags'] if t['tag_sets_id'] == tag_util.NYT_LABELS_TAG_SET_ID ] s['themes'] = ", ".join(story_tag_ids) return story_page # need links too
def _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs): add_media_fields = False # switch for including all the media metadata in each row (ie. story) media_lookup = {} story_page = apicache.topic_story_list_by_page(user_key, topics_id, link_id=link_id, **kwargs) if len(story_page['stories']) > 0: # be careful to not construct malformed query if no story ids story_ids = [str(s['stories_id']) for s in story_page['stories']] stories_with_tags = apicache.story_list(user_key, 'stories_id:(' + " ".join(story_ids) + ")", kwargs['limit']) # build a media lookup table in parallel so it is faster if add_media_fields: pool = Pool(processes=MEDIA_INFO_POOL_SIZE) jobs = [{'user_key': user_key, 'media_id': s['media_id']} for s in story_page['stories']] job_results = pool.map(_media_info_worker, jobs) # blocks until they are all done media_lookup = {j['media_id']: j for j in job_results} pool.terminate() # update story info for each story in the page, put it into the [stories] field, send updated page with # stories back for s in story_page['stories']: # add in media metadata to the story (from page-level cache built earlier) if add_media_fields: media = media_lookup[s['media_id']] # add in foci/subtopic names for k, v in media['metadata'].items(): s['media_{}'.format(k)] = v['label'] if v is not None else None # build lookup for id => story for all stories in stories with tags (non topic results) for st in stories_with_tags: if s['stories_id'] == st['stories_id']: s.update(st) foci_names = [f['name'] for f in s['foci']] s['subtopics'] = ", ".join(foci_names) s['themes'] = '' story_tag_ids = [t['tags_id'] for t in s['story_tags']] if tag_util.NYT_LABELER_1_0_0_TAG_ID in story_tag_ids: story_tag_ids = [t['tag'] for t in s['story_tags'] if t['tag_sets_id'] == tag_util.NYT_LABELS_TAG_SET_ID] s['themes'] = ", ".join(story_tag_ids) return story_page # need links too