def _topic_media_link_list_by_page_as_csv_row(user_mc_key, topics_id, props, **kwargs): local_mc = user_admin_mediacloud_client( user_mc_key ) #having issues with calling apicache call.. so trying directly yield u','.join(props) + u'\n' # first send the column names all_media = [] more_media = True link_id = 0 params = kwargs params[ 'limit'] = 1000 # an arbitrary value to let us page through with big pages while more_media: media_link_page = apicache.topic_media_link_list_by_page( TOOL_API_KEY, topics_id, link_ids=link_id, **kwargs) media_list = media_link_page['links'] media_src_ids = [ str(s['source_media_id']) for s in media_link_page['links'] ] media_ref_ids = [ str(s['ref_media_id']) for s in media_link_page['links'] ] media_src_ids = media_src_ids + media_ref_ids # user_mc_key isn't working # for m in media_link_page['links']: q = "media_id:{[61164, 4434, 18380]}" params['q'] = q # TODO - CSB this currently doesn't work the way we need it to. waiting for hal to respond media_info = local_mc.topicMediaList(topics_id, **params) for m_info in media_info['media']: if m['source_media_id'] == m_info['media_id']: m['source_info'] = m_info if m['ref_media_id'] == m_info['media_id']: m['ref_info'] = m_info if 'next' in media_link_page['link_ids']: link_id = media_link_page['link_ids']['next'] else: more_media = False for s in media_link_page['links']: cleaned_source_info = csv.dict2row(TOPIC_MEDIA_CSV_PROPS, s['source_info']) cleaned_ref_info = csv.dict2row(TOPIC_MEDIA_CSV_PROPS, s['ref_info']) row_string = u','.join(cleaned_source_info) + ',' + u','.join( cleaned_ref_info) + u'\n' yield row_string
def _story_list_by_page_as_csv_row(api_key, q, fq, stories_per_page, sort, page_limit, props): yield ','.join(props) + '\n' # first send the column names for page in _story_list_by_page(api_key, q, fq, stories_per_page, sort, page_limit): for story in page: cleaned_row = csv.dict2row(props, story) row_string = ','.join(cleaned_row) + '\n' yield row_string
def _story_list_by_page_as_csv_row(q, fq, stories_per_page, sort, page_limit, props): yield ','.join(props) + '\n' # first send the column names for page in _story_list_by_page(q, fq, stories_per_page, sort, page_limit): for story in page: cleaned_row = csv.dict2row(props, story) row_string = ','.join(cleaned_row) + '\n' yield row_string
def _topic_story_link_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs): local_mc = user_admin_mediacloud_client(user_key) yield u','.join(props) + u'\n' # first send the column names link_id = 0 more_pages = True while more_pages: story_link_page = topic_story_link_list_by_page(user_key, topics_id, link_ids=link_id, **kwargs) story_src_ids = [ str(s['source_stories_id']) for s in story_link_page['links'] ] story_ref_ids = [ str(s['ref_stories_id']) for s in story_link_page['links'] ] story_src_ids = story_src_ids + story_ref_ids # TODO there is a cached topic story list... but paging is different... storiesInfoList = local_mc.topicStoryList(topics_id, stories_id=story_src_ids) # get all source and ref story link ids and fetch them with topicStoryList if 'next' in story_link_page['link_ids']: link_id = story_link_page['link_ids']['next'] else: more_pages = False for s in storiesInfoList['stories']: cleaned_row = csv.dict2row(props, s) row_string = u','.join(cleaned_row) + u'\n' yield row_string
def _topic_media_link_list_by_page_as_csv_row(user_mc_key, topics_id, props, **kwargs): yield ','.join(props) + '\n' # first send the column names more_media = True link_id = 0 basic_media_props = ['media_id', 'name', 'url'] while more_media: # fetch one page of data media_link_page = apicache.topic_media_link_list_by_page( TOOL_API_KEY, topics_id, link_id, **kwargs) # get the media info for all the media sources media_src_ids = [ str(s['source_media_id']) for s in media_link_page['links'] ] media_ref_ids = [ str(s['ref_media_id']) for s in media_link_page['links'] ] media_src_ids = set(media_src_ids + media_ref_ids) # make it distinct # TODO: PARALLELIZE - can't use executor here, because we are out of the context? media_lookup = { int(mid): _media_info_worker({ 'key': user_mc_key, 'media_id': mid }) for mid in media_src_ids } # connect the link data to the media info data for link_pair in media_link_page['links']: link_pair['source_info'] = media_lookup[int( link_pair['source_media_id'])] link_pair['ref_info'] = media_lookup[int( link_pair['ref_media_id'])] # stream this page's results to the client for s in media_link_page['links']: cleaned_source_info = csv.dict2row(basic_media_props, s['source_info']) cleaned_ref_info = csv.dict2row(basic_media_props, s['ref_info']) row_string = ','.join(cleaned_source_info) + ',' + ','.join( cleaned_ref_info) + '\n' yield row_string # set up to grab the next page if 'next' in media_link_page['link_ids']: link_id = media_link_page['link_ids']['next'] else: more_media = False
def _topic_media_link_list_by_page_as_csv_row(user_mc_key, topics_id, props, **kwargs): yield ','.join(props) + '\n' # first send the column names more_media = True use_pool = True link_id = 0 basic_media_props = ['media_id', 'name', 'url'] while more_media: # fetch one page of data media_link_page = apicache.topic_media_link_list_by_page(TOOL_API_KEY, topics_id, link_id, **kwargs) # get the media info for all the media sources media_src_ids = [str(s['source_media_id']) for s in media_link_page['links']] media_ref_ids = [str(s['ref_media_id']) for s in media_link_page['links']] media_src_ids = set(media_src_ids + media_ref_ids) # make it distinct if use_pool: # for editing users, add in last scrape and active feed count (if requested) jobs = [{'key': user_mc_key, 'media_id': mid} for mid in media_src_ids] pool = Pool(processes=15) page_media_info = pool.map(_media_info_worker, jobs) # blocks until they are all done media_lookup = {int(m['media_id']): m for m in page_media_info} else: media_lookup = {int(mid): _media_info_worker({'key': user_mc_key, 'media_id': mid}) for mid in media_src_ids} # connect the link data to the media info data for link_pair in media_link_page['links']: link_pair['source_info'] = media_lookup[int(link_pair['source_media_id'])] link_pair['ref_info'] = media_lookup[int(link_pair['ref_media_id'])] # stream this page's results to the client for s in media_link_page['links']: cleaned_source_info = csv.dict2row(basic_media_props, s['source_info']) cleaned_ref_info = csv.dict2row(basic_media_props, s['ref_info']) row_string = ','.join(cleaned_source_info) + ',' + ','.join(cleaned_ref_info) + '\n' yield row_string # set up to grab the next page if 'next' in media_link_page['link_ids']: link_id = media_link_page['link_ids']['next'] else: more_media = False
def _topic_story_link_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs): local_mc = user_admin_mediacloud_client(user_key) spec_props = [ 'source_stories_id', 'source_publish_date', 'source_title', 'source_url', 'source_language', 'source_ap_syndicated', 'source_inlink_count', 'source_outlink_count', 'ref_stories_id', 'ref_publish_date', 'ref_title', 'ref_url', 'ref_language', 'ref_ap_syndicated', 'ref_inlink_count', 'ref_outlink_count' # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type' ] yield u','.join(spec_props) + u'\n' # first send the column names link_id = 0 more_pages = True while more_pages: story_link_page = topic_story_link_list_by_page(user_key, topics_id, link_ids=link_id, **kwargs) story_src_ids = [str(s['source_stories_id']) for s in story_link_page['links']] story_ref_ids = [str(s['ref_stories_id']) for s in story_link_page['links']] story_src_ids = story_src_ids + story_ref_ids stories_info_list = local_mc.topicStoryList(topics_id, stories_id=story_src_ids) for s in story_link_page['links']: for s_info in stories_info_list['stories']: if s['source_stories_id'] == s_info['stories_id']: s['source_info'] = s_info if s['ref_stories_id'] == s_info['stories_id']: s['ref_info'] = s_info if 'next' in story_link_page['link_ids']: link_id = story_link_page['link_ids']['next'] else: more_pages = False for s in story_link_page['links']: cleaned_source_info = csv.dict2row(props, s['source_info']) cleaned_ref_info = csv.dict2row(props, s['ref_info']) row_string = u','.join(cleaned_source_info) + ',' + u','.join(cleaned_ref_info) + u'\n' yield row_string
def _stream_media_by_page(user_mc_key, topics_id, props, **kwargs): yield ','.join(props) + '\n' # first send the column names more_media = True while more_media: page = apicache.topic_media_list_page(user_mc_key, topics_id, **kwargs) media = page['media'] for m in media: row = csv.dict2row(props, m) row_string = ','.join(row) + '\n' yield row_string if 'next' in page['link_ids']: kwargs['link_id'] = page['link_ids']['next'] more_media = True else: more_media = False
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs): yield u','.join(props) + u'\n' # first send the column names link_id = 0 more_pages = True while more_pages: page = _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs) if 'next' in page['link_ids']: link_id = page['link_ids']['next'] else: more_pages = False for s in page['stories']: cleaned_row = csv.dict2row(props, s) row_string = u','.join(cleaned_row) + u'\n' yield row_string
def _topic_story_link_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs): local_mc = user_admin_mediacloud_client(user_key) spec_props = [ 'source_stories_id', 'source_publish_date', 'source_title', 'source_url', 'source_language', 'source_ap_syndicated', 'source_inlink_count', 'source_outlink_count', 'ref_stories_id', 'ref_publish_date', 'ref_title', 'ref_url', 'ref_language', 'ref_ap_syndicated', 'ref_inlink_count', 'ref_outlink_count' # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type' ] yield ','.join(spec_props) + '\n' # first send the column names link_id = 0 more_pages = True while more_pages: story_link_page = apicache.topic_story_link_list_by_page(user_key, topics_id, link_id=link_id, **kwargs) story_src_ids = [str(s['source_stories_id']) for s in story_link_page['links']] story_ref_ids = [str(s['ref_stories_id']) for s in story_link_page['links']] story_src_ids = story_src_ids + story_ref_ids stories_info_list = local_mc.topicStoryList(topics_id, stories_id=story_src_ids) for s in story_link_page['links']: for s_info in stories_info_list['stories']: if s['source_stories_id'] == s_info['stories_id']: s['source_info'] = s_info if s['ref_stories_id'] == s_info['stories_id']: s['ref_info'] = s_info if 'next' in story_link_page['link_ids']: link_id = story_link_page['link_ids']['next'] else: more_pages = False for s in story_link_page['links']: cleaned_source_info = csv.dict2row(props, s['source_info']) cleaned_ref_info = csv.dict2row(props, s['ref_info']) row_string = ','.join(cleaned_source_info) + ',' + ','.join(cleaned_ref_info) + '\n' yield row_string
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs): yield u','.join(props) + u'\n' # first send the column names link_id = 0 more_pages = True while more_pages: page = _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs) if 'next' in page['link_ids']: link_id = page['link_ids']['next'] else: more_pages = False for s in page['stories']: # first foci down to just the readable names s['subtopics'] = [u"{}: {}".format(f['focal_set_name'], f['name']) for f in s['foci']] cleaned_row = csv.dict2row(props, s) row_string = u','.join(cleaned_row) + u'\n' yield row_string
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs): yield ','.join(props) + '\n' # first send the column names link_id = 0 more_pages = True while more_pages: page = _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs) if 'next' in page['link_ids']: link_id = page['link_ids']['next'] else: more_pages = False for s in page['stories']: # first foci down to just the readable names s['subtopics'] = ["{}: {}".format(f['focal_set_name'], f['name']) for f in s['foci']] cleaned_row = csv.dict2row(props, s) row_string = ','.join(cleaned_row) + '\n' yield row_string
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs): yield ','.join(props) + '\n' # first send the column names include_all_url_shares = kwargs[ 'include_all_url_shares'] if 'include_all_url_shares' in kwargs else False story_count = 0 link_id = 0 more_pages = True yet_to_hit_story_limit = True has_story_limit = ('story_limit' in kwargs) and (kwargs['story_limit'] is not None) # page through the story list results, until we run out or we hit the user's desired limit while more_pages and ((not has_story_limit) or (has_story_limit and yet_to_hit_story_limit)): page = _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs) if 'next' in page['link_ids']: link_id = page['link_ids']['next'] else: more_pages = False for s in page['stories']: if include_all_url_shares: topic_seed_queries = kwargs['topic_seed_queries'] # add in each header col for item in s['url_sharing_counts']: seed_query = [ tsq for tsq in topic_seed_queries if tsq['topic_seed_queries_id'] == item['topic_seed_queries_id'] ][0] prefix = platform_csv_column_header_prefix(seed_query) s[prefix + "post_count"] = item['post_count'] s[prefix + "channel_count"] = item['channel_count'] s[prefix + "author_count"] = item['author_count'] # first foci down to just the readable names s['subtopics'] = [ "{}: {}".format(f['focal_set_name'], f['name']) for f in s['foci'] ] cleaned_row = csv.dict2row(props, s) row_string = ','.join(cleaned_row) + '\n' yield row_string story_count += len(page['stories']) yet_to_hit_story_limit = has_story_limit and (story_count < int( kwargs['story_limit']))
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs): yield ','.join(props) + '\n' # first send the column names story_count = 0 link_id = 0 more_pages = True yet_to_hit_story_limit = True has_story_limit = ('story_limit' in kwargs) and (kwargs['story_limit'] is not None) while more_pages and ((not has_story_limit) or (has_story_limit and yet_to_hit_story_limit)): page = _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs) if 'next' in page['link_ids']: link_id = page['link_ids']['next'] else: more_pages = False for s in page['stories']: # first foci down to just the readable names s['subtopics'] = ["{}: {}".format(f['focal_set_name'], f['name']) for f in s['foci']] cleaned_row = csv.dict2row(props, s) row_string = ','.join(cleaned_row) + '\n' yield row_string story_count += len(page['stories']) yet_to_hit_story_limit = has_story_limit and (story_count < int(kwargs['story_limit']))