Beispiel #1
0
def _topic_media_link_list_by_page_as_csv_row(user_mc_key, topics_id, props,
                                              **kwargs):
    local_mc = user_admin_mediacloud_client(
        user_mc_key
    )  #having issues with calling apicache call.. so trying directly
    yield u','.join(props) + u'\n'  # first send the column names
    all_media = []
    more_media = True
    link_id = 0
    params = kwargs
    params[
        'limit'] = 1000  # an arbitrary value to let us page through with big pages
    while more_media:
        media_link_page = apicache.topic_media_link_list_by_page(
            TOOL_API_KEY, topics_id, link_ids=link_id, **kwargs)
        media_list = media_link_page['links']

        media_src_ids = [
            str(s['source_media_id']) for s in media_link_page['links']
        ]
        media_ref_ids = [
            str(s['ref_media_id']) for s in media_link_page['links']
        ]
        media_src_ids = media_src_ids + media_ref_ids
        # user_mc_key isn't working
        #

        for m in media_link_page['links']:
            q = "media_id:{[61164, 4434, 18380]}"
            params['q'] = q
            # TODO - CSB this currently doesn't work the way we need it to. waiting for hal to respond
            media_info = local_mc.topicMediaList(topics_id, **params)
            for m_info in media_info['media']:
                if m['source_media_id'] == m_info['media_id']:
                    m['source_info'] = m_info
                if m['ref_media_id'] == m_info['media_id']:
                    m['ref_info'] = m_info

        if 'next' in media_link_page['link_ids']:
            link_id = media_link_page['link_ids']['next']
        else:
            more_media = False
            for s in media_link_page['links']:
                cleaned_source_info = csv.dict2row(TOPIC_MEDIA_CSV_PROPS,
                                                   s['source_info'])
                cleaned_ref_info = csv.dict2row(TOPIC_MEDIA_CSV_PROPS,
                                                s['ref_info'])
                row_string = u','.join(cleaned_source_info) + ',' + u','.join(
                    cleaned_ref_info) + u'\n'
                yield row_string
Beispiel #2
0
def _story_list_by_page_as_csv_row(api_key, q, fq, stories_per_page, sort, page_limit, props):
    yield ','.join(props) + '\n'  # first send the column names
    for page in _story_list_by_page(api_key, q, fq, stories_per_page, sort, page_limit):
        for story in page:
            cleaned_row = csv.dict2row(props, story)
            row_string = ','.join(cleaned_row) + '\n'
            yield row_string
def _story_list_by_page_as_csv_row(q, fq, stories_per_page, sort, page_limit, props):
    yield ','.join(props) + '\n'  # first send the column names
    for page in _story_list_by_page(q, fq, stories_per_page, sort, page_limit):
        for story in page:
            cleaned_row = csv.dict2row(props, story)
            row_string = ','.join(cleaned_row) + '\n'
            yield row_string
def _topic_story_link_list_by_page_as_csv_row(user_key, topics_id, props,
                                              **kwargs):
    local_mc = user_admin_mediacloud_client(user_key)
    yield u','.join(props) + u'\n'  # first send the column names
    link_id = 0
    more_pages = True
    while more_pages:
        story_link_page = topic_story_link_list_by_page(user_key,
                                                        topics_id,
                                                        link_ids=link_id,
                                                        **kwargs)

        story_src_ids = [
            str(s['source_stories_id']) for s in story_link_page['links']
        ]
        story_ref_ids = [
            str(s['ref_stories_id']) for s in story_link_page['links']
        ]
        story_src_ids = story_src_ids + story_ref_ids

        # TODO there is a cached topic story list... but paging is different...
        storiesInfoList = local_mc.topicStoryList(topics_id,
                                                  stories_id=story_src_ids)
        # get all source and ref story link ids and fetch them with topicStoryList

        if 'next' in story_link_page['link_ids']:
            link_id = story_link_page['link_ids']['next']
        else:
            more_pages = False
        for s in storiesInfoList['stories']:
            cleaned_row = csv.dict2row(props, s)
            row_string = u','.join(cleaned_row) + u'\n'
            yield row_string
Beispiel #5
0
def _topic_media_link_list_by_page_as_csv_row(user_mc_key, topics_id, props,
                                              **kwargs):
    yield ','.join(props) + '\n'  # first send the column names
    more_media = True
    link_id = 0
    basic_media_props = ['media_id', 'name', 'url']
    while more_media:
        # fetch one page of data
        media_link_page = apicache.topic_media_link_list_by_page(
            TOOL_API_KEY, topics_id, link_id, **kwargs)
        # get the media info for all the media sources
        media_src_ids = [
            str(s['source_media_id']) for s in media_link_page['links']
        ]
        media_ref_ids = [
            str(s['ref_media_id']) for s in media_link_page['links']
        ]
        media_src_ids = set(media_src_ids + media_ref_ids)  # make it distinct
        # TODO: PARALLELIZE - can't use executor here, because we are out of the context?
        media_lookup = {
            int(mid): _media_info_worker({
                'key': user_mc_key,
                'media_id': mid
            })
            for mid in media_src_ids
        }
        # connect the link data to the media info data
        for link_pair in media_link_page['links']:
            link_pair['source_info'] = media_lookup[int(
                link_pair['source_media_id'])]
            link_pair['ref_info'] = media_lookup[int(
                link_pair['ref_media_id'])]
        # stream this page's results to the client
        for s in media_link_page['links']:
            cleaned_source_info = csv.dict2row(basic_media_props,
                                               s['source_info'])
            cleaned_ref_info = csv.dict2row(basic_media_props, s['ref_info'])
            row_string = ','.join(cleaned_source_info) + ',' + ','.join(
                cleaned_ref_info) + '\n'
            yield row_string
        # set up to grab the next page
        if 'next' in media_link_page['link_ids']:
            link_id = media_link_page['link_ids']['next']
        else:
            more_media = False
Beispiel #6
0
def _topic_media_link_list_by_page_as_csv_row(user_mc_key, topics_id, props, **kwargs):
    yield ','.join(props) + '\n'  # first send the column names
    more_media = True
    use_pool = True
    link_id = 0
    basic_media_props = ['media_id', 'name', 'url']
    while more_media:
        # fetch one page of data
        media_link_page = apicache.topic_media_link_list_by_page(TOOL_API_KEY, topics_id, link_id, **kwargs)
        # get the media info for all the media sources
        media_src_ids = [str(s['source_media_id']) for s in media_link_page['links']]
        media_ref_ids = [str(s['ref_media_id']) for s in media_link_page['links']]
        media_src_ids = set(media_src_ids + media_ref_ids)  # make it distinct
        if use_pool:
            # for editing users, add in last scrape and active feed count (if requested)
            jobs = [{'key': user_mc_key, 'media_id': mid} for mid in media_src_ids]
            pool = Pool(processes=15)
            page_media_info = pool.map(_media_info_worker, jobs)  # blocks until they are all done
            media_lookup = {int(m['media_id']): m for m in page_media_info}
        else:
            media_lookup = {int(mid): _media_info_worker({'key': user_mc_key, 'media_id': mid})
                            for mid in media_src_ids}
        # connect the link data to the media info data
        for link_pair in media_link_page['links']:
            link_pair['source_info'] = media_lookup[int(link_pair['source_media_id'])]
            link_pair['ref_info'] = media_lookup[int(link_pair['ref_media_id'])]
        # stream this page's results to the client
        for s in media_link_page['links']:
            cleaned_source_info = csv.dict2row(basic_media_props, s['source_info'])
            cleaned_ref_info = csv.dict2row(basic_media_props, s['ref_info'])
            row_string = ','.join(cleaned_source_info) + ',' + ','.join(cleaned_ref_info) + '\n'
            yield row_string
        # set up to grab the next page
        if 'next' in media_link_page['link_ids']:
            link_id = media_link_page['link_ids']['next']
        else:
            more_media = False
Beispiel #7
0
def _topic_story_link_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs):
    local_mc = user_admin_mediacloud_client(user_key)
    spec_props = [
        'source_stories_id', 'source_publish_date', 'source_title', 'source_url', 'source_language', 'source_ap_syndicated',
        'source_inlink_count', 'source_outlink_count', 'ref_stories_id', 'ref_publish_date', 'ref_title', 'ref_url', 'ref_language',
        'ref_ap_syndicated', 'ref_inlink_count', 'ref_outlink_count'
        # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'
    ]
    yield u','.join(spec_props) + u'\n'  # first send the column names
    link_id = 0
    more_pages = True
    while more_pages:
        story_link_page = topic_story_link_list_by_page(user_key, topics_id, link_ids=link_id, **kwargs)

        story_src_ids = [str(s['source_stories_id']) for s in story_link_page['links']]
        story_ref_ids = [str(s['ref_stories_id']) for s in story_link_page['links']]
        story_src_ids = story_src_ids + story_ref_ids

        stories_info_list = local_mc.topicStoryList(topics_id, stories_id=story_src_ids)

        for s in story_link_page['links']:
            for s_info in stories_info_list['stories']:
                if s['source_stories_id'] == s_info['stories_id']:
                    s['source_info'] = s_info
                if s['ref_stories_id'] == s_info['stories_id']:
                    s['ref_info'] = s_info

        if 'next' in story_link_page['link_ids']:
            link_id = story_link_page['link_ids']['next']
        else:
            more_pages = False
            for s in story_link_page['links']:
                cleaned_source_info = csv.dict2row(props, s['source_info'])
                cleaned_ref_info = csv.dict2row(props, s['ref_info'])
                row_string = u','.join(cleaned_source_info) + ',' + u','.join(cleaned_ref_info) + u'\n'
                yield row_string
Beispiel #8
0
def _stream_media_by_page(user_mc_key, topics_id, props, **kwargs):
    yield ','.join(props) + '\n'  # first send the column names
    more_media = True
    while more_media:
        page = apicache.topic_media_list_page(user_mc_key, topics_id, **kwargs)
        media = page['media']
        for m in media:
            row = csv.dict2row(props, m)
            row_string = ','.join(row) + '\n'
            yield row_string
        if 'next' in page['link_ids']:
            kwargs['link_id'] = page['link_ids']['next']
            more_media = True
        else:
            more_media = False
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs):
    yield u','.join(props) + u'\n'  # first send the column names
    link_id = 0
    more_pages = True
    while more_pages:
        page = _topic_story_page_with_media(user_key, topics_id, link_id,
                                            **kwargs)
        if 'next' in page['link_ids']:
            link_id = page['link_ids']['next']
        else:
            more_pages = False
        for s in page['stories']:
            cleaned_row = csv.dict2row(props, s)
            row_string = u','.join(cleaned_row) + u'\n'
            yield row_string
Beispiel #10
0
def _topic_story_link_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs):
    local_mc = user_admin_mediacloud_client(user_key)
    spec_props = [
        'source_stories_id', 'source_publish_date', 'source_title', 'source_url', 'source_language',
        'source_ap_syndicated', 'source_inlink_count', 'source_outlink_count', 'ref_stories_id', 'ref_publish_date',
        'ref_title', 'ref_url', 'ref_language', 'ref_ap_syndicated', 'ref_inlink_count', 'ref_outlink_count'
        # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'
    ]
    yield ','.join(spec_props) + '\n'  # first send the column names
    link_id = 0
    more_pages = True
    while more_pages:
        story_link_page = apicache.topic_story_link_list_by_page(user_key, topics_id, link_id=link_id, **kwargs)

        story_src_ids = [str(s['source_stories_id']) for s in story_link_page['links']]
        story_ref_ids = [str(s['ref_stories_id']) for s in story_link_page['links']]
        story_src_ids = story_src_ids + story_ref_ids

        stories_info_list = local_mc.topicStoryList(topics_id, stories_id=story_src_ids)

        for s in story_link_page['links']:
            for s_info in stories_info_list['stories']:
                if s['source_stories_id'] == s_info['stories_id']:
                    s['source_info'] = s_info
                if s['ref_stories_id'] == s_info['stories_id']:
                    s['ref_info'] = s_info

        if 'next' in story_link_page['link_ids']:
            link_id = story_link_page['link_ids']['next']
        else:
            more_pages = False
            for s in story_link_page['links']:
                cleaned_source_info = csv.dict2row(props, s['source_info'])
                cleaned_ref_info = csv.dict2row(props, s['ref_info'])
                row_string = ','.join(cleaned_source_info) + ',' + ','.join(cleaned_ref_info) + '\n'
                yield row_string
Beispiel #11
0
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs):
    yield u','.join(props) + u'\n'  # first send the column names
    link_id = 0
    more_pages = True
    while more_pages:
        page = _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs)
        if 'next' in page['link_ids']:
            link_id = page['link_ids']['next']
        else:
            more_pages = False
        for s in page['stories']:
            # first foci down to just the readable names
            s['subtopics'] = [u"{}: {}".format(f['focal_set_name'], f['name']) for f in s['foci']]
            cleaned_row = csv.dict2row(props, s)
            row_string = u','.join(cleaned_row) + u'\n'
            yield row_string
Beispiel #12
0
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs):
    yield ','.join(props) + '\n'  # first send the column names
    link_id = 0
    more_pages = True
    while more_pages:
        page = _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs)
        if 'next' in page['link_ids']:
            link_id = page['link_ids']['next']
        else:
            more_pages = False
        for s in page['stories']:
            # first foci down to just the readable names
            s['subtopics'] = ["{}: {}".format(f['focal_set_name'], f['name']) for f in s['foci']]
            cleaned_row = csv.dict2row(props, s)
            row_string = ','.join(cleaned_row) + '\n'
            yield row_string
Beispiel #13
0
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs):
    yield ','.join(props) + '\n'  # first send the column names
    include_all_url_shares = kwargs[
        'include_all_url_shares'] if 'include_all_url_shares' in kwargs else False
    story_count = 0
    link_id = 0
    more_pages = True
    yet_to_hit_story_limit = True
    has_story_limit = ('story_limit' in kwargs) and (kwargs['story_limit']
                                                     is not None)
    # page through the story list results, until we run out or we hit the user's desired limit
    while more_pages and ((not has_story_limit) or
                          (has_story_limit and yet_to_hit_story_limit)):
        page = _topic_story_page_with_media(user_key, topics_id, link_id,
                                            **kwargs)
        if 'next' in page['link_ids']:
            link_id = page['link_ids']['next']
        else:
            more_pages = False
        for s in page['stories']:
            if include_all_url_shares:
                topic_seed_queries = kwargs['topic_seed_queries']
                # add in each header col
                for item in s['url_sharing_counts']:
                    seed_query = [
                        tsq for tsq in topic_seed_queries
                        if tsq['topic_seed_queries_id'] ==
                        item['topic_seed_queries_id']
                    ][0]
                    prefix = platform_csv_column_header_prefix(seed_query)
                    s[prefix + "post_count"] = item['post_count']
                    s[prefix + "channel_count"] = item['channel_count']
                    s[prefix + "author_count"] = item['author_count']
            # first foci down to just the readable names
            s['subtopics'] = [
                "{}: {}".format(f['focal_set_name'], f['name'])
                for f in s['foci']
            ]
            cleaned_row = csv.dict2row(props, s)
            row_string = ','.join(cleaned_row) + '\n'
            yield row_string
        story_count += len(page['stories'])
        yet_to_hit_story_limit = has_story_limit and (story_count < int(
            kwargs['story_limit']))
Beispiel #14
0
def _topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **kwargs):
    yield ','.join(props) + '\n'  # first send the column names
    story_count = 0
    link_id = 0
    more_pages = True
    yet_to_hit_story_limit = True
    has_story_limit = ('story_limit' in kwargs) and (kwargs['story_limit'] is not None)
    while more_pages and ((not has_story_limit) or (has_story_limit and yet_to_hit_story_limit)):
        page = _topic_story_page_with_media(user_key, topics_id, link_id, **kwargs)
        if 'next' in page['link_ids']:
            link_id = page['link_ids']['next']
        else:
            more_pages = False
        for s in page['stories']:
            # first foci down to just the readable names
            s['subtopics'] = ["{}: {}".format(f['focal_set_name'], f['name']) for f in s['foci']]
            cleaned_row = csv.dict2row(props, s)
            row_string = ','.join(cleaned_row) + '\n'
            yield row_string
        story_count += len(page['stories'])
        yet_to_hit_story_limit = has_story_limit and (story_count < int(kwargs['story_limit']))