Example #1
0
def stream_media_link_list_csv(user_mc_key, filename, topics_id, **kwargs):
    params = kwargs.copy()
    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id':
        request.args['focusId'] if 'foci_id' in request.args else None,
    }
    params.update(merged_args)
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [
            None, '', 'null', 'undefined'
        ] else None
    params[
        'limit'] = 1000  # an arbitrary value to let us page through with big topics

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    columns = [
        'src_media_id', 'src_media_name', 'src_media_url', 'ref_media_id',
        'ref_media_name', 'ref_media_url'
    ]
    return Response(_topic_media_link_list_by_page_as_csv_row(
        user_mc_key, topics_id, columns, **params),
                    mimetype='text/csv; charset=utf-8',
                    headers=headers)
Example #2
0
def stream_story_link_list_csv(user_key, filename, topics_id, **kwargs):

    all_stories = []
    params=kwargs.copy()

    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id': request.args['focusId'] if 'foci_id' in request.args else None,
    }
    params.update(merged_args)
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None
    params['limit'] = 100  # an arbitrary value to let us page through with big topics

    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated',
        'inlink_count','outlink_count'
        # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'
    ]

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_topic_story_link_list_by_page_as_csv_row(user_key, topics_id, props, **params),
                    mimetype='text/csv; charset=utf-8', headers=headers)
Example #3
0
def _stream_story_list_csv(filename,
                           q,
                           fq,
                           stories_per_page=500,
                           sort=MediaCloud.SORT_PROCESSED_STORIES_ID,
                           page_limit=None):
    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language',
        'ap_syndicated', 'themes', 'media_id', 'media_name', 'media_url'
    ]
    if INCLUDE_MEDIA_METADATA_IN_CSV:
        metadata_cols = [
            'media_pub_country', 'media_pub_state', 'media_language',
            'media_about_country', 'media_media_type'
        ]
        props += metadata_cols
    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_story_list_by_page_as_csv_row(user_mediacloud_key(), q,
                                                   fq, stories_per_page, sort,
                                                   page_limit, props),
                    mimetype='text/csv; charset=utf-8',
                    headers=headers)
Example #4
0
def stream_story_list_csv(user_key, filename, topics_id, **kwargs):

    as_attachment = kwargs['as_attachment'] if 'as_attachment' in kwargs else True
    fb_data = kwargs['fb_data'] if 'fb_data' in kwargs else False
    all_stories = []
    params = kwargs.copy()

    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id': request.args['focusId'] if 'foci_id' in request.args else None,
        'q': request.args['q'] if 'q' in request.args else None,
        'sort': request.args['sort'] if 'sort' in request.args else None,
    }
    params.update(merged_args)
    #
    if 'as_attachment' in params:
        del params['as_attachment']
    if 'fb_data' in params:
        del params['fb_data']
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None
    params['limit'] = 100  # an arbitrary value to let us page through with big topics

    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated',
        'themes', 'subtopics',
        'inlink_count', 'facebook_share_count', 'outlink_count', 'media_inlink_count',
        'media_id', 'media_name', 'media_url',
        # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'
    ]

    if fb_data:
        all_fb_count = []
        more_fb_count = True
        link_id = 0
        local_mc = user_admin_mediacloud_client()
        while more_fb_count:
            fb_page = local_mc.topicStoryListFacebookData(topics_id, limit=100, link_id=link_id)

            all_fb_count = all_fb_count + fb_page['counts']
            if 'next' in fb_page['link_ids']:
                link_id = fb_page['link_ids']['next']
                more_fb_count = True
            else:
                more_fb_count = False

        # now iterate through each list and set up the fb collection date
        for s in all_stories:
            for fb_item in all_fb_count:
                if int(fb_item['stories_id']) == int(s['stories_id']):
                    s['facebook_collection_date'] = fb_item['facebook_api_collect_date']
        props.append('facebook_collection_date')

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **params),
                    mimetype='text/csv; charset=utf-8', headers=headers)
Example #5
0
def stream_media_list_csv(user_mc_key, topic, filename, **kwargs):
    filename = topic['name'] + '-' + filename
    # we have to make a separate call to the media info if the user wants to inlcude the media metadata
    include_media_metadata = ('media_metadata'
                              in kwargs) and (kwargs['media_metadata'] == '1')
    # if the focusId is a URL Sharing subtopic, then we have platform-specific post/author/channel share counts
    include_platform_url_shares = kwargs[
        'include_platform_url_shares'] if 'include_platform_url_shares' in kwargs else False
    # if this topic includes platforms, then we have URL sharing counts (post/author/channel) for each platform
    include_all_url_shares = kwargs[
        'include_all_url_shares'] if 'include_all_url_shares' in kwargs else False
    params = kwargs.copy()
    snapshots_id, timespans_id, foci_id, q = filters_from_args(request.args)
    merged_args = {
        'timespans_id': timespans_id,
        'snapshots_id': snapshots_id,
        'foci_id': foci_id,
        'q': q,
        'sort': request.args.get('sort') if 'sort' in request.args else None,
    }
    params.update(merged_args)
    # do a check to see if the user has added in a real query or not
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [
            None, '', 'null', 'undefined'
        ] else None
    params[
        'limit'] = 1000  # an arbitrary value to let us page through with big topics (note, this is the page size)
    # set up the dict keys / column headers that the user cares about for this download
    props = TOPIC_MEDIA_CSV_PROPS
    if include_platform_url_shares:
        props += ['post_count', 'channel_count', 'author_count']
    if include_all_url_shares:
        # if the user requested to download all the url sharing counts by platform, we need to grab the config for that
        # which is held in the platform seed query objects
        topic_seed_queries = topic['topic_seed_queries']
        extra_columns = []
        for tsq in topic_seed_queries:
            prefix = platform_csv_column_header_prefix(tsq)
            extra_columns += [
                prefix + 'post_count', prefix + 'channel_count',
                prefix + 'author_count'
            ]
        props += extra_columns
        params['topic_seed_queries'] = topic_seed_queries
    if include_media_metadata:
        props += [
            'media_pub_country', 'media_pub_state', 'media_language',
            'media_about_country', 'media_media_type'
        ]
    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_stream_media_by_page(user_mc_key, topic['topics_id'],
                                          props, **params),
                    mimetype='text/csv; charset=utf-8',
                    headers=headers)
def _stream_story_list_csv(filename, q, fq, stories_per_page=500, sort=MediaCloud.SORT_PROCESSED_STORIES_ID,
                           page_limit=None):
    props = ['stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated',
             'themes', 'media_id', 'media_name', 'media_url',
             'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country',
             'media_media_type']
    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_story_list_by_page_as_csv_row(q, fq, stories_per_page, sort, page_limit, props),
                    mimetype='text/csv; charset=utf-8', headers=headers)
Example #7
0
def stream_media_link_list_csv(user_mc_key, filename, topics_id, **kwargs):
    params = kwargs.copy()
    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id': request.args['focusId'] if 'foci_id' in request.args else None,
    }
    params.update(merged_args)
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None
    params['limit'] = 1000  # an arbitrary value to let us page through with big topics

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    columns = ['src_media_id', 'src_media_name', 'src_media_url', 'ref_media_id', 'ref_media_name', 'ref_media_url']
    return Response(_topic_media_link_list_by_page_as_csv_row(user_mc_key, topics_id, columns, **params),
                    mimetype='text/csv; charset=utf-8', headers=headers)
Example #8
0
def stream_story_link_list_csv(user_key, filename, topics_id, **kwargs):
    params = kwargs.copy()
    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id': request.args['focusId'] if 'foci_id' in request.args else None,
    }
    params.update(merged_args)
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None
    params['limit'] = 100  # an arbitrary value to let us page through with big topics

    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated',
        'inlink_count', 'outlink_count'
        # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'
    ]

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_topic_story_link_list_by_page_as_csv_row(user_key, topics_id, props, **params),
                    mimetype='text/csv; charset=utf-8', headers=headers)
Example #9
0
def stream_story_list_csv(user_key, topic, **kwargs):
    filename = topic['name']+'-stories'
    has_twitter_data = (topic['ch_monitor_id'] is not None) and (topic['ch_monitor_id'] != 0)

    # as_attachment = kwargs['as_attachment'] if 'as_attachment' in kwargs else True
    include_media_metadata = ('media_metadata' in kwargs) and (kwargs['media_metadata'] is True)
    include_story_tags = ('story_tags' in kwargs) and (kwargs['story_tags'] is True)
    include_reddit_submissions = ('reddit_submissions' in kwargs) and (kwargs['reddit_submissions'] is True)
    include_fb_date = kwargs['fb_data'] if 'fb_data' in kwargs else False
    all_stories = []
    params = kwargs.copy()

    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id': request.args['focusId'] if 'focusId' in request.args else None,
        'q': request.args['q'] if 'q' in request.args else None,
        'sort': request.args['sort'] if 'sort' in request.args else None,
    }
    params.update(merged_args)

    story_count = apicache.topic_story_count(user_mediacloud_key(), topic['topics_id'],
                                             snapshots_id=params['snapshots_id'], timespans_id=params['timespans_id'],
                                             foci_id=params['foci_id'], q=params['q'])
    logger.info("Total stories to download: {}".format(story_count['count']))

    if 'as_attachment' in params:
        del params['as_attachment']
    if 'fb_data' in params:
        del params['fb_data']
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None
    params['limit'] = 1000  # an arbitrary value to let us page through with big topics

    # determine which props the user actually wants to download
    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated', 'inlink_count',
        'facebook_share_count',
    ]
    if has_twitter_data:
        props.append('simple_tweet_count')
    if include_reddit_submissions:
        props.append('reddit_submissions')
    if include_fb_date:
        props.append('facebook_collection_date')
    if include_story_tags:
        props += ['themes', 'subtopics']
    props += ['outlink_count', 'media_inlink_count', 'media_id', 'media_name', 'media_url']
    if include_media_metadata:
        props += ['media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type']

    if include_fb_date:
        all_fb_count = []
        more_fb_count = True
        link_id = 0
        local_mc = user_admin_mediacloud_client()
        while more_fb_count:
            fb_page = local_mc.topicStoryListFacebookData(topic['topics_id'], limit=100, link_id=link_id)

            all_fb_count = all_fb_count + fb_page['counts']
            if 'next' in fb_page['link_ids']:
                link_id = fb_page['link_ids']['next']
                more_fb_count = True
            else:
                more_fb_count = False

        # now iterate through each list and set up the fb collection date
        for s in all_stories:
            for fb_item in all_fb_count:
                if int(fb_item['stories_id']) == int(s['stories_id']):
                    s['facebook_collection_date'] = fb_item['facebook_api_collect_date']

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_topic_story_list_by_page_as_csv_row(user_key, topic['topics_id'], props, **params),
                    mimetype='text/csv; charset=utf-8', headers=headers)
Example #10
0
def stream_story_list_csv(user_key, filename, topics_id, **kwargs):
    user_mc = user_mediacloud_client(user_key)
    topic = user_mc.topic(topics_id)
    has_twitter_data = topic['ch_monitor_id'] is not None

    # as_attachment = kwargs['as_attachment'] if 'as_attachment' in kwargs else True
    include_fb_date = kwargs['fb_data'] if 'fb_data' in kwargs else False
    all_stories = []
    params = kwargs.copy()

    merged_args = {
        'snapshots_id': request.args['snapshotId'],
        'timespans_id': request.args['timespanId'],
        'foci_id': request.args['focusId'] if 'focusId' in request.args else None,
        'q': request.args['q'] if 'q' in request.args else None,
        'sort': request.args['sort'] if 'sort' in request.args else None,
    }
    params.update(merged_args)

    story_count = apicache.topic_story_count(user_mediacloud_key(), topics_id,
                                             snapshots_id=params['snapshots_id'], timespans_id=params['timespans_id'],
                                             foci_id = params['foci_id'], q=params['q'])
    logger.info("Total stories to download: {}".format(story_count))

    if 'as_attachment' in params:
        del params['as_attachment']
    if 'fb_data' in params:
        del params['fb_data']
    if 'q' in params:
        params['q'] = params['q'] if 'q' not in [None, '', 'null', 'undefined'] else None
    params['limit'] = 1000  # an arbitrary value to let us page through with big topics

    # determine which props the user actaully wants to download
    props = [
        'stories_id', 'publish_date', 'title', 'url', 'language', 'ap_syndicated',
        'themes', 'subtopics',
        'inlink_count', 'facebook_share_count',
        # removed media metadata here because it takes too long to query for it
        # 'media_pub_country', 'media_pub_state', 'media_language', 'media_about_country', 'media_media_type'
    ]
    if has_twitter_data:
        props.append('simple_tweet_count')
    if include_fb_date:
        props.append('facebook_collection_date')
    props += ['outlink_count', 'media_inlink_count', 'media_id', 'media_name', 'media_url']

    if include_fb_date:
        all_fb_count = []
        more_fb_count = True
        link_id = 0
        local_mc = user_admin_mediacloud_client()
        while more_fb_count:
            fb_page = local_mc.topicStoryListFacebookData(topics_id, limit=100, link_id=link_id)

            all_fb_count = all_fb_count + fb_page['counts']
            if 'next' in fb_page['link_ids']:
                link_id = fb_page['link_ids']['next']
                more_fb_count = True
            else:
                more_fb_count = False

        # now iterate through each list and set up the fb collection date
        for s in all_stories:
            for fb_item in all_fb_count:
                if int(fb_item['stories_id']) == int(s['stories_id']):
                    s['facebook_collection_date'] = fb_item['facebook_api_collect_date']

    timestamped_filename = csv.safe_filename(filename)
    headers = {
        "Content-Disposition": "attachment;filename=" + timestamped_filename
    }
    return Response(_topic_story_list_by_page_as_csv_row(user_key, topics_id, props, **params),
                    mimetype='text/csv; charset=utf-8', headers=headers)