def create_nyt_theme_focal_set(topics_id):
    user_mc = user_mediacloud_client()

    # grab the focalSetName and focalSetDescription and then make one
    focal_set_name = request.form['focalSetName']
    focal_set_description = request.form['focalSetDescription']
    theme_data = json.loads(request.form['data[]'])
    focal_technique = FOCAL_TECHNIQUE_BOOLEAN_QUERY  # is this right?
    new_focal_set = user_mc.topicFocalSetDefinitionCreate(
        topics_id, focal_set_name, focal_set_description, focal_technique)
    if 'focal_set_definitions_id' not in new_focal_set:
        return json_error_response('Unable to create the subtopic set')
    # now make the foci in it - one for each country
    for tag in theme_data:
        params = {
            'name': tag['label'],
            'description': "Stories about {}".format(tag['label']),
            'query': "tags_id_stories:{}".format(tag['tags_id']),
            'focal_set_definitions_id':
            new_focal_set['focal_set_definitions_id'],
        }
        user_mc = user_mediacloud_client()
        user_mc.topicFocusDefinitionCreate(topics_id, **params)

    return {'success': True}
Exemple #2
0
def media_search(search_str, tags_id=None, **kwargs):
    mc = user_mediacloud_client()
    return mc.mediaList(name_like=search_str,
                        tags_id=tags_id,
                        rows=MAX_SOURCES,
                        sort="num_stories",
                        **kwargs)
Exemple #3
0
def _cached_collection_source_representation(mc_api_key,
                                             collection_id,
                                             sample_size=1000,
                                             fq=''):
    # have to respect the api here here because only some folks can see private collections
    user_mc = user_mediacloud_client(mc_api_key)
    stories = user_mc.storyList('tags_id_media:{}'.format(collection_id),
                                fq,
                                rows=sample_size,
                                sort=mc.SORT_RANDOM)
    media_representation = {}
    for s in stories:
        if s['media_id'] not in media_representation:
            media_representation[s['media_id']] = {
                'media_id': s['media_id'],
                'media_name': s['media_name'],
                'media_url': s['media_url'],
                'sample_size': sample_size,
                'stories': 0
            }
        media_representation[s['media_id']]['stories'] += 1
    for media_id in media_representation:
        media_representation[media_id]['story_pct'] = float(
            media_representation[media_id]['stories']) / float(sample_size)
    return sorted(list(media_representation.values()),
                  key=operator.itemgetter('stories'))
def _mc_client(admin=False):
    # return the user's client handler, or a tool one if not logged in
    if is_user_logged_in():
        client_to_use = user_mediacloud_client() if not admin else user_admin_mediacloud_client()
    else:
        client_to_use = mc
    return client_to_use
def _cached_topic_tag_counts(user_mc_key, topics_id, tag_sets_id, sample_size,
                             query):
    user_mc = user_mediacloud_client()
    # we don't need ot use topics_id here because the timespans_id is in the query argument
    tag_counts = user_mc.storyTagCount(query, tag_sets_id=tag_sets_id)
    # add in the pct so we can show relative values within the sample
    return tag_counts
Exemple #6
0
def get_topic_platforms(topics_id):
    user_mc = user_mediacloud_client()
    available_platforms = _available_platforms()
    topic = user_mc.topic(topics_id)
    # and add in the open web query, which isn't stored in topic_seed_queries for historical reasons :-(
    if topic_has_seed_query(topic):
        for item in available_platforms:
            if (item['platform']
                    == PLATFORM_OPEN_WEB) and (item['source']
                                               == PLATFORM_SOURCE_MEDIA_CLOUD):
                real_web_query = platform_for_web_seed_query(topic)
                for key in real_web_query:
                    item[key] = real_web_query[key]
                break
    # now fill in with any seed queries that have been created
    for seed_query in topic['topic_seed_queries']:
        match = [
            p for p in available_platforms
            if (p['platform'] == seed_query['platform']) and (
                p['source'] == seed_query['source'])
        ]
        if len(match) == 1:
            match[0]['query'] = seed_query['query']
            match[0]['topic_seed_queries_id'] = seed_query[
                'topic_seed_queries_id']
    return jsonify({'results': available_platforms})
Exemple #7
0
def cached_media_with_tag_page(tags_id, max_media_id):
    '''
    We have to do this on the page, not the full list because memcache has a 1MB cache upper limit,
    and some of the collections have TONS of sources
    '''
    user_mc = user_mediacloud_client()
    return user_mc.mediaList(tags_id=tags_id, last_media_id=max_media_id, rows=100)
Exemple #8
0
def story_counts_by_snapshot(topics_id):
    user_mc = user_mediacloud_client(user_mediacloud_key())
    snapshots = user_mc.topicSnapshotList(topics_id)
    counts = {}
    for s in snapshots:
        # get the count of stories in the overally timespan for this snapshot
        timespans = apicache.cached_topic_timespan_list(user_mediacloud_key(), topics_id,
                                                        snapshots_id=s['snapshots_id'], foci_id=None)
        try:
            total = timespans[0]['story_count']
        except mediacloud.error.MCException:
            total = 0
        except IndexError:  # this doesn't have any snapshots (ie. it failed to generate correctly)
            total = 0
        # search by tag to find out how many stories were spidered
        spidered = 0
        try:
            spidered = apicache.topic_story_count(user_mediacloud_key(), topics_id,
                                                  snapshots_id=s['snapshots_id'], foci_id=None,
                                                  timespans_id=timespans[0]['timespans_id'],
                                                  q="* AND tags_id_stories:{}".format(TAG_SPIDERED_STORY))['count']
        except mediacloud.error.MCException:
            spidered = 0
        except IndexError:  # this doesn't have any snapshots (ie. it failed to generate correctly)
            total = 0
        seeded = total - spidered
        counts[s['snapshots_id']] = {'total': total, 'spidered': spidered, 'seeded': seeded}
    return jsonify(counts)
def get_topic_media_links_csv(topics_id):
    user_mc = user_mediacloud_client()
    topic = user_mc.topic(topics_id)

    #page through results for timespand
    return stream_media_link_list_csv(user_mediacloud_key(),
                                      topic['name'] + '-stories', topics_id)
def api_collection_source_representation_csv(collection_id):
    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    source_representation = apicache.collection_source_representation(user_mediacloud_key(), collection_id)
    props = ['media_id', 'media_name', 'media_url', 'stories', 'sample_size', 'story_pct']
    filename = info['label'] + "-source sentence counts.csv"
    return csv.stream_response(source_representation, props, filename)
def api_collection_sources_csv(collection_id):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)    # not cached because props can change often
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag'])
    properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS
    return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def mc_client(admin=False):
    # return the user's client handler, or a tool one if not logged in
    if is_user_logged_in():
        client_to_use = user_mediacloud_client() if not admin else user_admin_mediacloud_client()
    else:
        client_to_use = mc
    return client_to_use
Exemple #13
0
def topic_favorites():
    user_mc = user_mediacloud_client()
    favorite_topic_ids = user_db.get_users_lists(user_name(), 'favoriteTopics')
    favorited_topics = [user_mc.topic(tid) for tid in favorite_topic_ids]
    for t in favorited_topics:
        t['isFavorite'] = True
    return jsonify({'topics': favorited_topics})
def api_collection_sources_csv(collection_id):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)    # not cached because props can change often
    all_media = media_with_tag(user_mediacloud_key(), collection_id)
    file_prefix = "Collection {} ({}) - sources ".format(collection_id, collection['tag'])
    properties_to_include = SOURCE_LIST_CSV_EDIT_PROPS
    return csv.download_media_csv(all_media, file_prefix, properties_to_include)
def add_retweet_partisanship_to_topic(topics_id, focal_set_name,
                                      focal_set_description):
    user_mc = user_mediacloud_client()
    focal_technique = FOCAL_TECHNIQUE_BOOLEAN_QUERY
    new_focal_set = user_mc.topicFocalSetDefinitionCreate(
        topics_id, focal_set_name, focal_set_description, focal_technique)
    if 'focal_set_definitions_id' not in new_focal_set:
        return json_error_response('Unable to create the subtopic set')
    # now make the foci in it - one for each partisanship quintile
    partisanship_tags = _cached_media_tags(
        TAG_SETS_ID_RETWEET_PARTISANSHIP_2016)
    for tag in partisanship_tags:
        name = tag['label']
        description = "Media sources that were retweeted more often during the 2016 US election " \
                      "season by people on the {}".format(tag['label'])
        query = tag['query']
        focal_set_definitions_id = new_focal_set['focal_set_definitions_id']
        # create a new boolean query subtopic based on the tag sets
        new_focus = user_mc.topicFocusDefinitionCreate(
            topics_id,
            name=name,
            description=description,
            query=query,
            focal_set_definitions_id=focal_set_definitions_id)
        if (len(new_focus) == 0) or ('focus_definitions_id'
                                     not in new_focus[0]):
            return json_error_response(
                'Unable to create the {} subtopic'.format(name))
    return {'success': True}
Exemple #16
0
def _cached_topic_story_list(user_mc_key, topics_id, **kwargs):
    """
    Internal helper - don't call this; call topic_story_list instead. This needs user_mc_key in the
    function signature to make sure the caching is keyed correctly.
    """
    local_mc = user_mediacloud_client(user_mc_key)
    return local_mc.topicStoryList(topics_id, **kwargs)
def topic_focus_definition_update_or_create(topics_id):
    user_mc = user_mediacloud_client()
    name = request.form['focusName']
    description = request.form['focusDescription']
    query = request.form['keywords']
    # update if it has an id, create if new
    if 'foci_id' in request.form:
        # you can't change the focal set a focus is in
        foci_id = request.form['foci_id']
        focus = user_mc.topicFocusDefinitionUpdate(topics_id, foci_id, name=name, description=description,
                                                   query=query)
    else:
        # if new focal set, then create that first
        if int(request.form['focalSetDefinitionId']) is NEW_FOCAL_SET_PLACEHOLDER_ID:
            name = request.form['focalSetName']
            description = request.form['focalSetDescription']
            focal_technique = request.form['focalTechnique']
            new_focal_set = user_mc.topicFocalSetDefinitionCreate(topics_id, name, description, focal_technique)
            focal_set_definitions_id = new_focal_set['focal_set_definitions_id']
        else:
            focal_set_definitions_id = request.form['focalSetDefinitionId']
        # create focus, pointing at focal set
        focus = user_mc.topicFocusDefinitionCreate(topics_id, name=name, description=description,
                                                   query=query, focal_set_definitions_id=focal_set_definitions_id)
    return jsonify(focus)
def _topic_snapshot_list(topic):
    if access_public_topic(topic['topics_id']):
        local_mc = mc
        api_key = TOOL_API_KEY
    elif is_user_logged_in():
        local_mc = user_mediacloud_client()
        api_key = user_mediacloud_key()
    else:
        return {}  # prob something smarter we can do here
    snapshots = local_mc.topicSnapshotList(topic['topics_id'])
    snapshots = sorted(snapshots, key=itemgetter('snapshots_id'))
    # add in any missing version numbers
    for idx in range(0, len(snapshots)):
        if snapshots[idx]['note'] in [None, '']:
            snapshots[idx]['note'] = idx + ARRAY_BASE_ONE
    # seed_query story count
    topic['seed_query_story_count'] = _topic_seed_story_count(topic)
    # add foci_count for display
    snapshots = _add_snapshot_foci_count(api_key, topic['topics_id'],
                                         snapshots)
    snapshots = sorted(snapshots, key=lambda d: d['snapshot_date'])
    # extra stuff
    snapshot_status = mc.topicSnapshotGenerateStatus(
        topic['topics_id'])['job_states']  # need to know if one is running
    latest = snapshots[-1] if len(snapshots) > 0 else None
    return {
        'list': snapshots,
        'jobStatus': snapshot_status,
        'latestVersion': latest['note'] if latest else 1,
    }
Exemple #19
0
def topic_update_platform(topics_id, platform_id):
    user_mc = user_mediacloud_client()
    channel = request.form[
        'platform_channel'] if 'platform_channel' in request.form else None
    source = request.form[
        'platform_source'] if 'platform_source' in request.form else None
    query = request.form[
        'platform_query'] if 'platform_query' in request.form else None
    platform = request.form['platform_type']
    result = {}
    if platform == PLATFORM_OPEN_WEB:
        # here we need to parse the sources and collections out of the 'channel'
        sources, collections = parse_open_web_media_from_channel(channel)
        user_mc.topicUpdate(topics_id,
                            media_ids=sources,
                            media_tags_ids=collections,
                            solr_seed_query=query)
        result['success'] = 1
        result['id'] = platform_id  #web_shim_ui
    else:
        result = user_mc.topicRemoveSeedQuery(
            topics_id, topic_seed_queries_id=platform_id)
        # Fake an update operation here by removing and then adding again
        if platform == PLATFORM_REDDIT:
            #TODO update this merge with correct info from Jason/Pushshift library
            query = "{} AND {}".format(query, channel)
        result = user_mc.topicAddSeedQuery(topics_id, platform, source, query)

        result['success'] = 1 if 'topic_seed_query' in result else 0
        result['id'] = result['topic_seed_query']['topic_seed_queries_id']

    return result  # topic_seed_queries_id
def favorite_collections():
    user_mc = user_mediacloud_client()
    user_favorited = db.get_users_lists(user_name(), 'favoriteCollections')
    favorited_collections = [user_mc.tag(tag_id) for tag_id in user_favorited]
    for s in favorited_collections:
        s['isFavorite'] = True
    return jsonify({'list': favorited_collections})
def topic_focus_definition_update_or_create(topics_id):
    user_mc = user_mediacloud_client()
    name = request.form['focusName']
    description = request.form['focusDescription']
    query = request.form['keywords']
    # update if it has an id, create if new
    if 'foci_id' in request.form:
        # you can't change the focal set a focus is in
        foci_id = request.form['foci_id']
        focus = user_mc.topicFocusDefinitionUpdate(topics_id,
                                                   foci_id,
                                                   name=name,
                                                   description=description,
                                                   query=query)
    else:
        # if new focal set, then create that first
        if int(request.form['focalSetDefinitionId']
               ) is NEW_FOCAL_SET_PLACEHOLDER_ID:
            fs_name = request.form['focalSetName']
            fs_description = request.form['focalSetDescription']
            focal_technique = request.form['focalTechnique']
            new_focal_set = user_mc.topicFocalSetDefinitionCreate(
                topics_id, fs_name, fs_description, focal_technique)
            focal_set_definitions_id = new_focal_set[
                'focal_set_definitions_id']
        else:
            focal_set_definitions_id = request.form['focalSetDefinitionId']
        # create focus, pointing at focal set
        focus = user_mc.topicFocusDefinitionCreate(
            topics_id,
            name=name,
            description=description,
            query=query,
            focal_set_definitions_id=focal_set_definitions_id)
    return jsonify(focus)
def favorite_sources():
    user_mc = user_mediacloud_client()
    user_favorited = db.get_users_lists(user_name(), 'favoriteSources')
    favorited_s = [user_mc.media(media_id) for media_id in user_favorited]
    for s in favorited_s:
        s['isFavorite'] = True
    return jsonify({'list': favorited_s})
Exemple #23
0
def cached_topic_timespan_list(topics_id, snapshots_id=None, foci_id=None):
    # this includes the user_mc_key as a first param so the cache works right
    user_mc = user_mediacloud_client()
    timespans = user_mc.topicTimespanList(topics_id,
                                          snapshots_id=snapshots_id,
                                          foci_id=foci_id)
    return timespans
Exemple #24
0
def _topic_snapshot_list(topic):
    local_mc = user_mediacloud_client()
    api_key = user_mediacloud_key()
    snapshots = local_mc.topicSnapshotList(topic['topics_id'])
    snapshots = sorted(snapshots, key=itemgetter('snapshots_id'))
    # add in any missing version numbers
    for idx in range(0, len(snapshots)):
        if snapshots[idx]['note'] in [None, '']:
            snapshots[idx]['note'] = idx + ARRAY_BASE_ONE
    # format any web seed queries as platforms objects
    for s in snapshots:
        platforms = []
        if (s['seed_queries'] is not None) and ('topic' in s['seed_queries']):
            p = platform_for_web_seed_query(s['seed_queries'])
            platforms.append(p)
            platforms += s['seed_queries']['topic_seed_queries']
        else:
            if topic_has_seed_query(topic):
                p = platform_for_web_seed_query(topic)
                platforms.append(p)
        s['platform_seed_queries'] = platforms
    # add foci_count for display
    snapshots = _add_snapshot_foci_count(api_key, topic['topics_id'],
                                         snapshots)
    snapshots = sorted(snapshots, key=lambda d: d['snapshot_date'])
    # extra stuff
    snapshot_status = mc.topicSnapshotGenerateStatus(
        topic['topics_id'])['job_states']  # need to know if one is running
    latest = snapshots[-1] if len(snapshots) > 0 else None
    topic['seed_query_story_count'] = _topic_seed_story_count(topic)
    return {
        'list': snapshots,
        'jobStatus': snapshot_status,
        'latestVersion': latest['note'] if latest else 1,
    }
Exemple #25
0
def topic_create():
    user_mc = user_mediacloud_client()
    name = request.form['name']
    description = request.form['description']
    solr_seed_query = request.form['solr_seed_query']
    start_date = request.form['start_date']
    end_date = request.form['end_date']
    optional_args = {
        'max_iterations': request.form['max_iterations'] if 'max_iterations' in request.form and request.form['max_iterations'] != 'null' else None,
        'max_stories': request.form['max_stories'] if 'max_stories' in request.form and request.form['max_stories'] != 'null' else flask_login.current_user.profile['limits']['max_topic_stories'],
    }
    try:
        topic_result = user_mc.topicCreate(name=name, description=description, solr_seed_query=solr_seed_query,
                                           start_date=start_date, end_date=end_date,
                                           media_tags_ids=[COLLECTION_US_TOP_ONLINE],  # HACK: can't save without one of these in place (for now)
                                           **optional_args,
                                           )['topics'][0]
        topics_id = topic_result['topics_id']
        logger.info("Created new topic \"{}\" as {}".format(name, topics_id))
        # if this includes any of the US-centric collections, add the retweet partisanship subtopic by default
        # client will either make a empty snapshot, or a spidering one
        return topic_summary(topics_id)
    except mediacloud.error.MCException as e:
        logging.error("Topic creation failed {}".format(name))
        logging.exception(e)
        return json_error_response(e.message, e.status_code)
    except Exception as e:
        logging.error("Topic creation failed {}".format(name))
        logging.exception(e)
        return json_error_response(str(e), 500)
def api_collection_sources_feed_status_csv(collection_id, source_type):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)
    list_type = str(source_type).lower()
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    media_info_in_collection = _media_list_edit_job.map(media_in_collection)
    if list_type == 'review':
        filtered_media = [
            m for m in media_info_in_collection if m['active_feed_count'] > 0
            and m['num_stories_90'] == 0 and m['num_stories_last_year'] > 0
        ]
    elif list_type == 'remove':
        filtered_media = [
            m for m in media_info_in_collection if m['active_feed_count'] > 0
            and m['num_stories_90'] == 0 and m['num_stories_last_year'] == 0
            and m['latest_scrape_job.state'] == 'failed'
        ]
    elif list_type == 'unscrapeable':
        filtered_media = [
            m for m in media_info_in_collection
            if m['active_feed_count'] == 0 and m['num_stories_90'] > 0
        ]
    elif list_type == 'working':
        filtered_media = [
            m for m in media_info_in_collection
            if m['active_feed_count'] > 0 and m['num_stories_last_year'] > 0
        ]
    else:
        filtered_media = media_info_in_collection
    file_prefix = "Collection {} ({}) - sources feed {}".format(
        collection_id, collection['tag'], source_type)
    properties_to_include = SOURCE_FEED_LIST_CSV_PROPS
    return csv.download_media_csv(filtered_media, file_prefix,
                                  properties_to_include)
def cached_entities(user_mediacloud_key, stories_id):
    user_mc = user_mediacloud_client()
    nlp_results = user_mc.storyCoreNlpList(story_id_list=[stories_id])
    if nlp_results[0]['corenlp'] == "story is not annotated":
        return None
    story_nlp = nlp_results[0]['corenlp']['_']['corenlp']
    # set up for entity counting
    entities = []
    current_entity_words = []
    current_entity_type = None  # entities can be split across multiple consecutive words
    # iterate through the words collecting any named entities
    for sentence in story_nlp['sentences']:
        for token in sentence['tokens']:
            if (len(token['ne']) > 1):
                current_entity_type = token['ne']
                current_entity_words.append(token['word'])
            else:  # found a non-entity, so check if the preceeding word(s) were an entity and add it to the mix
                if current_entity_type is not None:
                    entities.append({'type': current_entity_type,
                                     'name': " ".join(current_entity_words),
                                     'words': len(current_entity_words)})
                current_entity_words = []
                current_entity_type = None
    # turn the lists into counts
    unique_entities = {}
    for entity in entities:
        unique_key = entity['type'] + entity['name'] + str(entity['words'])
        if unique_key in unique_entities.keys():
            unique_entities[unique_key]['frequency'] += 1
        else:
            unique_entities[unique_key] = entity
            unique_entities[unique_key]['frequency'] = 1
    unique_entities = list(unique_entities.values())
    unique_entities = sorted(unique_entities, key=itemgetter('frequency'), reverse=True)
    return unique_entities
Exemple #28
0
def topic_favorites():
    user_mc = user_mediacloud_client()
    favorite_topic_ids = db.get_users_lists(user_name(), 'favoriteTopics')
    favorited_topics = [user_mc.topic(tid) for tid in favorite_topic_ids]
    for t in favorited_topics:
        t['isFavorite'] = True
        # t['detailInfo'] = get_topic_info_per_snapshot_timespan(t['topics_id'])
    return jsonify({'topics': favorited_topics})
Exemple #29
0
def sorted_public_topic_list():
    # needs to support logged in or not
    if is_user_logged_in():
        local_mc = user_mediacloud_client()
    else:
        local_mc = mc
    public_topics = local_mc.topicList(public=True, limit=51)['topics']
    return sorted(public_topics, key=lambda t: t['name'].lower())
Exemple #30
0
def _cached_tag_page(mc_api_key, tag_sets_id, last_tags_id, rows, public_only):
    # user agnositic here because the list of tags in a collection only changes for users based on public_only
    local_mc = user_mediacloud_client(mc_api_key)
    tag_list = local_mc.tagList(tag_sets_id=tag_sets_id,
                                last_tags_id=last_tags_id,
                                rows=rows,
                                public_only=public_only)
    return tag_list
def sorted_public_topic_list():
    # needs to support logged in or not
    if is_user_logged_in():
        local_mc = user_mediacloud_client()
    else:
        local_mc = mc
    public_topics_list = local_mc.topicList(public=True)['topics']
    return sorted(public_topics_list, key=lambda t: t['name'].lower())
Exemple #32
0
def topic_favorites():
    user_mc = user_mediacloud_client()
    favorite_topic_ids = user_db.get_users_lists(user_name(), 'favoriteTopics')
    favorited_topics = [user_mc.topic(tid) for tid in favorite_topic_ids]
    for t in favorited_topics:
        t['isFavorite'] = True
        # t['detailInfo'] = get_topic_info_per_snapshot_timespan(t['topics_id'])
    return jsonify({'topics': favorited_topics})
Exemple #33
0
def _cached_top_tags(q, fq, tag_sets_id, sample_size=None):
    # post it so long queries work
    user_mc = user_mediacloud_client()
    return user_mc.storyTagCount(q,
                                 fq,
                                 tag_sets_id=tag_sets_id,
                                 limit=sample_size,
                                 http_method='POST')
Exemple #34
0
def topic_provider_stories_csv(topics_id):
    optional_args = _parse_stories_optional_arguments()
    user_mc = user_mediacloud_client()
    topic = user_mc.topic(topics_id)
    del optional_args[
        'link_id']  # we do this do make sure this helper can page through the results
    return stream_story_list_csv(user_mediacloud_key(), 'stories', topic,
                                 **optional_args)
Exemple #35
0
def topic_create():
    user_mc = user_mediacloud_client()
    name = request.form['name']
    description = request.form['description']
    solr_seed_query = request.form['solr_seed_query']
    start_date = request.form['start_date']
    end_date = request.form['end_date']

    optional_args = {
        'is_public':
        request.form['is_public'] if 'is_public' in request.form else None,
        'is_logogram':
        request.form['is_logogram'] if 'is_logogram' in request.form else None,
        'ch_monitor_id':
        request.form['ch_monitor_id'] if len(request.form['ch_monitor_id']) > 0
        and request.form['ch_monitor_id'] != 'null' else None,
        'max_iterations':
        request.form['max_iterations']
        if 'max_iterations' in request.form else None,
        'max_stories':
        request.form['max_stories'] if 'max_stories' in request.form
        and request.form['max_stories'] != 'null' else
        flask_login.current_user.profile['max_topic_stories'],
    }

    # parse out any sources and collections to add
    media_ids_to_add = ids_from_comma_separated_str(request.form['sources[]'])
    tag_ids_to_add = ids_from_comma_separated_str(
        request.form['collections[]'])

    try:
        topic_result = user_mc.topicCreate(name=name,
                                           description=description,
                                           solr_seed_query=solr_seed_query,
                                           start_date=start_date,
                                           end_date=end_date,
                                           media_ids=media_ids_to_add,
                                           media_tags_ids=tag_ids_to_add,
                                           **optional_args)['topics'][0]

        topics_id = topic_result['topics_id']
        logger.info("Created new topic \"{}\" as {}".format(name, topics_id))
        # if this includes any of the US-centric collections, add the retweet partisanship subtopic by default
        if set(tag_ids_to_add).intersection(US_COLLECTIONS):
            add_retweet_partisanship_to_topic(
                topic_result['topics_id'], 'Retweet Partisanship',
                'Subtopics driven by our analysis of Twitter followers of Trump and Clinton during the 2016 election season.  Each media soure is scored based on the ratio of retweets of their stories in those two groups.'
            )
        # client will either make a empty snapshot, or a spidering one
        return topic_summary(topics_id)
    except Exception as e:
        logging.error("Topic creation failed {}".format(name))
        logging.exception(e)
        return json_error_response(str(e), 500)
    except mediacloud.error.MCException as e:
        logging.error("Topic creation failed {}".format(name))
        logging.exception(e)
        return json_error_response(e.message, e.status_code)
def _cached_tag_coverage_pct(query, tag_sets_id):
    user_mc = user_mediacloud_client()
    story_count = source_story_count(user_mediacloud_key(), query)
    tagged_story_counts = user_mc.storyTagCount(solr_query=query, tag_sets_id=tag_sets_id)
    # sum tagged articles because there are different versions
    tagged_sum = sum([tag_info['count'] for tag_info in tagged_story_counts])
    # compute coverage ratio (protect against div by zero)
    ratio = float(tagged_sum) / float(story_count) if story_count > 0 else 0
    return ratio
Exemple #37
0
def _cached_split_story_counts(q='*', fq=''):
    # sources are open to everyone, so no need for user-specific cache
    # Helper to fetch split story counts over a timeframe for an arbitrary query
    user_mc = user_mediacloud_client()
    results = user_mc.storyCount(solr_query=q,
                                 solr_filter=fq,
                                 split=True,
                                 split_period='day')
    return results
def _cached_topic_story_count(user_mc_key, topics_id, **kwargs):
    '''
    Internal helper - don't call this; call topic_story_count instead. This needs user_mc_key in the
    function signature to make sure the caching is keyed correctly.
    '''
    if user_mc_key == TOOL_API_KEY:
        local_mc = mc
    else:
        local_mc = user_mediacloud_client()
    return local_mc.topicStoryCount(topics_id, **kwargs)
def _cached_topic_tag_counts(user_mc_key, topics_id, tag_sets_id, sample_size, query):
    user_mc = user_mediacloud_client()
    # we don't need ot use topics_id here because the timespans_id is in the query argument
    tag_counts = user_mc.storyTagCount(query, tag_sets_id=tag_sets_id)
    # add in the pct so we can show relative values within the sample
    for t in tag_counts:
        if (is_bad_theme(t['tags_id'])):
            tag_counts.remove(t)

    return tag_counts
def story_info(stories_id):
    user_mc = user_mediacloud_client()
    admin_mc = user_admin_mediacloud_client()
    if stories_id in [None, 'NaN']:
        return jsonify({'error': 'bad value'})
    if 'text' in request.args and request.args['text'] == 'true':
        story = admin_mc.story(stories_id, text=True)
    else:
        story = user_mc.story(stories_id)
    story["media"] = user_mc.media(story["media_id"])
    return jsonify({'info': story})
def api_topics_preview_story_sample():
    user_mc = user_mediacloud_client()

    solr_query = concatenate_query_for_solr(solr_seed_query=request.form['q'],
                                            media_ids=ids_from_comma_separated_str(request.form['sources[]']) if 'sources[]' in request.form else None,
                                            tags_ids=ids_from_comma_separated_str(request.form['collections[]'])) if 'collections[]' in request.form else None,

    fq = concatenate_solr_dates(start_date=request.form['start_date'],
                                            end_date=request.form['end_date'])
    num_stories = request.form['rows']
    story_count_result = user_mc.storyList(solr_query=solr_query, solr_filter=fq, sort=user_mc.SORT_RANDOM, rows=num_stories)
    return jsonify(story_count_result)
def _cached_last_year_split_story_count(q='*'):
    # sources are open to everyone, so no need for user-specific cache
    # Helper to fetch split story counts over a timeframe for an arbitrary query
    user_mc = user_mediacloud_client()
    last_n_days = 365
    start_date = datetime.date.today()-datetime.timedelta(last_n_days)
    end_date = datetime.date.today()-datetime.timedelta(1)  # yesterday
    fq = user_mc.publish_date_query(start_date, end_date)
    results = user_mc.storyCount(solr_query=q, solr_filter=fq, split=True, split_period='day')
    results['counts'] = add_missing_dates_to_split_story_counts(results['counts'], start_date, end_date)
    results['total_story_count'] = sum([r['count'] for r in results['counts']])
    return results
def _cached_sentence_list(mc_api_key, q, fq, rows, include_stories=True):
    # need to get an admin client with the tool key so they have sentence read permissions
    tool_mc = user_admin_mediacloud_client(mc_api_key)
    sentences = tool_mc.sentenceList(q, fq)[:rows]
    stories_id_list = [str(s['stories_id']) for s in sentences]
    if (len(stories_id_list) > 0) and include_stories:
        # this is the fastest way to get a list of stories by id
        stories = user_mediacloud_client().storyList("stories_id:({})".format(" ".join(stories_id_list)))
        stories_by_id = {s['stories_id']: s for s in stories}  # build a quick lookup table by stories_id
        for s in sentences:
            s['story'] = stories_by_id[s['stories_id']]
    return sentences
def _remove_word_source_from_network(ms_name, word_list):
    user_mc = user_mediacloud_client()
    ms = user_mc.mediaList(name_like=ms_name)
    if len(ms) == 1:
        try:
            del word_list[ms[0]['media_id']]
        except KeyError:
            logger.debug('Media Source not present in list.')
    elif len(ms) == 0:
        logger.debug('No match for %s.' % ms_name)
    else:
        logger.debug('Multiple matches for Media Source. No action taken.')
Exemple #45
0
def does_user_have_a_running_topic():
    user_mc = user_mediacloud_client()
    queued_and_running_topics = []
    more_topics = True
    link_id = None
    while more_topics:
        results = user_mc.topicList(link_id=link_id, limit=100)
        topics = results['topics']
        queued_and_running_topics += [t for t in topics if t['state'] in ['running', 'queued']
                                      and t['user_permission'] in ['admin']]
        more_topics = 'next' in results['link_ids']
        if more_topics:
            link_id = results['link_ids']['next']
    return jsonify(queued_and_running_topics)
def create_nyt_theme_focal_set(topics_id):
    user_mc = user_mediacloud_client()

    # grab the focalSetName and focalSetDescription and then make one
    focal_set_name = request.form['focalSetName']
    focal_set_description = request.form['focalSetDescription']
    theme_data= json.loads(request.form['data[]'])
    focal_technique = FOCAL_TECHNIQUE_BOOLEAN_QUERY # is this right?
    new_focal_set = user_mc.topicFocalSetDefinitionCreate(topics_id, focal_set_name, focal_set_description, focal_technique)
    if 'focal_set_definitions_id' not in new_focal_set:
        return json_error_response('Unable to create the subtopic set')
    # now make the foci in it - one for each country
    for tag in theme_data:
        params = {
            'name': tag['label'],
            'description': "Stories about {}".format(tag['label']),
            'query': "tags_id_stories:{}".format(tag['tags_id']) ,
            'focal_set_definitions_id' : new_focal_set['focal_set_definitions_id'],
        }
        user_mc = user_mediacloud_client()
        user_mc.topicFocusDefinitionCreate(topics_id, **params)

    return {'success': True}
def api_sources_name_exists():
    '''Check if source with name/url exists already
    :return: boolean indicating if source with this name exists or not (case insensive check)
    '''
    mc = user_mediacloud_client()
    search_str = request.args['searchStr']
    id = int(request.args['id']) if 'id' in request.args else None
    matching_sources = mc.mediaList(name_like=search_str)[:MAX_SOURCES]
    if id:
        matching_source_names = [s['name'].lower().strip() for s in matching_sources if s['media_id'] != id and s['name'].strip().lower() != search_str.strip().lower() ]
    else:
        matching_source_names = [s['name'].lower().strip() for s in matching_sources]

    name_in_use = search_str.lower() in matching_source_names
    return jsonify({'nameInUse': name_in_use})
def api_collection_details(collection_id):
    add_in_sources = False
    if ('getSources' in request.args) and (request.args['getSources'] == 'true'):
        add_in_sources = True

    user_mc = user_mediacloud_client()
    info = user_mc.tag(collection_id)
    add_user_favorite_flag_to_collections([info])
    info['id'] = collection_id
    info['tag_set'] = _tag_set_info(user_mediacloud_key(), info['tag_sets_id'])
    if add_in_sources:
        media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
        info['sources'] = media_in_collection
    analytics_db.increment_count(analytics_db.TYPE_COLLECTION, collection_id, analytics_db.ACTION_SOURCE_MGR_VIEW)
    return jsonify({'results': info})
def api_collections_name_exists():
    '''Check if source with name/url exists already
    :return: boolean indicating if source with this name exists or not (case insensive check)
    '''
    mc = user_mediacloud_client()
    search_str = request.args['searchStr']
    id = int(request.args['id']) if 'id' in request.args else None
    #tag_sets_id_list, public_only=public_only, name_like=search_str
    matching_collections = mc.tagList(name_like=search_str)[:MAX_SOURCES]
    if id:
        matching_collections_names = [s['label'].lower().strip() for s in matching_collections if s['tags_id'] != id]
    else:
        matching_collections_names = [s['label'].lower().strip() for s in matching_collections]

    name_in_use = search_str.lower() in matching_collections_names
    return jsonify({'nameInUse': name_in_use})
Exemple #50
0
def story(topics_id, stories_id):
    if is_user_logged_in():
        local_mc = user_mediacloud_client()
        story_topic_info = apicache.topic_story_list(user_mediacloud_key(), topics_id, stories_id=stories_id)
        story_topic_info = story_topic_info['stories'][0]
        '''
        all_fb_count = []
        more_fb_count = True
        link_id = 0
        while more_fb_count:
            fb_page = local_mc.topicStoryListFacebookData(topics_id, limit=100, link_id=link_id)

            all_fb_count = all_fb_count + fb_page['counts']
            if 'next' in fb_page['link_ids']:
                link_id = fb_page['link_ids']['next']
                more_fb_count = True
            else:
                more_fb_count = False

        for fb_item in all_fb_count:
            if int(fb_item['stories_id']) == int(stories_id):
                story_topic_info['facebook_collection_date'] = fb_item['facebook_api_collect_date']
        '''
    else:
        return jsonify({'status': 'Error', 'message': 'Invalid attempt'})

    try:
        story_info = local_mc.story(stories_id)  # add in other fields from regular call
        for k in story_info.keys():
            story_topic_info[k] = story_info[k]
        for tag in story_info['story_tags']:
            if tag['tag_sets_id'] == tag_util.GEO_TAG_SET:
                geonames_id = int(tag['tag'][9:])
                try:
                    tag['geoname'] = _cached_geoname(geonames_id)
                except Exception as e:
                    # query to CLIFF failed :-( handle it gracefully
                    logger.exception(e)
                    tag['geoname'] = {}
    except MCException:
        logger.warning("Story {} wasn't found in a regular story API call, but is it topic {}".format(
            stories_id, topics_id
        ))
    return jsonify(story_topic_info)
def api_collection_sources_feed_status_csv(collection_id, source_type):
    user_mc = user_mediacloud_client()
    collection = user_mc.tag(collection_id)
    type = str(source_type).lower()
    media_in_collection = media_with_tag(user_mediacloud_key(), collection_id)
    media_info_in_collection = _fetch_collection_source_feed_info(media_in_collection)
    if type == 'review':
        filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] > 0]
    elif type == 'remove':
        filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_90'] == 0 and m['num_stories_last_year'] == 0 and m['latest_scrape_job.state'] == 'failed']
    elif type == 'unscrapeable':
        filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] == 0 and m['num_stories_90'] > 0]
    elif type == 'working':
        filtered_media = [m for m in media_info_in_collection if m['active_feed_count'] > 0 and m['num_stories_last_year'] > 0]
    else:
        filtered_media = media_info_in_collection
    file_prefix = "Collection {} ({}) - sources feed {}".format(collection_id, collection['tag'], source_type)
    properties_to_include = SOURCE_FEED_LIST_CSV_PROPS
    return csv.download_media_csv(filtered_media, file_prefix, properties_to_include)
def _cached_topic_split_story_counts(user_mc_key, topics_id, **kwargs):
    '''
    Internal helper - don't call this; call topic_split_story_counts instead. This needs user_mc_key in the
    function signature to make sure the caching is keyed correctly.
    '''
    local_mc = None
    if user_mc_key == TOOL_API_KEY:
        local_mc = mc
    else:
        local_mc = user_mediacloud_client()

    results = local_mc.topicStoryCount(topics_id,
        split=True,
        **kwargs)
    total_stories = 0
    for c in results['counts']:
        total_stories += c['count']
    results['total_story_count'] = total_stories
    return results
def _cached_collection_source_representation(mc_api_key, collection_id):
    # have to respect the api here here because only some folks can see private collections
    user_mc = user_mediacloud_client(mc_api_key)
    sample_size = 1000
    stories = user_mc.storyList('tags_id_media:{}'.format(collection_id), rows=sample_size, sort=mc.SORT_RANDOM)
    media_representation = {}
    for s in stories:
        if s['media_id'] not in media_representation:
            media_representation[s['media_id']] = {
                'media_id': s['media_id'],
                'media_name': s['media_name'],
                'media_url': s['media_url'],
                'sample_size': sample_size,
                'stories': 0
            }
        media_representation[s['media_id']]['stories'] += 1
    for media_id in media_representation:
        media_representation[media_id]['story_pct'] = float(media_representation[media_id]['stories']) / float(
            sample_size)
    return sorted(list(media_representation.values()), key=operator.itemgetter('stories'))
def create_media_type_focal_set(topics_id):
    user_mc = user_mediacloud_client()
    # grab the focalSetName and focalSetDescription and then make one
    focal_set_name = request.form['focalSetName']
    focal_set_description = request.form['focalSetDescription']
    media_type_tags = tags_in_tag_set(TOOL_API_KEY, TAG_SETS_ID_MEDIA_TYPE)
    focal_technique = FOCAL_TECHNIQUE_BOOLEAN_QUERY
    new_focal_set = user_mc.topicFocalSetDefinitionCreate(topics_id, focal_set_name, focal_set_description, focal_technique)
    if 'focal_set_definitions_id' not in new_focal_set:
        return json_error_response('Unable to create the subtopic set')
    # now make the foci in it - one for each media type
    focus_def_results = []
    for tag in media_type_tags:
        params = {
            'name': tag['label'],
            'description': "Stories from {} sources".format(tag['label']),
            'query': "tags_id_media:{}".format(tag['tags_id']),
            'focal_set_definitions_id': new_focal_set['focal_set_definitions_id'],
        }
        result = user_mc.topicFocusDefinitionCreate(topics_id, **params)
        focus_def_results.append(result)
    return {'success': True}
def cached_source_story_count(query):
    # sources are open to everyone, so no need for user-specific cache
    user_mc = user_mediacloud_client()
    return user_mc.storyCount(query)['count']
def _tag_set_info(user_mc_key, tag_sets_id):
    user_mc = user_mediacloud_client()
    return user_mc.tagSet(tag_sets_id)
def collection_search(search_str, public_only, tag_sets_id_list):
    user_mc = user_mediacloud_client()
    return user_mc.tagList(tag_sets_id_list, public_only=public_only, name_like=search_str)
def media_search(search_str, tags_id=None):
    mc = user_mediacloud_client()
    return mc.mediaList(name_like=search_str, tags_id=tags_id, rows=MAX_SOURCES, sort="num_stories")
def _cached_timeperiod_story_count(q='*', time_period=QUERY_LAST_MONTH):
    # sources are open to everyone, so no need for user-specific cache
    # Helper to fetch split story counts over a timeframe for an arbitrary query
    user_mc = user_mediacloud_client()
    results = user_mc.storyCount(solr_query=q, solr_filter=time_period)
    return results
Exemple #60
0
def get_topic_media_links_csv(topics_id):
    user_mc = user_mediacloud_client()
    topic = user_mc.topic(topics_id)
    # page through results for timespand
    return stream_media_link_list_csv(user_mediacloud_key(), topic['name'] + '-stories', topics_id)