Exemple #1
0
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None:
    """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern.

    Update the following fields in the topic_fetch_urls row:

    code - the status code of the http response
    fetch_date - the current time
    state - one of the FETCH_STATE_* constatnts
    message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED)
    stories_id - the id of the story generated from the fetched content, or null if no story created'

    If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting
    topic_fetch_urls.stories_id to topic_links.ref_stories_id.

    If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything.

    If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of
    fetching the url.

    This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of
    FETCH_STATE_PYTHON_ERROR

    Arguments:
    db - db handle
    topic_fetch_urls_id - id of topic_fetch_urls row
    domain_timeout - pass through to fech_link

    Returns:
    None

    """
    topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id)

    try:
        log.info("fetch_link: %s" % topic_fetch_url['url'])
        _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout)

        if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None:
            story = db.require_by_id('stories', topic_fetch_url['stories_id'])
            topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
            redirect_url = topic_fetch_url['url']
            assume_match = topic_fetch_url['assume_match']
            if _is_not_topic_story(db, topic_fetch_url):
                if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match):
                    _add_to_topic_stories(db, story, topic)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    except McThrottledDomainException as ex:
        raise ex

    except Exception as ex:
        log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex))

        topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR
        topic_fetch_url['message'] = traceback.format_exc()
        log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message']))

    db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None:
    """
    Store posts for a single day.

    Arguments:
    db - db handle
    topic_post_day - topic_post_day dict
    posts - list of posts found for day

    Return:
    None
    """
    log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day']))

    tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id'])
    topic = db.require_by_id('topics', tsq['topics_id'])
    posts = list(filter(lambda p: content_matches_topic(p['content'], topic), posts))

    num_posts_fetched = len(posts)

    log.info(f"{num_posts_fetched} posts remaining after match")

    db.begin()

    db.query("SET LOCAL citus.multi_shard_modify_mode TO 'sequential'")

    log.debug("inserting into topic_posts ...")

    [_store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts]

    db.query(
        """
        UPDATE topic_post_days SET
            posts_fetched = true,
            num_posts_stored = %(num_posts_stored)s,
            num_posts_fetched = %(num_posts_fetched)s
        WHERE
            topics_id = %(topics_id)s AND
            topic_post_days_id = %(topic_post_days_id)s
        """,
        {
            'num_posts_stored': len(posts),
            'num_posts_fetched': num_posts_fetched,
            'topics_id': topic_post_day['topics_id'],
            'topic_post_days_id': topic_post_day['topic_post_days_id'],
        }
    )

    db.commit()

    log.debug("done inserting into topic_posts")
Exemple #3
0
def get_default_size_attribute(db: DatabaseHandler, timespans_id: int) -> str:
    """Return size attribute based on whether the timespan belongs to a url sharing subtopic."""
    timespan = db.require_by_id('timespans', timespans_id)

    if timespan['foci_id'] is None:
        return 'media_inlink_count'

    focus = db.require_by_id('foci', timespan['foci_id'])
    focal_set = db.require_by_id('focal_sets', focus['focal_sets_id'])

    if focal_set['focal_technique'] == 'URL Sharing':
        return 'author_count'
    else:
        return 'media_inlink_count'
Exemple #4
0
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict,
                         posts: list) -> None:
    """
    Store posts for a single day.

    Arguments:
    db - db handle
    topic_post_day - topic_post_day dict
    posts - list of posts found for day

    Return:
    None
    """
    log.info("adding %d posts for day %s" %
             (len(posts), topic_post_day['day']))

    tsq = db.require_by_id('topic_seed_queries',
                           topic_post_day['topic_seed_queries_id'])
    topic = db.require_by_id('topics', tsq['topics_id'])
    posts = list(
        filter(lambda p: content_matches_topic(p['content'], topic), posts))

    num_posts_fetched = len(posts)

    log.info(f"{num_posts_fetched} posts remaining after match")

    db.begin()

    log.debug("inserting into topic_posts ...")

    [
        _store_post_and_urls(db, topic_post_day, meta_tweet)
        for meta_tweet in posts
    ]

    db.query(
        """
        update topic_post_days set posts_fetched = true, num_posts_stored = %(a)s, num_posts_fetched = %(b)s
            where topic_post_days_id = %(c)s
        """, {
            'a': len(posts),
            'b': num_posts_fetched,
            'c': topic_post_day['topic_post_days_id']
        })

    db.commit()

    log.debug("done inserting into topic_posts")
Exemple #5
0
    def update_job_state_args(self, db: DatabaseHandler,
                              args: Dict[str, Any]) -> None:
        """Update the args field for the current "job_states" row."""
        args = decode_object_from_bytes_if_needed(args)

        job_state = db.require_by_id(table='job_states',
                                     object_id=self.__job_states_id)

        try:

            # job_states.args got changed from JSON to JSONB while sharding the
            # database, and there's no way to disable decoding JSONB (as
            # opposed to JSON) in psycopg2, so "args" might be a JSON string or
            # a pre-decoded dictionary
            maybe_json_db_args = job_state.get('args', '')
            if isinstance(maybe_json_db_args, dict):
                db_args = maybe_json_db_args
            else:
                db_args = decode_json(maybe_json_db_args)

        except Exception as ex:
            log.error(
                f"Unable to decode args from job state {job_state}: {ex}")
            db_args = {}

        db_args = {**db_args, **args}

        args_json = encode_json(db_args)

        db.update_by_id(table='job_states',
                        object_id=self.__job_states_id,
                        update_hash={
                            'args': args_json,
                        })
Exemple #6
0
def _get_deduped_medium(db: DatabaseHandler, media_id: int) -> dict:
    """Get either the referenced medium or the deduped version of the medium by recursively following dup_media_id."""
    medium = db.require_by_id('media', media_id)
    if medium['dup_media_id'] is None:
        return medium
    else:
        return _get_deduped_medium(db, medium['dup_media_id'])
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None:
    """Reparse the tweet json for a given topic and try to reinsert all tweet urls."""
    topic_posts_ids = db.query(
        """
        SELECT
            topic_posts.topic_posts_id
        FROM topic_posts
            INNER JOIN topic_post_days ON
                topic_posts.topics_id = topic_post_days.topics_id AND
                topic_posts.topic_post_days_id = topic_post_days.topic_post_days_id
            INNER JOIN topic_seed_queries ON
                topic_post_days.topics_id = topic_seed_queries.topics_id AND
                topic_post_days.topic_seed_queries_id = topic_seed_queries.topic_seed_queries_id
        WHERE
            topics_id = %(topics_id)s
        """, {
            'topics_id': topic['topics_id'],
        }
    ).flat()

    for (i, topic_posts_id) in enumerate(topic_posts_ids):
        if i % 1000 == 0:
            log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids)))

        topic_post = db.require_by_id('topic_posts', topic_posts_id)
        data = decode_json(topic_post['data'])
        urls = get_tweet_urls(data['data']['tweet'])
        _insert_post_urls(db, topic_post, urls)
def get_story_date_tag(db: DatabaseHandler, story: dict) -> Optional[tuple]:
    """Return the tag tag_sets dict associated with the story guess method tag sets."""
    tags = db.query(
        """
        select t.*
            from tags t
                join tag_sets ts using ( tag_sets_id )
                join stories_tags_map stm using ( tags_id )
            where
                ts.name = any(%(a)s) and
                stm.stories_id = %(b)s
        """, {
            'a': [GUESS_METHOD_TAG_SET, INVALID_TAG_SET],
            'b': story['stories_id']
        }).hashes()

    assert len(tags) < 2

    if len(tags) == 1:
        tag = tags[0]
    else:
        return None, None

    tag_set = db.require_by_id('tag_sets', tag['tag_sets_id'])

    return tag, tag_set
Exemple #9
0
def fetch_topic_posts(db: DatabaseHandler, topic_seed_query: dict) -> None:
    """For each day within the topic dates, fetch and store posts returned by the topic_seed_query.

    This is the core function that fetches and stores data for sharing topics.  This function will break the
    date range for the topic into individual days and fetch posts matching the topic_seed_query for the
    for each day.  This function will create a topic_post_day row for each day of posts fetched,
    a topic_post row for each post fetched, and a topic_post_url row for each url found in a post.

    Arguments:
    db - database handle
    topics_id - topic id

    Return:
    None
    """
    topic = db.require_by_id('topics', topic_seed_query['topics_id'])

    date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d')

    end_date = datetime.datetime.strptime(topic['end_date'], '%Y-%m-%d')
    while date <= end_date:
        log.debug("fetching posts for %s" % date)
        if not _topic_post_day_fetched(db, topic_seed_query, date):
            posts = fetch_posts(topic_seed_query, date)
            topic_post_day = _add_topic_post_single_day(
                db, topic_seed_query, len(posts), date)
            _store_posts_for_day(db, topic_post_day, posts)

        date = date + datetime.timedelta(days=1)
Exemple #10
0
def _get_deduped_medium(db: DatabaseHandler, media_id: int) -> dict:
    """Get either the referenced medium or the deduped version of the medium by recursively following dup_media_id."""
    medium = db.require_by_id('media', media_id)
    if medium['dup_media_id'] is None:
        return medium
    else:
        return _get_deduped_medium(db, medium['dup_media_id'])
Exemple #11
0
    def update_job_state_message(self, db: DatabaseHandler, message: str) -> None:
        """
        Update the message field for the current "job_states" row.

        This is a public method that is intended to be used by code run anywhere above the stack from run() to publish
        messages updating the progress of a long running job.
        """
        message = decode_object_from_bytes_if_needed(message)

        # Verify that it exists I guess?
        db.require_by_id(table='job_states', object_id=self.__job_states_id)

        job_state = db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={
            'message': message,
            'last_updated': sql_now(),
        })

        self.__update_table_state(db=db, job_state=job_state)
Exemple #12
0
def _fetch_tweets_for_day(db: DatabaseHandler,
                          topic_tweet_day: dict,
                          meta_tweets: list,
                          max_tweets: typing.Optional[int] = None) -> None:
    """
    Fetch tweets for a single day.

    If tweets_fetched is false for the given topic_tweet_days row, fetch the tweets for the given day by querying
    the list of tweets and then fetching each tweet from twitter.

    Arguments:
    db - db handle
    topic_tweet_day - topic_tweet_day dict
    meta_tweets - list of meta tweets found for day
    max_tweets - max tweets to fetch for a single day

    Return:
    None
    """
    if (max_tweets is not None):
        meta_tweets = meta_tweets[0:max_tweets]

    topics_id = topic_tweet_day['topics_id']
    log.info("adding %d tweets for topic %s, day %s" %
             (len(meta_tweets), topics_id, topic_tweet_day['day']))

    # we can only get 100 posts at a time from twitter
    for i in range(0, len(meta_tweets), 100):
        _add_tweets_to_meta_tweets(meta_tweets[i:i + 100])

    topic = db.require_by_id('topics', topic_tweet_day['topics_id'])
    meta_tweets = list(
        filter(lambda p: _tweet_matches_pattern(topic, p), meta_tweets))

    log.info("%d tweets remaining after match" % (len(meta_tweets)))

    db.begin()

    log.debug("inserting into topic_tweets ...")

    [
        _store_tweet_and_urls(db, topic_tweet_day, meta_tweet)
        for meta_tweet in meta_tweets
    ]

    topic_tweet_day['num_tweets'] = len(meta_tweets)

    db.query(
        "update topic_tweet_days set tweets_fetched = true, num_tweets = %(a)s where topic_tweet_days_id = %(b)s",
        {
            'a': topic_tweet_day['num_tweets'],
            'b': topic_tweet_day['topic_tweet_days_id']
        })

    db.commit()

    log.debug("done inserting into topic_tweets")
Exemple #13
0
def _get_dup_story_groups(db: DatabaseHandler, topic: dict) -> list:
    """Return a list of duplicate story groups.

    Find all stories within a topic that have duplicate normalized titles with a given day and media_id.  Return a
    list of story lists.  Each story list is a list of stories that are duplicated os each other.
    """
    story_pairs = db.query(
        """
            SELECT
                a.stories_id AS stories_id_a,
                b.stories_id AS stories_id_b
            FROM
                snap.live_stories AS a,
                snap.live_stories AS b
            WHERE
                a.topics_id = %(topics_id)s AND
                a.topics_id = b.topics_id AND
                a.stories_id < b.stories_id AND
                a.media_id = b.media_id AND
                a.normalized_title_hash = b.normalized_title_hash AND
                date_trunc('day', a.publish_date) = date_trunc('day', b.publish_date)
            ORDER BY
                stories_id_a,
                stories_id_b
        """,
        {
            'topics_id': topic['topics_id'],
        }
    ).hashes()

    story_groups = {}
    ignore_stories = {}
    for story_pair in story_pairs:
        if story_pair['stories_id_b'] in ignore_stories:
            continue

        story_a = db.require_by_id('stories', story_pair['stories_id_a'])
        story_b = db.require_by_id('stories', story_pair['stories_id_b'])

        story_groups.setdefault(story_a['stories_id'], [story_a])
        story_groups[story_a['stories_id']].append(story_b)

        ignore_stories[story_b['stories_id']] = True

    return list(story_groups.values())
Exemple #14
0
def fetch_topic_tweets(
        db: DatabaseHandler,
        topics_id: int,
        max_tweets_per_day: typing.Optional[int] = None) -> None:
    """For each day within the topic dates, fetch and store the tweets.

    This is the core function that fetches and stores data for twitter topics.  This function will break the
    date range for the topic into individual days and fetch tweets matching thes twitter seed query for the
    topic for each day.  This function will create a topic_tweet_day row for each day of tweets fetched,
    a topic_tweet row for each tweet fetched, and a topic_tweet_url row for each url found in a tweet.

    This function pulls metadata about the matching tweets from a search source (such as crimson hexagon or
    archive.org, as deteremined by the topic_seed_queries.source field) and then fetches the tweets returned
    by the search from the twitter api in batches of 100.

    Arguments:
    db - database handle
    topics_id - topic id
    max_tweets_per_day - max tweets to fetch each day

    Return:
    None
    """
    topic = db.require_by_id('topics', topics_id)

    if topic['platform'] != 'twitter':
        raise (
            McFetchTopicTweetsDataException("Topic platform is not 'twitter'"))

    date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d')
    end_date = datetime.datetime.strptime(topic['end_date'], '%Y-%m-%d')
    while date <= end_date:
        try:
            log.info("fetching tweets for %s" % date)
            if not _topic_tweet_day_fetched(db, topic, date):
                meta_tweets = fetch_meta_tweets(db, topic, date)
                topic_tweet_day = _add_topic_tweet_single_day(
                    db, topic, len(meta_tweets), date)
                _fetch_tweets_for_day(db, topic_tweet_day, meta_tweets,
                                      max_tweets_per_day)
        except McFetchTopicTweetDateFetchedException:
            pass

        date = date + datetime.timedelta(days=1)
def fetch_topic_tweets(
        db: DatabaseHandler,
        topics_id: int,
        twitter_class: typing.Type[AbstractTwitter] = Twitter,
        ch_class: typing.Type[AbstractCrimsonHexagon] = CrimsonHexagon
) -> None:
    """
    Fetch list of tweets within a Crimson Hexagon monitor based on the ch_monitor_id of the given topic.

    Crimson Hexagon returns up to 10k randomly sampled tweets per posts fetch, and each posts fetch can be restricted
    down to a single day.  This call fetches tweets from CH day by day, up to a total of 1 million tweets for a single
    topic for the whole date range combined.  The call normalizes the number of tweets returned for each day so that
    each day has the same percentage of all tweets found on that day.  So if there were 20,000 tweets found on the
    busiest day, each day will use at most 50% of the returned tweets for the day.

    One call to this function takes care of both fetching the list of all tweets from CH and fetching each of those
    tweets from twitter (CH does not provide the tweet content, only the url).  Each day's worth of tweets will be
    recorded in topic_tweet_days, and subsequent calls to the function will not refetch a given day for a given topic,
    but each call will fetch any days newly included in the date range of the topic given a topic dates change.

    If there is no ch_monitor_id for the topic, do nothing.

    Arguments:
    db - db handle
    topics_id - topic id
    twitter_class - optional implementation of AbstractTwitter class;
        default to one that fetches data from twitter with config from mediawords.yml
    ch_class - optional implementation of AbstractCrimsonHexagon class;
        default to one that fetches data from twitter with config from mediawords.yml

    Return:
    None
    """
    topic = db.require_by_id('topics', topics_id)
    ch_monitor_id = topic['ch_monitor_id']

    if ch_monitor_id is None:
        log.debug(
            "returning after noop because topic topics_id has a null ch_monitor_id"
        )
        return

    _add_topic_tweet_days(db, topic, twitter_class, ch_class)
Exemple #16
0
    def update_job_state_args(self, db: DatabaseHandler, args: Dict[str, Any]) -> None:
        """Update the args field for the current "job_states" row."""
        args = decode_object_from_bytes_if_needed(args)

        job_state = db.require_by_id(table='job_states', object_id=self.__job_states_id)

        try:
            db_args = decode_json(job_state.get('args', '{}'))
        except Exception as ex:
            log.error(f"Unable to decode args from job state {job_state}: {ex}")
            db_args = {}

        db_args = {**db_args, **args}

        args_json = encode_json(db_args)

        db.update_by_id(table='job_states', object_id=self.__job_states_id, update_hash={
            'args': args_json,
        })
Exemple #17
0
def fetch_topic_posts(db: DatabaseHandler, topics_id: int) -> None:
    """For each day within the topic dates, fetch and store posts returned by the topic_seed_query.

    This is the core function that fetches and stores data for sharing topics.  This function will break the
    date range for the topic into individual days and fetch posts matching the topic_seed_query for the
    for each day.  This function will create a topic_post_day row for each day of posts fetched,
    a topic_post row for each post fetched, and a topic_post_url row for each url found in a post.

    Arguments:
    db - database handle
    topics_id - topic id

    Return:
    None
    """
    topic = db.require_by_id('topics', topics_id)

    if topic['mode'] != 'url_sharing':
        raise McFetchTopicPostsDataException("Topic mode is not 'sharing'")

    topic_seed_queries = db.query(
        "select * from topic_seed_queries where topics_id = %(a)s", {
            'a': topics_id
        }).hashes()

    if not len(topic_seed_queries) == 1:
        raise McFetchTopicPostsDataException(
            "Topic must have exactly one topic_seed_queries row")

    topic_seed_query = topic_seed_queries[0]

    date = datetime.datetime.strptime(topic['start_date'], '%Y-%m-%d')
    end_date = datetime.datetime.strptime(topic['end_date'], '%Y-%m-%d')
    log.warning("%s - %s" % (str(date), str(end_date)))
    while date <= end_date:
        log.debug("fetching posts for %s" % date)
        if not _topic_post_day_fetched(db, topic, date):
            posts = fetch_posts(topic_seed_query, date)
            topic_post_day = _add_topic_post_single_day(
                db, topic, len(posts), date)
            _store_posts_for_day(db, topic_post_day, posts)

        date = date + datetime.timedelta(days=1)
def fetch_topic_tweets(
        db: DatabaseHandler,
        topics_id: int,
        twitter_class: typing.Type[AbstractTwitter] = Twitter,
        ch_class: typing.Type[AbstractCrimsonHexagon] = CrimsonHexagon) -> None:
    """
    Fetch list of tweets within a Crimson Hexagon monitor based on the ch_monitor_id of the given topic.

    Crimson Hexagon returns up to 10k randomly sampled tweets per posts fetch, and each posts fetch can be restricted
    down to a single day.  This call fetches tweets from CH day by day, up to a total of 1 million tweets for a single
    topic for the whole date range combined.  The call normalizes the number of tweets returned for each day so that
    each day has the same percentage of all tweets found on that day.  So if there were 20,000 tweets found on the
    busiest day, each day will use at most 50% of the returned tweets for the day.

    One call to this function takes care of both fetching the list of all tweets from CH and fetching each of those
    tweets from twitter (CH does not provide the tweet content, only the url).  Each day's worth of tweets will be
    recorded in topic_tweet_days, and subsequent calls to the function will not refetch a given day for a given topic,
    but each call will fetch any days newly included in the date range of the topic given a topic dates change.

    If there is no ch_monitor_id for the topic, do nothing.

    Arguments:
    db - db handle
    topics_id - topic id
    twitter_class - optional implementation of AbstractTwitter class;
        default to one that fetches data from twitter with config from mediawords.yml
    ch_class - optional implementation of AbstractCrimsonHexagon class;
        default to one that fetches data from twitter with config from mediawords.yml

    Return:
    None
    """
    topic = db.require_by_id('topics', topics_id)
    ch_monitor_id = topic['ch_monitor_id']

    if ch_monitor_id is None:
        log.debug("returning after noop because topic topics_id has a null ch_monitor_id")
        return

    _add_topic_tweet_days(db, topic, twitter_class, ch_class)
Exemple #19
0
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None:
    """Reparse the tweet json for a given topic and try to reinsert all tweet urls."""
    topic_posts_ids = db.query(
        """
        select tt.topic_posts_id
            from topic_posts tt
                join topic_post_days ttd using ( topic_post_days_id )
            where
                topics_id = %(a)s
        """, {
            'a': topic['topics_id']
        }).flat()

    for (i, topic_posts_id) in enumerate(topic_posts_ids):
        if i % 1000 == 0:
            log.info('regenerate tweet urls: %d/%d' %
                     (i, len(topic_posts_ids)))

        topic_post = db.require_by_id('topic_posts', topic_posts_id)
        data = decode_json(topic_post['data'])
        urls = get_tweet_urls(data['data']['tweet'])
        _insert_post_urls(db, topic_post, urls)
def _try_fetch_topic_url(
        db: DatabaseHandler,
        topic_fetch_url: dict,
        domain_timeout: typing.Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url'])

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED):
        return

    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url'])
    if failed_url:
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if mediawords.tm.domains.skip_self_linked_domain(db, topic_fetch_url):
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        story_match = mediawords.tm.stories.get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout)
        log.debug("%d response returned for url: %s" % (response.code, topic_fetch_url['url']))
    else:
        log.debug("seeded content found for url: %s" % topic_fetch_url['url'])

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        if _ignore_link_pattern(response_url):
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url")
        story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(content=content, topic=topic, assume_match=assume_match):
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url
            story = mediawords.tm.stories.generate_story(db=db, content=content, url=url)

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except mediawords.tm.stories.McTMStoriesDuplicateException:
            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(db, topic_fetch_url, "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url)
            if story_match is None:
                raise McTMFetchLinkException("Unable to find matching story after unique constraint error.")
            topic_fetch_url['stories_id'] = story_match['stories_id']

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")
Exemple #21
0
def _try_fetch_topic_url(
        db: DatabaseHandler,
        topic_fetch_url: dict,
        domain_timeout: typing.Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url'])

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING, FETCH_STATE_REQUEUED):
        return

    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        topic_fetch_url['state'] = FETCH_STATE_IGNORE
        topic_fetch_url['code'] = 403
        return

    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = get_failed_url(db, topic_fetch_url['topics_id'], topic_fetch_url['url'])
    if failed_url:
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if mediawords.tm.domains.skip_self_linked_domain(db, topic_fetch_url):
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        story_match = mediawords.tm.stories.get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # get content from either the seed or by fetching it
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = get_seeded_content(db, topic_fetch_url)
    if response is None:
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = fetch_url(db, topic_fetch_url['url'], domain_timeout=domain_timeout)
        log.debug("%d response returned for url: %s" % (response.code(), topic_fetch_url['url']))
    else:
        log.debug("seeded content found for url: %s" % topic_fetch_url['url'])

    content = response.decoded_content()

    fetched_url = topic_fetch_url['url']
    response_url = response.request().url() if response.request() else None

    if fetched_url != response_url:
        if _ignore_link_pattern(response_url):
            topic_fetch_url['state'] = FETCH_STATE_IGNORE
            topic_fetch_url['code'] = 403
            return

        _update_tfu_message(db, topic_fetch_url, "checking story match for redirect_url")
        story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url)

    topic_fetch_url['code'] = response.code()

    assume_match = topic_fetch_url['assume_match']

    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success():
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message()
    elif story_match is not None:
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not _content_matches_topic(content=content, topic=topic, assume_match=assume_match):
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url
            story = mediawords.tm.stories.generate_story(db=db, content=content, url=url)

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except mediawords.tm.stories.McTMStoriesDuplicateException:
            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(db, topic_fetch_url, "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = mediawords.tm.stories.get_story_match(db=db, url=fetched_url, redirect_url=response_url)
            if story_match is None:
                raise McTMFetchLinkException("Unable to find matching story after unique constraint error.")
            topic_fetch_url['stories_id'] = story_match['stories_id']

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")
Exemple #22
0
def _try_fetch_topic_url(db: DatabaseHandler,
                         topic_fetch_url: dict,
                         domain_timeout: Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.info(f"Trying to fetch topic URL {topic_fetch_url['url']}...")

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING,
                                        FETCH_STATE_REQUEUED):
        log.info(
            f"URL's state '{topic_fetch_url['state']}' is not pending or requeued, not refetching"
        )
        return

    log.info("Checking ignore links...")
    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        log.info("Link is to be ignored, returning")
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    log.info("Checking failed URL...")
    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'],
                                 topic_fetch_url['url'])
    if failed_url:
        log.info("URL is failed, returning")
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    log.info("Checking self-linked domain...")
    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if skip_self_linked_domain(db, topic_fetch_url):
        log.info("Link is self-linked domain, returning")
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    log.info(f"Fetching topic {topic_fetch_url['topics_id']}...")
    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    log.info("Checking story match...")
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        log.info("URL is in pending state, getting story match...")
        story_match = get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            log.info(f"Matched story {story_match['stories_id']}, returning")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    log.info("Checking for pending state...")
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        log.info("URL is in pending state, returning")
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    log.info("Checking seeded content...")
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        log.info("Seeded content found, fetching URL...")
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db,
                              topic_fetch_url['url'],
                              domain_timeout=domain_timeout)
        log.info(f"{response.code} response returned")
    else:
        log.debug(f"Seeded content found for URL: {topic_fetch_url['url']}")

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        log.info(
            f"Fetched URL {fetched_url} is not the same as response URL {response_url}, testing for ignore link pattern"
        )
        if _ignore_link_pattern(response_url):
            log.info("Ignore link pattern matched, returning")
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        log.info("Checking story match for redirect URL...")
        _update_tfu_message(db, topic_fetch_url,
                            "checking story match for redirect_url")
        story_match = get_story_match(db=db,
                                      url=fetched_url,
                                      redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    log.info("Checking content match...")
    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        log.info("Request failed")
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        log.info(f"Story {story_match['stories_id']} matched")
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(
            content=content, topic=topic, assume_match=assume_match):
        log.info("Content matched")
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        log.info("Nothing matched, generating story...")

        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url

            log.info("Creating story...")
            story = generate_story(db=db, content=content, url=url)
            log.info(f"Created story {story['stories_id']}")

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except McTMStoriesDuplicateException:

            log.info(
                "Duplicate story found, checking for story match on unique constraint error..."
            )

            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(
                db, topic_fetch_url,
                "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = get_story_match(db=db,
                                          url=fetched_url,
                                          redirect_url=response_url)
            if story_match is None:
                message = "Unable to find matching story after unique constraint error."
                log.error(message)
                raise McTMFetchLinkException(message)

            log.info(f"Matched story {story_match['stories_id']}")
            topic_fetch_url['stories_id'] = story_match['stories_id']

        log.info("Done generating story")

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")

    log.info(f"Done trying to fetch topic URL {topic_fetch_url['url']}.")
def extract_links_for_topic_story(
    db: DatabaseHandler,
    stories_id: int,
    topics_id: int,
    test_throw_exception: bool = False,
) -> None:
    """
    Extract links from a story and insert them into the topic_links table for the given topic.

    After the story is processed, set topic_stories.spidered to true for that story.  Calls _get_links_from_story()
    on each story.

    Almost all errors are caught by this function saved in topic_stories.link_mine_error.  In the case of an error
    topic_stories.link_mined is also set to true.

    Arguments:
    db - db handle
    story - story dict from db
    topic - topic dict from db

    Returns:
    None

    """
    story = db.require_by_id(table='stories', object_id=stories_id)
    topic = db.require_by_id(table='topics', object_id=topics_id)

    try:
        if test_throw_exception:
            raise McExtractLinksForTopicStoryTestException(
                "Testing whether errors get logged.")

        log.info("mining %s %s for topic %s .." %
                 (story['title'], story['url'], topic['name']))
        links = _get_links_from_story(db, story)

        for link in links:
            if skip_self_linked_domain_url(db, topic['topics_id'],
                                           story['url'], link):
                log.debug("skipping self linked domain url...")
                continue

            topic_link = {
                'topics_id': topic['topics_id'],
                'stories_id': story['stories_id'],
                'url': link
            }

            db.create('topic_links', topic_link)
            increment_domain_links(db, topic_link)

        link_mine_error = ''
    except Exception as ex:
        log.error(f"Link mining error: {ex}")
        link_mine_error = traceback.format_exc()

    db.query(
        """
        update topic_stories set link_mined = 't', link_mine_error = %(c)s
            where stories_id = %(a)s and topics_id = %(b)s
        """, {
            'a': story['stories_id'],
            'b': topic['topics_id'],
            'c': link_mine_error
        })
Exemple #24
0
def update(db: DatabaseHandler, media_id: int, client: SimilarWebClient):
    """Updates a media_id in the database, along with the summary table.

    Parameters
    ----------
    db : DatabaseHandler
        Connection to the database

    media_id : int
        Media id to fetch audience data for

    client : SimilarWebClient
        client to use when querying SimilarWeb
    """
    # MC_REWRITE_TO_PYTHON: remove after rewrite to Python
    if isinstance(media_id, bytes):
        media_id = decode_object_from_bytes_if_needed(media_id)

    media_id = int(media_id)
    try:
        media_data = db.require_by_id('media', media_id)
    except McRequireByIDException:
        raise ValueError('No media found with id {}'.format(media_id))

    url = media_data['url']
    similarweb_data = client.get(url)

    meta = similarweb_data['meta']
    domain = meta['request']['domain']
    is_domain_exact_match = check_if_is_domain_exact_match(url, domain)

    if 'visits' in similarweb_data:
        visits = []
        for row in similarweb_data['visits']:
            visits.append(row['visits'])
            if visits[-1] is not None:
                month_visits = int(visits[-1])
            else:
                month_visits = None
            db.query(
                """
                INSERT INTO similarweb_metrics (domain, month, visits)
                VALUES (%(domain)s, %(month)s, %(visits)s)
                ON CONFLICT (domain, month) DO UPDATE
                SET domain = %(domain)s, month=%(month)s
            """, {
                    'domain': domain,
                    'month': row['date'],
                    'visits': month_visits,
                })
        if len(visits) == 0:
            monthly_audience = 0
        else:
            # careful of None values
            monthly_audience = int(
                sum(j if j else 0 for j in visits) / len(visits))
        db.query(
            """
            INSERT INTO similarweb_media_metrics (similarweb_domain, domain_exact_match, monthly_audience, media_id)
            VALUES (%(similarweb_domain)s, %(domain_exact_match)s, %(monthly_audience)s, %(media_id)s)
            ON CONFLICT (media_id) DO UPDATE
            SET similarweb_domain = %(similarweb_domain)s,
                domain_exact_match = %(domain_exact_match)s,
                monthly_audience = %(monthly_audience)s
        """, {
                'similarweb_domain': domain,
                'domain_exact_match': is_domain_exact_match,
                'monthly_audience': monthly_audience,
                'media_id': media_id,
            })
    elif 'error_message' in meta:
        raise SimilarWebException(meta['error_message'])
    else:
        raise SimilarWebException(
            'Was not able to fetch SimilarWeb data for {} for unknown reason'.
            format(url))
def fetch_topic_url(db: DatabaseHandler, topic_fetch_urls_id: int, domain_timeout: typing.Optional[int] = None) -> None:
    """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern.

    Update the following fields in the topic_fetch_urls row:

    code - the status code of the http response
    fetch_date - the current time
    state - one of the FETCH_STATE_* constatnts
    message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED)
    stories_id - the id of the story generated from the fetched content, or null if no story created'

    If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting
    topic_fetch_urls.stories_id to topic_links.ref_stories_id.

    If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything.

    If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of
    fetching the url.

    This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of
    FETCH_STATE_PYTHON_ERROR

    Arguments:
    db - db handle
    topic_fetch_urls_id - id of topic_fetch_urls row
    domain_timeout - pass through to fech_link

    Returns:
    None

    """
    topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id)

    try:
        log.info("fetch_link: %s" % topic_fetch_url['url'])
        _try_fetch_topic_url(db=db, topic_fetch_url=topic_fetch_url, domain_timeout=domain_timeout)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

        if 'stories_id' in topic_fetch_url and topic_fetch_url['stories_id'] is not None:
            story = db.require_by_id('stories', topic_fetch_url['stories_id'])
            topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
            redirect_url = topic_fetch_url['url']
            assume_match = topic_fetch_url['assume_match']
            if _is_not_topic_story(db, topic_fetch_url):
                if _story_matches_topic(db, story, topic, redirect_url=redirect_url, assume_match=assume_match):
                    mediawords.tm.stories.add_to_topic_stories(db, story, topic)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    except McThrottledDomainException as ex:
        raise ex

    except Exception as ex:
        log.error("Error while fetching URL {}: {}".format(topic_fetch_url, ex))

        topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR
        topic_fetch_url['message'] = traceback.format_exc()
        log.warning('topic_fetch_url %s failed: %s' % (topic_fetch_url['url'], topic_fetch_url['message']))

    db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'], topic_fetch_url)