def test_content_matches_topic():
    """Test content_matches_topic()."""
    assert content_matches_topic('foo', {'topics_id': 1, 'pattern': 'foo'})
    assert content_matches_topic('FOO', {'topics_id': 1, 'pattern': 'foo'})
    assert content_matches_topic('FOO', {'topics_id': 1, 'pattern': ' foo '})
    assert not content_matches_topic('foo', {'topics_id': 1, 'pattern': 'bar'})
    assert content_matches_topic('foo', {
        'topics_id': 1,
        'pattern': 'bar'
    },
                                 assume_match=True)
Esempio n. 2
0
def _try_fetch_tweets_chunk(db: DatabaseHandler,
                            topic: Dict[str, Any],
                            topic_fetch_urls: List[Dict[str, Any]]) -> None:
    """Fetch up to URLS_CHUNK_SIZE topic_fetch_urls from twitter api as statuses and add them as topic stories.

    Throw any errors up the stack.
    """
    status_lookup = {}
    for topic_fetch_url in topic_fetch_urls:
        status_id = parse_status_id_from_url(topic_fetch_url['url'])
        status_lookup.setdefault(status_id, [])
        status_lookup[status_id].append(topic_fetch_url)

    status_ids = list(status_lookup.keys())

    log.info(f"fetching tweets for {len(status_ids)} status_ids ...")
    tweets = fetch_100_tweets(status_ids)

    for tweet in tweets:
        try:
            topic_fetch_urls = status_lookup[str(tweet['id'])]
            del (status_lookup[str(tweet['id'])])
        except KeyError:
            raise KeyError(f"can't find tweet '{tweet['id']}' in ids: {status_ids}")

        if content_matches_topic(tweet['text'], topic):
            _add_tweet_story(db, topic, tweet, topic_fetch_urls)
        else:
            [_log_content_match_failed(db, u) for u in topic_fetch_urls]

    for status_id in status_lookup.keys():
        topic_fetch_urls = status_lookup[status_id]
        [_log_tweet_missing(db, u) for u in topic_fetch_urls]
Esempio n. 3
0
def _try_fetch_users_chunk(db: DatabaseHandler, topic: Dict[str, Any], topic_fetch_urls: List[Dict[str, Any]]) -> None:
    """Fetch up to URLS_CHUNK_SIZE topic_fetch_urls from twitter api as users and add them as topic stories.

    Throw any errors up the stack.
    """
    url_lookup = {}
    for topic_fetch_url in topic_fetch_urls:
        screen_name = parse_screen_name_from_user_url(topic_fetch_url['url']).lower()
        url_lookup.setdefault(screen_name, [])
        url_lookup[screen_name].append(topic_fetch_url)

    screen_names = list(url_lookup.keys())

    log.info(f"fetching users for {len(screen_names)} screen_names ...")
    users = fetch_100_users(screen_names)

    for user in users:
        try:
            screen_name = user['screen_name'].lower()
            topic_fetch_urls = url_lookup[screen_name]
            del (url_lookup[screen_name])
        except KeyError:
            raise KeyError(f"can't find user '{user['screen_name']}' in urls: {screen_names}")

        content = f"{user['name']} {user['screen_name']} {user['description']}"
        if content_matches_topic(content, topic):
            _add_user_story(db, topic, user, topic_fetch_urls)
        else:
            [_log_content_match_failed(db, u) for u in topic_fetch_urls]

    for screen_name in url_lookup.keys():
        topic_fetch_urls = url_lookup[screen_name]
        [_log_tweet_missing(db, u) for u in topic_fetch_urls]
Esempio n. 4
0
def _story_matches_topic(db: DatabaseHandler,
                         story: dict,
                         topic: dict,
                         assume_match: bool = False,
                         redirect_url: str = None) -> bool:
    """Test whether the story sentences or metadata of the story match the topic['pattern'] regex.

    Arguments:
    db - database handle
    story - story to match against topic pattern
    topic - topic to match against
    redirect_url - alternate url for story


    Return:
    True if the story matches the topic pattern

    """
    if assume_match:
        return True

    for field in ['title', 'description', 'url']:
        if content_matches_topic(story[field], topic):
            return True

    if redirect_url and content_matches_topic(redirect_url, topic):
        return True

    story = db.query(
        """
        select string_agg(' ', sentence) as text
            from story_sentences ss
                join topics c on ( c.topics_id = %(a)s )
            where
                ss.stories_id = %(b)s
        """, {
            'a': topic['topics_id'],
            'b': story['stories_id']
        }).hash()

    if content_matches_topic(story['text'], topic):
        return True
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict, posts: list) -> None:
    """
    Store posts for a single day.

    Arguments:
    db - db handle
    topic_post_day - topic_post_day dict
    posts - list of posts found for day

    Return:
    None
    """
    log.info("adding %d posts for day %s" % (len(posts), topic_post_day['day']))

    tsq = db.require_by_id('topic_seed_queries', topic_post_day['topic_seed_queries_id'])
    topic = db.require_by_id('topics', tsq['topics_id'])
    posts = list(filter(lambda p: content_matches_topic(p['content'], topic), posts))

    num_posts_fetched = len(posts)

    log.info(f"{num_posts_fetched} posts remaining after match")

    db.begin()

    db.query("SET LOCAL citus.multi_shard_modify_mode TO 'sequential'")

    log.debug("inserting into topic_posts ...")

    [_store_post_and_urls(db, topic_post_day, meta_tweet) for meta_tweet in posts]

    db.query(
        """
        UPDATE topic_post_days SET
            posts_fetched = true,
            num_posts_stored = %(num_posts_stored)s,
            num_posts_fetched = %(num_posts_fetched)s
        WHERE
            topics_id = %(topics_id)s AND
            topic_post_days_id = %(topic_post_days_id)s
        """,
        {
            'num_posts_stored': len(posts),
            'num_posts_fetched': num_posts_fetched,
            'topics_id': topic_post_day['topics_id'],
            'topic_post_days_id': topic_post_day['topic_post_days_id'],
        }
    )

    db.commit()

    log.debug("done inserting into topic_posts")
Esempio n. 6
0
def _story_matches_topic(
        db: DatabaseHandler,
        story: dict,
        topic: dict,
        assume_match: bool = False,
        redirect_url: str = None) -> bool:
    """Test whether the story sentences or metadata of the story match the topic['pattern'] regex.

    Arguments:
    db - database handle
    story - story to match against topic pattern
    topic - topic to match against
    redirect_url - alternate url for story


    Return:
    True if the story matches the topic pattern

    """
    if assume_match:
        return True

    for field in ['title', 'description', 'url']:
        if content_matches_topic(story[field], topic):
            return True

    if redirect_url and content_matches_topic(redirect_url, topic):
        return True

    sentences = db.query(
        "select sentence from story_sentences where stories_id = %(a)s",
        {'a': story['stories_id']}).flat()

    text = ' '.join(sentences)

    if content_matches_topic(text, topic):
        return True
Esempio n. 7
0
def _store_posts_for_day(db: DatabaseHandler, topic_post_day: dict,
                         posts: list) -> None:
    """
    Store posts for a single day.

    Arguments:
    db - db handle
    topic_post_day - topic_post_day dict
    posts - list of posts found for day

    Return:
    None
    """
    log.info("adding %d posts for day %s" %
             (len(posts), topic_post_day['day']))

    tsq = db.require_by_id('topic_seed_queries',
                           topic_post_day['topic_seed_queries_id'])
    topic = db.require_by_id('topics', tsq['topics_id'])
    posts = list(
        filter(lambda p: content_matches_topic(p['content'], topic), posts))

    num_posts_fetched = len(posts)

    log.info(f"{num_posts_fetched} posts remaining after match")

    db.begin()

    log.debug("inserting into topic_posts ...")

    [
        _store_post_and_urls(db, topic_post_day, meta_tweet)
        for meta_tweet in posts
    ]

    db.query(
        """
        update topic_post_days set posts_fetched = true, num_posts_stored = %(a)s, num_posts_fetched = %(b)s
            where topic_post_days_id = %(c)s
        """, {
            'a': len(posts),
            'b': num_posts_fetched,
            'c': topic_post_day['topic_post_days_id']
        })

    db.commit()

    log.debug("done inserting into topic_posts")
Esempio n. 8
0
def _try_fetch_topic_url(db: DatabaseHandler,
                         topic_fetch_url: dict,
                         domain_timeout: Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.info(f"Trying to fetch topic URL {topic_fetch_url['url']}...")

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING,
                                        FETCH_STATE_REQUEUED):
        log.info(
            f"URL's state '{topic_fetch_url['state']}' is not pending or requeued, not refetching"
        )
        return

    log.info("Checking ignore links...")
    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        log.info("Link is to be ignored, returning")
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    log.info("Checking failed URL...")
    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'],
                                 topic_fetch_url['url'])
    if failed_url:
        log.info("URL is failed, returning")
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    log.info("Checking self-linked domain...")
    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if skip_self_linked_domain(db, topic_fetch_url):
        log.info("Link is self-linked domain, returning")
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    log.info(f"Fetching topic {topic_fetch_url['topics_id']}...")
    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    log.info("Checking story match...")
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        log.info("URL is in pending state, getting story match...")
        story_match = get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            log.info(f"Matched story {story_match['stories_id']}, returning")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    log.info("Checking for pending state...")
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        log.info("URL is in pending state, returning")
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    log.info("Checking seeded content...")
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        log.info("Seeded content found, fetching URL...")
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db,
                              topic_fetch_url['url'],
                              domain_timeout=domain_timeout)
        log.info(f"{response.code} response returned")
    else:
        log.debug(f"Seeded content found for URL: {topic_fetch_url['url']}")

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        log.info(
            f"Fetched URL {fetched_url} is not the same as response URL {response_url}, testing for ignore link pattern"
        )
        if _ignore_link_pattern(response_url):
            log.info("Ignore link pattern matched, returning")
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        log.info("Checking story match for redirect URL...")
        _update_tfu_message(db, topic_fetch_url,
                            "checking story match for redirect_url")
        story_match = get_story_match(db=db,
                                      url=fetched_url,
                                      redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    log.info("Checking content match...")
    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        log.info("Request failed")
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        log.info(f"Story {story_match['stories_id']} matched")
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(
            content=content, topic=topic, assume_match=assume_match):
        log.info("Content matched")
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        log.info("Nothing matched, generating story...")

        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url

            log.info("Creating story...")
            story = generate_story(db=db, content=content, url=url)
            log.info(f"Created story {story['stories_id']}")

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except McTMStoriesDuplicateException:

            log.info(
                "Duplicate story found, checking for story match on unique constraint error..."
            )

            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(
                db, topic_fetch_url,
                "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = get_story_match(db=db,
                                          url=fetched_url,
                                          redirect_url=response_url)
            if story_match is None:
                message = "Unable to find matching story after unique constraint error."
                log.error(message)
                raise McTMFetchLinkException(message)

            log.info(f"Matched story {story_match['stories_id']}")
            topic_fetch_url['stories_id'] = story_match['stories_id']

        log.info("Done generating story")

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")

    log.info(f"Done trying to fetch topic URL {topic_fetch_url['url']}.")
Esempio n. 9
0
def _try_fetch_topic_url(db: DatabaseHandler,
                         topic_fetch_url: dict,
                         domain_timeout: Optional[int] = None) -> None:
    """Implement the logic of fetch_topic_url without the try: or the topic_fetch_url update."""

    log.warning("_try_fetch_topic_url: %s" % topic_fetch_url['url'])

    # don't reprocess already processed urls
    if topic_fetch_url['state'] not in (FETCH_STATE_PENDING,
                                        FETCH_STATE_REQUEUED):
        return

    _update_tfu_message(db, topic_fetch_url, "checking ignore links")
    if _ignore_link_pattern(topic_fetch_url['url']):
        topic_fetch_url['state'] = FETCH_STATE_IGNORED
        topic_fetch_url['code'] = 403
        return

    _update_tfu_message(db, topic_fetch_url, "checking failed url")
    failed_url = _get_failed_url(db, topic_fetch_url['topics_id'],
                                 topic_fetch_url['url'])
    if failed_url:
        topic_fetch_url['state'] = failed_url['state']
        topic_fetch_url['code'] = failed_url['code']
        topic_fetch_url['message'] = failed_url['message']
        return

    _update_tfu_message(db, topic_fetch_url, "checking self linked domain")
    if skip_self_linked_domain(db, topic_fetch_url):
        topic_fetch_url['state'] = FETCH_STATE_SKIPPED
        topic_fetch_url['code'] = 403
        return

    topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
    topic_fetch_url['fetch_date'] = datetime.datetime.now()

    story_match = None

    # this match is relatively expensive, so only do it on the first 'pending' request and not the potentially
    # spammy 'requeued' requests
    _update_tfu_message(db, topic_fetch_url, "checking story match")
    if topic_fetch_url['state'] == FETCH_STATE_PENDING:
        story_match = get_story_match(db=db, url=topic_fetch_url['url'])

        # try to match the story before doing the expensive fetch
        if story_match is not None:
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            topic_fetch_url['code'] = 200
            topic_fetch_url['stories_id'] = story_match['stories_id']
            return

    # check whether we want to delay fetching for another job, eg. fetch_twitter_urls
    pending_state = _get_pending_state(topic_fetch_url)
    if pending_state:
        topic_fetch_url['state'] = pending_state
        return

    # get content from either the seed or by fetching it
    _update_tfu_message(db, topic_fetch_url, "checking seeded content")
    response = _get_seeded_content(db, topic_fetch_url)
    if response is None:
        _update_tfu_message(db, topic_fetch_url, "fetching content")
        response = _fetch_url(db,
                              topic_fetch_url['url'],
                              domain_timeout=domain_timeout)
        log.debug("%d response returned for url: %s" %
                  (response.code, topic_fetch_url['url']))
    else:
        log.debug("seeded content found for url: %s" % topic_fetch_url['url'])

    content = response.content

    fetched_url = topic_fetch_url['url']
    response_url = response.last_requested_url

    if fetched_url != response_url:
        if _ignore_link_pattern(response_url):
            topic_fetch_url['state'] = FETCH_STATE_IGNORED
            topic_fetch_url['code'] = 403
            return

        _update_tfu_message(db, topic_fetch_url,
                            "checking story match for redirect_url")
        story_match = get_story_match(db=db,
                                      url=fetched_url,
                                      redirect_url=response_url)

    topic_fetch_url['code'] = response.code

    assume_match = topic_fetch_url['assume_match']

    _update_tfu_message(db, topic_fetch_url, "checking content match")
    if not response.is_success:
        topic_fetch_url['state'] = FETCH_STATE_REQUEST_FAILED
        topic_fetch_url['message'] = response.message
    elif story_match is not None:
        topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
        topic_fetch_url['stories_id'] = story_match['stories_id']
    elif not content_matches_topic(
            content=content, topic=topic, assume_match=assume_match):
        topic_fetch_url['state'] = FETCH_STATE_CONTENT_MATCH_FAILED
    else:
        try:
            _update_tfu_message(db, topic_fetch_url, "generating story")
            url = response_url if response_url is not None else fetched_url
            story = generate_story(db=db, content=content, url=url)

            topic_fetch_url['stories_id'] = story['stories_id']
            topic_fetch_url['state'] = FETCH_STATE_STORY_ADDED

        except McTMStoriesDuplicateException:
            # may get a unique constraint error for the story addition within the media source.  that's fine
            # because it means the story is already in the database and we just need to match it again.
            _update_tfu_message(
                db, topic_fetch_url,
                "checking for story match on unique constraint error")
            topic_fetch_url['state'] = FETCH_STATE_STORY_MATCH
            story_match = get_story_match(db=db,
                                          url=fetched_url,
                                          redirect_url=response_url)
            if story_match is None:
                raise McTMFetchLinkException(
                    "Unable to find matching story after unique constraint error."
                )
            topic_fetch_url['stories_id'] = story_match['stories_id']

    _update_tfu_message(db, topic_fetch_url, "_try_fetch_url done")