コード例 #1
0
def _add_tweet_story(db: DatabaseHandler,
                     topic: Dict[str, Any],
                     tweet: dict,
                     topic_fetch_urls: List[Dict[str, Any]]) -> dict:
    """Generate a story based on the given tweet, as returned by the twitter api."""
    screen_name = tweet['user']['screen_name']
    content = tweet['text']
    title = f"{screen_name}: {content}"
    tweet_date = tweet['created_at']
    url = f"https://twitter.com/{screen_name}/status/{tweet['id']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    urls = get_tweet_urls(tweet)
    for url in urls:
        if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url):
            log.debug("skipping self linked domain url...")
            continue

        topic_link = {
            'topics_id': topic['topics_id'],
            'stories_id': story['stories_id'],
            'url': url,
        }

        db.create('topic_links', topic_link)
        increment_domain_links(db, topic_link)

    return story
コード例 #2
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = f"{user['name']} ({user['screen_name']}): {user['description']}"
    title = f"{user['name']} ({user['screen_name']}) | Twitter"
    tweet_date = sql_now()
    url = f"https://twitter.com/{user['screen_name']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)

    stories_id = story['stories_id']
    tags_id = undateable_tag['tags_id']

    db.query("""
        INSERT INTO public.stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
        ON CONFLICT (stories_id, tags_id) DO NOTHING
    """, {
        'stories_id': stories_id,
        'tags_id': tags_id,
    })

    return story
コード例 #3
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description'])
    title = '%s (%s) | Twitter' % (user['name'], user['screen_name'])
    tweet_date = sql_now()
    url = 'https://twitter.com/%s' % user['screen_name']

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': undateable_tag['tags_id']})

    return story
コード例 #4
0
def test_try_update_topic_link_ref_stories_id():
    """Test try_update_topic_link_ref_stories_id()."""
    db = connect_to_db()

    medium = create_test_medium(db, 'foo')
    feed = create_test_feed(db, label='foo', medium=medium)
    source_story = create_test_story(db, label='source story', feed=feed)
    target_story = create_test_story(db, label='target story a', feed=feed)

    topic = create_test_topic(db, 'foo')

    db.create('topic_stories', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id']})

    # first update should work
    topic_link_a = db.create('topic_links', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id'],
        'url': 'http://foo.com'})

    topic_fetch_url_a = db.create('topic_fetch_urls', {
        'topics_id': topic['topics_id'],
        'url': 'http://foo.com',
        'topic_links_id': topic_link_a['topic_links_id'],
        'state': FETCH_STATE_STORY_ADDED,
        'stories_id': target_story['stories_id']})

    try_update_topic_link_ref_stories_id(db, topic_fetch_url_a)

    topic_link_a = db.require_by_id('topic_links', topic_link_a['topic_links_id'])

    assert topic_link_a['ref_stories_id'] == target_story['stories_id']

    # second one should silently fail
    topic_link_b = db.create('topic_links', {
        'topics_id': topic['topics_id'],
        'stories_id': source_story['stories_id'],
        'url': 'http://foo.com'})

    topic_fetch_url_b = db.create('topic_fetch_urls', {
        'topics_id': topic['topics_id'],
        'url': 'http://foo.com',
        'topic_links_id': topic_link_a['topic_links_id'],
        'state': FETCH_STATE_STORY_ADDED,
        'stories_id': target_story['stories_id']})

    try_update_topic_link_ref_stories_id(db, topic_fetch_url_b)

    topic_link_b = db.require_by_id('topic_links', topic_link_b['topic_links_id'])

    assert topic_link_b['ref_stories_id'] is None

    # now generate an non-unique error and make sure we get an error
    bogus_tfu = {'topic_links_id': 0, 'topics_id': 'nan', 'stories_id': 'nan'}

    with pytest.raises(McUpdateByIDException):
        try_update_topic_link_ref_stories_id(db, bogus_tfu)
コード例 #5
0
def fetch_topic_url(db: DatabaseHandler,
                    topic_fetch_urls_id: int,
                    domain_timeout: Optional[int] = None) -> None:
    """Fetch a url for a topic and create a media cloud story from it if its content matches the topic pattern.

    Update the following fields in the topic_fetch_urls row:

    code - the status code of the http response
    fetch_date - the current time
    state - one of the FETCH_STATE_* constatnts
    message - message related to the state (eg. HTTP message for FETCH_STATE_REQUEST_FAILED)
    stories_id - the id of the story generated from the fetched content, or null if no story created'

    If topic_links_id is present in the topic_fetch_url and if a story was added or matched, assign the resulting
    topic_fetch_urls.stories_id to topic_links.ref_stories_id.

    If the state is anything but FETCH_STATE_PENDING or FETCH_STATE_REQUEUED, return without doing anything.

    If there is content for the corresponding url and topics_id in topic_seed_urls, use that content instead of
    fetching the url.

    This function catches almost all possible exceptions and stashes them topic_fetch_urls along with a state of
    FETCH_STATE_PYTHON_ERROR

    Arguments:
    db - db handle
    topic_fetch_urls_id - id of topic_fetch_urls row
    domain_timeout - pass through to fetch_link

    Returns:
    None

    """
    topic_fetch_url = db.require_by_id('topic_fetch_urls', topic_fetch_urls_id)

    try:
        log.info("fetch_link: %s" % topic_fetch_url['url'])
        _try_fetch_topic_url(db=db,
                             topic_fetch_url=topic_fetch_url,
                             domain_timeout=domain_timeout)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

        if 'stories_id' in topic_fetch_url and topic_fetch_url[
                'stories_id'] is not None:
            story = db.require_by_id('stories', topic_fetch_url['stories_id'])
            topic = db.require_by_id('topics', topic_fetch_url['topics_id'])
            redirect_url = topic_fetch_url['url']
            assume_match = topic_fetch_url['assume_match']
            if _is_not_topic_story(db, topic_fetch_url):
                if _story_matches_topic(db,
                                        story,
                                        topic,
                                        redirect_url=redirect_url,
                                        assume_match=assume_match):
                    add_to_topic_stories(db, story, topic)

            # add redirect_url as a lookup url for the story, if it is different from the story url
            if not redirect_url == topic_fetch_url['url']:
                insert_story_urls(db, story, redirect_url)

        if topic_fetch_url['topic_links_id'] and topic_fetch_url['stories_id']:
            try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    except McThrottledDomainException as ex:
        raise ex

    except Exception as ex:
        log.error("Error while fetching URL {}: {}".format(
            topic_fetch_url, ex))

        topic_fetch_url['state'] = FETCH_STATE_PYTHON_ERROR
        topic_fetch_url['message'] = traceback.format_exc()
        log.warning('topic_fetch_url %s failed: %s' %
                    (topic_fetch_url['url'], topic_fetch_url['message']))

    db.update_by_id('topic_fetch_urls', topic_fetch_url['topic_fetch_urls_id'],
                    topic_fetch_url)