Esempio n. 1
0
def _call_function_on_url_chunks(db: DatabaseHandler, topic: dict, urls: List,
                                 chunk_function: Callable) -> None:
    """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time.

    Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk.
    """
    i = 0
    while i < len(urls):
        chunk_urls = urls[i:i + URLS_CHUNK_SIZE]

        try:
            chunk_function(db, topic, chunk_urls)
        except Exception as ex:
            log.warning("error fetching twitter data: {}".format(ex))

            topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls]
            db.query(
                "update topic_fetch_urls set state = %(a)s, message = %(b)s where topic_fetch_urls_id = any(%(c)s)",
                {
                    'a': FETCH_STATE_PYTHON_ERROR,
                    'b': traceback.format_exc(),
                    'c': topic_fetch_urls_ids
                })

        i += URLS_CHUNK_SIZE
Esempio n. 2
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = f"{user['name']} ({user['screen_name']}): {user['description']}"
    title = f"{user['name']} ({user['screen_name']}) | Twitter"
    tweet_date = sql_now()
    url = f"https://twitter.com/{user['screen_name']}"

    story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)

    stories_id = story['stories_id']
    tags_id = undateable_tag['tags_id']

    db.query("""
        INSERT INTO public.stories_tags_map (stories_id, tags_id)
        VALUES (%(stories_id)s, %(tags_id)s)
        ON CONFLICT (stories_id, tags_id) DO NOTHING
    """, {
        'stories_id': stories_id,
        'tags_id': tags_id,
    })

    return story
Esempio n. 3
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links and all_links counts in the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.  Always increment all_links.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    self_link = 1 if story_domain in (url_domain, redirect_url_domain) else 0

    db.query(
        """
        insert into topic_domains (topics_id, domain, self_links, all_links)
            values(%(topics_id)s, %(domain)s, %(self_link)s, 1)
            on conflict (topics_id, md5(domain))
                do update set
                    self_links = topic_domains.self_links + %(self_link)s,
                    all_links = topic_domains.all_links + 1
        """,
        {'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain, 'self_link': self_link})
Esempio n. 4
0
def __print_table_csv_to_stdout(db: DatabaseHandler, table: str) -> None:
    """Print table dump to STDOUT."""

    column_names = db.query("SELECT * FROM %s LIMIT 0" % table).columns()
    primary_key_column = db.primary_key_column(table=table)

    print("""
--
-- Table '%(table)s'
--

    """ % {'table': table})

    # Python's "csv" module doesn't bother to differentiate between empty strings and "None" values:
    #
    # http://stackoverflow.com/a/11379550/200603
    #
    # ...so we're exporting the table in "TEXT" format with a cumbersome "\\N" (two-backslashes-N) mark for NULL values.
    print(
        "COPY %(table)s (%(column_names)s) FROM STDIN WITH (FORMAT TEXT, NULL '\\\\N');"
        % {
            'table': table,
            'column_names': ', '.join(column_names),
        })

    csv_writer = csv.writer(sys.stdout,
                            delimiter="\t",
                            escapechar="\\",
                            quoting=csv.QUOTE_NONE)

    res = db.query("SELECT * FROM %(table)s ORDER BY %(primary_key_column)s" %
                   {
                       'table': table,
                       'primary_key_column': primary_key_column,
                   })

    postgresql_null_value = '\\N'
    postgresql_end_of_data = '\.'
    while True:
        row = res.array()
        if row is None:
            break
        else:
            csv_writer.writerow(
                [postgresql_null_value if val is None else val for val in row])

    print(postgresql_end_of_data)

    print("""

-- Update sequence head
SELECT setval(
    pg_get_serial_sequence('%(table)s', '%(primary_key_column)s'),
    (SELECT max(%(primary_key_column)s)+1 FROM %(table)s)
);

    """ % {
        'table': table,
        'primary_key_column': primary_key_column,
    })
Esempio n. 5
0
def _call_function_on_url_chunks(db: DatabaseHandler,
                                 topic: Dict[str, Any],
                                 urls: List[Dict[str, Any]],
                                 chunk_function: Callable) -> None:
    """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time.

    Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk.
    """
    i = 0
    while i < len(urls):
        chunk_urls = urls[i:i + URLS_CHUNK_SIZE]

        try:
            chunk_function(db, topic, chunk_urls)
        except Exception as ex:
            log.warning(f"error fetching twitter data: {ex}")

            topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls]

            db.query("""
                UPDATE topic_fetch_urls SET
                    state = %(state)s,
                    message = %(message)s
                WHERE
                    topics_id = %(topics_id)s AND
                    topic_fetch_urls_id = ANY(%(topic_fetch_urls_ids)s)
            """, {
                'state': FETCH_STATE_PYTHON_ERROR,
                'message': str(ex),
                'topics_id': topic['topics_id'],
                'topic_fetch_urls_ids': topic_fetch_urls_ids,
            })

        i += URLS_CHUNK_SIZE
Esempio n. 6
0
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict,
                    topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = '%s (%s): %s' % (user['name'], user['screen_name'],
                               user['description'])
    title = '%s (%s) | Twitter' % (user['name'], user['screen_name'])
    tweet_date = mediawords.util.sql.sql_now()
    url = 'https://twitter.com/%s' % user['screen_name']

    story = mediawords.tm.stories.generate_story(db=db,
                                                 url=url,
                                                 content=content,
                                                 title=title,
                                                 publish_date=tweet_date)
    mediawords.tm.stories.add_to_topic_stories(db=db,
                                               story=story,
                                               topic=topic,
                                               link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(
            db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {
            'a': story['stories_id'],
            'b': undateable_tag['tags_id']
        })

    return story
Esempio n. 7
0
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> dict:
    """Update topic_fetch_url state to tweet missing."""
    db.query(
        "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s",
        {
            'a': FETCH_STATE_TWEET_MISSING,
            'b': topic_fetch_url['topic_fetch_urls_id']
        })
Esempio n. 8
0
def _log_content_match_failed(db: DatabaseHandler,
                              topic_fetch_url: dict) -> dict:
    """Update topic_fetch_url state to content match failed."""
    db.query(
        "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s",
        {
            'a': FETCH_STATE_CONTENT_MATCH_FAILED,
            'b': topic_fetch_url['topic_fetch_urls_id']
        })
Esempio n. 9
0
 def __kill_connections_to_database(db: DatabaseHandler, database_name: str) -> None:
     """Kill all active connections to the database."""
     # If multiple Python test files get run in a sequence and one of them fails, the test apparently doesn't call
     # tearDown() and the the connection to the test database persists (apparently)
     db.query("""
         SELECT pg_terminate_backend(pg_stat_activity.pid)
         FROM pg_catalog.pg_stat_activity
         WHERE datname = %(template_db_name)s
           AND pid != pg_backend_pid()
     """, {'template_db_name': database_name})
Esempio n. 10
0
 def __kill_connections_to_database(db: DatabaseHandler,
                                    database_name: str) -> None:
     """Kill all active connections to the database."""
     # If multiple Python test files get run in a sequence and one of them fails, the test apparently doesn't call
     # tearDown() and the the connection to the test database persists (apparently)
     db.query(
         """
         SELECT pg_terminate_backend(pg_stat_activity.pid)
         FROM pg_catalog.pg_stat_activity
         WHERE datname = %(template_db_name)s
           AND pid != pg_backend_pid()
     """, {'template_db_name': database_name})
Esempio n. 11
0
def __print_table_csv_to_stdout(db: DatabaseHandler, table: str) -> None:
    """Print table dump to STDOUT."""

    column_names = db.query("SELECT * FROM %s LIMIT 0" % table).columns()
    primary_key_column = db.primary_key_column(table)

    print("""
--
-- Table '%(table)s'
--

    """ % {'table': table})

    # Python's "csv" module doesn't bother to differentiate between empty strings and "None" values:
    #
    # http://stackoverflow.com/a/11379550/200603
    #
    # ...so we're exporting the table in "TEXT" format with a cumbersome "\\N" (two-backslashes-N) mark for NULL values.
    print("COPY %(table)s (%(column_names)s) FROM STDIN WITH (FORMAT TEXT, NULL '\\\\N');" % {
        'table': table,
        'column_names': ', '.join(column_names),
    })

    csv_writer = csv.writer(sys.stdout, delimiter="\t", escapechar="\\", quoting=csv.QUOTE_NONE)

    res = db.query("SELECT * FROM %(table)s ORDER BY %(primary_key_column)s" % {
        'table': table,
        'primary_key_column': primary_key_column,
    })

    postgresql_null_value = '\\N'
    postgresql_end_of_data = r'\.'
    while True:
        row = res.array()
        if row is None:
            break
        else:
            csv_writer.writerow([postgresql_null_value if val is None else val for val in row])

    print(postgresql_end_of_data)

    print("""

-- Update sequence head
SELECT setval(
    pg_get_serial_sequence('%(table)s', '%(primary_key_column)s'),
    (SELECT max(%(primary_key_column)s)+1 FROM %(table)s)
);

    """ % {
        'table': table,
        'primary_key_column': primary_key_column,
    })
Esempio n. 12
0
def fetch_twitter_urls_update_state(db: DatabaseHandler, topic_fetch_urls_ids: List[int]):
    """Try fetch_twitter_urls(), update state."""
    try:
        fetch_twitter_urls(db=db, topic_fetch_urls_ids=topic_fetch_urls_ids)
    except Exception as ex:
        log.error("Error while fetching URL with ID {}: {}".format(topic_fetch_urls_ids, str(ex)))
        db.query("""
            update topic_fetch_urls
            set state = %(a)s,
                message = %(b)s,
                fetch_date = now()
            where topic_fetch_urls_id = any(%(c)s)
        """, {'a': FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids})
Esempio n. 13
0
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> None:
    """Update topic_fetch_url state to tweet missing."""

    db.query("""
        UPDATE topic_fetch_urls SET
            state = %(state)s,
            fetch_date = NOW()
        WHERE
            topics_id = %(topics_id)s AND
            topic_fetch_urls_id = %(topic_fetch_urls_id)s
    """, {
        'state': FETCH_STATE_TWEET_MISSING,
        'topics_id': topic_fetch_url['topics_id'],
        'topic_fetch_urls_id': topic_fetch_url['topic_fetch_urls_id'],
    })
Esempio n. 14
0
def _log_content_match_failed(db: DatabaseHandler, topic_fetch_url: dict) -> None:
    """Update topic_fetch_url state to content match failed."""

    db.query("""
        UPDATE topic_fetch_urls SET
            state = %(state)s,
            fetch_date = NOW()
        WHERE
            topics_id = %(topics_id)s AND
            topic_fetch_urls_id = %(topic_fetch_urls_id)s
    """, {
        'state': FETCH_STATE_CONTENT_MATCH_FAILED,
        'topics_id': topic_fetch_url['topics_id'],
        'topic_fetch_urls_id': topic_fetch_url['topic_fetch_urls_id'],
    })
Esempio n. 15
0
def skip_self_linked_domain_url(db: DatabaseHandler, topics_id: int,
                                source_url: str, ref_url: str) -> bool:
    """Return true if the url should be skipped because it is a self linked domain within the topic.

    Return true if the domain of the ref_url is the same as the domain of the story_url and one of the following
    is true:
    * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or
    * ref_url matches SKIP_SELF_LINK_RE.
    """
    source_domain = get_url_distinctive_domain(source_url)
    ref_domain = get_url_distinctive_domain(ref_url)

    if source_domain != ref_domain:
        return False

    if re.search(SKIP_SELF_LINK_RE, ref_url, flags=re.I):
        return True

    topic_domain = db.query(
        "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)",
        {
            'a': topics_id,
            'b': ref_domain
        }).hash()

    if topic_domain and topic_domain['self_links'] >= MAX_SELF_LINKS:
        return True

    return False
Esempio n. 16
0
def _get_undateable_tag(db: DatabaseHandler) -> dict:
    """Return the date_invalid:undateable tag."""
    tag_name = 'undateable'
    tag_set_name = 'date_invalid'

    invalid_tag = db.query(
        """
        select t.*
            from tags t
                join tag_sets ts using ( tag_sets_id )
            where
                t.tag = %(a)s and
                ts.name = %(b)s
        """, {
            'a': tag_name,
            'b': tag_set_name
        }).hash()

    if invalid_tag is None:
        tag_set = db.find_or_create('tag_sets', {'name': tag_set_name})
        invalid_tag = db.find_or_create('tags', {
            'tag': tag_name,
            'tag_sets_id': tag_set['tag_sets_id']
        })

    return invalid_tag
Esempio n. 17
0
def skip_self_linked_domain(db: DatabaseHandler,
                            topic_fetch_url: dict) -> bool:
    """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain.

    Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url.

    Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict.
    """
    if 'topic_links_id' not in topic_fetch_url or topic_fetch_url[
            'topic_links_id'] is None:
        return False

    topic_link = db.query(
        """
        SELECT *
        FROM topic_links
        WHERE
            topics_id = %(topics_id)s AND
            topic_links_id = %(topic_links_id)s
    """, {
            'topics_id': topic_fetch_url['topics_id'],
            'topic_links_id': topic_fetch_url['topic_links_id'],
        }).hash()
    if not topic_link:
        raise Exception(
            f"Topic link ID {topic_fetch_url['topic_links_id']} was not found."
        )

    story = db.require_by_id('stories', topic_link['stories_id'])

    url = topic_link.get('redirect_url', topic_link['url'])

    return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'],
                                       story['url'], url)
def print_long_running_job_states(db: DatabaseHandler):
    """print a count of job_states that have been running or queued for longer than a day"""
    counts = db.query("""
        select
            class,
            count(*) as count,
            min(last_updated::date) as min_date,
            max(last_updated::date) as max_date
        from (
            select
                *,
                rank() over (
                    partition by
                        class,
                        (args->>'media_id')::int
                    order by last_updated desc
                ) as media_rank,
                args->>'media_id' as media_id
            from job_states
        ) as q 
        where 
            media_rank = 1 and 
            state in( 'queued', 'running') and 
            last_updated < now() - interval '1 day'
        group by class
        order by class
    """).hashes()

    if len(counts) > 0:
        print("Long Running Jobs:\n")

        for count in counts:
            print("%s: %d (%s - %s)" % (count['class'], count['count'], count['min_date'], count['max_date']))
Esempio n. 19
0
def skip_self_linked_domain_url(db: DatabaseHandler, topics_id: int, source_url: str, ref_url: str) -> bool:
    """Return true if the url should be skipped because it is a self linked domain within the topic.

    Return true if the domain of the ref_url is the same as the domain of the story_url and one of the following
    is true:
    * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or
    * ref_url matches SKIP_SELF_LINK_RE.
    """
    source_domain = mediawords.util.url.get_url_distinctive_domain(source_url)
    ref_domain = mediawords.util.url.get_url_distinctive_domain(ref_url)

    if source_domain != ref_domain:
        return False

    if re.search(SKIP_SELF_LINK_RE, ref_url, flags=re.I):
        return True

    topic_domain = db.query(
        "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)",
        {'a': topics_id, 'b': ref_domain}).hash()

    if topic_domain and topic_domain['self_links'] >= MAX_SELF_LINKS:
        return True

    return False
def _log_tweet_added(db: DatabaseHandler, topic_fetch_url: dict, story: dict) -> dict:
    """Update topic_fetch_url stat to tweet added."""
    return db.query(
        """
        update topic_fetch_urls set state=%(a)s, stories_id=%(b)s, fetch_date=now() where topic_fetch_urls_id=%(c)s
            returning *
        """,
        {'a': FETCH_STATE_TWEET_ADDED, 'b': story['stories_id'], 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
Esempio n. 21
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    if story_domain not in (url_domain, redirect_url_domain):
        return

    topic_domain = db.query(
        """
        insert into topic_domains (topics_id, domain, self_links)
            values(%(topics_id)s, %(domain)s, 1)
            on conflict (topics_id, md5(domain))
                do nothing
            returning *
        """,
        {
            'topics_id': topic_link['topics_id'],
            'domain': redirect_url_domain
        }
    ).hash()

    # do this update separately instead of as an upsert because the upsert was occasionally deadlocking
    if not topic_domain:
        db.query(
            """
            update topic_domains set
                    self_links = topic_domains.self_links + 1
                where
                    topics_id = %(topics_id)s and
                    domain = %(domain)s
            """,
            {
                'topics_id': topic_link['topics_id'],
                'domain': redirect_url_domain
            }
        )
def _log_python_errpr(db: DatabaseHandler, topic_fetch_url: dict, message: str) -> dict:
    """Update topic_fetch_url stat to tweet failed."""
    return db.query(
        """
        update topic_fetch_urls set state=%(a)s, fetch_date=now(), message = %(b)s  where topic_fetch_urls_id=%(c)s
            returning *
        """,
        {'a': FETCH_STATE_PYTHON_ERROR, 'b': message, 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
Esempio n. 23
0
def _log_tweet_added(db: DatabaseHandler, topic_fetch_url: dict, story: dict) -> dict:
    """Update topic_fetch_url stat to tweet added."""
    return db.query("""
        UPDATE topic_fetch_urls
        SET state = %(a)s,
            stories_id = %(b)s,
            fetch_date = NOW()
        WHERE topic_fetch_urls_id = %(c)s
        RETURNING *
    """, {'a': FETCH_STATE_TWEET_ADDED, 'b': story['stories_id'], 'c': topic_fetch_url['topic_fetch_urls_id']}).hash()
Esempio n. 24
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
def _call_function_on_url_chunks(db: DatabaseHandler, topic: dict, urls: List, chunk_function: Callable) -> None:
    """Call chunk_function on chunks of up to URLS_CHUNK_SIZE urls at a time.

    Catch any exceptions raised and save them in the topic_fetch_urls for the given chunk.
    """
    i = 0
    while i < len(urls):
        chunk_urls = urls[i:i + URLS_CHUNK_SIZE]

        try:
            chunk_function(db, topic, chunk_urls)
        except Exception as ex:
            log.warning("error fetching twitter data: {}".format(ex))

            topic_fetch_urls_ids = [u['topic_fetch_urls_id'] for u in urls]
            db.query(
                "update topic_fetch_urls set state = %(a)s, message = %(b)s where topic_fetch_urls_id = any(%(c)s)",
                {'a': FETCH_STATE_PYTHON_ERROR, 'b': traceback.format_exc(), 'c': topic_fetch_urls_ids})

        i += URLS_CHUNK_SIZE
Esempio n. 26
0
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict:
    """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download
    store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content()."""

    story = decode_object_from_bytes_if_needed(story)
    feed = decode_object_from_bytes_if_needed(feed)

    if 'content' in story:
        content = story['content']
    else:
        content = _get_test_content()

    if story.get('full_text_rss', None):
        story['full_text_rss'] = False
        db.update_by_id(
            table='stories',
            object_id=story['stories_id'],
            update_hash={'full_text_rss': False},
        )

    host = get_url_host(feed['url'])

    download = db.create(
        table='downloads',
        insert_hash={
            'feeds_id': feed['feeds_id'],
            'url': story['url'],
            'host': host,
            'type': 'content',
            'sequence': 1,
            'state': 'fetching',
            'priority': 1,
            'extracted': False,
            'stories_id': story['stories_id'],
        }
    )

    download = store_content(db=db, download=download, content=content)

    story['download'] = download
    story['content'] = content

    extract_and_process_story(db=db, story=story)

    story['download_text'] = db.query("""
        SELECT *
        FROM download_texts
        WHERE downloads_id = %(downloads_id)s
    """, {'downloads_id': download['downloads_id']}).hash()

    if not story['download_text']:
        raise McAddContentToTestStoryException("Unable to find download_text")

    return story
Esempio n. 27
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = get_url_distinctive_domain(story['url'])

    url_domain = get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = get_url_distinctive_domain(redirect_url)

    if story_domain not in (url_domain, redirect_url_domain):
        return

    topic_domain = db.query(
        """
            INSERT INTO topic_domains (topics_id, domain, self_links)
            VALUES (%(topics_id)s, %(domain)s, 1)
            ON CONFLICT (topics_id, md5(domain)) DO NOTHING
            RETURNING *
        """, {
            'topics_id': topic_link['topics_id'],
            'domain': redirect_url_domain
        }).hash()

    # do this update separately instead of as an upsert because the upsert was occasionally deadlocking
    if not topic_domain:
        db.query(
            """
            UPDATE topic_domains SET
                self_links = topic_domains.self_links + 1
            WHERE
                topics_id = %(topics_id)s AND
                domain = %(domain)s
            """, {
                'topics_id': topic_link['topics_id'],
                'domain': redirect_url_domain
            })
def _add_user_story(db: DatabaseHandler, topic: dict, user: dict, topic_fetch_urls: list) -> dict:
    """Generate a story based on the given user, as returned by the twitter api."""
    content = '%s (%s): %s' % (user['name'], user['screen_name'], user['description'])
    title = '%s (%s) | Twitter' % (user['name'], user['screen_name'])
    tweet_date = mediawords.util.sql.sql_now()
    url = 'https://twitter.com/%s' % user['screen_name']

    story = mediawords.tm.stories.generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date)
    mediawords.tm.stories.add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True)

    for topic_fetch_url in topic_fetch_urls:
        topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story)
        mediawords.tm.fetch_link.try_update_topic_link_ref_stories_id(db, topic_fetch_url)

    # twitter user pages are undateable because there is never a consistent version of the page
    undateable_tag = _get_undateable_tag(db)
    db.query(
        "insert into stories_tags_map (stories_id, tags_id) values (%(a)s, %(b)s)",
        {'a': story['stories_id'], 'b': undateable_tag['tags_id']})

    return story
Esempio n. 29
0
def _log_tweet_added(db: DatabaseHandler, topic_fetch_url: dict,
                     story: dict) -> dict:
    """Update topic_fetch_url stat to tweet added."""
    return db.query(
        """
        update topic_fetch_urls set state=%(a)s, stories_id=%(b)s, fetch_date=now() where topic_fetch_urls_id=%(c)s
            returning *
        """, {
            'a': FETCH_STATE_TWEET_ADDED,
            'b': story['stories_id'],
            'c': topic_fetch_url['topic_fetch_urls_id']
        }).hash()
Esempio n. 30
0
def _log_python_errpr(db: DatabaseHandler, topic_fetch_url: dict,
                      message: str) -> dict:
    """Update topic_fetch_url stat to tweet failed."""
    return db.query(
        """
        update topic_fetch_urls set state=%(a)s, fetch_date=now(), message = %(b)s  where topic_fetch_urls_id=%(c)s
            returning *
        """, {
            'a': FETCH_STATE_PYTHON_ERROR,
            'b': message,
            'c': topic_fetch_url['topic_fetch_urls_id']
        }).hash()
Esempio n. 31
0
def fetch_twitter_urls_update_state(db: DatabaseHandler,
                                    topics_id: int,
                                    topic_fetch_urls_ids: List[int]) -> None:
    """Try fetch_twitter_urls(), update state."""
    try:
        fetch_twitter_urls(db=db, topics_id=topics_id, topic_fetch_urls_ids=topic_fetch_urls_ids)
    except Exception as ex:
        log.error(f"Error while fetching URL with ID {topic_fetch_urls_ids}: {ex}")

        db.query("""
            UPDATE topic_fetch_urls SET
                state = %(state)s,
                message = %(message)s,
                fetch_date = NOW()
            WHERE
                topics_id = %(topics_id)s AND
                topic_fetch_urls_id = ANY(%(topic_fetch_urls_ids)s)
        """, {
            'state': FETCH_STATE_PYTHON_ERROR,
            'message': traceback.format_exc(),
            'topics_id': topics_id,
            'topic_fetch_urls_ids': topic_fetch_urls_ids,
        })
Esempio n. 32
0
def create_test_topic_stories(
        db: DatabaseHandler, 
        topic: dict, 
        num_media: int=10, 
        num_stories_per_medium: int=10) -> None:
    """Fill topic with test stories in num_media with one cross media link per medium."""
    for mi in range(num_media):
        medium = create_test_medium(db, f'medium {mi}')
        feed = create_test_feed(db, f'feed {mi}', medium)
        
        for si in range(num_stories_per_medium):
            story = create_test_story(db, f'story {mi} {si}', feed)
            db.update_by_id('stories', story['stories_id'], {'publish_date': topic['start_date']})
            db.create('topic_stories', {'topics_id': topic['topics_id'], 'stories_id': story['stories_id']})

    db.query(
        """
        insert into topic_links ( topics_id, stories_id, url, ref_stories_id )
            select %(a)s, a.stories_id, b.url, b.stories_id
                from stories a
                    join stories b on ( a.media_id <> b.media_id )
                limit %(b)s
        """,
        {'a': topic['topics_id'], 'b': num_media * num_stories_per_medium})
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List) -> None:
    """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100."""
    if len(topic_fetch_urls_ids) == 0:
        return

    topic_fetch_urls = db.query(
        "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)",
        {'a': topic_fetch_urls_ids}).hashes()

    topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id'])

    (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls)

    _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk)
    _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
Esempio n. 34
0
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List[int]) -> None:
    """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100."""
    if len(topic_fetch_urls_ids) == 0:
        return

    topic_fetch_urls = db.query(
        "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)",
        {'a': topic_fetch_urls_ids}).hashes()

    topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id'])

    (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls)

    _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk)
    _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
Esempio n. 35
0
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool:
    """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain.

    Return true if the domain of the linked url is the same as the domain of the linking story and one of the following
    is true:
    * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or
    * the linked url matches SKIP_SELF_LINK_RE.

    Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict.
    """
    if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None:
        return False

    topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id'])

    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])

    if redirect_url is None:
        redirect_url = topic_link['url']

    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    link_domain = redirect_url_domain if redirect_url_domain else url_domain

    if story_domain not in (url_domain, redirect_url_domain):
        return False

    for url in (topic_link['url'], redirect_url):
        if re.search(SKIP_SELF_LINK_RE, url, flags=re.I):
            return True

    topic_domain = db.query(
        "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)",
        {'a': topic_fetch_url['topics_id'], 'b': link_domain}).hash()

    if topic_domain and topic_domain['self_links'] > MAX_SELF_LINKS:
        return True

    return False
Esempio n. 36
0
    def reset_all_schemas(db_: DatabaseHandler) -> None:
        """Recreate all schemas."""
        schemas = db_.query("""
            SELECT schema_name
            FROM information_schema.schemata
            WHERE schema_name NOT LIKE %(schema_pattern)s
              AND schema_name != 'information_schema'
            ORDER BY schema_name
        """, {'schema_pattern': 'pg_%'}).flat()

        # When dropping schemas, PostgreSQL spits out a lot of notices which break "no warnings" unit test
        db_.query('SET client_min_messages=WARNING')

        for schema in schemas:
            db_.query('DROP SCHEMA IF EXISTS %s CASCADE' % schema)

        db_.query('SET client_min_messages=NOTICE')
def _get_undateable_tag(db: DatabaseHandler) -> dict:
    """Return the date_invalid:undateable tag."""
    tag_name = 'undateable'
    tag_set_name = 'date_invalid'

    invalid_tag = db.query(
        """
        select t.*
            from tags t
                join tag_sets ts using ( tag_sets_id )
            where
                t.tag = %(a)s and
                ts.name = %(b)s
        """,
        {'a': tag_name, 'b': tag_set_name}).hash()

    if invalid_tag is None:
        tag_set = db.find_or_create('tag_sets', {'name': tag_set_name})
        invalid_tag = db.find_or_create('tags', {'tag': tag_name, 'tag_sets_id': tag_set['tag_sets_id']})

    return invalid_tag
Esempio n. 38
0
def postgres_regex_match(db: DatabaseHandler, strings: List[str],
                         regex: str) -> bool:
    """Run the regex through the PostgreSQL engine against a given list of strings.

    Return True if any string matches the given regex.

    This is necessary because very occasionally the wrong combination of text and complex boolean regex will cause Perl
    (Python too?) to hang."""

    strings = decode_object_from_bytes_if_needed(strings)
    regex = decode_object_from_bytes_if_needed(regex)

    if not isinstance(strings, list):
        raise McPostgresRegexMatch("Strings must be a list, but is: %s" %
                                   str(strings))

    if len(strings) == 0:
        return False

    if not isinstance(strings[0], str):
        raise McPostgresRegexMatch(
            "Strings must be a list of strings, but is: %s" % str(strings))

    full_regex = '(?isx)%s' % regex
    match = db.query(
        """
        SELECT 1
        FROM UNNEST(%(strings)s) AS string
        WHERE string ~ %(regex)s
        LIMIT 1
    """,
        {
            'strings': strings,  # list gets converted to PostgreSQL's ARRAY[]
            'regex': full_regex,
        }).hash()

    if match is not None:
        return True
    else:
        return False
Esempio n. 39
0
def _get_undateable_tag(db: DatabaseHandler) -> dict:
    """Return the date_invalid:undateable tag."""
    tag_name = 'undateable'
    tag_set_name = 'date_invalid'

    invalid_tag = db.query("""
        SELECT t.*
        FROM tags AS t
            INNER JOIN tag_sets AS ts USING (tag_sets_id)
        WHERE
            t.tag = %(tag_name)s AND
            ts.name = %(tag_set_name)s
    """, {
        'tag_name': tag_name,
        'tag_set_name': tag_set_name,
    }).hash()

    if invalid_tag is None:
        tag_set = db.find_or_create('tag_sets', {'name': tag_set_name})
        invalid_tag = db.find_or_create('tags', {'tag': tag_name, 'tag_sets_id': tag_set['tag_sets_id']})

    return invalid_tag
Esempio n. 40
0
def fetch_twitter_urls(db: DatabaseHandler, topics_id: int, topic_fetch_urls_ids: List[int]) -> None:
    """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100."""
    if len(topic_fetch_urls_ids) == 0:
        return

    topic_fetch_urls = db.query("""
        SELECT *
        FROM topic_fetch_urls
        WHERE
            topics_id = %(topics_id)s AND
            topic_fetch_urls_id = ANY(%(topic_fetch_urls_ids)s)
    """, {
        'topics_id': topics_id,
        'topic_fetch_urls_ids': topic_fetch_urls_ids,
    }).hashes()

    topic = db.require_by_id('topics', topics_id)

    (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls)

    _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk)
    _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
Esempio n. 41
0
    def reset_all_schemas(db_: DatabaseHandler) -> None:
        """Recreate all schemas."""
        schemas = db_.query(
            """
            SELECT schema_name
            FROM information_schema.schemata
            WHERE schema_name NOT LIKE %(schema_pattern)s
              AND schema_name != 'information_schema'
            ORDER BY schema_name
        """, {
                'schema_pattern': 'pg_%'
            }).flat()

        # When dropping schemas, PostgreSQL spits out a lot of notices which break "no warnings" unit test
        db_.query('SET client_min_messages=WARNING')

        for schema in schemas:
            db_.query('DROP SCHEMA IF EXISTS %s CASCADE' % schema)

        db_.query('SET client_min_messages=NOTICE')
Esempio n. 42
0
def __validate_table_foreign_keys(db: DatabaseHandler, table: str) -> None:
    """Validate all table's foreign keys; raise McValidateTableForeignKeysException if any of the keys are invalid.

    If table's constraints aren't right, SQL would be pretty much invalid."""

    foreign_keys = db.query("""
        SELECT
            tc.constraint_name,
            tc.table_schema,
            tc.table_name,
            kcu.column_name,
            ccu.table_schema AS foreign_table_schema,
            ccu.table_name AS foreign_table_name,
            ccu.column_name AS foreign_column_name

        FROM information_schema.table_constraints AS tc
            JOIN information_schema.key_column_usage AS kcu
                ON tc.constraint_name = kcu.constraint_name
            JOIN information_schema.constraint_column_usage AS ccu
                ON ccu.constraint_name = tc.constraint_name
        WHERE constraint_type = 'FOREIGN KEY'
          AND tc.table_schema = 'public'
          AND tc.table_name = %(table)s
    """, {'table': table}).hashes()

    foreign_key_errors = []

    for foreign_key in foreign_keys:
        constraint_name = foreign_key['constraint_name']

        log.info("Validating foreign key '%s' for table '%s'..." % (constraint_name, table))

        sql = """
            SELECT DISTINCT a.%(column_name)s
            FROM %(table_schema)s.%(table_name)s AS a
                LEFT JOIN %(foreign_table_schema)s.%(foreign_table_name)s AS b
                    ON a.%(column_name)s = b.%(foreign_column_name)s
            WHERE a.%(column_name)s IS NOT NULL
              AND b.%(foreign_column_name)s IS NULL
            ORDER BY a.%(column_name)s
        """ % {
            'column_name': foreign_key['column_name'],
            'table_schema': foreign_key['table_schema'],
            'table_name': table,
            'foreign_table_schema': foreign_key['foreign_table_schema'],
            'foreign_table_name': foreign_key['foreign_table_name'],
            'foreign_column_name': foreign_key['foreign_column_name'],
        }

        unreferenced_rows = db.query(sql).flat()

        if len(unreferenced_rows) > 0:
            error = """
                Table '%(table)s' has unreferenced rows for constraint '%(constraint_name)s':
                %(unreferenced_rows)s; SQL: %(sql)s
            """ % {
                'table': table,
                'constraint_name': constraint_name,
                'unreferenced_rows': ', '.join(unreferenced_rows),
                'sql': sql,
            }
            foreign_key_errors.append(error)
            log.warning(error)
        else:
            log.info("Foreign key '%s' for table '%s' looks fine." % (constraint_name, table))

    if len(foreign_key_errors) > 0:
        raise McValidateTableForeignKeysException(
            "One or more foreign key checks failed for table '%(table)s': %(foreign_key_errors)s" % {
                'table': table,
                'foreign_key_errors': "\n".join(foreign_key_errors)
            }
        )
def _log_content_match_failed(db: DatabaseHandler, topic_fetch_url: dict) -> dict:
    """Update topic_fetch_url state to content match failed."""
    db.query(
        "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s",
        {'a': FETCH_STATE_CONTENT_MATCH_FAILED, 'b': topic_fetch_url['topic_fetch_urls_id']})
def _log_tweet_missing(db: DatabaseHandler, topic_fetch_url: dict) -> dict:
    """Update topic_fetch_url state to tweet missing."""
    db.query(
        "update topic_fetch_urls set state = %(a)s, fetch_date = now() where topic_fetch_urls_id = %(b)s",
        {'a': FETCH_STATE_TWEET_MISSING, 'b': topic_fetch_url['topic_fetch_urls_id']})
Esempio n. 45
0
def connect_to_db(
        label: typing.Optional[str] = None,
        do_not_check_schema_version: bool = False,
        is_template: bool = False) -> DatabaseHandler:
    """Connect to PostgreSQL.

    Arguments:
    label - db config section label for mediawords.yml
    do_no_check_schema_version - if false, throw an error if the versions in mediawords.ym and the db do not match
    is_template - if true, connect to a db called <db_name>_template instead of <db_name>

    """
    label = decode_str_from_bytes_if_needed(label)

    # If this is Catalyst::Test run, force the label to the test database
    if using_test_database():
        label = 'test'

    config = py_get_config()

    if 'database' not in config:
        raise McConnectToDBException("No database connections are configured")

    all_settings = config['database']
    if all_settings is None:
        raise McConnectToDBException("No database connections are configured")

    settings = None
    if label is not None:
        for configured_database in all_settings:
            if configured_database['label'] == label:
                settings = configured_database
                break
        if settings is None:
            raise McConnectToDBException("No database connection settings labeled '%s'." % label)
    else:
        if len(all_settings) == 0:
            raise McConnectToDBException("No default connection settings found.")

        settings = all_settings[0]

    if settings is None:
        raise McConnectToDBException("Settings are undefined.")

    if 'host' not in settings or 'db' not in settings:
        raise McConnectToDBException("Settings are incomplete ('db' and 'host' must both be set).")

    host = settings['host']
    port = int(settings['port'])
    username = settings['user']
    password = settings['pass']
    database = settings['db']

    if is_template:
        database = database + "_template"

    try:
        ret = DatabaseHandler(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
            do_not_check_schema_version=do_not_check_schema_version
        )
    except Exception as ex:
        raise McConnectToDBException(
            "Unable to connect to database %(username)s@%(host)s:%(port)d/%(database)s: %(exception)s" % {
                'username': username,
                'host': host,
                'port': port,
                'database': database,
                'exception': str(ex)
            })

    if ret is None:
        raise McConnectToDBException("Error while connecting to the database.")

    if 'db_statement_timeout' in config['mediawords']:
        db_statement_timeout = config['mediawords']['db_statement_timeout']

        ret.query('SET statement_timeout TO %(db_statement_timeout)s' % {'db_statement_timeout': db_statement_timeout})

    return ret