Ejemplo n.º 1
0
def skip_self_linked_domain(db: DatabaseHandler,
                            topic_fetch_url: dict) -> bool:
    """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain.

    Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url.

    Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict.
    """
    if 'topic_links_id' not in topic_fetch_url or topic_fetch_url[
            'topic_links_id'] is None:
        return False

    topic_link = db.query(
        """
        SELECT *
        FROM topic_links
        WHERE
            topics_id = %(topics_id)s AND
            topic_links_id = %(topic_links_id)s
    """, {
            'topics_id': topic_fetch_url['topics_id'],
            'topic_links_id': topic_fetch_url['topic_links_id'],
        }).hash()
    if not topic_link:
        raise Exception(
            f"Topic link ID {topic_fetch_url['topic_links_id']} was not found."
        )

    story = db.require_by_id('stories', topic_link['stories_id'])

    url = topic_link.get('redirect_url', topic_link['url'])

    return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'],
                                       story['url'], url)
Ejemplo n.º 2
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links and all_links counts in the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.  Always increment all_links.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    self_link = 1 if story_domain in (url_domain, redirect_url_domain) else 0

    db.query(
        """
        insert into topic_domains (topics_id, domain, self_links, all_links)
            values(%(topics_id)s, %(domain)s, %(self_link)s, 1)
            on conflict (topics_id, md5(domain))
                do update set
                    self_links = topic_domains.self_links + %(self_link)s,
                    all_links = topic_domains.all_links + 1
        """,
        {'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain, 'self_link': self_link})
Ejemplo n.º 3
0
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool:
    """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain.

    Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url.

    Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict.
    """
    if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None:
        return False

    topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id'])

    story = db.require_by_id('stories', topic_link['stories_id'])

    url = topic_link.get('redirect_url', topic_link['url'])

    return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'], story['url'], url)
Ejemplo n.º 4
0
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool:
    """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain.

    Return true if the domain of the linked url is the same as the domain of the linking story and one of the following
    is true:
    * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or
    * the linked url matches SKIP_SELF_LINK_RE.

    Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict.
    """
    if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None:
        return False

    topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id'])

    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])

    if redirect_url is None:
        redirect_url = topic_link['url']

    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    link_domain = redirect_url_domain if redirect_url_domain else url_domain

    if story_domain not in (url_domain, redirect_url_domain):
        return False

    for url in (topic_link['url'], redirect_url):
        if re.search(SKIP_SELF_LINK_RE, url, flags=re.I):
            return True

    topic_domain = db.query(
        "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)",
        {'a': topic_fetch_url['topics_id'], 'b': link_domain}).hash()

    if topic_domain and topic_domain['self_links'] > MAX_SELF_LINKS:
        return True

    return False
Ejemplo n.º 5
0
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List[int]) -> None:
    """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100."""
    if len(topic_fetch_urls_ids) == 0:
        return

    topic_fetch_urls = db.query(
        "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)",
        {'a': topic_fetch_urls_ids}).hashes()

    topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id'])

    (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls)

    _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk)
    _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
Ejemplo n.º 6
0
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List) -> None:
    """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100."""
    if len(topic_fetch_urls_ids) == 0:
        return

    topic_fetch_urls = db.query(
        "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)",
        {'a': topic_fetch_urls_ids}).hashes()

    topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id'])

    (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls)

    _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk)
    _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
Ejemplo n.º 7
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = mediawords.util.url.get_url_distinctive_domain(story['url'])

    url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url)

    if story_domain not in (url_domain, redirect_url_domain):
        return

    topic_domain = db.query(
        """
        insert into topic_domains (topics_id, domain, self_links)
            values(%(topics_id)s, %(domain)s, 1)
            on conflict (topics_id, md5(domain))
                do nothing
            returning *
        """,
        {
            'topics_id': topic_link['topics_id'],
            'domain': redirect_url_domain
        }
    ).hash()

    # do this update separately instead of as an upsert because the upsert was occasionally deadlocking
    if not topic_domain:
        db.query(
            """
            update topic_domains set
                    self_links = topic_domains.self_links + 1
                where
                    topics_id = %(topics_id)s and
                    domain = %(domain)s
            """,
            {
                'topics_id': topic_link['topics_id'],
                'domain': redirect_url_domain
            }
        )
Ejemplo n.º 8
0
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None:
    """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row.

    Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of
    topic_links.url or topic_links.redirect_url.
    """
    story = db.require_by_id('stories', topic_link['stories_id'])
    story_domain = get_url_distinctive_domain(story['url'])

    url_domain = get_url_distinctive_domain(topic_link['url'])

    redirect_url = topic_link.get('redirect_url', topic_link['url'])
    redirect_url_domain = get_url_distinctive_domain(redirect_url)

    if story_domain not in (url_domain, redirect_url_domain):
        return

    topic_domain = db.query(
        """
            INSERT INTO topic_domains (topics_id, domain, self_links)
            VALUES (%(topics_id)s, %(domain)s, 1)
            ON CONFLICT (topics_id, md5(domain)) DO NOTHING
            RETURNING *
        """, {
            'topics_id': topic_link['topics_id'],
            'domain': redirect_url_domain
        }).hash()

    # do this update separately instead of as an upsert because the upsert was occasionally deadlocking
    if not topic_domain:
        db.query(
            """
            UPDATE topic_domains SET
                self_links = topic_domains.self_links + 1
            WHERE
                topics_id = %(topics_id)s AND
                domain = %(domain)s
            """, {
                'topics_id': topic_link['topics_id'],
                'domain': redirect_url_domain
            })
Ejemplo n.º 9
0
def fetch_twitter_urls(db: DatabaseHandler, topics_id: int, topic_fetch_urls_ids: List[int]) -> None:
    """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100."""
    if len(topic_fetch_urls_ids) == 0:
        return

    topic_fetch_urls = db.query("""
        SELECT *
        FROM topic_fetch_urls
        WHERE
            topics_id = %(topics_id)s AND
            topic_fetch_urls_id = ANY(%(topic_fetch_urls_ids)s)
    """, {
        'topics_id': topics_id,
        'topic_fetch_urls_ids': topic_fetch_urls_ids,
    }).hashes()

    topic = db.require_by_id('topics', topics_id)

    (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls)

    _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk)
    _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)