def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool: """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain. Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url. Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict. """ if 'topic_links_id' not in topic_fetch_url or topic_fetch_url[ 'topic_links_id'] is None: return False topic_link = db.query( """ SELECT * FROM topic_links WHERE topics_id = %(topics_id)s AND topic_links_id = %(topic_links_id)s """, { 'topics_id': topic_fetch_url['topics_id'], 'topic_links_id': topic_fetch_url['topic_links_id'], }).hash() if not topic_link: raise Exception( f"Topic link ID {topic_fetch_url['topic_links_id']} was not found." ) story = db.require_by_id('stories', topic_link['stories_id']) url = topic_link.get('redirect_url', topic_link['url']) return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'], story['url'], url)
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None: """Given a topic link, increment the self_links and all_links counts in the corresponding topic_domains row. Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of topic_links.url or topic_links.redirect_url. Always increment all_links. """ story = db.require_by_id('stories', topic_link['stories_id']) story_domain = mediawords.util.url.get_url_distinctive_domain(story['url']) url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url) self_link = 1 if story_domain in (url_domain, redirect_url_domain) else 0 db.query( """ insert into topic_domains (topics_id, domain, self_links, all_links) values(%(topics_id)s, %(domain)s, %(self_link)s, 1) on conflict (topics_id, md5(domain)) do update set self_links = topic_domains.self_links + %(self_link)s, all_links = topic_domains.all_links + 1 """, {'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain, 'self_link': self_link})
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool: """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain. Return skip_self_linked_domain_url() for the topic, source url and ref url of the given topic_fetch_url. Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict. """ if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None: return False topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id']) story = db.require_by_id('stories', topic_link['stories_id']) url = topic_link.get('redirect_url', topic_link['url']) return skip_self_linked_domain_url(db, topic_fetch_url['topics_id'], story['url'], url)
def skip_self_linked_domain(db: DatabaseHandler, topic_fetch_url: dict) -> bool: """Given a topic_fetch_url, return true if the url should be skipped because it is a self linked domain. Return true if the domain of the linked url is the same as the domain of the linking story and one of the following is true: * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or * the linked url matches SKIP_SELF_LINK_RE. Always return false if topic_fetch_url['topic_links_id'] is None or not in the dict. """ if 'topic_links_id' not in topic_fetch_url or topic_fetch_url['topic_links_id'] is None: return False topic_link = db.require_by_id('topic_links', topic_fetch_url['topic_links_id']) story = db.require_by_id('stories', topic_link['stories_id']) story_domain = mediawords.util.url.get_url_distinctive_domain(story['url']) url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) if redirect_url is None: redirect_url = topic_link['url'] redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url) link_domain = redirect_url_domain if redirect_url_domain else url_domain if story_domain not in (url_domain, redirect_url_domain): return False for url in (topic_link['url'], redirect_url): if re.search(SKIP_SELF_LINK_RE, url, flags=re.I): return True topic_domain = db.query( "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)", {'a': topic_fetch_url['topics_id'], 'b': link_domain}).hash() if topic_domain and topic_domain['self_links'] > MAX_SELF_LINKS: return True return False
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List[int]) -> None: """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100.""" if len(topic_fetch_urls_ids) == 0: return topic_fetch_urls = db.query( "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)", {'a': topic_fetch_urls_ids}).hashes() topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id']) (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls) _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk) _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
def fetch_twitter_urls(db: DatabaseHandler, topic_fetch_urls_ids: List) -> None: """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100.""" if len(topic_fetch_urls_ids) == 0: return topic_fetch_urls = db.query( "select * from topic_fetch_urls where topic_fetch_urls_id = any(%(a)s)", {'a': topic_fetch_urls_ids}).hashes() topic = db.require_by_id('topics', topic_fetch_urls[0]['topics_id']) (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls) _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk) _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None: """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row. Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of topic_links.url or topic_links.redirect_url. """ story = db.require_by_id('stories', topic_link['stories_id']) story_domain = mediawords.util.url.get_url_distinctive_domain(story['url']) url_domain = mediawords.util.url.get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) redirect_url_domain = mediawords.util.url.get_url_distinctive_domain(redirect_url) if story_domain not in (url_domain, redirect_url_domain): return topic_domain = db.query( """ insert into topic_domains (topics_id, domain, self_links) values(%(topics_id)s, %(domain)s, 1) on conflict (topics_id, md5(domain)) do nothing returning * """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain } ).hash() # do this update separately instead of as an upsert because the upsert was occasionally deadlocking if not topic_domain: db.query( """ update topic_domains set self_links = topic_domains.self_links + 1 where topics_id = %(topics_id)s and domain = %(domain)s """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain } )
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None: """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row. Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of topic_links.url or topic_links.redirect_url. """ story = db.require_by_id('stories', topic_link['stories_id']) story_domain = get_url_distinctive_domain(story['url']) url_domain = get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) redirect_url_domain = get_url_distinctive_domain(redirect_url) if story_domain not in (url_domain, redirect_url_domain): return topic_domain = db.query( """ INSERT INTO topic_domains (topics_id, domain, self_links) VALUES (%(topics_id)s, %(domain)s, 1) ON CONFLICT (topics_id, md5(domain)) DO NOTHING RETURNING * """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain }).hash() # do this update separately instead of as an upsert because the upsert was occasionally deadlocking if not topic_domain: db.query( """ UPDATE topic_domains SET self_links = topic_domains.self_links + 1 WHERE topics_id = %(topics_id)s AND domain = %(domain)s """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain })
def fetch_twitter_urls(db: DatabaseHandler, topics_id: int, topic_fetch_urls_ids: List[int]) -> None: """Fetch topic_fetch_urls from twitter api as statuses and users in chunks of up to 100.""" if len(topic_fetch_urls_ids) == 0: return topic_fetch_urls = db.query(""" SELECT * FROM topic_fetch_urls WHERE topics_id = %(topics_id)s AND topic_fetch_urls_id = ANY(%(topic_fetch_urls_ids)s) """, { 'topics_id': topics_id, 'topic_fetch_urls_ids': topic_fetch_urls_ids, }).hashes() topic = db.require_by_id('topics', topics_id) (user_urls, status_urls) = _split_urls_into_users_and_statuses(topic_fetch_urls) _call_function_on_url_chunks(db, topic, user_urls, _try_fetch_users_chunk) _call_function_on_url_chunks(db, topic, status_urls, _try_fetch_tweets_chunk)