def skip_self_linked_domain_url(db: DatabaseHandler, topics_id: int, source_url: str, ref_url: str) -> bool: """Return true if the url should be skipped because it is a self linked domain within the topic. Return true if the domain of the ref_url is the same as the domain of the story_url and one of the following is true: * topic.domains.self_links value for the domain is greater than MAX_SELF_LINKS or * ref_url matches SKIP_SELF_LINK_RE. """ source_domain = get_url_distinctive_domain(source_url) ref_domain = get_url_distinctive_domain(ref_url) if source_domain != ref_domain: return False if re.search(SKIP_SELF_LINK_RE, ref_url, flags=re.I): return True topic_domain = db.query( "select * from topic_domains where topics_id = %(a)s and md5(domain) = md5(%(b)s)", { 'a': topics_id, 'b': ref_domain }).hash() if topic_domain and topic_domain['self_links'] >= MAX_SELF_LINKS: return True return False
def _url_domain_matches_medium(medium: dict, urls: list) -> bool: """Return true if the domain of any of the story urls matches the domain of the medium url.""" medium_domain = get_url_distinctive_domain(medium['url']) story_domains = [get_url_distinctive_domain(u) for u in urls] matches = list(filter(lambda d: medium_domain == d, story_domains)) return len(matches) > 0
def print_long_running_job_states(db: DatabaseHandler, limit: int): media = db.query(""" select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id is null order by m.media_id asc limit %(a)s """, {'a': limit}).hashes() media_groups = {} num_media = len(media) for i, medium in enumerate(media): domain = get_url_distinctive_domain(medium['url']) log.warning("%s [%d/%d]" % (domain, i, num_media)) if domain not in media_groups: media_groups[domain] = [] media_groups[domain].append(medium) medium['medium_domain'] = domain medium['dup_domain_matches'] = True dup_media = db.query( "select m.*, mh.* from media m join media_health mh using ( media_id ) where dup_media_id = %(a)s", {'a': medium['media_id']} ).hashes() media_groups[domain].extend(dup_media) for dup_medium in dup_media: dup_domain = get_url_distinctive_domain(dup_medium['url']) medium['medium_domain'] = dup_domain medium['dup_domain_matches'] = domain == dup_domain db.query("DROP TABLE IF EXISTS media_dups") db.query( """ CREATE TABLE media_dups ( domain TEXT, media_id BIGINT ) """) db.begin() for i, domain in enumerate(media_groups.keys()): log.warning("domain %s [%d/%d]" % (domain, i, len(media_groups.keys()))) media = media_groups[domain] if len(media) > 1: for m in media: db.query(""" insert into media_dups (domain, media_id) values (%(a)s, %(b)s) """, {'a': domain, 'b': m['media_id']}) db.commit()
def test_skip_self_links(self): """Test that self links are skipped within extract_links_for_topic_story""" story_domain = get_url_distinctive_domain(self.test_story['url']) topic = create_test_topic(self.db, 'links') self.db.create( 'topic_stories', { 'topics_id': topic['topics_id'], 'stories_id': self.test_story['stories_id'] }) num_links = MAX_SELF_LINKS * 2 content = '' for i in range(num_links): plain_text = "Sample sentence to make sure the links get extracted" * 10 url = "http://%s/%d" % (story_domain, i) paragraph = "<p>%s <a href='%s'>link</a></p>\n\n" % (plain_text, url) content = content + paragraph store_content(self.db, self.test_download, content) extract_links_for_topic_story(db=self.db, stories_id=self.test_story['stories_id'], topics_id=topic['topics_id']) topic_links = self.db.query( "select * from topic_links where topics_id = %(a)s", { 'a': topic['topics_id'] }).hashes() assert (len(topic_links) == MAX_SELF_LINKS)
def test_skip_self_linked_domain(self) -> None: """Test skip_self_linked_domain.""" # no topic_links_id should always return False assert (skip_self_linked_domain(self.db, {}) is False) # always skip search type pages story_domain = get_url_distinctive_domain(self.story['url']) regex_skipped_urls = [ 'http://%s/%s' % (story_domain, suffix) for suffix in ['search', 'author', 'tag'] ] for url in regex_skipped_urls: tl = self.create_topic_link(self.topic, self.story, url, url) assert (skip_self_linked_domain(self.db, tl) is True) self_domain_url = 'http://%s/foo/bar' % story_domain for i in range(MAX_SELF_LINKS - len(regex_skipped_urls) - 1): url = self_domain_url + str(i) tl = self.create_topic_link(self.topic, self.story, url, url) assert (skip_self_linked_domain(self.db, tl) is False) num_tested_skipped_urls = 10 for i in range(num_tested_skipped_urls): tl = self.create_topic_link(self.topic, self.story, self_domain_url, self_domain_url) assert (skip_self_linked_domain(self.db, tl) is True) other_domain_url = 'http://other.domain/foo/bar' num_tested_other_urls = 10 for i in range(num_tested_other_urls): tl = self.create_topic_link(self.topic, self.story, other_domain_url, other_domain_url) assert (skip_self_linked_domain(self.db, tl) is False)
def test_get_url_distinctive_domain(): # FIXME - some resulting domains look funny, not sure if I can change them easily though assert mc_url.get_url_distinctive_domain('http://www.nytimes.com/') == 'nytimes.com' assert mc_url.get_url_distinctive_domain('http://cyber.law.harvard.edu/') == 'law.harvard' assert mc_url.get_url_distinctive_domain('http://www.gazeta.ru/') == 'gazeta.ru' assert mc_url.get_url_distinctive_domain('http://www.whitehouse.gov/'), 'www.whitehouse' assert mc_url.get_url_distinctive_domain('http://info.info/') == 'info.info' assert mc_url.get_url_distinctive_domain('http://blog.yesmeck.com/jquery-jsonview/') == 'yesmeck.com' assert mc_url.get_url_distinctive_domain('http://status.livejournal.org/') == 'livejournal.org' # ".(gov|org|com).XX" exception assert mc_url.get_url_distinctive_domain('http://www.stat.gov.lt/') == 'stat.gov.lt' # "wordpress.com|blogspot|..." exception assert mc_url.get_url_distinctive_domain('https://en.blog.wordpress.com/') == 'en.blog.wordpress.com'
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None: """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row. Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of topic_links.url or topic_links.redirect_url. """ story = db.require_by_id('stories', topic_link['stories_id']) story_domain = get_url_distinctive_domain(story['url']) url_domain = get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) redirect_url_domain = get_url_distinctive_domain(redirect_url) if story_domain not in (url_domain, redirect_url_domain): return topic_domain = db.query( """ insert into topic_domains (topics_id, domain, self_links) values(%(topics_id)s, %(domain)s, 1) on conflict (topics_id, md5(domain)) do nothing returning * """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain }).hash() # do this update separately instead of as an upsert because the upsert was occasionally deadlocking if not topic_domain: db.query( """ update topic_domains set self_links = topic_domains.self_links + 1 where topics_id = %(topics_id)s and domain = %(domain)s """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain })
def increment_domain_links(db: DatabaseHandler, topic_link: dict) -> None: """Given a topic link, increment the self_links count is necessary n the corresponding topic_domains row. Increment self_links if the domain of the story at topic_links.stories_id is the same as the domain of topic_links.url or topic_links.redirect_url. """ story = db.require_by_id('stories', topic_link['stories_id']) story_domain = get_url_distinctive_domain(story['url']) url_domain = get_url_distinctive_domain(topic_link['url']) redirect_url = topic_link.get('redirect_url', topic_link['url']) redirect_url_domain = get_url_distinctive_domain(redirect_url) if story_domain not in (url_domain, redirect_url_domain): return topic_domain = db.query( """ INSERT INTO topic_domains (topics_id, domain, self_links) VALUES (%(topics_id)s, %(domain)s, 1) ON CONFLICT (topics_id, md5(domain)) DO NOTHING RETURNING * """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain }).hash() # do this update separately instead of as an upsert because the upsert was occasionally deadlocking if not topic_domain: db.query( """ UPDATE topic_domains SET self_links = topic_domains.self_links + 1 WHERE topics_id = %(topics_id)s AND domain = %(domain)s """, { 'topics_id': topic_link['topics_id'], 'domain': redirect_url_domain })
def __url_with_http_auth(url: str) -> str: """If there are HTTP auth credentials for the requested site, add them to the URL.""" url = decode_object_from_bytes_if_needed(url) auth_lookup = UserAgent.__get_domain_http_auth_lookup() domain = get_url_distinctive_domain(url=url).lower() if domain in auth_lookup: auth = auth_lookup[domain] uri = furl(url) # https://stackoverflow.com/a/21629125/200603 uri.username = auth['user'] uri.password = auth['password'] url = uri.url return url
def __url_with_http_auth(url: str) -> str: """If there are HTTP auth credentials for the requested site, add them to the URL.""" url = decode_object_from_bytes_if_needed(url) auth_lookup = UserAgent.__get_domain_http_auth_lookup() domain = get_url_distinctive_domain(url=url).lower() if domain in auth_lookup: auth = auth_lookup[domain] uri = furl(url) # https://stackoverflow.com/a/21629125/200603 uri.username = auth['user'] uri.password = auth['password'] url = uri.url return url
def test_increment_domain_links(self) -> None: """Test incremeber_domain_links().""" nomatch_domain = 'no.match' story_domain = get_url_distinctive_domain(self.story['url']) num_url_matches = 3 for i in range(num_url_matches): self.create_topic_link(self.topic, self.story, story_domain, nomatch_domain) td = self.get_topic_domain(self.topic, nomatch_domain) assert (td is not None) assert (td['self_links'] == i + 1) num_redirect_matches = 3 for i in range(num_redirect_matches): self.create_topic_link(self.topic, self.story, nomatch_domain, story_domain) td = self.get_topic_domain(self.topic, story_domain) assert (td is not None) assert (td['self_links'] == i + 1)
def test_get_url_distinctive_domain(): # FIXME - some resulting domains look funny, not sure if I can change them easily though assert mc_url.get_url_distinctive_domain( 'http://www.nytimes.com/') == 'nytimes.com' assert mc_url.get_url_distinctive_domain( 'http://cyber.law.harvard.edu/') == 'harvard.edu' assert mc_url.get_url_distinctive_domain( 'http://www.gazeta.ru/') == 'gazeta.ru' assert mc_url.get_url_distinctive_domain( 'http://www.whitehouse.gov/'), 'whitehouse.gov' assert mc_url.get_url_distinctive_domain( 'http://info.info/') == 'info.info' assert mc_url.get_url_distinctive_domain( 'http://blog.yesmeck.com/jquery-jsonview/') == 'yesmeck.com' assert mc_url.get_url_distinctive_domain( 'http://status.livejournal.org/') == 'livejournal.org' assert mc_url.get_url_distinctive_domain( 'http://www.republicoftogo.com/') == 'republicoftogo.com' assert mc_url.get_url_distinctive_domain('http://www.fbi.gov') == 'fbi.gov' assert mc_url.get_url_distinctive_domain( 'http://shrb.dzwww.com/') == 'dzwww.com' assert mc_url.get_url_distinctive_domain( 'http://www.thecwsandiego.com/') == 'thecwsandiego.com' # assert mc_url.get_url_distinctive_domain('https://www.gov.uk/') == 'gov.uk' assert mc_url.get_url_distinctive_domain( 'https://www.dailymail.co.uk/home/index.html') == 'dailymail.co.uk' # ".(gov|org|com).XX" exception assert mc_url.get_url_distinctive_domain( 'http://www.stat.gov.lt/') == 'stat.gov.lt' # "wordpress.com|blogspot|..." exception assert mc_url.get_url_distinctive_domain( 'https://en.blog.wordpress.com/') == 'en.blog.wordpress.com'
def test_get_url_distinctive_domain(): # FIXME - some resulting domains look funny, not sure if I can change them easily though assert mc_url.get_url_distinctive_domain('http://www.nytimes.com/') == 'nytimes.com' assert mc_url.get_url_distinctive_domain('http://cyber.law.harvard.edu/') == 'harvard.edu' assert mc_url.get_url_distinctive_domain('http://www.gazeta.ru/') == 'gazeta.ru' assert mc_url.get_url_distinctive_domain('http://www.whitehouse.gov/'), 'whitehouse.gov' assert mc_url.get_url_distinctive_domain('http://info.info/') == 'info.info' assert mc_url.get_url_distinctive_domain('http://blog.yesmeck.com/jquery-jsonview/') == 'yesmeck.com' assert mc_url.get_url_distinctive_domain('http://status.livejournal.org/') == 'livejournal.org' assert mc_url.get_url_distinctive_domain('http://www.republicoftogo.com/') == 'republicoftogo.com' assert mc_url.get_url_distinctive_domain('http://www.fbi.gov') == 'fbi.gov' assert mc_url.get_url_distinctive_domain('http://shrb.dzwww.com/') == 'dzwww.com' assert mc_url.get_url_distinctive_domain('http://www.thecwsandiego.com/') == 'thecwsandiego.com' # assert mc_url.get_url_distinctive_domain('https://www.gov.uk/') == 'gov.uk' assert mc_url.get_url_distinctive_domain('https://www.dailymail.co.uk/home/index.html') == 'dailymail.co.uk' # ".(gov|org|com).XX" exception assert mc_url.get_url_distinctive_domain('http://www.stat.gov.lt/') == 'stat.gov.lt' # "wordpress.com|blogspot|..." exception assert mc_url.get_url_distinctive_domain('https://en.blog.wordpress.com/') == 'en.blog.wordpress.com'