def evaluate_recall(result_file, test_file): """ Count number of urls in test found""" test_host = set() with open(test_file) as lines: for line in lines: url = line.strip() #url = url_normalize(url) host = URLUtility.get_tld(url) test_host.add(host) found_host = set() with open(result_file) as lines: for line in lines: values = line.strip().split() url = values[1] #url = url_normalize(url) host = URLUtility.get_tld(url) found_host.add(host) found = 0 for host in found_host: if host in test_host: found += 1 print host, found print found, len(test_host)
def extract_external_links(self, url, html): ''' Extract external outlinks, that link to different websites Returns: - list of unique urls ''' try: soup = BeautifulSoup(html, 'lxml') links = set() tld = URLUtility.get_tld(url) for tag in soup.findAll('a', href=True): link = tag['href'] values = urlparse.urlparse(link) if (values.netloc == "") or (values.netloc == tld) or (tld in values.netloc): continue link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links) except: traceback.print_exc() return []
def extract_insite_links(self, url, html): ''' Returns: - list of insite urls that are different from the input url ''' try: soup = BeautifulSoup(html, 'html.parser') #soup = BeautifulSoup(html, 'lxml') # Couldn't parse http://www.gunsinternational.com/ links = set() tld = URLUtility.get_tld(url) for tag in soup.findAll('a', href=True): link = tag['href'] try: link = urlparse.urljoin(url, link) except: traceback.print_exc() continue values = urlparse.urlparse(link) if tld in values.netloc: link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link and link != url: links.add(link) return list(links) except: print "Parsing with BeautifulSoup failed" return []
def update_seeds(self, seed_urls): '''Update seed urls in the current seed list. Fetch the seed urls''' new_seed_urls = [] for url in seed_urls: host = URLUtility.get_tld(url) if host not in self.host: self.host.add(host) new_seed_urls.append(url) urls, text = self.fetcher.fetch_urls(new_seed_urls) self.similarity.update_seeds(urls, text) self.K = max(len(self.similarity.seed_pages.keys())/2, 10)
def get_tld(self): return URLUtility.get_tld(self.url)