def _format_hyperlinks(self, links, exclude_nofollow=False): """ Formats a list of hyperlinks for return """ if self.analysis.get("base_url"): base_url = URL(self.analysis["base_url"]) else: base_url = self.source_url hyperlinks = [] for href, txt, rel in links: url = URL(base_url.urljoin(href), check_encoding=True) if ( # Probably a forgotten mailto: b"@" not in url.parsed.path and # Probably an html error not href.startswith(b"<") and not (exclude_nofollow and rel == "nofollow") and # This regex catches several things we don't want to follow: # invalid hosts, TLDs, usernames, .. _RE_VALID_NETLOC.match(url.parsed.netloc) ): hyperlinks.append({ "href": url, "text": txt.strip() }) return hyperlinks
def test_simple_insert_and_search(indexer, searcher): from cosrlib.url import URL indexed = indexer.client.index_document( """<html><title>hello world</title><body>hello body</body></html>""", url=URL("http://example.com")) indexed2 = indexer.client.index_document( """<html><title>another world</title><body>document body</body></html>""", url=URL("http://example2.com/page2")) indexer.client.flush() indexer.client.refresh() assert indexed["docid"] assert indexed["url"].url == "http://example.com" assert indexed["rank"] > 0 assert indexed2["docid"] != indexed["docid"] search_results = searcher.client.search("hello") assert len(search_results["hits"]) == 1 assert search_results["hits"][0]["docid"] == indexed["docid"] assert search_results["hits"][0]["score"] > 0 search_results = searcher.client.search("world") assert len(search_results["hits"]) == 2 search_results = searcher.client.search("world", domain="example2.com") assert len(search_results["hits"]) == 1 assert search_results["hits"][0]["docid"] == indexed2["docid"]
def parse_canonical_url(self): """ Look for a valid meta tag rel="canonical" in the HEAD of the page. Returns an absolute URL in all cases """ # TODO: should we use og:url as canonical url? http://ogp.me/ meta_rel_canonical = [ x for x in self.analysis.get("head_links", []) if (x.get("rel") or "").lower() == "canonical" ] if len(meta_rel_canonical) > 0: canonical_url = meta_rel_canonical[0].get("href") or None if not canonical_url: return None canonical_url = URL(canonical_url, check_encoding=True) # If the canonical URL is relative, make it absolute based on the source_url if not canonical_url.domain or not canonical_url.parsed.scheme: canonical_url = URL( urlparse.urljoin(self.source_url.url or "", canonical_url.url)) # For now we force canonical URLs to be on the same domain. May relax carefully in the future elif self.source_url and canonical_url.domain != self.source_url.domain: return None return canonical_url
def _format_hyperlinks(self, links): """ Formats a list of hyperlinks for return """ if self.analysis.get("base_url"): base_url = URL(self.analysis["base_url"]) else: base_url = self.source_url hyperlinks = [] for href, words in links: url = URL(base_url.urljoin(href), check_encoding=True) if ( # Probably a forgotten mailto: "@" not in url.parsed.path and # Probably an html error not href.startswith("<") and # This regex catches several things we don't want to follow: # invalid hosts, TLDs, usernames, .. _RE_VALID_NETLOC.match(url.parsed.netloc) ): hyperlinks.append({ "href": url, "words": self._split_words(words) }) return hyperlinks
def _format_hyperlinks(self, links, exclude_nofollow=False): """ Formats a list of hyperlinks for return """ if self.analysis.get("base_url"): base_url = URL(self.analysis["base_url"]) else: base_url = self.source_url hyperlinks = [] for href, txt, rel in links: url = URL(base_url.urljoin(href), check_encoding=True) if ( # Probably a forgotten mailto: b"@" not in url.parsed.path and # Probably an html error not href.startswith(b"<") and not (exclude_nofollow and rel == "nofollow") and # This regex catches several things we don't want to follow: # invalid hosts, TLDs, usernames, .. _RE_VALID_NETLOC.match(url.parsed.netloc)): hyperlinks.append({"href": url, "text": txt.strip()}) return hyperlinks
def __init__(self, source_data, url=None, headers=None, index_level=2): self.source_data = source_data self.source_headers = headers or {} self.index_level = index_level if not url: self.source_url = URL("") elif is_basestring(url): self.source_url = URL(url) else: self.source_url = url self._word_groups = []
def __init__(self, source_data, url=None, headers=None): self.source_data = source_data self.source_headers = headers or {} if not url: self.source_url = URL("") elif isinstance(url, basestring): self.source_url = URL(url) else: self.source_url = url self._title = None self._hyperlinks = [] self._word_groups = []
def iter_items(self, partition): """ Yields objects in the source's native format """ warc_stream = self.open_warc_stream(partition["path"]) for record in warc_stream: if not record.url: continue if record['Content-Type'] != 'application/http; msgtype=response': continue url = URL(record.url, check_encoding=True) do_parse, index_level = self.qualify_url(url) if not do_parse: continue payload = record.payload.read() parser = HttpParser() parser.execute(payload, len(payload)) headers = parser.get_headers() if 'text/html' not in headers.get("content-type", ""): # print "Not HTML?", record.url, headers continue yield url, headers, "html", index_level, parser.recv_body()
def get_domain_ids(self, urls): """ Return a list of domain IDs for these URLs """ ret = [] for u in urls: url = URL(u) ret.append(datasources["ids"].domain_id(url)) return ret
def get_domain_ids(self, urls): """ Return a list of domain IDs for these URLs """ ret = [] for u in urls: url = URL(u) ret.append(make_domain_id(url)) return ret
def import_row(self, i, row): """ Returns a (key, value) pair for this row from the dump file """ event, elem = row if event != "end": return if elem.tag == "{http://dmoz.org/rdf/}ExternalPage": url = URL(elem.attrib["about"].encode("utf-8")).normalized title = elem.find("{http://purl.org/dc/elements/1.0/}Title") description = elem.find( "{http://purl.org/dc/elements/1.0/}Description") if url: yield url, { "dmoz_title": (title.text or "") if (title is not None) else "", "dmoz_description": (description.text or "") if (description is not None) else "" } self.clear_xml_elements(title, description, elem) elif elem.tag not in ("{http://purl.org/dc/elements/1.0/}Title", "{http://purl.org/dc/elements/1.0/}Description"): self.clear_xml_elements(elem)
def iter_rows(self): if config["TESTDATA"] == "1": extract_dir = self.dump_testdata clean = False else: extract_dir = tempfile.mkdtemp(suffix="cosr-ut1-import") clean = True os.system("curl %s > %s/blacklists.tar.gz" % (self.dump_url, extract_dir)) os.system("cd %s && tar zxf blacklists.tar.gz" % extract_dir) extract_dir += "/blacklists" data = defaultdict(list) for fp in os.listdir(extract_dir): fullpath = os.path.join(extract_dir, fp) if os.path.isdir(fullpath) and not os.path.islink(fullpath): cnt = 0 with open(fullpath + "/domains", 'r') as f: for line in f.readlines(): url = URL(line.strip()).normalized if url: data[url].append(fp) cnt += 1 if os.path.isfile(fullpath + "/urls"): with open(fullpath + "/urls", 'r') as f: for line in f.readlines(): url = URL(line.strip()).normalized if url: data[url].append(fp) cnt += 1 print("Done %s (%s entries)" % (fp, cnt)) if clean: shutil.rmtree(os.path.dirname(extract_dir)) for key, value in data.iteritems(): yield key, {"ut1_blacklist": value}
def import_dump(self): """ Read a dump from an URL or a local file, and merge its data in RocksDB """ db = Storage(read_only=False) write_batch = db.write_batch(None) start_time = time.time() done = 0 for url, values in self.iter_rows(): # TODO: RocksDB merge operator? existing_value = db.get(url) existing_pb = urlserver_pb2.UrlMetadata() if existing_value is not None: existing_pb.ParseFromString(existing_value) else: # In order to send the protobuf message untouched via RPC, we pre-compute the ID existing_pb.id = make_url_id(URL(url)) for k, v in values.iteritems(): if k in ("ut1_blacklist", ): for elt in v: existing_pb.ut1_blacklist.append(elt) # pylint: disable=no-member else: setattr(existing_pb, k, v) # print "IMPORT", key, existing_pb write_batch.put(url, existing_pb.SerializeToString()) done += 1 if self.dump_batch_size and (done % self.dump_batch_size) == 0: eta = 0 if self.dump_count_estimate: eta = float( self.dump_count_estimate - done ) / ( 3600.0 * done / (time.time() - start_time) ) print("Done %s (%s/s, ~%0.2f%%, ETA %0.2fh)" % ( done, int(done / (time.time() - start_time)), (float(done * 100) / self.dump_count_estimate) if self.dump_count_estimate else 0, eta )) write_batch = db.write_batch(write_batch) print("Total rows: %s" % done) db.write_batch(write_batch) db.close()
def add_hyperlink(self, href, words): """ Validate then add an hyperlink parsed from the document """ # Resolve relative links. Some might have a weird syntax so we need to # catch exceptions. try: url = URL((self.base_url or self.source_url).urljoin(href), check_encoding=True) except ValueError: return if url.parsed.scheme in ("http", "https"): self._hyperlinks.append({"href": url, "words": words})
def iter_domain(record): """ Transforms Row(domain=www.example.com) into tuple([int64 ID], "example.com") """ domain = record["domain"] if not domain or not domain.strip(): return [] name = URL("http://" + domain).normalized_domain try: _id = _fast_make_domain_id(name) except Exception: # pylint: disable=broad-except return [] return [(py2_long(_id), str(name))]
def import_dump(self): """ Read a dump from an URL or a local file, and merge its data in RocksDB """ db = Storage(read_only=False) write_batch = db.write_batch(None) batch_time = time.time() done = 0 for i, row in self.iter_dump(): for key, values in self.import_row(i, row): url = key.encode("utf-8") # TODO: RocksDB merge operator? existing_value = db.get(url) existing_pb = urlserver_pb2.UrlMetadata() if existing_value is not None: existing_pb.ParseFromString(existing_value) else: # In order to send the protobuf message untouched via RPC, we pre-compute the ID existing_pb.id = make_url_id(URL(url)) for k, v in values.iteritems(): if k in ("ut1_blacklist", ): for elt in v: existing_pb.ut1_blacklist.append(elt) # pylint: disable=no-member else: setattr(existing_pb, k, v) # print "IMPORT", key, existing_pb write_batch.put(url, existing_pb.SerializeToString()) done += 1 if self.dump_batch_size and (done % self.dump_batch_size) == 0: print "Done %s (%s/s)" % (done, int(done / (time.time() - batch_time))) write_batch = db.write_batch(write_batch) batch_time = time.time() print "Total rows: %s" % done db.write_batch(write_batch) db.close()
def get_metadata(self, urls): """ Return a list of tuples of metadata for these *normalized* URLs """ ret = [] for url in urls: data = db.get(url) # If the URL has been in none of our static databases, we still want to return an ID if data is None: obj = urlserver_pb2.UrlMetadata() obj.id = make_url_id(URL(url)) data = obj.SerializeToString() ret.append(data) return ret
def get_metadata(self, urls): """ Return a list of tuples of metadata for these URLs """ ret = [] for u in urls: url = URL(u) ret.append(( datasources["ids"].url_id(url), datasources["ids"].domain_id(url), datasources["alexa_top1m"].rank(url), datasources["dmoz_url"].exists(url), datasources["dmoz_domain"].exists(url), datasources["ut1_blacklist"].classes(url), datasources["webdatacommons_hc"].rank(url) )) return ret
def iter_items(self, partition): """ Partition can be either a single raw document, or a filepath to a JSON file """ if partition.get("path"): with open(partition["path"], "r") as f: docs = json.load(f) else: docs = [partition["doc"]] for doc in docs: url = URL(doc["url"].encode("utf-8")) do_parse, index_level = self.qualify_url(url) if do_parse: yield (url, { "Content-Type": "text/html" }, "html", index_level, doc["content"].encode("utf-8"))
def import_row(self, i, row): """ Maps a raw data row into a list of (key, values) pairs """ # https://www.wikidata.org/wiki/Property:P856 official_website = None for claim in row["claims"].get("P856", []): if (claim["type"] == "statement" and claim["mainsnak"]["datatype"] == "url" and claim["mainsnak"].get("datavalue")): official_website = URL( claim["mainsnak"]["datavalue"]["value"]).normalized # TODO: other languages! label_en = row["labels"].get("en", {}).get("value") or "" description_en = row["descriptions"].get("en", {}).get("value") or "" if official_website: yield official_website, { "wikidata_title": label_en, "wikidata_description": description_en, "wikidata_sitelinks": len(row.get("sitelinks") or []) }
def endElement(self, name): if name == "ExternalPage": url = URL(self._current_page["url"]) # TODO: Import "cool" flag, like IMDb has in http://www.dmoz.org/Arts/Movies/Databases/ db_urls_wb.put( url.normalized.encode("utf-8"), json.dumps([ self._current_page.get("d:Title"), self._current_page.get("d:Description") # , self._current_page.get("topic") ]).encode("utf-8")) self._current_page = None db_domains_wb.put(url.normalized_domain.encode("utf-8"), "1") # TODO put total count? self.count += 1 if self.count % 100000 == 0: print "Done %s" % self.count self._current_key = None
def _start_tag(self, level, tag_name, attrs=None): """ We parsed a new element node in the document tree. """ attrs = attrs or {} # TODO: different weights (h1-h6 tags for instance) weight = 1 # Add this new element to the stack self.current_stack.append((tag_name, weight)) # TODO: If se wee role=presentation, should we transform the tag_name into a simple span? # https://www.w3.org/TR/wai-aria/roles#presentation if tag_name == "head": self.level_head = level return if tag_name == "article": self.level_article = level # If we are in the HEAD, only metadata should be interesting if self.level_head is not None: if tag_name == "link": self.head_links.append(attrs) elif tag_name == "meta": meta_name = (attrs.get("name") or attrs.get("property") or "").lower() if meta_name in defs.META_WHITELIST: self.head_metas[meta_name] = (attrs.get("content") or "").strip() elif tag_name == "base" and attrs.get( "href") and self.base_url is None: self.base_url = URL(attrs["href"]) # If we are in the BODY, inspect things a bit more else: if tag_name == "img": self._close_word_group() if attrs.get("alt"): self.add_word_group(attrs["alt"], tag="img") if attrs.get("src"): self.add_word_group(" ".join( self._split_filename_words(attrs["src"])), tag="img") # Does this element start a hidden subtree? if self.level_hidden is None and self._guess_element_hidden(attrs): self.level_hidden = level # Are we in a boilerplate subtree with a bypass tag? (like a title in a header) if self.level_boilerplate is not None and self.level_boilerplate_bypass is None: if tag_name in defs.TAGS_BOILERPLATE_BYPASS: self.level_boilerplate_bypass = level # Does this element start a boilerplate subtree? if self.level_boilerplate is None and self._guess_element_boilerplate( attrs): self.level_boilerplate = level # Does this element start a hyperlink subtree? # TODO how to deal with nested a? if tag_name == "a" and attrs.get( "rel") != "nofollow" and attrs.get("href"): self.level_hyperlink = level self.link_words = [] self.link_href = attrs.get("href") # does this element break word groups? is_separator = tag_name in defs.TAGS_SEPARATORS # Do we want to start collecting words? # TODO: would we avoid this for elements with no children? if self.level_hidden is None and self.level_boilerplate is None and is_separator: toplevel_word_group = {"tag": tag_name, "weight": weight} self._start_word_group(toplevel_word_group)
def test_pickling(): dumped = pickle.dumps(URL("http://sub.test.co.uk/azerza/azer.html?x=a#b")) url = pickle.loads(dumped) assert url.url == "http://sub.test.co.uk/azerza/azer.html?x=a#b"
def test_normalize(url, normalized_domain, normalized): _url = URL(url) assert _url.normalized_domain == normalized_domain assert _url.normalized == normalized
def test_url(): assert URL("https://www.test.com").normalized == "test.com" assert URL("https://www.test.com?").normalized == "test.com" assert URL("https://www.test.com?").normalized_domain == "test.com" assert URL("https://www.test.com?").domain == "www.test.com" assert URL(u"https://www.test.com?").domain == "www.test.com" assert type(URL(u"https://www.test.com?").domain) == str assert type(URL(u"https://www.test.com?").normalized_domain) == str assert URL("https://.test.com").domain == ".test.com" assert URL("https://test.com.").domain == "test.com." assert URL("https://.test.com.").domain == ".test.com." assert URL("https://www.test.com.").normalized_domain == "test.com" assert URL("https://.www.test.com").normalized_domain == "test.com" assert URL("https://www.www.test.com").normalized_domain == "test.com" assert URL("https://.www.www.test.com").normalized_domain == "test.com" assert URL("https://.test.com").normalized_subdomain == "" assert URL("https://.www.test.com").normalized_subdomain == "" assert URL("https://.example.test.com").normalized_subdomain == "example" assert URL("http://sub.test.com/?x=a#b").normalized == "sub.test.com/?x=a" assert URL( "http://sub.test.co.uk?x=a#b").normalized == "sub.test.co.uk/?x=a" assert URL("http://sub.test.co.uk/page1?x=a#b" ).normalized == "sub.test.co.uk/page1?x=a" assert URL("http://sub.test.co.uk/page1/?x=a#b" ).normalized == "sub.test.co.uk/page1/?x=a" assert URL("http://sub.test.co.uk?x=a#b" ).normalized_without_query == "sub.test.co.uk" assert URL("http://sub.test.co.uk?x=a#b").suffix == "co.uk" assert URL("http://sub.test.co.uk?x=a#b").pld == "test.co.uk" assert URL("http://www.sub.test.co.uk?x=a#b").subdomain == "www.sub" assert URL("http://www.sub.test.co.uk?x=a#b").normalized_subdomain == "sub" assert URL("http://sub.test.co.uk/azerza/azer.html?x=a#b" ).homepage == "http://sub.test.co.uk" assert URL( 'http://dc.weber.edu/\xc3\xaf\xc2\xbf\xc2\xbd/field/?a=b&c=d&e=\xc3\xaf\xc2\xbf\xc2\xbd#qq', check_encoding=True ).url == "http://dc.weber.edu/%C3%AF%C2%BF%C2%BD/field/?a=b&c=d&e=%C3%AF%C2%BF%C2%BD#qq" assert URL("http://nord.gouv.fr").normalized == "nord.gouv.fr"