Beispiel #1
0
    def _format_hyperlinks(self, links, exclude_nofollow=False):
        """ Formats a list of hyperlinks for return """

        if self.analysis.get("base_url"):
            base_url = URL(self.analysis["base_url"])
        else:
            base_url = self.source_url

        hyperlinks = []
        for href, txt, rel in links:
            url = URL(base_url.urljoin(href), check_encoding=True)

            if (
                    # Probably a forgotten mailto:
                    b"@" not in url.parsed.path and

                    # Probably an html error
                    not href.startswith(b"<") and

                    not (exclude_nofollow and rel == "nofollow") and

                    # This regex catches several things we don't want to follow:
                    # invalid hosts, TLDs, usernames, ..
                    _RE_VALID_NETLOC.match(url.parsed.netloc)
            ):
                hyperlinks.append({
                    "href": url,
                    "text": txt.strip()
                })

        return hyperlinks
Beispiel #2
0
def test_simple_insert_and_search(indexer, searcher):
    from cosrlib.url import URL

    indexed = indexer.client.index_document(
        """<html><title>hello world</title><body>hello body</body></html>""",
        url=URL("http://example.com"))

    indexed2 = indexer.client.index_document(
        """<html><title>another world</title><body>document body</body></html>""",
        url=URL("http://example2.com/page2"))
    indexer.client.flush()
    indexer.client.refresh()

    assert indexed["docid"]
    assert indexed["url"].url == "http://example.com"
    assert indexed["rank"] > 0

    assert indexed2["docid"] != indexed["docid"]

    search_results = searcher.client.search("hello")
    assert len(search_results["hits"]) == 1
    assert search_results["hits"][0]["docid"] == indexed["docid"]
    assert search_results["hits"][0]["score"] > 0

    search_results = searcher.client.search("world")
    assert len(search_results["hits"]) == 2

    search_results = searcher.client.search("world", domain="example2.com")
    assert len(search_results["hits"]) == 1
    assert search_results["hits"][0]["docid"] == indexed2["docid"]
Beispiel #3
0
    def parse_canonical_url(self):
        """ Look for a valid meta tag rel="canonical" in the HEAD of the page.
            Returns an absolute URL in all cases """

        # TODO: should we use og:url as canonical url? http://ogp.me/

        meta_rel_canonical = [
            x for x in self.analysis.get("head_links", [])
            if (x.get("rel") or "").lower() == "canonical"
        ]
        if len(meta_rel_canonical) > 0:
            canonical_url = meta_rel_canonical[0].get("href") or None

            if not canonical_url:
                return None

            canonical_url = URL(canonical_url, check_encoding=True)

            # If the canonical URL is relative, make it absolute based on the source_url
            if not canonical_url.domain or not canonical_url.parsed.scheme:
                canonical_url = URL(
                    urlparse.urljoin(self.source_url.url or "",
                                     canonical_url.url))

            # For now we force canonical URLs to be on the same domain. May relax carefully in the future
            elif self.source_url and canonical_url.domain != self.source_url.domain:
                return None

            return canonical_url
Beispiel #4
0
    def _format_hyperlinks(self, links):
        """ Formats a list of hyperlinks for return """

        if self.analysis.get("base_url"):
            base_url = URL(self.analysis["base_url"])
        else:
            base_url = self.source_url

        hyperlinks = []
        for href, words in links:
            url = URL(base_url.urljoin(href), check_encoding=True)

            if (
                    # Probably a forgotten mailto:
                    "@" not in url.parsed.path and

                    # Probably an html error
                    not href.startswith("<") and

                    # This regex catches several things we don't want to follow:
                    # invalid hosts, TLDs, usernames, ..
                    _RE_VALID_NETLOC.match(url.parsed.netloc)
            ):
                hyperlinks.append({
                    "href": url,
                    "words": self._split_words(words)
                })

        return hyperlinks
Beispiel #5
0
    def _format_hyperlinks(self, links, exclude_nofollow=False):
        """ Formats a list of hyperlinks for return """

        if self.analysis.get("base_url"):
            base_url = URL(self.analysis["base_url"])
        else:
            base_url = self.source_url

        hyperlinks = []
        for href, txt, rel in links:
            url = URL(base_url.urljoin(href), check_encoding=True)

            if (
                    # Probably a forgotten mailto:
                    b"@" not in url.parsed.path and

                    # Probably an html error
                    not href.startswith(b"<")
                    and not (exclude_nofollow and rel == "nofollow") and

                    # This regex catches several things we don't want to follow:
                    # invalid hosts, TLDs, usernames, ..
                    _RE_VALID_NETLOC.match(url.parsed.netloc)):
                hyperlinks.append({"href": url, "text": txt.strip()})

        return hyperlinks
Beispiel #6
0
    def __init__(self, source_data, url=None, headers=None, index_level=2):
        self.source_data = source_data
        self.source_headers = headers or {}
        self.index_level = index_level

        if not url:
            self.source_url = URL("")
        elif is_basestring(url):
            self.source_url = URL(url)
        else:
            self.source_url = url

        self._word_groups = []
Beispiel #7
0
    def __init__(self, source_data, url=None, headers=None):
        self.source_data = source_data
        self.source_headers = headers or {}

        if not url:
            self.source_url = URL("")
        elif isinstance(url, basestring):
            self.source_url = URL(url)
        else:
            self.source_url = url

        self._title = None
        self._hyperlinks = []
        self._word_groups = []
Beispiel #8
0
    def iter_items(self, partition):
        """ Yields objects in the source's native format """

        warc_stream = self.open_warc_stream(partition["path"])

        for record in warc_stream:

            if not record.url:
                continue

            if record['Content-Type'] != 'application/http; msgtype=response':
                continue

            url = URL(record.url, check_encoding=True)

            do_parse, index_level = self.qualify_url(url)

            if not do_parse:
                continue

            payload = record.payload.read()
            parser = HttpParser()
            parser.execute(payload, len(payload))

            headers = parser.get_headers()

            if 'text/html' not in headers.get("content-type", ""):
                # print "Not HTML?", record.url, headers
                continue

            yield url, headers, "html", index_level, parser.recv_body()
Beispiel #9
0
 def get_domain_ids(self, urls):
     """ Return a list of domain IDs for these URLs """
     ret = []
     for u in urls:
         url = URL(u)
         ret.append(datasources["ids"].domain_id(url))
     return ret
Beispiel #10
0
 def get_domain_ids(self, urls):
     """ Return a list of domain IDs for these URLs """
     ret = []
     for u in urls:
         url = URL(u)
         ret.append(make_domain_id(url))
     return ret
Beispiel #11
0
    def import_row(self, i, row):
        """ Returns a (key, value) pair for this row from the dump file """

        event, elem = row

        if event != "end":
            return

        if elem.tag == "{http://dmoz.org/rdf/}ExternalPage":
            url = URL(elem.attrib["about"].encode("utf-8")).normalized
            title = elem.find("{http://purl.org/dc/elements/1.0/}Title")
            description = elem.find(
                "{http://purl.org/dc/elements/1.0/}Description")

            if url:
                yield url, {
                    "dmoz_title": (title.text or "") if
                    (title is not None) else "",
                    "dmoz_description": (description.text or "") if
                    (description is not None) else ""
                }

            self.clear_xml_elements(title, description, elem)

        elif elem.tag not in ("{http://purl.org/dc/elements/1.0/}Title",
                              "{http://purl.org/dc/elements/1.0/}Description"):
            self.clear_xml_elements(elem)
Beispiel #12
0
    def iter_rows(self):
        if config["TESTDATA"] == "1":
            extract_dir = self.dump_testdata
            clean = False
        else:
            extract_dir = tempfile.mkdtemp(suffix="cosr-ut1-import")
            clean = True

            os.system("curl %s > %s/blacklists.tar.gz" %
                      (self.dump_url, extract_dir))
            os.system("cd %s && tar zxf blacklists.tar.gz" % extract_dir)
            extract_dir += "/blacklists"

        data = defaultdict(list)

        for fp in os.listdir(extract_dir):
            fullpath = os.path.join(extract_dir, fp)

            if os.path.isdir(fullpath) and not os.path.islink(fullpath):

                cnt = 0

                with open(fullpath + "/domains", 'r') as f:
                    for line in f.readlines():
                        url = URL(line.strip()).normalized
                        if url:
                            data[url].append(fp)
                            cnt += 1

                if os.path.isfile(fullpath + "/urls"):
                    with open(fullpath + "/urls", 'r') as f:
                        for line in f.readlines():
                            url = URL(line.strip()).normalized
                            if url:
                                data[url].append(fp)
                                cnt += 1

                print("Done %s (%s entries)" % (fp, cnt))

        if clean:
            shutil.rmtree(os.path.dirname(extract_dir))

        for key, value in data.iteritems():
            yield key, {"ut1_blacklist": value}
Beispiel #13
0
    def import_dump(self):
        """ Read a dump from an URL or a local file, and merge its data in RocksDB """

        db = Storage(read_only=False)

        write_batch = db.write_batch(None)
        start_time = time.time()

        done = 0

        for url, values in self.iter_rows():

            # TODO: RocksDB merge operator?
            existing_value = db.get(url)
            existing_pb = urlserver_pb2.UrlMetadata()

            if existing_value is not None:
                existing_pb.ParseFromString(existing_value)
            else:
                # In order to send the protobuf message untouched via RPC, we pre-compute the ID
                existing_pb.id = make_url_id(URL(url))

            for k, v in values.iteritems():
                if k in ("ut1_blacklist", ):
                    for elt in v:
                        existing_pb.ut1_blacklist.append(elt)  # pylint: disable=no-member
                else:
                    setattr(existing_pb, k, v)

            # print "IMPORT", key, existing_pb

            write_batch.put(url, existing_pb.SerializeToString())

            done += 1

            if self.dump_batch_size and (done % self.dump_batch_size) == 0:

                eta = 0
                if self.dump_count_estimate:
                    eta = float(
                        self.dump_count_estimate - done
                    ) / (
                        3600.0 * done / (time.time() - start_time)
                    )

                print("Done %s (%s/s, ~%0.2f%%, ETA %0.2fh)" % (
                    done,
                    int(done / (time.time() - start_time)),
                    (float(done * 100) / self.dump_count_estimate) if self.dump_count_estimate else 0,
                    eta
                ))
                write_batch = db.write_batch(write_batch)

        print("Total rows: %s" % done)
        db.write_batch(write_batch)
        db.close()
Beispiel #14
0
    def add_hyperlink(self, href, words):
        """ Validate then add an hyperlink parsed from the document """

        # Resolve relative links. Some might have a weird syntax so we need to
        # catch exceptions.
        try:
            url = URL((self.base_url or self.source_url).urljoin(href),
                      check_encoding=True)
        except ValueError:
            return

        if url.parsed.scheme in ("http", "https"):
            self._hyperlinks.append({"href": url, "words": words})
Beispiel #15
0
        def iter_domain(record):
            """ Transforms Row(domain=www.example.com) into tuple([int64 ID], "example.com") """

            domain = record["domain"]
            if not domain or not domain.strip():
                return []

            name = URL("http://" + domain).normalized_domain

            try:
                _id = _fast_make_domain_id(name)
            except Exception:  # pylint: disable=broad-except
                return []

            return [(py2_long(_id), str(name))]
Beispiel #16
0
    def import_dump(self):
        """ Read a dump from an URL or a local file, and merge its data in RocksDB """

        db = Storage(read_only=False)

        write_batch = db.write_batch(None)
        batch_time = time.time()

        done = 0
        for i, row in self.iter_dump():

            for key, values in self.import_row(i, row):

                url = key.encode("utf-8")

                # TODO: RocksDB merge operator?
                existing_value = db.get(url)
                existing_pb = urlserver_pb2.UrlMetadata()
                if existing_value is not None:
                    existing_pb.ParseFromString(existing_value)
                else:
                    # In order to send the protobuf message untouched via RPC, we pre-compute the ID
                    existing_pb.id = make_url_id(URL(url))

                for k, v in values.iteritems():
                    if k in ("ut1_blacklist", ):
                        for elt in v:
                            existing_pb.ut1_blacklist.append(elt)  # pylint: disable=no-member
                    else:
                        setattr(existing_pb, k, v)

                # print "IMPORT", key, existing_pb

                write_batch.put(url, existing_pb.SerializeToString())

                done += 1

                if self.dump_batch_size and (done % self.dump_batch_size) == 0:
                    print "Done %s (%s/s)" % (done,
                                              int(done /
                                                  (time.time() - batch_time)))
                    write_batch = db.write_batch(write_batch)
                    batch_time = time.time()

        print "Total rows: %s" % done
        db.write_batch(write_batch)
        db.close()
Beispiel #17
0
    def get_metadata(self, urls):
        """ Return a list of tuples of metadata for these *normalized* URLs """

        ret = []
        for url in urls:

            data = db.get(url)

            # If the URL has been in none of our static databases, we still want to return an ID
            if data is None:
                obj = urlserver_pb2.UrlMetadata()
                obj.id = make_url_id(URL(url))
                data = obj.SerializeToString()

            ret.append(data)

        return ret
Beispiel #18
0
    def get_metadata(self, urls):
        """ Return a list of tuples of metadata for these URLs """

        ret = []
        for u in urls:

            url = URL(u)
            ret.append((
                datasources["ids"].url_id(url),
                datasources["ids"].domain_id(url),
                datasources["alexa_top1m"].rank(url),
                datasources["dmoz_url"].exists(url),
                datasources["dmoz_domain"].exists(url),
                datasources["ut1_blacklist"].classes(url),
                datasources["webdatacommons_hc"].rank(url)
            ))

        return ret
Beispiel #19
0
    def iter_items(self, partition):
        """ Partition can be either a single raw document, or a filepath to a JSON file """

        if partition.get("path"):
            with open(partition["path"], "r") as f:
                docs = json.load(f)
        else:
            docs = [partition["doc"]]

        for doc in docs:

            url = URL(doc["url"].encode("utf-8"))

            do_parse, index_level = self.qualify_url(url)

            if do_parse:

                yield (url, {
                    "Content-Type": "text/html"
                }, "html", index_level, doc["content"].encode("utf-8"))
Beispiel #20
0
    def import_row(self, i, row):
        """ Maps a raw data row into a list of (key, values) pairs """

        # https://www.wikidata.org/wiki/Property:P856
        official_website = None
        for claim in row["claims"].get("P856", []):
            if (claim["type"] == "statement"
                    and claim["mainsnak"]["datatype"] == "url"
                    and claim["mainsnak"].get("datavalue")):
                official_website = URL(
                    claim["mainsnak"]["datavalue"]["value"]).normalized

        # TODO: other languages!
        label_en = row["labels"].get("en", {}).get("value") or ""
        description_en = row["descriptions"].get("en", {}).get("value") or ""

        if official_website:
            yield official_website, {
                "wikidata_title": label_en,
                "wikidata_description": description_en,
                "wikidata_sitelinks": len(row.get("sitelinks") or [])
            }
Beispiel #21
0
    def endElement(self, name):
        if name == "ExternalPage":
            url = URL(self._current_page["url"])

            # TODO: Import "cool" flag, like IMDb has in http://www.dmoz.org/Arts/Movies/Databases/
            db_urls_wb.put(
                url.normalized.encode("utf-8"),
                json.dumps([
                    self._current_page.get("d:Title"),
                    self._current_page.get("d:Description")
                    # , self._current_page.get("topic")
                ]).encode("utf-8"))
            self._current_page = None

            db_domains_wb.put(url.normalized_domain.encode("utf-8"),
                              "1")  # TODO put total count?

            self.count += 1
            if self.count % 100000 == 0:
                print "Done %s" % self.count

        self._current_key = None
Beispiel #22
0
    def _start_tag(self, level, tag_name, attrs=None):
        """ We parsed a new element node in the document tree. """

        attrs = attrs or {}

        # TODO: different weights (h1-h6 tags for instance)
        weight = 1

        # Add this new element to the stack
        self.current_stack.append((tag_name, weight))

        # TODO: If se wee role=presentation, should we transform the tag_name into a simple span?
        # https://www.w3.org/TR/wai-aria/roles#presentation

        if tag_name == "head":
            self.level_head = level
            return

        if tag_name == "article":
            self.level_article = level

        # If we are in the HEAD, only metadata should be interesting
        if self.level_head is not None:

            if tag_name == "link":
                self.head_links.append(attrs)

            elif tag_name == "meta":
                meta_name = (attrs.get("name") or attrs.get("property")
                             or "").lower()
                if meta_name in defs.META_WHITELIST:
                    self.head_metas[meta_name] = (attrs.get("content")
                                                  or "").strip()

            elif tag_name == "base" and attrs.get(
                    "href") and self.base_url is None:
                self.base_url = URL(attrs["href"])

        # If we are in the BODY, inspect things a bit more
        else:

            if tag_name == "img":
                self._close_word_group()
                if attrs.get("alt"):
                    self.add_word_group(attrs["alt"], tag="img")
                if attrs.get("src"):
                    self.add_word_group(" ".join(
                        self._split_filename_words(attrs["src"])),
                                        tag="img")

            # Does this element start a hidden subtree?
            if self.level_hidden is None and self._guess_element_hidden(attrs):
                self.level_hidden = level

            # Are we in a boilerplate subtree with a bypass tag? (like a title in a header)
            if self.level_boilerplate is not None and self.level_boilerplate_bypass is None:
                if tag_name in defs.TAGS_BOILERPLATE_BYPASS:
                    self.level_boilerplate_bypass = level

            # Does this element start a boilerplate subtree?
            if self.level_boilerplate is None and self._guess_element_boilerplate(
                    attrs):
                self.level_boilerplate = level

            # Does this element start a hyperlink subtree?
            # TODO how to deal with nested a?
            if tag_name == "a" and attrs.get(
                    "rel") != "nofollow" and attrs.get("href"):
                self.level_hyperlink = level
                self.link_words = []
                self.link_href = attrs.get("href")

            # does this element break word groups?
            is_separator = tag_name in defs.TAGS_SEPARATORS

            # Do we want to start collecting words?
            # TODO: would we avoid this for elements with no children?
            if self.level_hidden is None and self.level_boilerplate is None and is_separator:
                toplevel_word_group = {"tag": tag_name, "weight": weight}
                self._start_word_group(toplevel_word_group)
Beispiel #23
0
def test_pickling():

    dumped = pickle.dumps(URL("http://sub.test.co.uk/azerza/azer.html?x=a#b"))
    url = pickle.loads(dumped)
    assert url.url == "http://sub.test.co.uk/azerza/azer.html?x=a#b"
Beispiel #24
0
def test_normalize(url, normalized_domain, normalized):
    _url = URL(url)
    assert _url.normalized_domain == normalized_domain
    assert _url.normalized == normalized
Beispiel #25
0
def test_url():
    assert URL("https://www.test.com").normalized == "test.com"
    assert URL("https://www.test.com?").normalized == "test.com"
    assert URL("https://www.test.com?").normalized_domain == "test.com"
    assert URL("https://www.test.com?").domain == "www.test.com"

    assert URL(u"https://www.test.com?").domain == "www.test.com"
    assert type(URL(u"https://www.test.com?").domain) == str
    assert type(URL(u"https://www.test.com?").normalized_domain) == str

    assert URL("https://.test.com").domain == ".test.com"
    assert URL("https://test.com.").domain == "test.com."
    assert URL("https://.test.com.").domain == ".test.com."

    assert URL("https://www.test.com.").normalized_domain == "test.com"
    assert URL("https://.www.test.com").normalized_domain == "test.com"
    assert URL("https://www.www.test.com").normalized_domain == "test.com"
    assert URL("https://.www.www.test.com").normalized_domain == "test.com"

    assert URL("https://.test.com").normalized_subdomain == ""
    assert URL("https://.www.test.com").normalized_subdomain == ""
    assert URL("https://.example.test.com").normalized_subdomain == "example"

    assert URL("http://sub.test.com/?x=a#b").normalized == "sub.test.com/?x=a"
    assert URL(
        "http://sub.test.co.uk?x=a#b").normalized == "sub.test.co.uk/?x=a"

    assert URL("http://sub.test.co.uk/page1?x=a#b"
               ).normalized == "sub.test.co.uk/page1?x=a"
    assert URL("http://sub.test.co.uk/page1/?x=a#b"
               ).normalized == "sub.test.co.uk/page1/?x=a"

    assert URL("http://sub.test.co.uk?x=a#b"
               ).normalized_without_query == "sub.test.co.uk"

    assert URL("http://sub.test.co.uk?x=a#b").suffix == "co.uk"
    assert URL("http://sub.test.co.uk?x=a#b").pld == "test.co.uk"

    assert URL("http://www.sub.test.co.uk?x=a#b").subdomain == "www.sub"
    assert URL("http://www.sub.test.co.uk?x=a#b").normalized_subdomain == "sub"

    assert URL("http://sub.test.co.uk/azerza/azer.html?x=a#b"
               ).homepage == "http://sub.test.co.uk"

    assert URL(
        'http://dc.weber.edu/\xc3\xaf\xc2\xbf\xc2\xbd/field/?a=b&c=d&e=\xc3\xaf\xc2\xbf\xc2\xbd#qq',
        check_encoding=True
    ).url == "http://dc.weber.edu/%C3%AF%C2%BF%C2%BD/field/?a=b&c=d&e=%C3%AF%C2%BF%C2%BD#qq"

    assert URL("http://nord.gouv.fr").normalized == "nord.gouv.fr"