Beispiel #1
0
 def print_refs(self, url, out=True):
     self.assert_in_web(url)
     node = self.index.get(url)
     l = node.outgoing
     if not out: l = node.incoming
     for u in l:
         io.write_out("%s\n" % u)
Beispiel #2
0
 def print_refs(self, url, out=True):
     self.assert_in_web(url)
     node = self.index.get(url)
     l = node.outgoing
     if not out: l = node.incoming
     for u in l:
         io.write_out("%s\n" % u)
Beispiel #3
0
def qualify_urls(ref_url, urls, rule, newqueue, wb):
    for url in urls:
        _dump, _fetch, _spider = False, False, False

        # apply patterns to determine how to qualify url
        if recipe.apply_mask(rule.get("dump"), url):
            _dump = True
        if recipe.apply_mask(rule.get("fetch"), url):
            _fetch = True
        if (recipe.apply_mask(rule.get("spider"), url) and
            recipe.apply_hostfilter(rule.get("host_filter"), url)):
            _spider = True

        # build a record based on qualification
        record = {"url" : url}
        if url not in wb:
            if _dump:
                io.write_out("%s\n" % url)
            if _fetch and _spider:
                record["mode"] = fetch.Fetcher.SPIDER_FETCH
            elif _fetch:
                record["mode"] = fetch.Fetcher.FETCH
            elif _spider:
                record["mode"] = fetch.Fetcher.SPIDER

            if _fetch or _spider:
                newqueue.append(record)

        # add url to web if it was matched by anything
        if _dump or _fetch or _spider:
            wb.add_url(ref_url, [url])

    return newqueue, wb
Beispiel #4
0
def qualify_urls(ref_url, urls, rule, newqueue, wb):
    for url in urls:
        _dump, _fetch, _spider = False, False, False

        # apply patterns to determine how to qualify url
        if recipe.apply_mask(rule.get("dump"), url):
            _dump = True
        if recipe.apply_mask(rule.get("fetch"), url):
            _fetch = True
        if (recipe.apply_mask(rule.get("spider"), url)
                and recipe.apply_hostfilter(rule.get("host_filter"), url)):
            _spider = True

        # build a record based on qualification
        record = {"url": url}
        if url not in wb:
            if _dump:
                io.write_out("%s\n" % url)
            if _fetch and _spider:
                record["mode"] = fetch.Fetcher.SPIDER_FETCH
            elif _fetch:
                record["mode"] = fetch.Fetcher.FETCH
            elif _spider:
                record["mode"] = fetch.Fetcher.SPIDER

            if _fetch or _spider:
                newqueue.append(record)

        # add url to web if it was matched by anything
        if _dump or _fetch or _spider:
            wb.add_url(ref_url, [url])

    return newqueue, wb
Beispiel #5
0
 def print_aliases(self, url):
     self.assert_in_web(url)
     for u in self.index.get(url).aliases:
         io.write_out("%s\n" % u)
Beispiel #6
0
 def dump(self):
     for u in self.index:
         io.write_out("%s\n" % u)
Beispiel #7
0
 def print_aliases(self, url):
     self.assert_in_web(url)
     for u in self.index.get(url).aliases:
         io.write_out("%s\n" % u)
Beispiel #8
0
 def dump(self):
     for u in self.index:
         io.write_out("%s\n" % u)