def print_refs(self, url, out=True): self.assert_in_web(url) node = self.index.get(url) l = node.outgoing if not out: l = node.incoming for u in l: io.write_out("%s\n" % u)
def qualify_urls(ref_url, urls, rule, newqueue, wb): for url in urls: _dump, _fetch, _spider = False, False, False # apply patterns to determine how to qualify url if recipe.apply_mask(rule.get("dump"), url): _dump = True if recipe.apply_mask(rule.get("fetch"), url): _fetch = True if (recipe.apply_mask(rule.get("spider"), url) and recipe.apply_hostfilter(rule.get("host_filter"), url)): _spider = True # build a record based on qualification record = {"url" : url} if url not in wb: if _dump: io.write_out("%s\n" % url) if _fetch and _spider: record["mode"] = fetch.Fetcher.SPIDER_FETCH elif _fetch: record["mode"] = fetch.Fetcher.FETCH elif _spider: record["mode"] = fetch.Fetcher.SPIDER if _fetch or _spider: newqueue.append(record) # add url to web if it was matched by anything if _dump or _fetch or _spider: wb.add_url(ref_url, [url]) return newqueue, wb
def qualify_urls(ref_url, urls, rule, newqueue, wb): for url in urls: _dump, _fetch, _spider = False, False, False # apply patterns to determine how to qualify url if recipe.apply_mask(rule.get("dump"), url): _dump = True if recipe.apply_mask(rule.get("fetch"), url): _fetch = True if (recipe.apply_mask(rule.get("spider"), url) and recipe.apply_hostfilter(rule.get("host_filter"), url)): _spider = True # build a record based on qualification record = {"url": url} if url not in wb: if _dump: io.write_out("%s\n" % url) if _fetch and _spider: record["mode"] = fetch.Fetcher.SPIDER_FETCH elif _fetch: record["mode"] = fetch.Fetcher.FETCH elif _spider: record["mode"] = fetch.Fetcher.SPIDER if _fetch or _spider: newqueue.append(record) # add url to web if it was matched by anything if _dump or _fetch or _spider: wb.add_url(ref_url, [url]) return newqueue, wb
def print_aliases(self, url): self.assert_in_web(url) for u in self.index.get(url).aliases: io.write_out("%s\n" % u)
def dump(self): for u in self.index: io.write_out("%s\n" % u)