def qualify_urls(self, ref_url, urls, rule, newqueue): for url in urls: _dump, _fetch, _spider = False, False, False # apply patterns to determine how to qualify url if recipe.apply_mask(rule.get("dump"), url): _dump = True if recipe.apply_mask(rule.get("fetch"), url): _fetch = True if (recipe.apply_mask(rule.get("spider"), url) and recipe.apply_hostfilter(rule.get("host_filter"), url)): _spider = True # build a record based on qualification record = {"url": url} if url not in self.session.wb: if _dump: ioutils.write_out("%s\n" % url) if _fetch and _spider: record["mode"] = fetch.Fetcher.SPIDER_FETCH elif _fetch: record["mode"] = fetch.Fetcher.FETCH elif _spider: record["mode"] = fetch.Fetcher.SPIDER if _fetch or _spider: newqueue.append(record) # add url to web if it was matched by anything if _dump or _fetch or _spider: self.session.wb.add_url(ref_url, [url]) return newqueue
def print_refs(self, url, out=True): self.assert_in_web(url) node = self.index.get(url) l = node.outgoing if not out: l = node.incoming for u in l: ioutils.write_out("%s\n" % u)
def print_aliases(self, url): self.assert_in_web(url) for u in self.index.get(url).aliases: ioutils.write_out("%s\n" % u)
def dump(self): for u in self.index: ioutils.write_out("%s\n" % u)