def page_crawled(self, response, links): scores = {} response.meta['state'] = _state.get_id('CRAWLED') for link in links: if link.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link) scores[fingerprint] = self.get_score(url) link.meta['state'] = _state.get_id('QUEUED') return scores
def page_crawled(self, response, links): scores = {} response.meta['state'] = _state.get_id('CRAWLED') for link in links: if link.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url(link) scores[fingerprint] = self.get_score(url) link.meta['state'] = _state.get_id('QUEUED') print "$$ inside page_crawled : value -> {0}, links -> {1}".format(response.url,links) return scores
def add_seeds(self, seeds): scores = {} for seed in seeds: if seed.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url(seed) scores[fingerprint] = 1.0 seed.meta['state'] = _state.get_id('QUEUED') return scores
def add_seeds(self, seeds): scores = {} for seed in seeds: if seed.meta['state'] is None: url, fingerprint, _ = self.canonicalsolver.get_canonical_url(seed) scores[fingerprint] = 1.0 seed.meta['state'] = _state.get_id('QUEUED') print "$$ inside add_seeds : seeds -> {0}, scores -> {1}".format(seeds, scores) print "$$ printing seed.meta" for seed in seeds: print seed.meta return scores
def page_error(self, request, error): url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request) request.meta['state'] = _state.get_id('ERROR') print "$$ inside page_error : url -> {0}, error_reason -> {1}".format(request.url, error) return {fingerprint: 0.0}
def page_error(self, request, error): url, fingerprint, _ = self.canonicalsolver.get_canonical_url(request) request.meta['state'] = _state.get_id('ERROR') return {fingerprint: 0.0}