def test_param_value(reverse_update, reverse_test, is_param): dupe_predictor = DupePredictor() random.seed(1) def gen_urls(page): random_start = random.randint(1, 100) if is_param: tpls = [ '{}/?page={}', '{}/?page={}&start=0', '{}/?page={}&start=%s' % random_start ] else: tpls = ['{}/{}', '{}/{}?start=0', '{}/{}?start=%s' % random_start] return [tpl.format('http://foo.com', page) for tpl in tpls] for i in range(100): urls = gen_urls(i) with_contents = list( zip(urls, ['a{}'.format(i)] * 2 + ['r{}'.format(random.randint(1, 100))])) if reverse_update: with_contents.reverse() for url, content in with_contents: dupe_predictor.update_model(url, content) dupe_predictor.log_dupstats(min_dup=1) url1, url2, url3 = gen_urls('b') if reverse_test: url1, url2 = url2, url1 # url3 stays the same dupe_predictor.update_model(url1, 'b') assert dupe_predictor.get_dupe_prob(url2) > 0.97 assert dupe_predictor.get_dupe_prob(url3) < 0.3 for url in gen_urls('c'): assert dupe_predictor.get_dupe_prob(url) < 0.3
def learn_duplicates(name, f, verbose=False): print(name) logging.basicConfig(level=logging.DEBUG) texts_sample = [ item['extracted_text'] for item in item_reader(f, name, limit=300)] dupe_predictor = DupePredictor(texts_sample) lsh = MinHashLSH(threshold=0.9, num_perm=128) # separate from dupe_predictor too_common_shingles = dupe_predictor.too_common_shingles threshold = 0.98 y_pred, y_true = [], [] def _report_pr(): tp = sum(p > threshold and d for p, d in zip(y_pred, y_true)) fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true)) fn = sum(p < threshold and d for p, d in zip(y_pred, y_true)) n_dup = tp + fn print('precision: %.3f, recall %.3f at %.2f threshold ' '(%d duplicates)' % ( tp / (tp + fp) if tp else 0., tp / n_dup if n_dup else 0., threshold, n_dup)) for i, item in enumerate(item_reader(f, name)): dupe_prob = dupe_predictor.get_dupe_prob(item['url']) y_pred.append(dupe_prob) min_hash = get_min_hash(item['extracted_text'], too_common_shingles) if dupe_prob < threshold: duplicates = [url for url, _ in dupe_predictor.update_model( item['url'], item['extracted_text'])] else: # We think this is a duplicate: replicate crawling # and do not update the model. duplicates = list(lsh.query(min_hash)) key = canonicalize_url(item['url']) if key in lsh: lsh.remove(key) lsh.insert(key, min_hash) y_true.append(bool(duplicates)) if verbose: if duplicates and dupe_prob < threshold: path = _full_path(item['url']) sample = [url for url in duplicates if _full_path(url) == path] or duplicates print('false negative %s (%s, %d more)' % ( item['url'], sample[0], len(sample) - 1)) elif not duplicates and dupe_prob > threshold: print('false positive', item['url']) if i % 100 == 0: _report_pr() _report_pr()
def test_path(): dupe_predictor = DupePredictor() def gen_urls(): return [ 'http://foo.com/d?p{0}={0}'.format(random.randint(1, 100)), 'http://foo.com/nd?p{0}={0}'.format(random.randint(1, 100)) ] for _ in range(100): url1, url2 = gen_urls() dupe_predictor.update_model(url1, 'd') dupe_predictor.update_model(url2, 'd{}'.format(random.randint(1, 100))) dupe_predictor.log_dupstats(min_dup=1) url1, url2 = gen_urls() assert dupe_predictor.get_dupe_prob(url1) > 0.97 assert dupe_predictor.get_dupe_prob(url2) < 0.97
def test_param(reverse_update, reverse_test, is_param): dupe_predictor = DupePredictor() def gen_urls(page): tpls = ['{}/?page={}', '{}/?page={}&start=0'] if is_param else \ ['{}/{}', '{}/{}?start=0'] return [tpl.format('http://foo.com', page) for tpl in tpls] for i in range(100): urls = gen_urls(i) if reverse_update: urls.reverse() for url in urls: dupe_predictor.update_model(url, 'a{}'.format(i)) dupe_predictor.log_dupstats(min_dup=1) url1, url2 = gen_urls('b') if reverse_test: url1, url2 = url2, url1 dupe_predictor.update_model(url1, 'b') assert dupe_predictor.get_dupe_prob(url2) > 0.97 for url in gen_urls('c'): assert dupe_predictor.get_dupe_prob(url) < 0.1
def process_response(self, request, response, spider): if not isinstance(response, scrapy.http.response.text.TextResponse) or self.skip(request): return response url, text = response.url, extract_text(response) t0 = time.time() if self.dupe_predictor: self.dupe_predictor.update_model(url, text) t = time.time() - t0 if t > 0.01: logger.debug('Updated model in %.4f s for %s', t, url) else: self.initial_queue.append((url, text)) if len(self.initial_queue) >= self.initial_queue_limit: logger.debug( 'Gathered enough intitial pages, building DupePredictor') self.dupe_predictor = DupePredictor( texts_sample=[text for _, text in self.initial_queue]) # Update model with all the pages we have missed for url, text in self.initial_queue: self.dupe_predictor.update_model(url, text) self.initial_queue = None logger.debug('Built DupePredictor in %.4f s', time.time() - t0) return response
class AvoidDupContentMiddleware(object): """ Avoid requests for duplicate content. During crawling this middleware learns what parameters are important (influence content), and what can be safely ignored. Once it is confident it starts dropping most requests that are unlikely to get new content. Some requests are still downloaded to make crawling more robust against changes in site structure. It is applied only to requests with "avoid_dup_content" in meta. """ def __init__(self, initial_queue_limit, threshold, exploration): self.dupe_predictor = None # We initialize dupe detector only after gathering enough pages, # it needs them for better duplicate detection, to know which content # is common to a lot of pages, and which is unique. self.initial_queue = [] # (url, text) self.initial_queue_limit = initial_queue_limit self.threshold = threshold self.exploration = exploration @classmethod def from_crawler(cls, crawler): if not crawler.settings.getbool('AVOID_DUP_CONTENT_ENABLED'): raise NotConfigured s = crawler.settings return cls( initial_queue_limit=s.getint( 'AVOID_DUP_CONTENT_INITIAL_QUEUE_LIMIT', 300), threshold=s.getfloat('AVOID_DUP_CONTENT_THRESHOLD', 0.98), exploration=s.getfloat('AVOID_DUP_CONTENT_EXPLORATION', 0.05)) def process_request(self, request, spider): if not self.dupe_predictor or self.skip(request): return url = request.url t0 = time.time() dupe_prob = self.dupe_predictor.get_dupe_prob(url) t = time.time() - t0 if t > 0.01: logger.debug('get_dupe_prob took %.4f s for %s', t, url) if dupe_prob > self.threshold: if random.random() < self.exploration: logger.debug('Exploring a likely duplicate %s with prob %.3f', url, dupe_prob) else: logger.debug('Ignoring a likely duplicate %s with prob %.3f', url, dupe_prob) raise IgnoreRequest def process_response(self, request, response, spider): if not isinstance(response, scrapy.http.response.text.TextResponse) or self.skip(request): return response url, text = response.url, extract_text(response) t0 = time.time() if self.dupe_predictor: self.dupe_predictor.update_model(url, text) t = time.time() - t0 if t > 0.01: logger.debug('Updated model in %.4f s for %s', t, url) else: self.initial_queue.append((url, text)) if len(self.initial_queue) >= self.initial_queue_limit: logger.debug( 'Gathered enough intitial pages, building DupePredictor') self.dupe_predictor = DupePredictor( texts_sample=[text for _, text in self.initial_queue]) # Update model with all the pages we have missed for url, text in self.initial_queue: self.dupe_predictor.update_model(url, text) self.initial_queue = None logger.debug('Built DupePredictor in %.4f s', time.time() - t0) return response def skip(self, request): return not request.meta.get('avoid_dup_content')