Exemple #1
0
def test_param_value(reverse_update, reverse_test, is_param):
    dupe_predictor = DupePredictor()
    random.seed(1)

    def gen_urls(page):
        random_start = random.randint(1, 100)
        if is_param:
            tpls = [
                '{}/?page={}', '{}/?page={}&start=0',
                '{}/?page={}&start=%s' % random_start
            ]
        else:
            tpls = ['{}/{}', '{}/{}?start=0', '{}/{}?start=%s' % random_start]
        return [tpl.format('http://foo.com', page) for tpl in tpls]

    for i in range(100):
        urls = gen_urls(i)
        with_contents = list(
            zip(urls, ['a{}'.format(i)] * 2 +
                ['r{}'.format(random.randint(1, 100))]))
        if reverse_update:
            with_contents.reverse()
        for url, content in with_contents:
            dupe_predictor.update_model(url, content)
    dupe_predictor.log_dupstats(min_dup=1)
    url1, url2, url3 = gen_urls('b')
    if reverse_test:
        url1, url2 = url2, url1  # url3 stays the same
    dupe_predictor.update_model(url1, 'b')
    assert dupe_predictor.get_dupe_prob(url2) > 0.97
    assert dupe_predictor.get_dupe_prob(url3) < 0.3
    for url in gen_urls('c'):
        assert dupe_predictor.get_dupe_prob(url) < 0.3
def learn_duplicates(name, f, verbose=False):
    print(name)
    logging.basicConfig(level=logging.DEBUG)
    texts_sample = [
        item['extracted_text'] for item in item_reader(f, name, limit=300)]
    dupe_predictor = DupePredictor(texts_sample)

    lsh = MinHashLSH(threshold=0.9, num_perm=128)  # separate from dupe_predictor
    too_common_shingles = dupe_predictor.too_common_shingles
    threshold = 0.98
    y_pred, y_true = [], []
    def _report_pr():
        tp = sum(p > threshold and d for p, d in zip(y_pred, y_true))
        fp = sum(p > threshold and not d for p, d in zip(y_pred, y_true))
        fn = sum(p < threshold and d for p, d in zip(y_pred, y_true))
        n_dup = tp + fn
        print('precision: %.3f, recall %.3f at %.2f threshold '
                '(%d duplicates)' % (
            tp / (tp + fp) if tp else 0.,
            tp / n_dup if n_dup else 0., threshold, n_dup))
    for i, item in enumerate(item_reader(f, name)):
        dupe_prob = dupe_predictor.get_dupe_prob(item['url'])
        y_pred.append(dupe_prob)
        min_hash = get_min_hash(item['extracted_text'], too_common_shingles)
        if dupe_prob < threshold:
            duplicates = [url for url, _ in dupe_predictor.update_model(
                item['url'], item['extracted_text'])]
        else:
            # We think this is a duplicate: replicate crawling
            # and do not update the model.
            duplicates = list(lsh.query(min_hash))
        key = canonicalize_url(item['url'])
        if key in lsh:
            lsh.remove(key)
        lsh.insert(key, min_hash)
        y_true.append(bool(duplicates))
        if verbose:
            if duplicates and dupe_prob < threshold:
                path = _full_path(item['url'])
                sample = [url for url in duplicates
                          if _full_path(url) == path] or duplicates
                print('false negative %s (%s, %d more)' % (
                    item['url'], sample[0], len(sample) - 1))
            elif not duplicates and dupe_prob > threshold:
                print('false positive', item['url'])
        if i % 100 == 0:
            _report_pr()
    _report_pr()
Exemple #3
0
def test_path():
    dupe_predictor = DupePredictor()

    def gen_urls():
        return [
            'http://foo.com/d?p{0}={0}'.format(random.randint(1, 100)),
            'http://foo.com/nd?p{0}={0}'.format(random.randint(1, 100))
        ]

    for _ in range(100):
        url1, url2 = gen_urls()
        dupe_predictor.update_model(url1, 'd')
        dupe_predictor.update_model(url2, 'd{}'.format(random.randint(1, 100)))
    dupe_predictor.log_dupstats(min_dup=1)
    url1, url2 = gen_urls()
    assert dupe_predictor.get_dupe_prob(url1) > 0.97
    assert dupe_predictor.get_dupe_prob(url2) < 0.97
Exemple #4
0
def test_param(reverse_update, reverse_test, is_param):
    dupe_predictor = DupePredictor()

    def gen_urls(page):
        tpls = ['{}/?page={}', '{}/?page={}&start=0'] if is_param else \
               ['{}/{}',       '{}/{}?start=0']
        return [tpl.format('http://foo.com', page) for tpl in tpls]

    for i in range(100):
        urls = gen_urls(i)
        if reverse_update:
            urls.reverse()
        for url in urls:
            dupe_predictor.update_model(url, 'a{}'.format(i))
    dupe_predictor.log_dupstats(min_dup=1)
    url1, url2 = gen_urls('b')
    if reverse_test:
        url1, url2 = url2, url1
    dupe_predictor.update_model(url1, 'b')
    assert dupe_predictor.get_dupe_prob(url2) > 0.97
    for url in gen_urls('c'):
        assert dupe_predictor.get_dupe_prob(url) < 0.1
Exemple #5
0
 def process_response(self, request, response, spider):
     if not isinstance(response, scrapy.http.response.text.TextResponse) or self.skip(request):
         return response
     url, text = response.url, extract_text(response)
     t0 = time.time()
     if self.dupe_predictor:
         self.dupe_predictor.update_model(url, text)
         t = time.time() - t0
         if t > 0.01:
             logger.debug('Updated model in %.4f s for %s', t, url)
     else:
         self.initial_queue.append((url, text))
         if len(self.initial_queue) >= self.initial_queue_limit:
             logger.debug(
                 'Gathered enough intitial pages, building DupePredictor')
             self.dupe_predictor = DupePredictor(
                 texts_sample=[text for _, text in self.initial_queue])
             # Update model with all the pages we have missed
             for url, text in self.initial_queue:
                 self.dupe_predictor.update_model(url, text)
             self.initial_queue = None
             logger.debug('Built DupePredictor in %.4f s', time.time() - t0)
     return response
Exemple #6
0
class AvoidDupContentMiddleware(object):
    """
    Avoid requests for duplicate content. During crawling this middleware
    learns what parameters are important (influence content), and what can
    be safely ignored. Once it is confident it starts dropping most
    requests that are unlikely to get new content. Some requests are still
    downloaded to make crawling more robust against changes in site structure.
    It is applied only to requests with "avoid_dup_content" in meta.
    """
    def __init__(self, initial_queue_limit, threshold, exploration):
        self.dupe_predictor = None
        # We initialize dupe detector only after gathering enough pages,
        # it needs them for better duplicate detection, to know which content
        # is common to a lot of pages, and which is unique.
        self.initial_queue = []  # (url, text)
        self.initial_queue_limit = initial_queue_limit
        self.threshold = threshold
        self.exploration = exploration

    @classmethod
    def from_crawler(cls, crawler):
        if not crawler.settings.getbool('AVOID_DUP_CONTENT_ENABLED'):
            raise NotConfigured
        s = crawler.settings
        return cls(
            initial_queue_limit=s.getint(
                'AVOID_DUP_CONTENT_INITIAL_QUEUE_LIMIT', 300),
            threshold=s.getfloat('AVOID_DUP_CONTENT_THRESHOLD', 0.98),
            exploration=s.getfloat('AVOID_DUP_CONTENT_EXPLORATION', 0.05))

    def process_request(self, request, spider):
        if not self.dupe_predictor or self.skip(request):
            return
        url = request.url
        t0 = time.time()
        dupe_prob = self.dupe_predictor.get_dupe_prob(url)
        t = time.time() - t0
        if t > 0.01:
            logger.debug('get_dupe_prob took %.4f s for %s', t, url)
        if dupe_prob > self.threshold:
            if random.random() < self.exploration:
                logger.debug('Exploring a likely duplicate %s with prob %.3f',
                             url, dupe_prob)
            else:
                logger.debug('Ignoring a likely duplicate %s with prob %.3f',
                             url, dupe_prob)
                raise IgnoreRequest

    def process_response(self, request, response, spider):
        if not isinstance(response, scrapy.http.response.text.TextResponse) or self.skip(request):
            return response
        url, text = response.url, extract_text(response)
        t0 = time.time()
        if self.dupe_predictor:
            self.dupe_predictor.update_model(url, text)
            t = time.time() - t0
            if t > 0.01:
                logger.debug('Updated model in %.4f s for %s', t, url)
        else:
            self.initial_queue.append((url, text))
            if len(self.initial_queue) >= self.initial_queue_limit:
                logger.debug(
                    'Gathered enough intitial pages, building DupePredictor')
                self.dupe_predictor = DupePredictor(
                    texts_sample=[text for _, text in self.initial_queue])
                # Update model with all the pages we have missed
                for url, text in self.initial_queue:
                    self.dupe_predictor.update_model(url, text)
                self.initial_queue = None
                logger.debug('Built DupePredictor in %.4f s', time.time() - t0)
        return response

    def skip(self, request):
        return not request.meta.get('avoid_dup_content')