Beispiel #1
0
class NearDuplicatesDetection(object):

    def __init__(self):
        self.index = NearDuplicatesIndex()

    def process_response(self, request, response, spider):
        doc = request.meta['terms']
        if self.index.appendif(doc, response.url, 0.0):
            return response

        stats.inc_value('downloader/near_duplicates')

        raise IgnoreRequest
Beispiel #2
0
 def __init__(self):
     self.index = NearDuplicatesIndex()