class CosineBfsSpider(BfsSpider): name = 'cosinebfs' def __init__(self, *a, **kwargs): self.sim = CosineSimilarity(indexfile=settings.INDEX_FILE, threshold=settings.RELEVANCY_THRESHOLD) super(CosineBfsSpider, self).__init__(*a, **kwargs) def parse_item(self, response): item = RecipebotItem() doc = response.meta['terms'] # decide if the page is interesting if not self.sim.is_relevant(doc): stats.inc_value('recipe/filtered_out') # probably not recipe page return item['url'] = response.url return item
def __init__(self, *a, **kwargs): self.sim = CosineSimilarity(indexfile=settings.INDEX_FILE, threshold=settings.RELEVANCY_THRESHOLD) super(CosineBfsSpider, self).__init__(*a, **kwargs)