Esempio n. 1
0
class CosineBfsSpider(BfsSpider):
    name = 'cosinebfs'

    def __init__(self, *a, **kwargs):
        self.sim = CosineSimilarity(indexfile=settings.INDEX_FILE, threshold=settings.RELEVANCY_THRESHOLD)
        super(CosineBfsSpider, self).__init__(*a, **kwargs)

    def parse_item(self, response):
        item = RecipebotItem()

        doc = response.meta['terms']

        # decide if the page is interesting
        if not self.sim.is_relevant(doc):
            stats.inc_value('recipe/filtered_out') # probably not recipe page
            return

        item['url'] = response.url

        return item
Esempio n. 2
0
 def __init__(self, *a, **kwargs):
     self.sim = CosineSimilarity(indexfile=settings.INDEX_FILE, threshold=settings.RELEVANCY_THRESHOLD)
     super(CosineBfsSpider, self).__init__(*a, **kwargs)