def identify_bio_pages(self, firm): fts = FirmTrainingSet.get_for_firm(firm) spider = Spider(firm.domain, os.path.join(self.scrape_dir, str(fts.id) + ".db"), workers=4, retry_attempts=2) self.stdout.write("Identifying bio pages...\n") model = PageClassifier(os.path.join(self.model_dir, str(fts.id) + "_page.tgm")) model.load() self.stdout.write('Retrieving page features...\n') bio_pages = [] for url in spider.urls: page = spider.get(url) cat = str(model.predict(page)) if cat == 'bio': self.stdout.write(' * ' + url) bio_pages.append(url) self.stdout.write('Done.') return bio_pages