def test_scrap(self): nb_doc = 4 # to keep test short curr_doc = 0 scraper = Scraper(disconnected=True) directory = os.path.dirname(os.path.abspath(__file__)) with vcr.use_cassette(directory + '/vcr_cassettes/test_run_scraper.yaml', record_mode='none', ignore_localhost=True): for doc in scraper.scrap(): self.assertIsInstance(doc.url, unicode) self.assertIsInstance(doc.title, unicode) self.assertIsInstance(doc.content, unicode) self.assertNotIn(u'.gif', doc.url) # check extension filter self.assertNotIn(u'youtu', doc.url) # check regex filter curr_doc += 1 if curr_doc == nb_doc: break else: self.fail('error: not enough docs extracted from cassette, should be ' + str(nb_doc) + ', was ' + str(curr_doc))
def run(): jsonpickle.set_encoder_options('simplejson', indent=4, ensure_ascii=False) scraper = Scraper() folder = '/media/nico/SAMSUNG/devs/gator/scraping reddit 10-01-2016' log_file = folder + 'run_scraper-' + str(datetime.datetime.utcnow()) + '.log' logging.basicConfig(format=u'%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename=log_file) while True: try: for scraper_document in scraper.scrap(): filename = folder + '/' + str(datetime.datetime.utcnow()) + '.json' json = jsonpickle.encode(scraper_document) with codecs.open(filename=filename, mode='w', encoding='utf-8') as file_desc: file_desc.write(json) except Exception as exception: # pylint: disable=broad-except logging.error("The orchestrator crashed! Starting it over ...") logging.exception(exception) sleep(30)