def test_scrap(self):
        nb_doc = 4  # to keep test short
        curr_doc = 0
        scraper = Scraper(disconnected=True)
        directory = os.path.dirname(os.path.abspath(__file__))
        with vcr.use_cassette(directory + '/vcr_cassettes/test_run_scraper.yaml', record_mode='none', ignore_localhost=True):
            for doc in scraper.scrap():
                self.assertIsInstance(doc.url, unicode)
                self.assertIsInstance(doc.title, unicode)
                self.assertIsInstance(doc.content, unicode)
                self.assertNotIn(u'.gif', doc.url)  # check extension filter
                self.assertNotIn(u'youtu', doc.url)  # check regex filter

                curr_doc += 1
                if curr_doc == nb_doc:
                    break
            else:
                self.fail('error: not enough docs extracted from cassette, should be '
                          + str(nb_doc) + ', was ' + str(curr_doc))
Exemple #2
0
def run():
    jsonpickle.set_encoder_options('simplejson', indent=4, ensure_ascii=False)

    scraper = Scraper()

    folder = '/media/nico/SAMSUNG/devs/gator/scraping reddit 10-01-2016'

    log_file = folder + 'run_scraper-' + str(datetime.datetime.utcnow()) + '.log'

    logging.basicConfig(format=u'%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename=log_file)

    while True:
        try:
            for scraper_document in scraper.scrap():
                filename = folder + '/' + str(datetime.datetime.utcnow()) + '.json'
                json = jsonpickle.encode(scraper_document)
                with codecs.open(filename=filename, mode='w', encoding='utf-8') as file_desc:
                    file_desc.write(json)

        except Exception as exception:  # pylint: disable=broad-except
            logging.error("The orchestrator crashed! Starting it over ...")
            logging.exception(exception)
            sleep(30)