def scrape(gush_id, no_queue=False): log = logging.getLogger(__name__) log.info("Scraping gush: %s", gush_id) # find gush/im if gush_id == "all": """ this is how we combat the low amount of memory available with the free redis instance - sort the gushim by descending last_checked_at timestamps. this is done because when redis reaches maxmemory it blocks ALL writes, so rq cannot work anymore until someone manually deletes stuff (even dequeueing doesn't work). so current solution is setting redis's maxmemory-policy to allkeys-lru, and that way the earliest jobs will just be discarded, and since we have the gushim by descending last checked order, the later-checked gushim will be discarded one by one so we can scrape the gushim that have not been scraped recently. """ gushim = db.gushim.find().sort(u'last_checked_at', -1) else: gushim = [db.gushim.find_one({"gush_id": gush_id})] log.debug("Found gushim: %s", gushim) if no_queue: log.warn("redis queuing is disabled, this is not recommended") for g in gushim: log.info("Scraping gush %s", g['gush_id']) scrape_gush(g, False, app.config['TESTING']) else: # enqueue them q = Queue(connection=redis_conn) for g in gushim: log.info("Queuing gush %s for scraping", g['gush_id']) q.enqueue(scrape_gush, g, False, app.config['TESTING'])
def test_scrape_empty_result(): #when quote mark appears in the middle of a string fixture['gush_id'] = 'empty' data = scrape_gush(fixture, RUN_FOLDER, app.config['TESTING']) eq_(len(data), 0)
def test_scrape_wellformed_json(): fixture['gush_id'] = '30649' data = scrape_gush(fixture, RUN_FOLDER, app.config['TESTING']) eq_(len(data), 35) eq_(data[0]['year'], 2006) eq_(len(data[0]['files_link']), 1)