Exemple #1
0
def scrape(gush_id, no_queue=False):
    log = logging.getLogger(__name__)
    log.info("Scraping gush: %s", gush_id)

    # find gush/im
    if gush_id == "all":
        """
        this is how we combat the low amount of memory available with the free redis instance -
        sort the gushim by descending last_checked_at timestamps. this is done because
        when redis reaches maxmemory it blocks ALL writes, so rq cannot work anymore
        until someone manually deletes stuff (even dequeueing doesn't work). so current
        solution is setting redis's maxmemory-policy to allkeys-lru, and that way the
        earliest jobs will just be discarded, and since we have the gushim by descending
        last checked order, the later-checked gushim will be discarded one by one so we
        can scrape the gushim that have not been scraped recently.
        """
        gushim = db.gushim.find().sort(u'last_checked_at', -1)
    else:
        gushim = [db.gushim.find_one({"gush_id": gush_id})]
    log.debug("Found gushim: %s", gushim)

    if no_queue:
        log.warn("redis queuing is disabled, this is not recommended")
        for g in gushim:
            log.info("Scraping gush %s", g['gush_id'])
            scrape_gush(g, False, app.config['TESTING'])
    else:
        # enqueue them
        q = Queue(connection=redis_conn)
        for g in gushim:
            log.info("Queuing gush %s for scraping", g['gush_id'])
            q.enqueue(scrape_gush, g, False, app.config['TESTING'])
def test_scrape_empty_result():
    #when quote mark appears in the middle of a string
    fixture['gush_id'] = 'empty'
    data = scrape_gush(fixture, RUN_FOLDER, app.config['TESTING'])
    eq_(len(data), 0)
def test_scrape_wellformed_json():
    fixture['gush_id'] = '30649'
    data = scrape_gush(fixture, RUN_FOLDER, app.config['TESTING'])
    eq_(len(data), 35)
    eq_(data[0]['year'], 2006)
    eq_(len(data[0]['files_link']), 1)
Exemple #4
0
def test_scrape_empty_result():
    #when quote mark appears in the middle of a string
    fixture['gush_id'] = 'empty'
    data = scrape_gush(fixture, RUN_FOLDER, app.config['TESTING'])
    eq_(len(data), 0)
Exemple #5
0
def test_scrape_wellformed_json():
    fixture['gush_id'] = '30649'
    data = scrape_gush(fixture, RUN_FOLDER, app.config['TESTING'])
    eq_(len(data), 35)
    eq_(data[0]['year'], 2006)
    eq_(len(data[0]['files_link']), 1)