def add_new_docs(cache_wrapper, now):
    print 'Adding new documents to the database...'
    
    cache = cache_wrapper.get_pickle_connection()
    
    new = 0
    for id in cache.keys():
        doc = cache.get(id)

        if doc.get('documentStatus', None) == "Withdrawn":
            continue

        db_doc = result_to_model(doc, now=now)
        
        try:
            db_doc.save()
            new += 1
        except:
            print "Failed to save document %s" % db_doc.id
    
    written = new
    print 'Wrote %s new documents.' % (written)
    
    return written
Exemple #2
0
def add_new_docs(cache_wrapper, now):
    print 'Adding new documents to the database...'

    cache = cache_wrapper.get_pickle_connection()

    new = 0
    for id in cache.keys():
        doc = cache.get(id)

        if doc.get('documentStatus', None) == "Withdrawn":
            continue

        db_doc = result_to_model(doc, now=now)

        try:
            db_doc.save()
            new += 1
        except:
            print "Failed to save document %s" % db_doc.id

    written = new
    print 'Wrote %s new documents.' % (written)

    return written
def run(options, args):
    print 'Retrieving current document IDs...'
    
    # HACK - pull ids via shell because doing it in Python is slow
    count_proc = subprocess.Popen(
        ["mongo", settings.DB_NAME] +\
            list(itertools.chain.from_iterable([("--%s" % key, str(value)) for key, value in settings.DB_SETTINGS.items()])) +\
            ["--quiet", "--eval", "printjson(db.docs.find({source:'regulations.gov',deleted:false},{_id:1}).map(function(i){return i._id;}))"],
        stdout=subprocess.PIPE
    )
    ids = set(json.load(count_proc.stdout))

    now = datetime.datetime.now()

    if options.since:
        most_recent = datetime.datetime.strptime(options.since, "%Y-%m-%d")
        print "Done; start date manually set to %s and total documents indexed is %s." % (most_recent.isoformat(), len(ids))
    else:
        print "Retrieving date of most recent document..."
        recent_agg = Doc._get_collection().aggregate([
            {
                "$match": {
                    "source": "regulations.gov",
                    "deleted": False
                }
            },
            {
                "$group": {
                    "_id": 0,
                    "max": {
                        "$max": "$details.Date_Posted"
                    }
                }
            }
        ]);
        most_recent = recent_agg['result'][0]['max']

        print "Done; last document is from %s and total documents indexed is %s." % (most_recent.isoformat(), len(ids))
        
        if most_recent > now:
            most_recent = now
            print "Overriding most recent to now."
        
    search_args = {
        # date range from one day before the most recent until one day after now
        "pd": "-".join([d.strftime("%m/%d/%y") for d in (most_recent - datetime.timedelta(days=1), now + datetime.timedelta(days=1))]),

        # order ascending by posted date to reduce pagination errors
        "sb": "postedDate",
        "so": "ASC"
    }

    # start new dumps
    position = 0
    increment = 1000
    stats = {'pages_downloaded': 0, 'new_records': 0, 'existing_records': 0, 'failed_saves': 0}
    total = parsed_search(1, 0, **search_args)['totalNumRecords']
    while position <= total:
        page = None
        for i in range(3):
            try:
                current_str = (position / increment) + 1
                total_str = '?' if total == 1 else (total / increment) + 1
                print "Downloading page %s of %s..." % (current_str, total_str)
                
                page = parsed_search(increment, position, **search_args)

                stats['pages_downloaded'] += 1
                break
            except (urllib2.HTTPError, httplib.HTTPException) as e:
                if i < 2:
                    if hasattr(e, 'code') and e.code in (503, 429) and 'rate' in e.read().lower():
                        print 'Download failed because of rate limiting; will retry in an hour...'
                        time.sleep(3600)
                    else:
                        print 'Download failed; will retry in 10 seconds...'
                        time.sleep(10)
                else:
                    print 'System troubles; giving up.'
                    raise

        for result in page.get('documents', []):
            if result['documentId'] in ids:
                stats['existing_records'] += 1
            else:
                if result.get('documentStatus', None) == "Withdrawn":
                    continue
                
                db_doc = result_to_model(result, now=now)
                
                try:
                    db_doc.save()
                    stats['new_records'] += 1
                except:
                    print "Failed to save document %s" % db_doc.id
                    stats['failed_saves'] += 1
        
        position += increment
    
    print "Wrote %s new records, encountered %s existing records, and had %s failed saves." % (stats['new_records'], stats['existing_records'], stats['failed_saves'])
    
    return stats