def check_if_scraping_done(record): record['running'] = False record['complete'] = True result = idc.search_ops_table({'content': record, 'isa': 'scrape'}) return result is not None
def get_live_scrapes_older_than(min_hours_old=DEFAULT_MAX_TIME, db_id=None, coll_id=None): """Get all scrapes that are not marked complete and are at least min_hour_old Generally we expect scrapes to take only a few hours so an entire DB scrape should not take more than 4-6 hours at worst. min_hour_old -- Find only records older than this many hours Optional: db_id -- Find only records relating to this db id coll_id -- Find only records relating to this coll id """ try: start = datetime.datetime.now() - datetime.timedelta( hours=min_hours_old) #Currently this is enough to identify scrape records rec_test = {'time': {"$lt": start}, 'complete': False} if db_id: rec_test['db'] = db_id if coll_id: rec_test['coll'] = coll_id curs = idc.search_ops_table(rec_test) return list(curs) except: return []
def get_progress(uid): """Get progress of scrape with uid""" #NOTE what follows _is_ vulnerable to races but # this will only affect the progress meter, and should be rare scrapes = idc.search_ops_table({'isa':'scrape', 'content':{'uid':uuid.UUID(uid)}}) #Assume all ops records with correct uid and containing 'running' are relevant try: n_scrapes = scrapes.count() except: try: n_scrapes = len(scrapes()) except: n_scrapes = 0 curr_coll = 0 curr_item = None for item in scrapes: if item['content']['complete'] : curr_coll = curr_coll + 1 if item['content']['running'] : curr_item = item if curr_item: try: prog_in_curr = get_progress_from_db(uid, curr_item['db'], curr_item['coll']) except: #Web front or user can't do anything about errors here. If process # is failing, will become evident later prog_in_curr = 0 else: #Nothing running. If not yet started, prog=0. If done prog=100. prog_in_curr = 100 * (curr_coll == n_scrapes) return {'n_colls':n_scrapes, 'curr_coll':curr_coll, 'progress_in_current':prog_in_curr}
def get_latest_report(): reports = idc.search_ops_table({'isa': 'report'}) sorted_reports = sorted(reports, key=lambda s: s['scan_date']) rept = sorted_reports[-1] rept['report'] = rept.pop('content') try: return rept except: return {'scan_date': 'NaN'}
def get_completed_scrapes(n_days=7): """Get successfully completed scrapes from the last n days """ try: start = datetime.datetime.now() - datetime.timedelta(days=n_days) #Currently this is enough to identify scrape records rec_test = {'time': {"$gt": start}, 'complete': True} curs = idc.search_ops_table(rec_test) return list(curs) except: return []
def get_lockout_state(): """Get global lockout status""" res = None try: rec_find = {'lockout':{"$exists":True}} #Get latest lockout record res = idc.search_ops_table(rec_find).sort('_id', -1).limit(1) except: res = None pass if res is None: return False else: return res[0]['lockout']
def collate_orphans_by_uid(uid): """Fetch all orphans with given uid and return summary""" #All orphans records for this uid record = {'uid':uuid.UUID(uid), 'orphans':{"$exists":True}} records = idc.search_ops_table(record) orph_data = {} db_name = '' try: db_name = idc.get_db_name(records[0]['db'])['name'] except: record = {'uid':uuid.UUID(uid)} tmp_record = idc.search_ops_table(record) try: db_name = idc.get_db_name(tmp_record['db'])['name'] except: pass orph_data[db_name] = {} for entry in records: coll = idc.get_coll_name(entry['coll'])['name'] orph_data[db_name][coll] = split_orphans(entry) return orph_data
def collate_orphans(): """Fetch all orphans and return summary""" #All orphans records record = {'orphans':{"$exists":True}} records = idc.search_ops_table(record) orph_data = {} for entry in records: #print entry['uid'] db_name = idc.get_db_name(entry['db'])['name'] orph_data[db_name] = {} coll = idc.get_coll_name(entry['coll'])['name'] orph_tmp = split_orphans(entry) orph_data[db_name][coll] = orph_tmp return orph_data