def main(): options = get_cmd_line_args() config = util.load_unstripped_json(options.config) g_logger.info("Fetching failed jobs from progress db") start_dt = date_util.from_date_iso(options.start_date) mongo = gae_download.open_db_conn(config) coordinator_cfg = config["coordinator_cfg"] # Don't touch tasks that was recently started # TODO(yunfang): parameterize this two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2) results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg) if not results: g_logger.info("Empty result set. Nothing to reprocess.") exit(0) for rec in results: if rec["history"]["1"] < start_dt: continue if rec["history"]["1"] >= two_hours_ago: # Started less than 2 hours ago continue # Reprocess fetch_interval = config['kinds'][rec['kind']][1] gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"], rec["end_dt"], fetch_interval, config) g_logger.info("Done reprocessing!!")
def gz_pickle_to_mongo(config, gzfile): mongo = gae_download.open_db_conn(config) (year, month, day) = re.match(r'.*(\d{4})-(\d{2})-(\d{2})', gzfile).groups() start_dt = dt.datetime(int(year), int(month), int(day)) end_dt = start_dt + dt.timedelta(days=1) f = gzip.open(gzfile, "rb") entity_list = pickle.load(f) f.close() g_logger.info("Loading %s entries to db" % len(entity_list)) gae_download.load_pbufs_to_db(config, mongo, entity_list, start_dt, end_dt)