def main():
    options = get_cmd_line_args()
    config = util.load_unstripped_json(options.config)
    g_logger.info("Fetching failed jobs from progress db")
    start_dt = date_util.from_date_iso(options.start_date)
    mongo = gae_download.open_db_conn(config)
    coordinator_cfg = config["coordinator_cfg"]
    # Don't touch tasks that was recently started
    # TODO(yunfang): parameterize this
    two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2)
    results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg)
    if not results:
        g_logger.info("Empty result set. Nothing to reprocess.")
        exit(0)
    for rec in results:
        if rec["history"]["1"] < start_dt:
            continue
        if rec["history"]["1"] >= two_hours_ago:
            # Started less than 2 hours ago
            continue
        # Reprocess
        fetch_interval = config['kinds'][rec['kind']][1]
        gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"],
            rec["end_dt"], fetch_interval, config)
    g_logger.info("Done reprocessing!!")
Beispiel #2
0
def main():
    options = get_cmd_line_args()
    config = util.load_unstripped_json(options.config)
    g_logger.info("Fetching failed jobs from progress db")
    start_dt = date_util.from_date_iso(options.start_date)
    mongo = gae_download.open_db_conn(config)
    coordinator_cfg = config["coordinator_cfg"]
    # Don't touch tasks that was recently started
    # TODO(yunfang): parameterize this
    two_hours_ago = datetime.datetime.now() - datetime.timedelta(hours=2)
    results = ka_download_coordinator.get_failed_jobs(mongo, coordinator_cfg)
    if not results:
        g_logger.info("Empty result set. Nothing to reprocess.")
        exit(0)
    for rec in results:
        if rec["history"]["1"] < start_dt:
            continue
        if rec["history"]["1"] >= two_hours_ago:
            # Started less than 2 hours ago
            continue
        # Reprocess
        fetch_interval = config['kinds'][rec['kind']][1]
        gae_download.fetch_and_process_data(rec["kind"], rec["start_dt"],
                                            rec["end_dt"], fetch_interval,
                                            config)
    g_logger.info("Done reprocessing!!")
def gz_pickle_to_mongo(config, gzfile):
    mongo = gae_download.open_db_conn(config)
    (year, month, day)  = re.match(r'.*(\d{4})-(\d{2})-(\d{2})', gzfile).groups()
    start_dt = dt.datetime(int(year), int(month), int(day))
    end_dt = start_dt + dt.timedelta(days=1)
    f = gzip.open(gzfile, "rb") 
    entity_list = pickle.load(f)
    f.close()
    g_logger.info("Loading %s entries to db" % len(entity_list))
    gae_download.load_pbufs_to_db(config, mongo, entity_list, start_dt, end_dt) 
Beispiel #4
0
def gz_pickle_to_mongo(config, gzfile):
    mongo = gae_download.open_db_conn(config)
    (year, month, day) = re.match(r'.*(\d{4})-(\d{2})-(\d{2})',
                                  gzfile).groups()
    start_dt = dt.datetime(int(year), int(month), int(day))
    end_dt = start_dt + dt.timedelta(days=1)
    f = gzip.open(gzfile, "rb")
    entity_list = pickle.load(f)
    f.close()
    g_logger.info("Loading %s entries to db" % len(entity_list))
    gae_download.load_pbufs_to_db(config, mongo, entity_list, start_dt, end_dt)