Esempio n. 1
0
def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval, config):
    """Main function: fetching data and load it to mongodb."""
    mongo = open_db_conn(config)
    kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED)

    # fetch
    g_logger.info("Downloading data for %s from %s to %s starts" % (kind, start_dt_arg, end_dt_arg))
    is_ndb = bool(config["kinds"][kind][3])
    entity_list = fetch_entities.download_entities(
        kind,
        is_ndb,
        start_dt_arg,
        end_dt_arg,
        fetch_interval,
        config["max_logs"],
        config["max_tries"],
        "backup_timestamp",  # TODO(jace): make configurable
        verbose=False,
    )
    g_logger.info(
        "Data downloaded for %s from %s to %s.# rows: %d finishes" % (kind, start_dt_arg, end_dt_arg, len(entity_list))
    )
    kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED)

    # save to a file
    # TODO(yunfang): revisit if we should save the pickled pb
    archived_file = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, "pickle")
    with open(archived_file, "wb") as f:
        pickle.dump(entity_list, f)
    ret = subprocess.call(["gzip", "-f", archived_file])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list), archived_file))
    else:
        g_logger.error("Cannot gzip %s" % (archived_file))

    # jsonize the entities
    json_filename = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, "json")
    json_key = config["kinds"][kind][4]
    f = open(json_filename, "wb")
    for pb in entity_list:
        doc = load_pbufs_to_hive.pb_to_dict(pb)
        json_str = json.dumps(doc)
        print >> f, "%s\t%s" % (doc[json_key], json_str)
    f.close()
    ret = subprocess.call(["gzip", "-f", json_filename])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list), json_filename))
    else:
        g_logger.error("Cannot gzip %s" % (json_filename))

    kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED)

    # we used to load into mongoDB, we don't anymore. but still set the flag.
    kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.LOADED)
Esempio n. 2
0
def jsonify_downloaded_file(kind, config):
    file_beginning = filename(kind, config)
    dat_filename = "%s.dat" % file_beginning
    json_filename = "%s.json" % file_beginning

    sqlite_conn = sqlite3.connect(dat_filename, isolation_level=None)
    sqlstring = 'SELECT id, value FROM result'
    cursor = sqlite_conn.cursor()
    cursor.execute(sqlstring)
    f = open(json_filename, 'wb')
    for unused_entity_id, pb in cursor:
        doc = load_pbufs_to_hive.pb_to_dict(pb)
        json_str = json.dumps(doc)
        print >>f, "%s\t%s" % (doc['key'], json_str)
    f.close()
    sqlite_conn.close()

    echo_system("gzip -f %s" % json_filename)
Esempio n. 3
0
def jsonify_downloaded_file(kind, config):
    file_beginning = filename(kind, config)
    dat_filename = "%s.dat" % file_beginning
    json_filename = "%s.json" % file_beginning

    sqlite_conn = sqlite3.connect(dat_filename, isolation_level=None)
    sqlstring = 'SELECT id, value FROM result'
    cursor = sqlite_conn.cursor()
    cursor.execute(sqlstring)
    f = open(json_filename, 'wb')
    for unused_entity_id, pb in cursor:
        doc = load_pbufs_to_hive.pb_to_dict(pb, parent=True)
        json_str = json.dumps(doc)
        print >> f, "%s\t%s" % (doc['key'], json_str)
    f.close()
    sqlite_conn.close()

    echo_system("gzip -f %s" % json_filename)
Esempio n. 4
0
def fetch_and_process_data(kind, start_dt_arg, end_dt_arg,
    fetch_interval, config):
    """Main function: fetching data and load it to mongodb."""
    if config['dbhost']:
        mongo = open_db_conn(config)
        kdc.record_progress(mongo, config['coordinator_cfg'],
            kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED)

    # fetch
    g_logger.info("Downloading data for %s from %s to %s starts" % (
        kind, start_dt_arg, end_dt_arg))
    is_ndb = bool(config['kinds'][kind][3])
    entity_list = fetch_entities.download_entities(
                      kind,
                      is_ndb,
                      start_dt_arg, end_dt_arg,
                      fetch_interval,
                      config['max_logs'], config['max_tries'],
                      "backup_timestamp",  # TODO(jace): make configurable
                      verbose=False)
    g_logger.info(
        "Data downloaded for %s from %s to %s.# rows: %d finishes" % (
            kind, start_dt_arg, end_dt_arg, len(entity_list)))

    if config['dbhost']:
        kdc.record_progress(mongo, config['coordinator_cfg'],
            kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED)

    # save to a file
    # TODO(yunfang): revisit if we should save the pickled pb
    archived_file = get_archive_file_name(config, kind,
        start_dt_arg, end_dt_arg, 'pickle')
    with open(archived_file, 'wb') as f:
        pickle.dump(entity_list, f)
    ret = subprocess.call(["gzip", "-f", archived_file])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list),
            archived_file))
    else:
        g_logger.error("Cannot gzip %s" % (archived_file))

    #jsonize the entities
    json_filename = get_archive_file_name(config, kind,
        start_dt_arg, end_dt_arg, 'json')
    json_key = config['kinds'][kind][4]
    f = open(json_filename, 'wb')
    for pb in entity_list:
        doc = load_pbufs_to_hive.pb_to_dict(pb)

        # TODO(mattfaus): Make configurable, like for download_entities() above
        log_timestamp_outside_window(
            kind, doc.get('backup_timestamp'), start_dt_arg, end_dt_arg)

        json_str = json.dumps(doc)
        print >>f, "%s\t%s" % (doc[json_key], json_str)
    f.close()
    ret = subprocess.call(["gzip", "-f", json_filename])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list),
            json_filename))
    else:
        g_logger.error("Cannot gzip %s" % (json_filename))

    if config['dbhost']:
        kdc.record_progress(mongo, config['coordinator_cfg'],
            kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED)

        # Well, we didn't actually load the data with this script, but mark
        # it as such anyway.
        kdc.record_progress(mongo, config['coordinator_cfg'],
            kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.LOADED)