Python download_entities Examples, fetch_entities.download_entities Python Examples

Example #1

0

Show file

File: gae_download.py Project: nathanleiby/analytics

def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval, config):
    """Main function: fetching data and load it to mongodb."""
    mongo = open_db_conn(config)
    kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED)

    # fetch
    g_logger.info("Downloading data for %s from %s to %s starts" % (kind, start_dt_arg, end_dt_arg))
    is_ndb = bool(config["kinds"][kind][3])
    entity_list = fetch_entities.download_entities(
        kind,
        is_ndb,
        start_dt_arg,
        end_dt_arg,
        fetch_interval,
        config["max_logs"],
        config["max_tries"],
        "backup_timestamp",  # TODO(jace): make configurable
        verbose=False,
    )
    g_logger.info(
        "Data downloaded for %s from %s to %s.# rows: %d finishes" % (kind, start_dt_arg, end_dt_arg, len(entity_list))
    )
    kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED)

    # save to a file
    # TODO(yunfang): revisit if we should save the pickled pb
    archived_file = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, "pickle")
    with open(archived_file, "wb") as f:
        pickle.dump(entity_list, f)
    ret = subprocess.call(["gzip", "-f", archived_file])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list), archived_file))
    else:
        g_logger.error("Cannot gzip %s" % (archived_file))

    # jsonize the entities
    json_filename = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, "json")
    json_key = config["kinds"][kind][4]
    f = open(json_filename, "wb")
    for pb in entity_list:
        doc = load_pbufs_to_hive.pb_to_dict(pb)
        json_str = json.dumps(doc)
        print >> f, "%s\t%s" % (doc[json_key], json_str)
    f.close()
    ret = subprocess.call(["gzip", "-f", json_filename])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list), json_filename))
    else:
        g_logger.error("Cannot gzip %s" % (json_filename))

    kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED)

    # we used to load into mongoDB, we don't anymore. but still set the flag.
    kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.LOADED)

Example #2

0

Show file

File: gae_download.py Project: mwahl/analytics

def fetch_and_process_data(kind, start_dt_arg, end_dt_arg,
    fetch_interval, config):
    """Main function: fetching data and load it to mongodb."""
    mongo = open_db_conn(config)
    kdc.record_progress(mongo, config['coordinator_cfg'],
        kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED)

    # fetch
    start_dt = start_dt_arg
    end_dt = end_dt_arg
    g_logger.info("Downloading data for %s from %s to %s starts"  % (
        kind, start_dt_arg, end_dt_arg))
    entity_list = fetch_entities.download_entities(
                      kind, 
                      start_dt_arg, end_dt_arg,
                      fetch_interval,
                      config['max_logs'], config['max_tries'],
                      verbose=False)
    g_logger.info(
        "Data downloaded for %s from %s to %s.# rows: %d finishes" % (
            kind, start_dt_arg, end_dt_arg, len(entity_list)))
    kdc.record_progress(mongo, config['coordinator_cfg'],
        kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED)

    # save to a file
    archived_file = get_archive_file_name(config, kind,
        start_dt_arg, end_dt_arg)
    with open(archived_file, 'wb') as f:
        pickle.dump(entity_list, f)
    ret = subprocess.call(["gzip", "-f", archived_file])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list),
            archived_file))
    else:
        g_logger.error("Cannot gzip %s" % (archived_file))
    kdc.record_progress(mongo, config['coordinator_cfg'],
        kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED)

    # load to db
    load_pbufs_to_db(config, mongo, entity_list,
        start_dt_arg, end_dt_arg, kind)

Example #3

0

Show file

File: gae_download.py Project: mwahl/analytics

def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval,
                           config):
    """Main function: fetching data and load it to mongodb."""
    mongo = open_db_conn(config)
    kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg,
                        end_dt_arg, kdc.DownloadStatus.STARTED)

    # fetch
    start_dt = start_dt_arg
    end_dt = end_dt_arg
    g_logger.info("Downloading data for %s from %s to %s starts" %
                  (kind, start_dt_arg, end_dt_arg))
    entity_list = fetch_entities.download_entities(kind,
                                                   start_dt_arg,
                                                   end_dt_arg,
                                                   fetch_interval,
                                                   config['max_logs'],
                                                   config['max_tries'],
                                                   verbose=False)
    g_logger.info("Data downloaded for %s from %s to %s.# rows: %d finishes" %
                  (kind, start_dt_arg, end_dt_arg, len(entity_list)))
    kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg,
                        end_dt_arg, kdc.DownloadStatus.FETCHED)

    # save to a file
    archived_file = get_archive_file_name(config, kind, start_dt_arg,
                                          end_dt_arg)
    with open(archived_file, 'wb') as f:
        pickle.dump(entity_list, f)
    ret = subprocess.call(["gzip", "-f", archived_file])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" %
                      (len(entity_list), archived_file))
    else:
        g_logger.error("Cannot gzip %s" % (archived_file))
    kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg,
                        end_dt_arg, kdc.DownloadStatus.SAVED)

    # load to db
    load_pbufs_to_db(config, mongo, entity_list, start_dt_arg, end_dt_arg,
                     kind)

Example #4

0

Show file

File: gae_download.py Project: arunpn/analytics

def fetch_and_process_data(kind, start_dt_arg, end_dt_arg,
    fetch_interval, config):
    """Main function: fetching data and load it to mongodb."""
    if config['dbhost']:
        mongo = open_db_conn(config)
        kdc.record_progress(mongo, config['coordinator_cfg'],
            kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED)

    # fetch
    g_logger.info("Downloading data for %s from %s to %s starts" % (
        kind, start_dt_arg, end_dt_arg))
    is_ndb = bool(config['kinds'][kind][3])
    entity_list = fetch_entities.download_entities(
                      kind,
                      is_ndb,
                      start_dt_arg, end_dt_arg,
                      fetch_interval,
                      config['max_logs'], config['max_tries'],
                      "backup_timestamp",  # TODO(jace): make configurable
                      verbose=False)
    g_logger.info(
        "Data downloaded for %s from %s to %s.# rows: %d finishes" % (
            kind, start_dt_arg, end_dt_arg, len(entity_list)))

    if config['dbhost']:
        kdc.record_progress(mongo, config['coordinator_cfg'],
            kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED)

    # save to a file
    # TODO(yunfang): revisit if we should save the pickled pb
    archived_file = get_archive_file_name(config, kind,
        start_dt_arg, end_dt_arg, 'pickle')
    with open(archived_file, 'wb') as f:
        pickle.dump(entity_list, f)
    ret = subprocess.call(["gzip", "-f", archived_file])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list),
            archived_file))
    else:
        g_logger.error("Cannot gzip %s" % (archived_file))

    #jsonize the entities
    json_filename = get_archive_file_name(config, kind,
        start_dt_arg, end_dt_arg, 'json')
    json_key = config['kinds'][kind][4]
    f = open(json_filename, 'wb')
    for pb in entity_list:
        doc = load_pbufs_to_hive.pb_to_dict(pb)

        # TODO(mattfaus): Make configurable, like for download_entities() above
        log_timestamp_outside_window(
            kind, doc.get('backup_timestamp'), start_dt_arg, end_dt_arg)

        json_str = json.dumps(doc)
        print >>f, "%s\t%s" % (doc[json_key], json_str)
    f.close()
    ret = subprocess.call(["gzip", "-f", json_filename])
    if ret == 0:
        g_logger.info("%s rows saved to %s.gz" % (len(entity_list),
            json_filename))
    else:
        g_logger.error("Cannot gzip %s" % (json_filename))

    if config['dbhost']:
        kdc.record_progress(mongo, config['coordinator_cfg'],
            kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED)

        # Well, we didn't actually load the data with this script, but mark
        # it as such anyway.
        kdc.record_progress(mongo, config['coordinator_cfg'],
            kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.LOADED)