def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval, config): """Main function: fetching data and load it to mongodb.""" mongo = open_db_conn(config) kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED) # fetch g_logger.info("Downloading data for %s from %s to %s starts" % (kind, start_dt_arg, end_dt_arg)) is_ndb = bool(config["kinds"][kind][3]) entity_list = fetch_entities.download_entities( kind, is_ndb, start_dt_arg, end_dt_arg, fetch_interval, config["max_logs"], config["max_tries"], "backup_timestamp", # TODO(jace): make configurable verbose=False, ) g_logger.info( "Data downloaded for %s from %s to %s.# rows: %d finishes" % (kind, start_dt_arg, end_dt_arg, len(entity_list)) ) kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED) # save to a file # TODO(yunfang): revisit if we should save the pickled pb archived_file = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, "pickle") with open(archived_file, "wb") as f: pickle.dump(entity_list, f) ret = subprocess.call(["gzip", "-f", archived_file]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), archived_file)) else: g_logger.error("Cannot gzip %s" % (archived_file)) # jsonize the entities json_filename = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, "json") json_key = config["kinds"][kind][4] f = open(json_filename, "wb") for pb in entity_list: doc = load_pbufs_to_hive.pb_to_dict(pb) json_str = json.dumps(doc) print >> f, "%s\t%s" % (doc[json_key], json_str) f.close() ret = subprocess.call(["gzip", "-f", json_filename]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), json_filename)) else: g_logger.error("Cannot gzip %s" % (json_filename)) kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED) # we used to load into mongoDB, we don't anymore. but still set the flag. kdc.record_progress(mongo, config["coordinator_cfg"], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.LOADED)
def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval, config): """Main function: fetching data and load it to mongodb.""" mongo = open_db_conn(config) kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED) # fetch start_dt = start_dt_arg end_dt = end_dt_arg g_logger.info("Downloading data for %s from %s to %s starts" % ( kind, start_dt_arg, end_dt_arg)) entity_list = fetch_entities.download_entities( kind, start_dt_arg, end_dt_arg, fetch_interval, config['max_logs'], config['max_tries'], verbose=False) g_logger.info( "Data downloaded for %s from %s to %s.# rows: %d finishes" % ( kind, start_dt_arg, end_dt_arg, len(entity_list))) kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED) # save to a file archived_file = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg) with open(archived_file, 'wb') as f: pickle.dump(entity_list, f) ret = subprocess.call(["gzip", "-f", archived_file]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), archived_file)) else: g_logger.error("Cannot gzip %s" % (archived_file)) kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED) # load to db load_pbufs_to_db(config, mongo, entity_list, start_dt_arg, end_dt_arg, kind)
def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval, config): """Main function: fetching data and load it to mongodb.""" mongo = open_db_conn(config) kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED) # fetch start_dt = start_dt_arg end_dt = end_dt_arg g_logger.info("Downloading data for %s from %s to %s starts" % (kind, start_dt_arg, end_dt_arg)) entity_list = fetch_entities.download_entities(kind, start_dt_arg, end_dt_arg, fetch_interval, config['max_logs'], config['max_tries'], verbose=False) g_logger.info("Data downloaded for %s from %s to %s.# rows: %d finishes" % (kind, start_dt_arg, end_dt_arg, len(entity_list))) kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED) # save to a file archived_file = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg) with open(archived_file, 'wb') as f: pickle.dump(entity_list, f) ret = subprocess.call(["gzip", "-f", archived_file]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), archived_file)) else: g_logger.error("Cannot gzip %s" % (archived_file)) kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED) # load to db load_pbufs_to_db(config, mongo, entity_list, start_dt_arg, end_dt_arg, kind)
def fetch_and_process_data(kind, start_dt_arg, end_dt_arg, fetch_interval, config): """Main function: fetching data and load it to mongodb.""" if config['dbhost']: mongo = open_db_conn(config) kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.STARTED) # fetch g_logger.info("Downloading data for %s from %s to %s starts" % ( kind, start_dt_arg, end_dt_arg)) is_ndb = bool(config['kinds'][kind][3]) entity_list = fetch_entities.download_entities( kind, is_ndb, start_dt_arg, end_dt_arg, fetch_interval, config['max_logs'], config['max_tries'], "backup_timestamp", # TODO(jace): make configurable verbose=False) g_logger.info( "Data downloaded for %s from %s to %s.# rows: %d finishes" % ( kind, start_dt_arg, end_dt_arg, len(entity_list))) if config['dbhost']: kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.FETCHED) # save to a file # TODO(yunfang): revisit if we should save the pickled pb archived_file = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, 'pickle') with open(archived_file, 'wb') as f: pickle.dump(entity_list, f) ret = subprocess.call(["gzip", "-f", archived_file]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), archived_file)) else: g_logger.error("Cannot gzip %s" % (archived_file)) #jsonize the entities json_filename = get_archive_file_name(config, kind, start_dt_arg, end_dt_arg, 'json') json_key = config['kinds'][kind][4] f = open(json_filename, 'wb') for pb in entity_list: doc = load_pbufs_to_hive.pb_to_dict(pb) # TODO(mattfaus): Make configurable, like for download_entities() above log_timestamp_outside_window( kind, doc.get('backup_timestamp'), start_dt_arg, end_dt_arg) json_str = json.dumps(doc) print >>f, "%s\t%s" % (doc[json_key], json_str) f.close() ret = subprocess.call(["gzip", "-f", json_filename]) if ret == 0: g_logger.info("%s rows saved to %s.gz" % (len(entity_list), json_filename)) else: g_logger.error("Cannot gzip %s" % (json_filename)) if config['dbhost']: kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.SAVED) # Well, we didn't actually load the data with this script, but mark # it as such anyway. kdc.record_progress(mongo, config['coordinator_cfg'], kind, start_dt_arg, end_dt_arg, kdc.DownloadStatus.LOADED)