def main(json_json_files_to_process, runtime_json, collection_name_override=None): with open(runtime_json, "r") as fj: runtime_config = json.load(fj) mongo_db_config = runtime_config["mongo_db_config"] connection_string = mongo_db_config["connection_string"] database_name = mongo_db_config["database_name"] if collection_name_override is None: collection_name = mongo_db_config["collection_name"] else: collection_name = collection_name_override refresh_collection = mongo_db_config["refresh_collection"] print("Connecting to MongoDB '%s'" % connection_string) client = pymongo.MongoClient(connection_string) database = client[database_name] collection = database[collection_name] if refresh_collection: collection.delete_many({}) initial_collection_count = collection.count() with open(json_json_files_to_process, "r") as fj: json_files_to_to_process = json.load(fj) j = 0 for json_file_dict in json_files_to_to_process: json_file_name = json_file_dict["data_json_file"] print("Loading '%s'" % json_file_name) data_dict = data_dict_load(json_file_name) i = 0 for datum_key in data_dict: collection.insert( data_dict[datum_key], check_keys=False ) # Have to makes sure keys do not have a "." or "$" in it if i > 0 and i % 500 == 0: print("Inserted '%s' documents" % i) i += 1 j += i print("Inserted '%s' total documents" % j) print("Added %s items in collection '%s' housed in database '%s'" % (collection.count() - initial_collection_count, collection_name, database_name))
def main(json_json_files_to_process, runtime_json, collection_name_override=None): with open(runtime_json, "r") as fj: runtime_config = json.load(fj) mongo_db_config = runtime_config["mongo_db_config"] connection_string = mongo_db_config["connection_string"] database_name = mongo_db_config["database_name"] if collection_name_override is None: collection_name = mongo_db_config["collection_name"] else: collection_name = collection_name_override refresh_collection = mongo_db_config["refresh_collection"] print("Connecting to MongoDB '%s'" % connection_string) client = pymongo.MongoClient(connection_string) database = client[database_name] collection = database[collection_name] if refresh_collection: collection.delete_many({}) initial_collection_count = collection.count() with open(json_json_files_to_process, "r") as fj: json_files_to_to_process = json.load(fj) j = 0 for json_file_dict in json_files_to_to_process: json_file_name = json_file_dict["data_json_file"] print("Loading '%s'" % json_file_name) data_dict = data_dict_load(json_file_name) i = 0 for datum_key in data_dict: collection.insert(data_dict[datum_key], check_keys=False) # Have to makes sure keys do not have a "." or "$" in it if i > 0 and i % 500 == 0: print("Inserted '%s' documents" % i) i += 1 j += i print("Inserted '%s' total documents" % j) print("Added %s items in collection '%s' housed in database '%s'" % (collection.count() - initial_collection_count, collection_name, database_name))
def main(hdf5_base_name, batch_json_file_name, data_template_json, refresh_template=True, output_directory=None): """Convert a JSON file to a HDF5 matrix format using a template""" with open(batch_json_file_name) as fj: # Load names of files to process batch_list_dict = json.load(fj) data_json_files = [x["data_json_file"] for x in batch_list_dict] sort_order_dict = {} batch_ids = [] for list_dict in batch_list_dict: batch_id = list_dict["batch_id"] batch_ids += [batch_id] if "sort_order_file_name" in list_dict: sort_order_file_name = list_dict["sort_order_file_name"] sort_order_dict[batch_id] = sort_order_file_name if output_directory is None: output_directory = os.path.abspath(os.path.split(data_json_files[0])[0]) generated_data_templates_names = [] ks = 0 for data_json_file in data_json_files: # For each subset generate a template batch_number = batch_ids[ks] data_dict = data_dict_load(data_json_file) with open(data_template_json, "r") as f: data_template_dict = json.load(f) data_template_dict = expand_template_dict(data_dict, data_template_dict) data_translate_dict = build_translation_dict(data_dict, data_template_dict) data_translate_dict = add_offsets_to_translation_dict(data_translate_dict) data_translate_dict_json_name = os.path.join(output_directory, hdf5_base_name + "_" + str(batch_number) + "_data_template.json") generated_data_templates_names += [data_translate_dict_json_name] print("Generated: '%s'" % data_translate_dict_json_name) with open(data_translate_dict_json_name, "w") as fjw: try: json.dump(data_translate_dict, fjw, indent=4, separators=(", ", ": "), sort_keys=True) except: json.dump(data_translate_dict, fjw) ks += 1 master_data_translate_dict = [] for data_template_name in generated_data_templates_names: # Combine templates into a single master template print("Merging '%s' to master data template" % data_template_name) with open(data_template_name) as fj: data_translate_dict = json.load(fj) master_data_translate_dict = merge_data_translate_dicts(master_data_translate_dict, data_translate_dict) master_data_translate_dict_name = os.path.join(output_directory, hdf5_base_name + "_master_data_template.json") with open(master_data_translate_dict_name, "w") as fjw: try: json.dump(master_data_translate_dict, fjw, indent=4, separators=(", ", ": "), sort_keys=True) except TypeError: json.dump(master_data_translate_dict, fjw) generated_hdf5_file_names = [] ks = 0 total_number_of_rows = 0 for data_json_file in data_json_files: # Export each subset into a HDF5 matrix batch_number = batch_ids[ks] data_dict = data_dict_load(data_json_file) total_number_of_rows += len(data_dict) hdf5_file_name = os.path.join(output_directory, hdf5_base_name + "_" + str(batch_number) + ".hdf5") generated_hdf5_file_names += [hdf5_file_name] f5p = h5py.File(hdf5_file_name, "w") if batch_number in sort_order_dict: sort_order_json_name = sort_order_dict[batch_number] with open(sort_order_json_name, "r") as fj: sort_order_list = json.load(fj) else: sort_order_list = None build_hdf5_matrix(f5p, data_dict, master_data_translate_dict, sort_order_list) ks += 1 print("Exported %s rows across %s files" % (total_number_of_rows, len(data_json_files))) all_hdf5_file_name = os.path.join(output_directory, hdf5_base_name + "_combined.hdf5") combined_hdf5 = h5py.File(all_hdf5_file_name, "w") combine_exported_hdf5_files_into_single_file(combined_hdf5, generated_hdf5_file_names, total_number_of_rows)
def main(hdf5_base_name, batch_json_file_name, data_template_json, refresh_template=True, output_directory=None): """Convert a JSON file to a HDF5 matrix format using a template""" with open(batch_json_file_name) as fj: # Load names of files to process batch_list_dict = json.load(fj) data_json_files = [x["data_json_file"] for x in batch_list_dict] sort_order_dict = {} batch_ids = [] for list_dict in batch_list_dict: batch_id = list_dict["batch_id"] batch_ids += [batch_id] if "sort_order_file_name" in list_dict: sort_order_file_name = list_dict["sort_order_file_name"] sort_order_dict[batch_id] = sort_order_file_name if output_directory is None: output_directory = os.path.abspath( os.path.split(data_json_files[0])[0]) generated_data_templates_names = [] ks = 0 for data_json_file in data_json_files: # For each subset generate a template batch_number = batch_ids[ks] data_dict = data_dict_load(data_json_file) with open(data_template_json, "r") as f: data_template_dict = json.load(f) data_template_dict = expand_template_dict(data_dict, data_template_dict) data_translate_dict = build_translation_dict(data_dict, data_template_dict) data_translate_dict = add_offsets_to_translation_dict( data_translate_dict) data_translate_dict_json_name = os.path.join( output_directory, hdf5_base_name + "_" + str(batch_number) + "_data_template.json") generated_data_templates_names += [data_translate_dict_json_name] print("Generated: '%s'" % data_translate_dict_json_name) with open(data_translate_dict_json_name, "w") as fjw: try: json.dump(data_translate_dict, fjw, indent=4, separators=(", ", ": "), sort_keys=True) except: json.dump(data_translate_dict, fjw) ks += 1 master_data_translate_dict = [] for data_template_name in generated_data_templates_names: # Combine templates into a single master template print("Merging '%s' to master data template" % data_template_name) with open(data_template_name) as fj: data_translate_dict = json.load(fj) master_data_translate_dict = merge_data_translate_dicts( master_data_translate_dict, data_translate_dict) master_data_translate_dict_name = os.path.join( output_directory, hdf5_base_name + "_master_data_template.json") with open(master_data_translate_dict_name, "w") as fjw: try: json.dump(master_data_translate_dict, fjw, indent=4, separators=(", ", ": "), sort_keys=True) except TypeError: json.dump(master_data_translate_dict, fjw) generated_hdf5_file_names = [] ks = 0 total_number_of_rows = 0 for data_json_file in data_json_files: # Export each subset into a HDF5 matrix batch_number = batch_ids[ks] data_dict = data_dict_load(data_json_file) total_number_of_rows += len(data_dict) hdf5_file_name = os.path.join( output_directory, hdf5_base_name + "_" + str(batch_number) + ".hdf5") generated_hdf5_file_names += [hdf5_file_name] f5p = h5py.File(hdf5_file_name, "w") if batch_number in sort_order_dict: sort_order_json_name = sort_order_dict[batch_number] with open(sort_order_json_name, "r") as fj: sort_order_list = json.load(fj) else: sort_order_list = None build_hdf5_matrix(f5p, data_dict, master_data_translate_dict, sort_order_list) ks += 1 print("Exported %s rows across %s files" % (total_number_of_rows, len(data_json_files))) all_hdf5_file_name = os.path.join(output_directory, hdf5_base_name + "_combined.hdf5") combined_hdf5 = h5py.File(all_hdf5_file_name, "w") combine_exported_hdf5_files_into_single_file(combined_hdf5, generated_hdf5_file_names, total_number_of_rows)