Ejemplo n.º 1
0
def main(json_json_files_to_process,
         runtime_json,
         collection_name_override=None):
    with open(runtime_json, "r") as fj:
        runtime_config = json.load(fj)

    mongo_db_config = runtime_config["mongo_db_config"]
    connection_string = mongo_db_config["connection_string"]
    database_name = mongo_db_config["database_name"]

    if collection_name_override is None:
        collection_name = mongo_db_config["collection_name"]
    else:
        collection_name = collection_name_override

    refresh_collection = mongo_db_config["refresh_collection"]
    print("Connecting to MongoDB '%s'" % connection_string)
    client = pymongo.MongoClient(connection_string)

    database = client[database_name]
    collection = database[collection_name]

    if refresh_collection:
        collection.delete_many({})

    initial_collection_count = collection.count()

    with open(json_json_files_to_process, "r") as fj:
        json_files_to_to_process = json.load(fj)

        j = 0
        for json_file_dict in json_files_to_to_process:
            json_file_name = json_file_dict["data_json_file"]
            print("Loading '%s'" % json_file_name)

            data_dict = data_dict_load(json_file_name)
            i = 0
            for datum_key in data_dict:

                collection.insert(
                    data_dict[datum_key], check_keys=False
                )  # Have to makes sure keys do not have a "." or "$" in it

                if i > 0 and i % 500 == 0:
                    print("Inserted '%s' documents" % i)
                i += 1
            j += i

        print("Inserted '%s' total documents" % j)

    print("Added %s items in collection '%s' housed in database '%s'" %
          (collection.count() - initial_collection_count, collection_name,
           database_name))
def main(json_json_files_to_process, runtime_json, collection_name_override=None):
    with open(runtime_json, "r") as fj:
        runtime_config = json.load(fj)

    mongo_db_config = runtime_config["mongo_db_config"]
    connection_string = mongo_db_config["connection_string"]
    database_name = mongo_db_config["database_name"]

    if collection_name_override is None:
        collection_name = mongo_db_config["collection_name"]
    else:
        collection_name = collection_name_override

    refresh_collection = mongo_db_config["refresh_collection"]
    print("Connecting to MongoDB '%s'" % connection_string)
    client = pymongo.MongoClient(connection_string)

    database = client[database_name]
    collection = database[collection_name]

    if refresh_collection:
        collection.delete_many({})

    initial_collection_count = collection.count()

    with open(json_json_files_to_process, "r") as fj:
        json_files_to_to_process = json.load(fj)

        j = 0
        for json_file_dict in json_files_to_to_process:
            json_file_name = json_file_dict["data_json_file"]
            print("Loading '%s'" % json_file_name)

            data_dict = data_dict_load(json_file_name)
            i = 0
            for datum_key in data_dict:

                collection.insert(data_dict[datum_key], check_keys=False) # Have to makes sure keys do not have a "." or "$" in it

                if i > 0 and i % 500 == 0:
                    print("Inserted '%s' documents" % i)
                i += 1
            j += i

        print("Inserted '%s' total documents" % j)

    print("Added %s items in collection '%s' housed in database '%s'" % (collection.count() - initial_collection_count, collection_name, database_name))
def main(hdf5_base_name, batch_json_file_name, data_template_json, refresh_template=True, output_directory=None):
    """Convert a JSON file to a HDF5 matrix format using a template"""

    with open(batch_json_file_name) as fj:  # Load names of files to process
        batch_list_dict = json.load(fj)

    data_json_files = [x["data_json_file"] for x in batch_list_dict]

    sort_order_dict = {}
    batch_ids = []
    for list_dict in batch_list_dict:
        batch_id = list_dict["batch_id"]
        batch_ids += [batch_id]
        if "sort_order_file_name" in list_dict:
            sort_order_file_name = list_dict["sort_order_file_name"]
            sort_order_dict[batch_id] = sort_order_file_name

    if output_directory is None:
        output_directory = os.path.abspath(os.path.split(data_json_files[0])[0])

    generated_data_templates_names = []
    ks = 0
    for data_json_file in data_json_files:  # For each subset generate a template
        batch_number = batch_ids[ks]

        data_dict = data_dict_load(data_json_file)

        with open(data_template_json, "r") as f:
            data_template_dict = json.load(f)

        data_template_dict = expand_template_dict(data_dict, data_template_dict)
        data_translate_dict = build_translation_dict(data_dict, data_template_dict)
        data_translate_dict = add_offsets_to_translation_dict(data_translate_dict)
        data_translate_dict_json_name = os.path.join(output_directory, hdf5_base_name + "_" + str(batch_number) + "_data_template.json")

        generated_data_templates_names += [data_translate_dict_json_name]

        print("Generated: '%s'" % data_translate_dict_json_name)

        with open(data_translate_dict_json_name, "w") as fjw:
            try:
                json.dump(data_translate_dict, fjw, indent=4, separators=(", ", ": "), sort_keys=True)
            except:
                json.dump(data_translate_dict, fjw)

        ks += 1

    master_data_translate_dict = []
    for data_template_name in generated_data_templates_names:  # Combine templates into a single master template
        print("Merging '%s' to master data template" % data_template_name)
        with open(data_template_name) as fj:
            data_translate_dict = json.load(fj)

            master_data_translate_dict = merge_data_translate_dicts(master_data_translate_dict, data_translate_dict)

    master_data_translate_dict_name = os.path.join(output_directory, hdf5_base_name + "_master_data_template.json")
    with open(master_data_translate_dict_name, "w") as fjw:
        try:
            json.dump(master_data_translate_dict, fjw, indent=4, separators=(", ", ": "), sort_keys=True)
        except TypeError:
            json.dump(master_data_translate_dict, fjw)

    generated_hdf5_file_names = []
    ks = 0
    total_number_of_rows = 0
    for data_json_file in data_json_files:  # Export each subset into a HDF5 matrix
        batch_number = batch_ids[ks]

        data_dict = data_dict_load(data_json_file)

        total_number_of_rows += len(data_dict)

        hdf5_file_name = os.path.join(output_directory, hdf5_base_name + "_" + str(batch_number) + ".hdf5")
        generated_hdf5_file_names += [hdf5_file_name]

        f5p = h5py.File(hdf5_file_name, "w")
        if batch_number in sort_order_dict:
            sort_order_json_name = sort_order_dict[batch_number]
            with open(sort_order_json_name, "r") as fj:
                sort_order_list = json.load(fj)
        else:
            sort_order_list = None

        build_hdf5_matrix(f5p, data_dict, master_data_translate_dict, sort_order_list)

        ks += 1

    print("Exported %s rows across %s files" % (total_number_of_rows, len(data_json_files)))

    all_hdf5_file_name = os.path.join(output_directory, hdf5_base_name + "_combined.hdf5")
    combined_hdf5 = h5py.File(all_hdf5_file_name, "w")

    combine_exported_hdf5_files_into_single_file(combined_hdf5, generated_hdf5_file_names, total_number_of_rows)
Ejemplo n.º 4
0
def main(hdf5_base_name,
         batch_json_file_name,
         data_template_json,
         refresh_template=True,
         output_directory=None):
    """Convert a JSON file to a HDF5 matrix format using a template"""

    with open(batch_json_file_name) as fj:  # Load names of files to process
        batch_list_dict = json.load(fj)

    data_json_files = [x["data_json_file"] for x in batch_list_dict]

    sort_order_dict = {}
    batch_ids = []
    for list_dict in batch_list_dict:
        batch_id = list_dict["batch_id"]
        batch_ids += [batch_id]
        if "sort_order_file_name" in list_dict:
            sort_order_file_name = list_dict["sort_order_file_name"]
            sort_order_dict[batch_id] = sort_order_file_name

    if output_directory is None:
        output_directory = os.path.abspath(
            os.path.split(data_json_files[0])[0])

    generated_data_templates_names = []
    ks = 0
    for data_json_file in data_json_files:  # For each subset generate a template
        batch_number = batch_ids[ks]

        data_dict = data_dict_load(data_json_file)

        with open(data_template_json, "r") as f:
            data_template_dict = json.load(f)

        data_template_dict = expand_template_dict(data_dict,
                                                  data_template_dict)
        data_translate_dict = build_translation_dict(data_dict,
                                                     data_template_dict)
        data_translate_dict = add_offsets_to_translation_dict(
            data_translate_dict)
        data_translate_dict_json_name = os.path.join(
            output_directory,
            hdf5_base_name + "_" + str(batch_number) + "_data_template.json")

        generated_data_templates_names += [data_translate_dict_json_name]

        print("Generated: '%s'" % data_translate_dict_json_name)

        with open(data_translate_dict_json_name, "w") as fjw:
            try:
                json.dump(data_translate_dict,
                          fjw,
                          indent=4,
                          separators=(", ", ": "),
                          sort_keys=True)
            except:
                json.dump(data_translate_dict, fjw)

        ks += 1

    master_data_translate_dict = []
    for data_template_name in generated_data_templates_names:  # Combine templates into a single master template
        print("Merging '%s' to master data template" % data_template_name)
        with open(data_template_name) as fj:
            data_translate_dict = json.load(fj)

            master_data_translate_dict = merge_data_translate_dicts(
                master_data_translate_dict, data_translate_dict)

    master_data_translate_dict_name = os.path.join(
        output_directory, hdf5_base_name + "_master_data_template.json")
    with open(master_data_translate_dict_name, "w") as fjw:
        try:
            json.dump(master_data_translate_dict,
                      fjw,
                      indent=4,
                      separators=(", ", ": "),
                      sort_keys=True)
        except TypeError:
            json.dump(master_data_translate_dict, fjw)

    generated_hdf5_file_names = []
    ks = 0
    total_number_of_rows = 0
    for data_json_file in data_json_files:  # Export each subset into a HDF5 matrix
        batch_number = batch_ids[ks]

        data_dict = data_dict_load(data_json_file)

        total_number_of_rows += len(data_dict)

        hdf5_file_name = os.path.join(
            output_directory,
            hdf5_base_name + "_" + str(batch_number) + ".hdf5")
        generated_hdf5_file_names += [hdf5_file_name]

        f5p = h5py.File(hdf5_file_name, "w")
        if batch_number in sort_order_dict:
            sort_order_json_name = sort_order_dict[batch_number]
            with open(sort_order_json_name, "r") as fj:
                sort_order_list = json.load(fj)
        else:
            sort_order_list = None

        build_hdf5_matrix(f5p, data_dict, master_data_translate_dict,
                          sort_order_list)

        ks += 1

    print("Exported %s rows across %s files" %
          (total_number_of_rows, len(data_json_files)))

    all_hdf5_file_name = os.path.join(output_directory,
                                      hdf5_base_name + "_combined.hdf5")
    combined_hdf5 = h5py.File(all_hdf5_file_name, "w")

    combine_exported_hdf5_files_into_single_file(combined_hdf5,
                                                 generated_hdf5_file_names,
                                                 total_number_of_rows)