def convert_and_save_source_data_to_eggroll_table(config_data):

    file_path = config_data.get("file_path")
    overlap_ratio = config_data.get("overlap_ratio")
    guest_split_ratio = config_data.get("guest_split_ratio")
    n_feature_guest = config_data.get("n_feature_guest")
    num_samples = config_data.get("num_samples")
    balanced = config_data.get("balanced")

    if not os.path.exists(file_path):
        print(file_path, "is not exist, please check the configure")
        sys.exit()

    guest_data_generator, host_data_generator, overlap_indexes = load_guest_host_generators_for_UCI_Credit_Card(
        file_path=file_path,
        num_samples=num_samples,
        overlap_ratio=overlap_ratio,
        guest_split_ratio=guest_split_ratio,
        guest_feature_num=n_feature_guest,
        balanced=balanced)

    namespace, table_name = generate_table_namespace_n_name(file_path)
    guest_table_name = table_name + "_guest"
    host_table_name = table_name + "_host"
    guest_table = save_data_to_eggroll_table(data=guest_data_generator,
                                             namespace=namespace,
                                             table_name=guest_table_name)
    host_table = save_data_to_eggroll_table(data=host_data_generator,
                                            namespace=namespace,
                                            table_name=host_table_name)

    guest_table_count = guest_table.count()
    host_table_count = host_table.count()

    # save data meta to a json file
    print("overlap_indexes[0]", overlap_indexes[0], type(overlap_indexes[0]))
    print("overlap_indexes[0]", overlap_indexes[-1], type(overlap_indexes[-1]))
    output = dict()
    output["guest_table_namespace"] = namespace
    output["guest_table_name"] = guest_table_name
    output["guest_table_count"] = guest_table_count
    output["n_feature_guest"] = n_feature_guest
    output["host_table_namespace"] = namespace
    output["host_table_name"] = host_table_name
    output["host_table_count"] = host_table_count
    output["overlap_index_range"] = {
        "start": int(overlap_indexes[0]),
        "end": int(overlap_indexes[-1])
    }

    with open('./guest_host_table_metadata.json', 'w') as outfile:
        json.dump(output, outfile)

    print("------------save data finish!-----------------")
    print("namespace:%s, guest_table_name:%s, host_table_name:%s" %
          (namespace, guest_table_name, host_table_name))
    print(output)

    return output
Beispiel #2
0
    def test_save_data_to_same_eggroll_table(self):

        data = [(1, 111), (3, 333), (4, 444), (6, 666)]
        namespace = str(uuid.uuid1())
        table_name = "table_name"
        save_data_to_eggroll_table(data, namespace, table_name, partition=1)

        save_data_to_eggroll_table(data, namespace, table_name, partition=1)
Beispiel #3
0
    def test_save_data_to_eggroll_table(self):

        data = [(1, 111), (3, 333), (4, 444), (6, 666)]
        namespace = str(uuid.uuid1())
        table_name = "table_name"
        save_data_to_eggroll_table(data, namespace, table_name, partition=1)

        actual_data_table = table(table_name, namespace)

        actual_data_dict = {}
        for item in actual_data_table.collect():
            actual_data_dict[item[0]] = item[1]

        assert len(data) == len(actual_data_dict)

        for item in data:
            assert item[1] == actual_data_dict[item[0]]