def convert_and_save_source_data_to_eggroll_table(config_data): file_path = config_data.get("file_path") overlap_ratio = config_data.get("overlap_ratio") guest_split_ratio = config_data.get("guest_split_ratio") n_feature_guest = config_data.get("n_feature_guest") num_samples = config_data.get("num_samples") balanced = config_data.get("balanced") if not os.path.exists(file_path): print(file_path, "is not exist, please check the configure") sys.exit() guest_data_generator, host_data_generator, overlap_indexes = load_guest_host_generators_for_UCI_Credit_Card( file_path=file_path, num_samples=num_samples, overlap_ratio=overlap_ratio, guest_split_ratio=guest_split_ratio, guest_feature_num=n_feature_guest, balanced=balanced) namespace, table_name = generate_table_namespace_n_name(file_path) guest_table_name = table_name + "_guest" host_table_name = table_name + "_host" guest_table = save_data_to_eggroll_table(data=guest_data_generator, namespace=namespace, table_name=guest_table_name) host_table = save_data_to_eggroll_table(data=host_data_generator, namespace=namespace, table_name=host_table_name) guest_table_count = guest_table.count() host_table_count = host_table.count() # save data meta to a json file print("overlap_indexes[0]", overlap_indexes[0], type(overlap_indexes[0])) print("overlap_indexes[0]", overlap_indexes[-1], type(overlap_indexes[-1])) output = dict() output["guest_table_namespace"] = namespace output["guest_table_name"] = guest_table_name output["guest_table_count"] = guest_table_count output["n_feature_guest"] = n_feature_guest output["host_table_namespace"] = namespace output["host_table_name"] = host_table_name output["host_table_count"] = host_table_count output["overlap_index_range"] = { "start": int(overlap_indexes[0]), "end": int(overlap_indexes[-1]) } with open('./guest_host_table_metadata.json', 'w') as outfile: json.dump(output, outfile) print("------------save data finish!-----------------") print("namespace:%s, guest_table_name:%s, host_table_name:%s" % (namespace, guest_table_name, host_table_name)) print(output) return output
def test_save_data_to_same_eggroll_table(self): data = [(1, 111), (3, 333), (4, 444), (6, 666)] namespace = str(uuid.uuid1()) table_name = "table_name" save_data_to_eggroll_table(data, namespace, table_name, partition=1) save_data_to_eggroll_table(data, namespace, table_name, partition=1)
def test_save_data_to_eggroll_table(self): data = [(1, 111), (3, 333), (4, 444), (6, 666)] namespace = str(uuid.uuid1()) table_name = "table_name" save_data_to_eggroll_table(data, namespace, table_name, partition=1) actual_data_table = table(table_name, namespace) actual_data_dict = {} for item in actual_data_table.collect(): actual_data_dict[item[0]] = item[1] assert len(data) == len(actual_data_dict) for item in data: assert item[1] == actual_data_dict[item[0]]