def test_dense_output_format(self): reader = DataIO() reader.set_tracker(self.tracker) component_params = { "DataIOParam": { "input_format": "dense" }, "role": { "guest": [9999], "host": [10000], "arbiter": [10000] }, "local": { "role": "guest", "party_id": 9999 } } reader.run(component_params, self.args1) data = reader.save_data().collect() result = dict(data) self.assertTrue(type(result['a']).__name__ == "Instance") self.assertTrue(type(result['b']).__name__ == "Instance") vala = result['a'] features = vala.features weight = vala.weight label = vala.label self.assertTrue(np.abs(weight - 1.0) < consts.FLOAT_ZERO) self.assertTrue(type(features).__name__ == "ndarray") self.assertTrue(label == None) self.assertTrue(features.shape[0] == 6) self.assertTrue(features.dtype == "float64")
def test_sparse_output_format(self): reader = DataIO() reader.set_tracker(self.tracker) component_params = { "DataIOParam": { "output_format": "sparse", "input_format": "dense" }, "role": { "guest": [9999], "host": [10000], "arbiter": [10000] }, "local": { "role": "guest", "party_id": 9999 } } reader.run(component_params, self.args1) data = reader.save_data().collect() result = dict(data) vala = result['a'] features = vala.features self.assertTrue(type(features).__name__ == "SparseVector") self.assertTrue(len(features.sparse_vec) == 4) self.assertTrue(features.shape == 6)
def test_with_label(self): reader = DataIO() reader.set_tracker(self.tracker) component_params = { "DataIOParam": { "output_format": "dense", "input_format": "dense", "with_label": True, "label_name": "x3" }, "role": { "guest": [9999], "host": [10000], "arbiter": [10000] }, "local": { "role": "guest", "party_id": 9999 } } reader.run(component_params, self.args1) data = reader.save_data().collect() result = dict(data) vala = result['a'] label = vala.label features = vala.features self.assertTrue(label == -1) self.assertTrue(features.shape[0] == 5)
def test_sparse_output_format(self): reader = DataIO() reader.set_tracker(self.tracker) component_params = { "DataIOParam": { "output_format": "sparse", "input_format": "sparse", "delimitor": ' ', "defualt_value": 2**30 }, "role": { "guest": [9999], "host": [10000], "arbiter": [10000] }, "local": { "role": "guest", "party_id": 9999 } } reader.run(component_params, self.args) data = reader.save_data().collect() for i in range(100): self.assertTrue( insts[i][1].features.get_shape() == self.max_feature + 1) self.assertTrue(insts[i][1].label == i % 2) original_feat = {} row = self.data[i][1].split(" ") for j in range(1, len(row)): fid, val = row[j].split(":", -1) original_feat[int(fid)] = float(val) self.assertTrue(original_feat == insts[i][1].features.sparse_vec)
def test_sparse_output_format(self): dataio = DataIO() dataio.set_tracker(TrackerMock()) component_params = { "DataIOParam": { "output_format": "sparse", "input_format": "sparse", "delimitor": ' ' }, "role": { "guest": [9999], "host": [10000], "arbiter": [10000] }, "local": { "role": "guest", "party_id": 9999 } } dataio.run(component_params, self.args) insts = list(dataio.save_data().collect()) for i in range(100): features = insts[i][1].features self.assertTrue(type(features).__name__ == "SparseVector") self.assertTrue(features.get_shape() == self.max_feature + 1) self.assertTrue(insts[i][1].label == i % 2) row = self.data[i][1].split(" ") for j in range(1, len(row)): fid, val = row[j].split(":", -1) self.assertTrue( np.fabs(features.get_data(int(fid)) - float(val)) < consts.FLOAT_ZERO)
def test_missing_value_fill(self): reader = DataIO() reader.set_tracker(self.tracker) component_params = { "DataIOParam": { "output_format": "sparse", "input_format": "dense", "default_value": 100, "with_label": False, "missing_fill": True, "missing_fill_method": "designated", "data_type": "int" }, "role": { "guest": [9999], "host": [10000], "arbiter": [10000] }, "local": { "role": "guest", "party_id": 9999 } } reader.run(component_params, self.args2) data = reader.save_data().collect() result = dict(data) features = result['a'].features for i in range(1, 5): self.assertTrue(features.get_data(i) == 100)
def test_dense_output_format(self): dataio = DataIO() dataio.set_tracker(TrackerMock()) component_params = { "output_format": "dense", "input_format": "sparse", "delimitor": ' ' } cpn_input = get_cpn_input(self.dataset, component_params) dataio.run(cpn_input) insts = list(dataio.save_data().collect()) for i in range(100): features = insts[i][1].features self.assertTrue(type(features).__name__ == "ndarray") self.assertTrue(features.shape[0] == self.max_feature + 1) self.assertTrue(insts[i][1].label == i % 2) row = self.data[i][1].split(" ") ori_feat = [0 for i in range(self.max_feature + 1)] for j in range(1, len(row)): fid, val = row[j].split(":", -1) ori_feat[int(fid)] = float(val) ori_feat = np.asarray(ori_feat, dtype="float64") self.assertTrue( np.abs(ori_feat - features).any() < consts.FLOAT_ZERO)
def test_tag_with_value_dense_output_format(self): dataio = DataIO() dataio.set_tracker(TrackerMock()) component_params = { "DataIOParam": { "output_format": "dense", "input_format": "tag", "delimitor": ' ', "data_type": "float", "with_label": False, "tag_with_value": True }, "role": { "guest": [9999], "host": [10000], "arbiter": [10000] }, "local": { "role": "guest", "party_id": 9999 } } dataio.run(component_params, self.args2) tag_insts = dataio.save_data() features = [inst.features for key, inst in tag_insts.collect()] tags = set() for row in self.data_with_value: tag_list = [] for tag_with_value in row[1].split(" ", -1): tag_list.append(tag_with_value.split(":")[0]) tags |= set(tag_list) tags = sorted(tags) tag_dict = dict(zip(tags, range(len(tags)))) for i in range(len(self.data_with_value)): ori_feature = [0 for i in range(len(tags))] for tag_with_value in self.data_with_value[i][1].split(" ", -1): tag = tag_with_value.split(":", -1)[0] val = float(tag_with_value.split(":", -1)[1]) ori_feature[tag_dict.get(tag)] = val ori_feature = np.asarray(ori_feature, dtype='float64') self.assertTrue( np.abs(ori_feature - features).all() < consts.FLOAT_ZERO)
def test_sparse_output_format(self): reader = DataIO() reader.set_tracker(self.tracker) component_params = {"DataIOParam": {"output_format": "sparse", "input_format": "dense" } } reader.run(component_params, self.args1) data = reader.save_data().collect() result = dict(data) vala = result['a'] features = vala.features self.assertTrue(type(features).__name__ == "SparseVector") self.assertTrue(len(features.sparse_vec) == 4) self.assertTrue(features.shape == 6)
def test_with_label(self): reader = DataIO() reader.set_tracker(self.tracker) component_params = { "DataIOParam": { "output_format": "dense", "input_format": "dense", "with_label": True, "label_idx": 2 } } reader.run(component_params, self.args1) data = reader.save_data().collect() result = dict(data) vala = result['a'] label = vala.label features = vala.features self.assertTrue(label == -1) self.assertTrue(features.shape[0] == 5)
def test_tag_sparse_output_format(self): dataio = DataIO() dataio.set_tracker(TrackerMock()) component_params = { "DataIOParam": { "output_format": "sparse", "input_format": "tag", "delimitor": ' ', "data_type": "int", "with_label": False, "tag_with_value": False }, "role": { "guest": [9999], "host": [10000], "arbiter": [10000] }, "local": { "role": "guest", "party_id": 9999 } } dataio.run(component_params, self.args1) tag_insts = dataio.save_data() features = [inst.features for key, inst in tag_insts.collect()] tags = set() for row in self.data: tags |= set(row[1].split(" ", -1)) tags = sorted(tags) tag_dict = dict(zip(tags, range(len(tags)))) for i in range(len(self.data)): ori_feature = {} for tag in self.data[i][1].split(" ", -1): ori_feature[tag_dict.get(tag)] = 1 self.assertTrue(ori_feature == features[i].sparse_vec)
def test_tag_with_value_sparse_output_format(self): dataio = DataIO() dataio.set_tracker(TrackerMock()) component_params = { "output_format": "sparse", "input_format": "tag", "delimitor": ' ', "data_type": "float", "with_label": False, "tag_with_value": True, "tag_value_delimitor": ":" } cpn_input = get_cpn_input(self.dataset2, component_params) dataio.run(cpn_input) tag_insts = dataio.save_data() features = [inst.features for key, inst in tag_insts.collect()] tags = set() for row in self.data_with_value: tag_list = [] for tag_with_value in row[1].split(" ", -1): tag_list.append(tag_with_value.split(":")[0]) tags |= set(tag_list) tags = sorted(tags) tag_dict = dict(zip(tags, range(len(tags)))) for i in range(len(self.data_with_value)): ori_feature = {} for tag_with_value in self.data_with_value[i][1].split(" ", -1): idx = tag_dict.get(tag_with_value.split(":")[0]) val = float(tag_with_value.split(":")[1]) self.assertTrue( np.abs(val - features[i].get_data(idx)) < consts.FLOAT_ZERO)
def test_sparse_output_format(self): dataio = DataIO() dataio.set_tracker(TrackerMock()) component_params = { "output_format": "sparse", "input_format": "sparse", "delimitor": ' ' } cpn_input = get_cpn_input(self.dataset, component_params) dataio.run(cpn_input) insts = list(dataio.save_data().collect()) for i in range(100): features = insts[i][1].features self.assertTrue(type(features).__name__ == "SparseVector") self.assertTrue(features.get_shape() == self.max_feature + 1) self.assertTrue(insts[i][1].label == i % 2) row = self.data[i][1].split(" ") for j in range(1, len(row)): fid, val = row[j].split(":", -1) self.assertTrue( np.fabs(features.get_data(int(fid)) - float(val)) < consts.FLOAT_ZERO)