def test_sparse_abnormal_data(self): final_result = [] numpy_array = [] sparse_inst_shape = self.feature_num + 15 indices = [x for x in range(self.feature_num + 10)] for i in range(self.data_num): tmp = 100 * np.random.rand(self.feature_num) tmp = [ik for ik in range(self.feature_num)] tmp[i % self.feature_num] = 'nan' # data_index = np.random.choice(indices, self.feature_num, replace=False) # data_index = sorted(data_index) data_index = [idx for idx in range(self.feature_num)] sparse_inst = SparseVector(data_index, tmp, shape=sparse_inst_shape) if i == 0: aa = sparse_inst.get_data(0, 'a') print('in for loop: {}, type: {}'.format(aa, type(aa))) inst = Instance(inst_id=i, features=sparse_inst, label=0) tmp_pair = (str(i), inst) final_result.append(tmp_pair) n = 0 pointer = 0 tmp_array = [] while n < sparse_inst_shape: if n in data_index: tmp_array.append(tmp[pointer]) pointer += 1 else: tmp_array.append(0) n += 1 numpy_array.append(tmp_array) abnormal_value = final_result[0][1].features.get_data(0, 'a') print('abnormal_value: {}, type: {}'.format(abnormal_value, type(abnormal_value))) table = session.parallelize(final_result, include_key=True, partition=1) header = ['x' + str(i) for i in range(sparse_inst_shape)] numpy_table = np.array(numpy_array) table.schema = {'header': header} self.used_data_set.append(table) bin_obj = self._bin_obj_generator(abnormal_list=['nan']) split_points = bin_obj.fit_split_points(table) print('split_points: {}'.format(split_points)) print(numpy_table) trans_result = bin_obj.transform(table, transform_cols_idx=-1, transform_type='bin_num') trans_result = trans_result.collect() print('transform result: ') for k, v in trans_result: value = v.features.get_all_data() value_list = [] for value_k, value_v in value: value_list.append((value_k, value_v)) print(k, value_list)
def test_instance(self): indices = [] data = [] for i in range(1, 10): indices.append(i * i) data.append(i**3) shape = 100 sparse_data = SparseVector(indices, data, shape) self.assertTrue(sparse_data.shape == shape and len(sparse_data.sparse_vec) == 9) self.assertTrue(sparse_data.count_zeros() == 91) self.assertTrue(sparse_data.count_non_zeros() == 9) for idx, val in zip(indices, data): self.assertTrue(sparse_data.get_data(idx) == val) for i in range(100): if i in indices: continue self.assertTrue(sparse_data.get_data(i, i**4) == i**4) self.assertTrue( dict(sparse_data.get_all_data()) == dict(zip(indices, data)))