def fea_slt(i_f, t_f, o_f, k): """Select feature""" spt = np.loadtxt(t_f, np.int, delimiter=',') X, y = read_data(i_f) X_new = X[:, np.sort(spt[:k])] X_new = np.c_[X_new, y] np.savetxt(o_f, X_new, fmt='%s', delimiter=',')
def get_mat(i_f, o_f, func): """Create coding matrix""" X, y = read_data(i_f) if isinstance(o_f, str): m, i = func(X, y) np.savetxt(o_f, m, fmt='%d', delimiter=',') elif isinstance(o_f, list): for oo in o_f: m, i = func(X, y) np.savetxt(oo, m, fmt='%d', delimiter=',') else: raise Exception('Unknown value of o_f')
def norm(norm_obj, i_f, o_f): X, y = read_data(i_f) X_new = norm_obj.fit_transform(X, y) X_new = np.c_[X_new, y] np.savetxt(o_f, X_new, fmt='%s', delimiter=',')
def fea_slt_number(bcn, file_names, fs_tags, matrix_types, nor_dir, mat_dir, spt_dir, fea_dir, exp_num): """Determine the size of feature subsets""" if not os.path.exists(spt_dir): os.mkdir(spt_dir) if not os.path.exists(fea_dir): os.mkdir(fea_dir) alpha = 0.05 # Iteration Step beta = 0.02 # Size of comparison interval for idx in range(exp_num): for fn in file_names: X_trn, y_trn = read_data('%s%s_%d_train.csv' % (nor_dir, fn, idx)) X_vld, y_vld = read_data('%s%s_%d_valid.csv' % (nor_dir, fn, idx)) fea_num = X_trn.shape[1] fea_step = math.ceil(fea_num * alpha) fea_k_list = list(range(fea_step, fea_num, fea_step)) y_names = np.unique(y_trn) y_index = dict((c, i) for i, c in enumerate(y_names)) for mc in matrix_types: print('processing %s %s %d...' % (fn, mc, idx)) mat_file = '%s%s_%s_%d.csv' % (mat_dir, fn, mc, idx) mat = np.loadtxt(mat_file, np.int, delimiter=',') mat_len = mat.shape[1] for col_i in range(mat_len): fea_file = '%s%s_%s_%s_%d_%d_fea_num.csv' % ( fea_dir, bcn, fn, mc, idx, col_i) y_code = mat[:, col_i] ytrn_ = np.array([y_code[y_index[y]] for y in y_trn]) yvld_ = np.array([y_code[y_index[y]] for y in y_vld]) X_train = X_trn[ytrn_ != 0] X_valid = X_vld[yvld_ != 0] y_train = ytrn_[ytrn_ != 0] y_valid = yvld_[yvld_ != 0] clock_start = time.time() fs_nums = [] fs_accs = [] for fs in fs_tags: spt_file = '%s%s_%s_%s_%s_%d_%d.csv' % ( spt_dir, bcn, fn, mc, fs, idx, col_i) rank_ind = fea_rank(X_train, y_train, fs) np.savetxt(spt_file, rank_ind.T, delimiter=',', fmt='%d') fea = np.loadtxt(spt_file, delimiter=',', dtype=np.int) accs = [] for fea_k in fea_k_list: fea_ind = fea[:fea_k] X_, y_ = X_train[:, fea_ind], y_train X__, y__ = X_valid[:, fea_ind], y_valid acc_ = cly_data(X_, y_, X__, y__, bcn) accs.append(acc_) fs_num, fs_acc = find_peak(accs, fea_step, fea_num, beta) fs_nums.append(fs_num) fs_accs.append(fs_acc) np.savetxt(fea_file, fs_nums, delimiter=',', fmt='%d') clock_stop = time.time() print('%s %s %d fs iteration cost %s seconds.' % (fn, mc, col_i, clock_stop - clock_start))
def do_exp(idx, bcn, fname, feature_space, matrix_code): """2019-10-21 Experiments on UCI data sets""" if not os.path.exists(out_dir): print('Creating %s' % out_dir) os.mkdir(out_dir) print('%s %s %s %d starts...' % (fname, bcn, matrix_code, idx)) data_set = {} point_num_test = -1 mat_len = -1 y_names, y_index = None, None out_file = '%s%s_%s_%s_%d_res.csv' % (out_dir, bcn, fname, matrix_code, idx) if os.path.exists(out_file): print('%s exist!' % out_file) return None mat_file = 'data/exp_mat/%s_%s_%d.csv' % (fname, matrix_code, idx) mat = np.loadtxt(mat_file, np.int, delimiter=',') for fs in feature_space: if mat_len == -1: mat_len = mat.shape[1] elif mat.shape[1] != mat_len: raise ValueError('The length of matrix is not the same.') for col_i in range(mat_len): s_col_i = str(col_i) train_file = '%s%s_%s_%s_%s_%d_%d_train.csv' % (dat_dir, bcn, fname, matrix_code, fs, idx, col_i) X_train, y_train = read_data(train_file) valid_file = '%s%s_%s_%s_%s_%d_%d_valid.csv' % (dat_dir, bcn, fname, matrix_code, fs, idx, col_i) X_valid, y_valid = read_data(valid_file) test_file = '%s%s_%s_%s_%s_%d_%d_test.csv' % (dat_dir, bcn, fname, matrix_code, fs, idx, col_i) X_test, y_test = read_data(test_file) if y_names is None: y_names = np.unique(y_train) y_index = dict((c,i) for i,c in enumerate(y_names)) len_train, len_test = len(y_train), len(y_test) if point_num_test == -1: point_num_test = len_test elif point_num_test != len_test: raise ValueError('Different length of test data.') y_code = mat[:, col_i] y_trn_ = np.array([y_code[y_index[y]] for y in y_train]) y_vld_ = np.array([y_code[y_index[y]] for y in y_valid]) y_tst_ = np.array([y_code[y_index[y]] for y in y_test]) #Training ECOC classifier ecoc = SimpleECOCClassifier(get_base_clf(bcn), mat, decoder_code) ecoc.fit(X_train, y_train) pred_valid = ecoc.predict_(X_valid[y_vld_!=0], col_i) estimators = ecoc.estimators_[col_i] weight = accuracy_score(y_vld_[y_vld_!=0], pred_valid) print('%s %d: %f' % (fs, col_i, weight)) data_set[fs+s_col_i] = { 'X_train': X_train, 'y_train': y_train, 'X_valid': X_valid, 'y_valid': y_valid, 'X_test': X_test, 'y_test': y_test, 'len_train': len_train, 'len_test': len_test, 'estimator': estimators, 'weight': weight } distances = [] for col_i in range(mat_len): s_col_i = str(col_i) dis_col = [] for fs in feature_space: d_i = feature_space.index(fs) y_code = mat[:, col_i] y_trn_ = np.array([y_code[y_index[y]] for y in data_set[fs+s_col_i]['y_train']]) y_tst_ = np.array([y_code[y_index[y]] for y in data_set[fs+s_col_i]['y_test']]) dis_col.append(fisher_measure(data_set[fs+s_col_i]['X_train'], y_trn_, data_set[fs+s_col_i]['X_test'], y_code)) distances.append(dis_col) distances = np.array(distances) pred = [] for test_i in range(point_num_test): sample_analyses = 'Sample %d/%d, label %s' % (test_i+1, point_num_test, y_test[test_i]) print(sample_analyses) matrix_, fstags_, estimators_, classifier_weight = [], [], [], [] distance = distances[:, :, test_i] for col_i in range(mat_len): s_col_i = str(col_i) dis_ = distance[col_i, :] minus_weights = np.array([data_set[k+s_col_i]['weight'] for k in feature_space]) # 得到每个fs的权重 score_ = np.array(dis_) * minus_weights fs_ = feature_space[score_.argmax()] classifier_weight.append(data_set[fs_+s_col_i]['weight']) fstags_.append(fs_) matrix_.append(mat[:, col_i]) estimators_.append(data_set[fs_+s_col_i]['estimator']) classifier_weight = np.array([classifier_weight]) matrix_ = np.array(matrix_).T Y = np.array([predict_binary(estimators_[i], data_set[fstags_[i]+str(i)]['X_test'][[test_i]]) for i in range(len(estimators_))]).T dd_ = decoder.decode(Y, matrix_) p_ = dd_.argmin(axis=1)[0] pred.append(y_names[p_]) """ The output file structure is as follows: pred true sample1 C1 C1 sample2 C2 C1 sample3 C2 C3 ... ... ... accuracy 90% 100% """ test_file = '%s%s_%s_%s_%s_%d_%d_test.csv' % (dat_dir, bcn, fname, matrix_code, fs, idx, 0) X_test, y_test = read_data(test_file) pred_col = ['ensemble', 'real'] acc = [round(accuracy_score(pred, y_test), 4), 1.] #保留四位小数 pred = np.array([pred, y_test]).T pred = np.r_['0,2', pred_col, pred, acc] np.savetxt(out_file, pred, delimiter=',', fmt='%s')