Example #1
0
def fea_slt(i_f, t_f, o_f, k):
    """Select feature"""
    spt = np.loadtxt(t_f, np.int, delimiter=',')
    X, y = read_data(i_f)
    X_new = X[:, np.sort(spt[:k])]
    X_new = np.c_[X_new, y]
    np.savetxt(o_f, X_new, fmt='%s', delimiter=',')
Example #2
0
def get_mat(i_f, o_f, func):
    """Create coding matrix"""
    X, y = read_data(i_f)
    if isinstance(o_f, str):
        m, i = func(X, y)
        np.savetxt(o_f, m, fmt='%d', delimiter=',')
    elif isinstance(o_f, list):
        for oo in o_f:
            m, i = func(X, y)
            np.savetxt(oo, m, fmt='%d', delimiter=',')
    else:
        raise Exception('Unknown value of o_f')
Example #3
0
def norm(norm_obj, i_f, o_f):
    X, y = read_data(i_f)
    X_new = norm_obj.fit_transform(X, y)
    X_new = np.c_[X_new, y]
    np.savetxt(o_f, X_new, fmt='%s', delimiter=',')
Example #4
0
def fea_slt_number(bcn, file_names, fs_tags, matrix_types, nor_dir, mat_dir,
                   spt_dir, fea_dir, exp_num):
    """Determine the size of feature subsets"""
    if not os.path.exists(spt_dir):
        os.mkdir(spt_dir)
    if not os.path.exists(fea_dir):
        os.mkdir(fea_dir)

    alpha = 0.05  # Iteration Step
    beta = 0.02  # Size of comparison interval
    for idx in range(exp_num):
        for fn in file_names:
            X_trn, y_trn = read_data('%s%s_%d_train.csv' % (nor_dir, fn, idx))
            X_vld, y_vld = read_data('%s%s_%d_valid.csv' % (nor_dir, fn, idx))
            fea_num = X_trn.shape[1]
            fea_step = math.ceil(fea_num * alpha)
            fea_k_list = list(range(fea_step, fea_num, fea_step))

            y_names = np.unique(y_trn)
            y_index = dict((c, i) for i, c in enumerate(y_names))
            for mc in matrix_types:
                print('processing %s %s %d...' % (fn, mc, idx))
                mat_file = '%s%s_%s_%d.csv' % (mat_dir, fn, mc, idx)
                mat = np.loadtxt(mat_file, np.int, delimiter=',')
                mat_len = mat.shape[1]
                for col_i in range(mat_len):
                    fea_file = '%s%s_%s_%s_%d_%d_fea_num.csv' % (
                        fea_dir, bcn, fn, mc, idx, col_i)
                    y_code = mat[:, col_i]
                    ytrn_ = np.array([y_code[y_index[y]] for y in y_trn])
                    yvld_ = np.array([y_code[y_index[y]] for y in y_vld])
                    X_train = X_trn[ytrn_ != 0]
                    X_valid = X_vld[yvld_ != 0]
                    y_train = ytrn_[ytrn_ != 0]
                    y_valid = yvld_[yvld_ != 0]

                    clock_start = time.time()
                    fs_nums = []
                    fs_accs = []
                    for fs in fs_tags:
                        spt_file = '%s%s_%s_%s_%s_%d_%d.csv' % (
                            spt_dir, bcn, fn, mc, fs, idx, col_i)
                        rank_ind = fea_rank(X_train, y_train, fs)
                        np.savetxt(spt_file,
                                   rank_ind.T,
                                   delimiter=',',
                                   fmt='%d')
                        fea = np.loadtxt(spt_file, delimiter=',', dtype=np.int)
                        accs = []
                        for fea_k in fea_k_list:
                            fea_ind = fea[:fea_k]
                            X_, y_ = X_train[:, fea_ind], y_train
                            X__, y__ = X_valid[:, fea_ind], y_valid
                            acc_ = cly_data(X_, y_, X__, y__, bcn)
                            accs.append(acc_)
                        fs_num, fs_acc = find_peak(accs, fea_step, fea_num,
                                                   beta)
                        fs_nums.append(fs_num)
                        fs_accs.append(fs_acc)
                    np.savetxt(fea_file, fs_nums, delimiter=',', fmt='%d')
                    clock_stop = time.time()
                    print('%s %s %d fs iteration cost %s seconds.' %
                          (fn, mc, col_i, clock_stop - clock_start))
Example #5
0
def do_exp(idx, bcn, fname, feature_space, matrix_code):
    """2019-10-21 Experiments on UCI data sets"""
    if not os.path.exists(out_dir):
        print('Creating %s' % out_dir)
        os.mkdir(out_dir)
    
    print('%s %s %s %d starts...' % (fname, bcn, matrix_code, idx))
    data_set = {}
    point_num_test = -1 
    mat_len = -1    
    y_names, y_index = None, None   
    out_file = '%s%s_%s_%s_%d_res.csv' % (out_dir, bcn, fname, matrix_code, idx)
    if os.path.exists(out_file):
        print('%s exist!' % out_file)
        return None  

    mat_file = 'data/exp_mat/%s_%s_%d.csv' % (fname, matrix_code, idx)
    mat = np.loadtxt(mat_file, np.int, delimiter=',')
    for fs in feature_space:
        if mat_len == -1:
            mat_len = mat.shape[1]
        elif mat.shape[1] != mat_len:
            raise ValueError('The length of matrix is not the same.')
        
        for col_i in range(mat_len):
            s_col_i = str(col_i) 
            train_file = '%s%s_%s_%s_%s_%d_%d_train.csv' % (dat_dir, bcn, fname, matrix_code, fs, idx, col_i)
            X_train, y_train = read_data(train_file)
            valid_file = '%s%s_%s_%s_%s_%d_%d_valid.csv' % (dat_dir, bcn, fname, matrix_code, fs, idx, col_i)
            X_valid, y_valid = read_data(valid_file)
            test_file = '%s%s_%s_%s_%s_%d_%d_test.csv' % (dat_dir, bcn, fname, matrix_code, fs, idx, col_i)
            X_test, y_test = read_data(test_file)
            if y_names is None:
                y_names = np.unique(y_train)
                y_index = dict((c,i) for i,c in enumerate(y_names))
            len_train, len_test = len(y_train), len(y_test)
            if point_num_test == -1:
                point_num_test = len_test
            elif point_num_test != len_test:
                raise ValueError('Different length of test data.')
            
            y_code = mat[:, col_i]
            y_trn_ = np.array([y_code[y_index[y]] for y in y_train])
            y_vld_ = np.array([y_code[y_index[y]] for y in y_valid])
            y_tst_ = np.array([y_code[y_index[y]] for y in y_test])
            
            #Training ECOC classifier
            ecoc = SimpleECOCClassifier(get_base_clf(bcn), mat, decoder_code)
            ecoc.fit(X_train, y_train)
            pred_valid = ecoc.predict_(X_valid[y_vld_!=0], col_i)
            estimators = ecoc.estimators_[col_i]
            weight = accuracy_score(y_vld_[y_vld_!=0], pred_valid)
            print('%s %d: %f' % (fs, col_i, weight))

            data_set[fs+s_col_i] = {
                'X_train': X_train,
                'y_train': y_train,
                'X_valid': X_valid,
                'y_valid': y_valid,
                'X_test': X_test,
                'y_test': y_test,
                'len_train': len_train,
                'len_test': len_test,
                'estimator': estimators,
                'weight': weight
            }       
    
    distances = []
    for col_i in range(mat_len):
        s_col_i = str(col_i)
        dis_col = []
        for fs in feature_space:
            d_i = feature_space.index(fs)
            y_code = mat[:, col_i]
            y_trn_ = np.array([y_code[y_index[y]] for y in data_set[fs+s_col_i]['y_train']])
            y_tst_ = np.array([y_code[y_index[y]] for y in data_set[fs+s_col_i]['y_test']])
            dis_col.append(fisher_measure(data_set[fs+s_col_i]['X_train'], y_trn_, data_set[fs+s_col_i]['X_test'], y_code))
        distances.append(dis_col)
    distances = np.array(distances)

    pred = []
    for test_i in range(point_num_test):
        sample_analyses = 'Sample %d/%d, label %s' % (test_i+1, point_num_test, y_test[test_i])
        print(sample_analyses)
        
        matrix_, fstags_, estimators_, classifier_weight = [], [], [], []
        distance = distances[:, :, test_i]
        for col_i in range(mat_len):
            s_col_i = str(col_i)
            dis_ = distance[col_i, :]
            minus_weights = np.array([data_set[k+s_col_i]['weight'] for k in feature_space])    # 得到每个fs的权重
            score_ = np.array(dis_) * minus_weights
            fs_ = feature_space[score_.argmax()]
            classifier_weight.append(data_set[fs_+s_col_i]['weight'])
            fstags_.append(fs_)
            matrix_.append(mat[:, col_i])
            estimators_.append(data_set[fs_+s_col_i]['estimator'])
        classifier_weight = np.array([classifier_weight])
        matrix_ = np.array(matrix_).T
        Y = np.array([predict_binary(estimators_[i], data_set[fstags_[i]+str(i)]['X_test'][[test_i]]) for i in
                        range(len(estimators_))]).T
        dd_ = decoder.decode(Y, matrix_)
        p_ = dd_.argmin(axis=1)[0]
        pred.append(y_names[p_])
    """
    The output file structure is as follows:
                pred    true
    sample1     C1      C1
    sample2     C2      C1
    sample3     C2      C3
    ...         ...     ...
    accuracy    90%    100%
    """
    test_file = '%s%s_%s_%s_%s_%d_%d_test.csv' % (dat_dir, bcn, fname, matrix_code, fs, idx, 0)
    X_test, y_test = read_data(test_file)
    pred_col = ['ensemble', 'real']
    acc = [round(accuracy_score(pred, y_test), 4), 1.]  #保留四位小数
    pred = np.array([pred, y_test]).T
    pred = np.r_['0,2', pred_col, pred, acc]
    np.savetxt(out_file, pred, delimiter=',', fmt='%s')