def solve(self, pos):
        self.pos = pos

        ret1 = OvR.separate(self.matx, self.label, pos, self.m, self.n)

        mat, vec = Smote.genNew(ret1[0], ret1[1], self.n)
        #		mat, vec = BorderlineSmote.genNew(ret1[0], ret1[1], self.n)

        beta = Descent.solve(mat, vec)
        '''
		ensembleN = 5
		beta = np.zeros((self.n))
		for i in range(ensembleN):
			mat, vec = EasyEnsemble.genNew(ret1[0], ret1[1], self.n)
			beta = beta + Descent.solve(mat, vec)
		beta = beta / ensembleN
		'''
        print('beta', beta)

        return beta
Exemple #2
0
    doc = open(project + algorithmName + "SMOTE", 'a')
    doc.write("project:" + project + " " + "algorithm: " + algorithmName +
              " sampleMethod: \n")
    doc.write("-----------------------\n")
    for i in range(N):
        skf = StratifiedKFold(n_splits=10, shuffle=True)
        for train_index, test_index in skf.split(x, y):
            #print("TRAIN:", train_index, "TEST:", test_index)
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]

            #x_train, y_train = BorderlineOverSampling(x_train, y_train, 0.5, 10).sampling()
            #x_train, y_train = Adasyn(x_train, y_train, 0.5, 10).sampling()
            #x_train, y_train = MAHAKIL().fit_sample(x_train, y_train)
            #x_train, y_train = IMAHAKIL().fit_sample(x_train, y_train)
            x_train, y_train = Smote(x_train, y_train, 50, 10).over_sampling()

            clf.fit(x_train, y_train)

            y_pred = clf.predict(x_test)
            acc.append(accuracy_score(y_test, y_pred))
            precision.append(precision_score(y_test, y_pred))
            recall.append(recall_score(y_test, y_pred))
            f1.append(f1_score(y_test, y_pred))
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
            matrix.append(ConfusionMatrix(tn, fp, fn, tp))
            gMean.append(((tp / (tp + fn)) * ((tn) / (fp + tn)))**0.5)
            #print(count)
            print(str(count) + "," + project)
            count = count + 1
Exemple #3
0
    k, data_0, data_1, headers = data_get()

    if k >= 2:
        n = int(k - 1)
        data = data_1
        smote_class = '1'
    elif 1/k >=2:
        n = int(1/k - 1)
        data = data_0
        smote_class = '0'
        
    #data = data[0:5]
    data = np_array(data)
    print(data)

    s=Smote(data,N=100)
    s = (s.over_sampling())
    s = s.tolist()
    
    smote = []
    for line in s:
        l = [0] +  line
        for i in range(1, len(line)+1):
            l[i] = int(line[i-1] + 0.5)
        print(l)
            
        smote.append(l)
    
    f = FileProcess()
    headers, d = f.file_get_data_row(filename)
    
Exemple #4
0
from smote import Smote
import numpy as np
from sklearn.model_selection import train_test_split

dataset = np.loadtxt('I:\\tools\\SoftwarePrediction\\dataset\\kc2.txt',
                     delimiter=",")
length = len(dataset[0])
x = dataset[:, 0:length - 1]
y = dataset[:, length - 1]
x_train = x
y_train = y
data_new, label_new = Smote(x_train, y_train, 90, 5).over_sampling()

label_new = label_new.reshape(1, label_new.shape[0]).T
data_new = np.concatenate((data_new, label_new), axis=1)
doc = open("test.txt", 'w')
for row in data_new:
    doc.write(
        str(int(row[0])) + "," + str(int(row[1])) + "," + str(int(row[2])) +
        "\n")
doc.close()
    new_attributes = []
    new_attribues_num = 32
    #new_attribues_num = len(X_train.columns.values)
    i = 0
    for item in gini_list:
        #print(type(item))
        #print(item)
        if i < new_attribues_num:
            new_attributes.append(str(item[0]))
        i = i + 1
    X_train = X_train[new_attributes]

    # Begin: smote
    new_train_df = pandas.concat([X_train, y_train], axis=1)
    smote_processor = Smote(new_train_df[new_train_df[target_key] == 1],
                            N=200,
                            k=5)
    train_df_sample = smote_processor.over_sampling()
    #X_sample,y_sample = smote_processor.over_sampling()
    sample = DataFrame(train_df_sample, columns=new_train_df.columns.values)
    #sample_datapreprocessing = DataPreprocessing(sample,sample.drop(target_key,axis=1,inplace=False).columns.values,target_key)
    #sample_datapreprocessing.data_summary()
    X_train = pandas.concat([X_train, sample[X_train.columns.values]], axis=0)
    y_train = pandas.concat([
        y_train.to_frame().rename(columns={0: target_key}),
        sample[target_key].to_frame().rename(columns={0: target_key})
    ],
                            axis=0)[target_key]
    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    merged_train_datapreprocessing = DataPreprocessing(
Exemple #6
0
print("recall", recall.mean())
print("f1", f1.mean())
print("gMean", gMean.mean())
#################################################################################
acc = []
precision = []
recall = []
f1 = []
gMean = []
matrix = []
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4,random_state=3)
#x_train, y_train = BorderlineOverSampling(x_train, y_train, 0.5, 10).sampling()
#x_train, y_train = Adasyn(x_train, y_train, 0.5, 10).sampling()
#x_train, y_train = MAHAKIL().fit_sample(x_train, y_train)
#x_train, y_train = IMAHAKIL().fit_sample(x_train, y_train)
x_train, y_train = Smote(before_x_train, before_y_train, pfp,
                         10).over_sampling()
doc.write(
    "\n" + "\n" + "\n" + "\n" + "\n" +
    "##################################Smote############################################"
    + "\n")
for i in range(len(x_train)):
    for j in range(len(x_train[i])):
        doc.write(str(round(x_train[i][j], 2)) + ",")
    doc.write(str(y_train[i]) + "\n")

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
acc.append(accuracy_score(y_test, y_pred))
precision.append(precision_score(y_test, y_pred))
recall.append(recall_score(y_test, y_pred))
# 按照比例随机划分训练集和测试集
train_X, m, train_Y, n = fft_split_data(np.array(data_X), data_Y, 0., 0., 0.)
train_X = StandardScaler().fit_transform(train_X)

# 测试集
data_file2 = r"G:\声发射试验\pjx-节段胶拼压弯AE试验\AE数据\试件2-0324\正式加载\wave-2欠采样.csv"
bb, initial_test_y = fft_four_tensor_frequency(data_file2, 550, 2200)
wavelet_file = "G:\\声发射试验\\pjx-节段胶拼压弯AE试验\\AE数据\\试件2-0324\\正式加载\\wavelet.csv"
with open(wavelet_file, 'r') as fl:
    initial_test_x = pd.read_csv(fl)

test_X, a, test_Y, b = fft_split_data(np.array(initial_test_x), initial_test_y, 0., 0., 0.)
test_X = StandardScaler().fit_transform(test_X)

# 创造新样本
s0 = Smote(train_X[:430], N=54, k=8)
new_samples0 = s0.over_sampling()
s2 = Smote(train_X[23954:], N=4, k=5)
new_samples2 = s2.over_sampling()
# 添加新样本
train_X = np.concatenate((train_X, new_samples0, new_samples2), axis=0)
train_Y = np.concatenate((train_Y, np.array([[1, 0, 0]] * len(new_samples0)),
                          np.array([[0, 0, 1]] * len(new_samples2))), axis=0)

# print("训练集不同类别信号的个数:", list(train_Y).count([1, 0, 0]), list(train_Y).count([0, 1, 0]),
#       list(train_Y).count([0, 0, 1]), '总数:', len(train_Y))

print(train_X.shape, test_X.shape, train_Y.shape, test_Y.shape)

# 归一化
rows, columns = train_X.shape
Exemple #8
0
def f_start_calculate():
    # 按照比例随机划分训练集和测试集
    train_X, test_X, train_Y, test_Y = split_data(np.array(data_X), data_Y, 0.,
                                                  0., 0.)
    train_X = StandardScaler().fit_transform(train_X)

    # print(train_X)
    # 创造新样本
    s0 = Smote(train_X[:430], N=54, k=10)
    new_samples0 = s0.over_sampling()
    s2 = Smote(train_X[23954:], N=4, k=5)
    new_samples2 = s2.over_sampling()
    # 添加新样本
    train_X = np.concatenate((train_X, new_samples0, new_samples2), axis=0)
    train_Y = np.concatenate((train_Y, np.array(
        [0] * len(new_samples0)), np.array([2] * len(new_samples2))),
                             axis=0)

    # 测试集
    data_file2 = r"G:\声发射试验\pjx-节段胶拼压弯AE试验\AE数据\试件2-0324\正式加载\t滤波后-7200.csv"
    # frequency_data_file2 = r"G:\声发射试验\pjx-节段胶拼压弯AE试验\AE数据\试件2-0324\正式加载\fft.csv"
    initial_test_x, initial_test_y = four_tensor_data(data_file2, 550, 2200)
    # wave_file = r"G:\声发射试验\pjx-节段胶拼压弯AE试验\AE数据\试件2-0324\正式加载\wave.csv"
    # with open(wave_file, 'r') as fl:
    #     waves = pd.read_csv(fl)
    # waves = waves.iloc[:, 2:]
    # fc = FeatureCreation(np.array(waves))
    # data = pd.DataFrame(initial_test_x)
    # data['average_x'] = fc.average_x()
    # data['p_x'] = fc.p_x()
    # data['rms_x'] = fc.rms_x()
    # data['std_x'] = fc.std_x()
    # data['sk_x'] = fc.sk_x()
    # data['kv_x'] = fc.kv_x()
    # data['sf_x'] = fc.sf_x()
    # data['cf_x'] = fc.cf_x()

    wavelet_file = "G:\\声发射试验\\pjx-节段胶拼压弯AE试验\\AE数据\\试件2-0324\\正式加载\\wavelet.csv"
    with open(wavelet_file, 'r') as fl:
        wavelet = pd.read_csv(fl)
    # data = pd.concat((data, wavelet), axis=1)
    initial_test_x = np.array(wavelet)

    test_X, a, test_Y, b = split_data(np.array(initial_test_x), initial_test_y,
                                      0., 0., 0.)
    test_X = StandardScaler().fit_transform(test_X)

    # 归一化
    # rows, columns = train_X.shape
    # print(train_X.shape, test_X.shape)
    # scale_X = StandardScaler().fit_transform(np.concatenate((train_X, test_X), axis=0))
    # train_scale_X = scale_X[:rows]
    # test_scale_X = scale_X[rows:]

    ss = StandardScaler().fit(train_X)
    train_scale_X = ss.transform(train_X)
    test_scale_X = ss.transform(test_X)

    print("训练集不同类别信号的个数:",
          list(train_Y).count(0),
          list(train_Y).count(1),
          list(train_Y).count(2), '信号总数:', len(train_Y))

    # clf_nn = RandomForestClassifier(n_estimators=500, min_samples_leaf=4, max_depth=4)

    # clf_nn = GradientBoostingClassifier(n_estimators=50, min_samples_leaf=10, learning_rate=0.1, random_state=1)

    clf_nn = MLPClassifier(solver='adam',
                           alpha=1.,
                           activation='relu',
                           hidden_layer_sizes=(600, ),
                           max_iter=2000,
                           verbose=1,
                           learning_rate='adaptive',
                           early_stopping=True,
                           random_state=6)

    clf_nn.fit(train_scale_X, train_Y)
    predictions = clf_nn.predict(test_scale_X)
    print("测试集不同类别信号的个数:",
          list(test_Y).count(0),
          list(test_Y).count(1),
          list(test_Y).count(2), '信号总数:', len(test_Y))
    print('     test_Y:', list(test_Y))
    print('predictions:', list(predictions))
    # print("各类准确率:", accuracy(test_Y, predictions))
    # print("综合准确率:", accuracy_score(test_Y, predictions))

    test_prob = clf_nn.predict_proba(test_scale_X)
    train_prob = clf_nn.predict_proba(train_scale_X)

    # clf_nn_prob = MLPClassifier(solver='adam',
    #                             alpha=1.,
    #                             activation='relu',
    #                             hidden_layer_sizes=(100),
    #                             max_iter=2000,
    #                             verbose=0,
    #                             learning_rate='adaptive',
    #                             early_stopping=True,
    #                             random_state=9)
    # clf_nn_prob = SVC(class_weight={0:1., 1:1., 2:1.5})

    # clf_nn_prob = DecisionTreeClassifier(max_depth=10, min_samples_leaf=2, random_state=1)

    # clf_nn_prob = GradientBoostingClassifier(n_estimators=20, min_samples_leaf=4, learning_rate=0.1, random_state=1)

    # clf_nn_prob = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=500, random_state=0)

    # clf_nn_prob.fit(train_scale_X, train_Y)
    # nn_prob_predictions = clf_nn_prob.predict(test_scale_X)
    # print("predictions:", list(nn_prob_predictions))
    # print("各类准确率:", accuracy(test_Y, nn_prob_predictions))

    output = []  #

    # return
    for num_samples in range(1, 21):
        test_y = []
        for i in range(0, len(test_Y), num_samples):
            test_y.append(np.argmax(np.bincount(test_Y[i:i + num_samples])))
        predict_y = []
        for i in range(0, len(predictions), num_samples):
            predict_y.append(
                np.argmax(np.bincount(predictions[i:i + num_samples])))

        # print("测试集区块的数量:", list(test_y).count(0), list(test_y).count(1), list(test_y).count(2),
        #       '总数:', len(test_y))
        # print("   test_y:", test_y)
        # print("predict_y:", predict_y)
        # print("各类准确率:", accuracy(test_y, predict_y))
        # print("综合准确率:", accuracy_score(test_y, predict_y))
        # print("基于概率缩放后各类准确率:", accuracy(test_y, prediction_use_prob(test_prob, num_samples)))
        # print("基于概率缩放后综合准确率:", accuracy_score(test_y, prediction_use_prob(test_prob, num_samples)))

        output.append(
            [accuracy_score(test_y, predict_y)] +
            list(accuracy(test_y, predict_y)) + [
                accuracy_score(test_y,
                               prediction_use_prob(test_prob, num_samples))
            ] +
            list(accuracy(test_y, prediction_use_prob(test_prob, num_samples)))
        )

    print(
        pd.DataFrame(data=output,
                     columns=[
                         "综合准确率", "p0", 'p1', 'p2', "概率分类的综合准确率", 'wp0', 'wp1',
                         'wp2'
                     ]))
Exemple #9
0
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn import linear_model

from Imahakil import IMAHAKIL
from smote import Smote

dataset = np.loadtxt('I:\\tools\\SoftwarePrediction\\dataset\\pc1.txt',
                     delimiter=",")
length = len(dataset[0])
x = dataset[:, 0:length - 1]
y = dataset[:, length - 1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

x_train, y_train = Smote(x_train, y_train, 90, 5).over_sampling()

clf = linear_model.LogisticRegression(solver='liblinear', max_iter=10000)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
# count=0
# # for i in range(len(result)):
# #     if result[i]==y_test[i]:
# #         count=count+1
# # print(count/len(result))
print("acc", accuracy_score(y_test, y_pred))
for i in range(len(y_pred)):
    if y_test[i] == 1 and y_pred[i] == 1:
        print(i)
print("precision", precision_score(y_test, y_pred))
Exemple #10
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

import pandas
from smote import Smote

if __name__ == "__main__":
    #file_fullpath='/home/login01/Workspaces/python/dataset/module_data_stg1/sample'
    file_fullpath = '/home/login01/Workspaces/python/dataset/cs.csv'
    cs = pandas.read_csv(file_fullpath,
                         sep=',',
                         index_col=0,
                         na_values='NA',
                         low_memory=False)
    cs_mean_MonthlyIncome = cs.MonthlyIncome.mean(skipna=True)
    cs_mean_NumberOfDependents = cs.NumberOfDependents.mean(skipna=True)
    cs.ix[:, 'MonthlyIncome'] = cs.MonthlyIncome.fillna(cs_mean_MonthlyIncome,
                                                        inplace=False)
    cs.ix[:, 'NumberOfDependents'] = cs.NumberOfDependents.fillna(
        cs_mean_NumberOfDependents, inplace=False)
    ismote = Smote(cs, 20, 6)
    print(ismote.n_samples)
    print(ismote.n_attrs)
    mysample = ismote.over_sampling()
    print(mysample)
Exemple #11
0
data = {}
t = len(data_all) / sum(DATA_PROPORTION.values())
train_count = int(DATA_PROPORTION['train'] * t)
test_count = int(DATA_PROPORTION['test'] * t)
data['train'] = data_all[:train_count]
data['test'] = data_all[train_count:train_count + test_count]
data['verify'] = data_all[train_count + test_count:]

print('Creating more simples by SMOTE...')
sys.stdout.flush()
cover_shape = (3, IMAGE_RESIZE_SIZE, IMAGE_RESIZE_SIZE)
positive_covers = [
    record[0].reshape(-1) for record in data['train'] if record[1]
]
smote = Smote(np.array(positive_covers), N=OVER_SIMPLING_N, k=5)
over_simples = smote.over_sampling()
data['train'] += [(np.reshape(simple, cover_shape), True)
                  for simple in over_simples]

print('Shuffling data...')
sys.stdout.flush()
for d in data.values():
    random.shuffle(d)

print('Exporting data.info ...')
sys.stdout.flush()
with open('data.info', 'w') as f:
    f.write(str({k: len(v) for k, v in data.items()}))

print('Preparing export recordio file...')