def solve(self, pos): self.pos = pos ret1 = OvR.separate(self.matx, self.label, pos, self.m, self.n) mat, vec = Smote.genNew(ret1[0], ret1[1], self.n) # mat, vec = BorderlineSmote.genNew(ret1[0], ret1[1], self.n) beta = Descent.solve(mat, vec) ''' ensembleN = 5 beta = np.zeros((self.n)) for i in range(ensembleN): mat, vec = EasyEnsemble.genNew(ret1[0], ret1[1], self.n) beta = beta + Descent.solve(mat, vec) beta = beta / ensembleN ''' print('beta', beta) return beta
doc = open(project + algorithmName + "SMOTE", 'a') doc.write("project:" + project + " " + "algorithm: " + algorithmName + " sampleMethod: \n") doc.write("-----------------------\n") for i in range(N): skf = StratifiedKFold(n_splits=10, shuffle=True) for train_index, test_index in skf.split(x, y): #print("TRAIN:", train_index, "TEST:", test_index) x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] #x_train, y_train = BorderlineOverSampling(x_train, y_train, 0.5, 10).sampling() #x_train, y_train = Adasyn(x_train, y_train, 0.5, 10).sampling() #x_train, y_train = MAHAKIL().fit_sample(x_train, y_train) #x_train, y_train = IMAHAKIL().fit_sample(x_train, y_train) x_train, y_train = Smote(x_train, y_train, 50, 10).over_sampling() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) acc.append(accuracy_score(y_test, y_pred)) precision.append(precision_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) f1.append(f1_score(y_test, y_pred)) tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() matrix.append(ConfusionMatrix(tn, fp, fn, tp)) gMean.append(((tp / (tp + fn)) * ((tn) / (fp + tn)))**0.5) #print(count) print(str(count) + "," + project) count = count + 1
k, data_0, data_1, headers = data_get() if k >= 2: n = int(k - 1) data = data_1 smote_class = '1' elif 1/k >=2: n = int(1/k - 1) data = data_0 smote_class = '0' #data = data[0:5] data = np_array(data) print(data) s=Smote(data,N=100) s = (s.over_sampling()) s = s.tolist() smote = [] for line in s: l = [0] + line for i in range(1, len(line)+1): l[i] = int(line[i-1] + 0.5) print(l) smote.append(l) f = FileProcess() headers, d = f.file_get_data_row(filename)
from smote import Smote import numpy as np from sklearn.model_selection import train_test_split dataset = np.loadtxt('I:\\tools\\SoftwarePrediction\\dataset\\kc2.txt', delimiter=",") length = len(dataset[0]) x = dataset[:, 0:length - 1] y = dataset[:, length - 1] x_train = x y_train = y data_new, label_new = Smote(x_train, y_train, 90, 5).over_sampling() label_new = label_new.reshape(1, label_new.shape[0]).T data_new = np.concatenate((data_new, label_new), axis=1) doc = open("test.txt", 'w') for row in data_new: doc.write( str(int(row[0])) + "," + str(int(row[1])) + "," + str(int(row[2])) + "\n") doc.close()
new_attributes = [] new_attribues_num = 32 #new_attribues_num = len(X_train.columns.values) i = 0 for item in gini_list: #print(type(item)) #print(item) if i < new_attribues_num: new_attributes.append(str(item[0])) i = i + 1 X_train = X_train[new_attributes] # Begin: smote new_train_df = pandas.concat([X_train, y_train], axis=1) smote_processor = Smote(new_train_df[new_train_df[target_key] == 1], N=200, k=5) train_df_sample = smote_processor.over_sampling() #X_sample,y_sample = smote_processor.over_sampling() sample = DataFrame(train_df_sample, columns=new_train_df.columns.values) #sample_datapreprocessing = DataPreprocessing(sample,sample.drop(target_key,axis=1,inplace=False).columns.values,target_key) #sample_datapreprocessing.data_summary() X_train = pandas.concat([X_train, sample[X_train.columns.values]], axis=0) y_train = pandas.concat([ y_train.to_frame().rename(columns={0: target_key}), sample[target_key].to_frame().rename(columns={0: target_key}) ], axis=0)[target_key] X_train = X_train.reset_index(drop=True) y_train = y_train.reset_index(drop=True) merged_train_datapreprocessing = DataPreprocessing(
print("recall", recall.mean()) print("f1", f1.mean()) print("gMean", gMean.mean()) ################################################################################# acc = [] precision = [] recall = [] f1 = [] gMean = [] matrix = [] #x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4,random_state=3) #x_train, y_train = BorderlineOverSampling(x_train, y_train, 0.5, 10).sampling() #x_train, y_train = Adasyn(x_train, y_train, 0.5, 10).sampling() #x_train, y_train = MAHAKIL().fit_sample(x_train, y_train) #x_train, y_train = IMAHAKIL().fit_sample(x_train, y_train) x_train, y_train = Smote(before_x_train, before_y_train, pfp, 10).over_sampling() doc.write( "\n" + "\n" + "\n" + "\n" + "\n" + "##################################Smote############################################" + "\n") for i in range(len(x_train)): for j in range(len(x_train[i])): doc.write(str(round(x_train[i][j], 2)) + ",") doc.write(str(y_train[i]) + "\n") clf.fit(x_train, y_train) y_pred = clf.predict(x_test) acc.append(accuracy_score(y_test, y_pred)) precision.append(precision_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred))
# 按照比例随机划分训练集和测试集 train_X, m, train_Y, n = fft_split_data(np.array(data_X), data_Y, 0., 0., 0.) train_X = StandardScaler().fit_transform(train_X) # 测试集 data_file2 = r"G:\声发射试验\pjx-节段胶拼压弯AE试验\AE数据\试件2-0324\正式加载\wave-2欠采样.csv" bb, initial_test_y = fft_four_tensor_frequency(data_file2, 550, 2200) wavelet_file = "G:\\声发射试验\\pjx-节段胶拼压弯AE试验\\AE数据\\试件2-0324\\正式加载\\wavelet.csv" with open(wavelet_file, 'r') as fl: initial_test_x = pd.read_csv(fl) test_X, a, test_Y, b = fft_split_data(np.array(initial_test_x), initial_test_y, 0., 0., 0.) test_X = StandardScaler().fit_transform(test_X) # 创造新样本 s0 = Smote(train_X[:430], N=54, k=8) new_samples0 = s0.over_sampling() s2 = Smote(train_X[23954:], N=4, k=5) new_samples2 = s2.over_sampling() # 添加新样本 train_X = np.concatenate((train_X, new_samples0, new_samples2), axis=0) train_Y = np.concatenate((train_Y, np.array([[1, 0, 0]] * len(new_samples0)), np.array([[0, 0, 1]] * len(new_samples2))), axis=0) # print("训练集不同类别信号的个数:", list(train_Y).count([1, 0, 0]), list(train_Y).count([0, 1, 0]), # list(train_Y).count([0, 0, 1]), '总数:', len(train_Y)) print(train_X.shape, test_X.shape, train_Y.shape, test_Y.shape) # 归一化 rows, columns = train_X.shape
def f_start_calculate(): # 按照比例随机划分训练集和测试集 train_X, test_X, train_Y, test_Y = split_data(np.array(data_X), data_Y, 0., 0., 0.) train_X = StandardScaler().fit_transform(train_X) # print(train_X) # 创造新样本 s0 = Smote(train_X[:430], N=54, k=10) new_samples0 = s0.over_sampling() s2 = Smote(train_X[23954:], N=4, k=5) new_samples2 = s2.over_sampling() # 添加新样本 train_X = np.concatenate((train_X, new_samples0, new_samples2), axis=0) train_Y = np.concatenate((train_Y, np.array( [0] * len(new_samples0)), np.array([2] * len(new_samples2))), axis=0) # 测试集 data_file2 = r"G:\声发射试验\pjx-节段胶拼压弯AE试验\AE数据\试件2-0324\正式加载\t滤波后-7200.csv" # frequency_data_file2 = r"G:\声发射试验\pjx-节段胶拼压弯AE试验\AE数据\试件2-0324\正式加载\fft.csv" initial_test_x, initial_test_y = four_tensor_data(data_file2, 550, 2200) # wave_file = r"G:\声发射试验\pjx-节段胶拼压弯AE试验\AE数据\试件2-0324\正式加载\wave.csv" # with open(wave_file, 'r') as fl: # waves = pd.read_csv(fl) # waves = waves.iloc[:, 2:] # fc = FeatureCreation(np.array(waves)) # data = pd.DataFrame(initial_test_x) # data['average_x'] = fc.average_x() # data['p_x'] = fc.p_x() # data['rms_x'] = fc.rms_x() # data['std_x'] = fc.std_x() # data['sk_x'] = fc.sk_x() # data['kv_x'] = fc.kv_x() # data['sf_x'] = fc.sf_x() # data['cf_x'] = fc.cf_x() wavelet_file = "G:\\声发射试验\\pjx-节段胶拼压弯AE试验\\AE数据\\试件2-0324\\正式加载\\wavelet.csv" with open(wavelet_file, 'r') as fl: wavelet = pd.read_csv(fl) # data = pd.concat((data, wavelet), axis=1) initial_test_x = np.array(wavelet) test_X, a, test_Y, b = split_data(np.array(initial_test_x), initial_test_y, 0., 0., 0.) test_X = StandardScaler().fit_transform(test_X) # 归一化 # rows, columns = train_X.shape # print(train_X.shape, test_X.shape) # scale_X = StandardScaler().fit_transform(np.concatenate((train_X, test_X), axis=0)) # train_scale_X = scale_X[:rows] # test_scale_X = scale_X[rows:] ss = StandardScaler().fit(train_X) train_scale_X = ss.transform(train_X) test_scale_X = ss.transform(test_X) print("训练集不同类别信号的个数:", list(train_Y).count(0), list(train_Y).count(1), list(train_Y).count(2), '信号总数:', len(train_Y)) # clf_nn = RandomForestClassifier(n_estimators=500, min_samples_leaf=4, max_depth=4) # clf_nn = GradientBoostingClassifier(n_estimators=50, min_samples_leaf=10, learning_rate=0.1, random_state=1) clf_nn = MLPClassifier(solver='adam', alpha=1., activation='relu', hidden_layer_sizes=(600, ), max_iter=2000, verbose=1, learning_rate='adaptive', early_stopping=True, random_state=6) clf_nn.fit(train_scale_X, train_Y) predictions = clf_nn.predict(test_scale_X) print("测试集不同类别信号的个数:", list(test_Y).count(0), list(test_Y).count(1), list(test_Y).count(2), '信号总数:', len(test_Y)) print(' test_Y:', list(test_Y)) print('predictions:', list(predictions)) # print("各类准确率:", accuracy(test_Y, predictions)) # print("综合准确率:", accuracy_score(test_Y, predictions)) test_prob = clf_nn.predict_proba(test_scale_X) train_prob = clf_nn.predict_proba(train_scale_X) # clf_nn_prob = MLPClassifier(solver='adam', # alpha=1., # activation='relu', # hidden_layer_sizes=(100), # max_iter=2000, # verbose=0, # learning_rate='adaptive', # early_stopping=True, # random_state=9) # clf_nn_prob = SVC(class_weight={0:1., 1:1., 2:1.5}) # clf_nn_prob = DecisionTreeClassifier(max_depth=10, min_samples_leaf=2, random_state=1) # clf_nn_prob = GradientBoostingClassifier(n_estimators=20, min_samples_leaf=4, learning_rate=0.1, random_state=1) # clf_nn_prob = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=500, random_state=0) # clf_nn_prob.fit(train_scale_X, train_Y) # nn_prob_predictions = clf_nn_prob.predict(test_scale_X) # print("predictions:", list(nn_prob_predictions)) # print("各类准确率:", accuracy(test_Y, nn_prob_predictions)) output = [] # # return for num_samples in range(1, 21): test_y = [] for i in range(0, len(test_Y), num_samples): test_y.append(np.argmax(np.bincount(test_Y[i:i + num_samples]))) predict_y = [] for i in range(0, len(predictions), num_samples): predict_y.append( np.argmax(np.bincount(predictions[i:i + num_samples]))) # print("测试集区块的数量:", list(test_y).count(0), list(test_y).count(1), list(test_y).count(2), # '总数:', len(test_y)) # print(" test_y:", test_y) # print("predict_y:", predict_y) # print("各类准确率:", accuracy(test_y, predict_y)) # print("综合准确率:", accuracy_score(test_y, predict_y)) # print("基于概率缩放后各类准确率:", accuracy(test_y, prediction_use_prob(test_prob, num_samples))) # print("基于概率缩放后综合准确率:", accuracy_score(test_y, prediction_use_prob(test_prob, num_samples))) output.append( [accuracy_score(test_y, predict_y)] + list(accuracy(test_y, predict_y)) + [ accuracy_score(test_y, prediction_use_prob(test_prob, num_samples)) ] + list(accuracy(test_y, prediction_use_prob(test_prob, num_samples))) ) print( pd.DataFrame(data=output, columns=[ "综合准确率", "p0", 'p1', 'p2', "概率分类的综合准确率", 'wp0', 'wp1', 'wp2' ]))
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.model_selection import train_test_split from sklearn import naive_bayes from sklearn import linear_model from Imahakil import IMAHAKIL from smote import Smote dataset = np.loadtxt('I:\\tools\\SoftwarePrediction\\dataset\\pc1.txt', delimiter=",") length = len(dataset[0]) x = dataset[:, 0:length - 1] y = dataset[:, length - 1] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4) x_train, y_train = Smote(x_train, y_train, 90, 5).over_sampling() clf = linear_model.LogisticRegression(solver='liblinear', max_iter=10000) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # count=0 # # for i in range(len(result)): # # if result[i]==y_test[i]: # # count=count+1 # # print(count/len(result)) print("acc", accuracy_score(y_test, y_pred)) for i in range(len(y_pred)): if y_test[i] == 1 and y_pred[i] == 1: print(i) print("precision", precision_score(y_test, y_pred))
#!/usr/bin/env python #-*- coding:utf-8 -*- import pandas from smote import Smote if __name__ == "__main__": #file_fullpath='/home/login01/Workspaces/python/dataset/module_data_stg1/sample' file_fullpath = '/home/login01/Workspaces/python/dataset/cs.csv' cs = pandas.read_csv(file_fullpath, sep=',', index_col=0, na_values='NA', low_memory=False) cs_mean_MonthlyIncome = cs.MonthlyIncome.mean(skipna=True) cs_mean_NumberOfDependents = cs.NumberOfDependents.mean(skipna=True) cs.ix[:, 'MonthlyIncome'] = cs.MonthlyIncome.fillna(cs_mean_MonthlyIncome, inplace=False) cs.ix[:, 'NumberOfDependents'] = cs.NumberOfDependents.fillna( cs_mean_NumberOfDependents, inplace=False) ismote = Smote(cs, 20, 6) print(ismote.n_samples) print(ismote.n_attrs) mysample = ismote.over_sampling() print(mysample)
data = {} t = len(data_all) / sum(DATA_PROPORTION.values()) train_count = int(DATA_PROPORTION['train'] * t) test_count = int(DATA_PROPORTION['test'] * t) data['train'] = data_all[:train_count] data['test'] = data_all[train_count:train_count + test_count] data['verify'] = data_all[train_count + test_count:] print('Creating more simples by SMOTE...') sys.stdout.flush() cover_shape = (3, IMAGE_RESIZE_SIZE, IMAGE_RESIZE_SIZE) positive_covers = [ record[0].reshape(-1) for record in data['train'] if record[1] ] smote = Smote(np.array(positive_covers), N=OVER_SIMPLING_N, k=5) over_simples = smote.over_sampling() data['train'] += [(np.reshape(simple, cover_shape), True) for simple in over_simples] print('Shuffling data...') sys.stdout.flush() for d in data.values(): random.shuffle(d) print('Exporting data.info ...') sys.stdout.flush() with open('data.info', 'w') as f: f.write(str({k: len(v) for k, v in data.items()})) print('Preparing export recordio file...')