def over_sample(buys, sells, nones, seed): features = np.concatenate((buys, sells, nones), axis=0) labels = np.array([1 for _ in range(len(buys))] + [2 for _ in range(len(sells))] + [0 for _ in range(len(nones))], dtype=np.int32) all_features = features.reshape( features.shape[0], features.shape[1] * features.shape[2] * features.shape[3]) sampled_features, sampled_labels = SMOTE(random_state=seed).fit_resample(all_features, labels) sampled_features = sampled_features.reshape( sampled_features.shape[0], features.shape[1], features.shape[2], features.shape[3]) sampled_buys = np.array([sampled_features[i] for i in range(len(sampled_features)) if sampled_labels[i] == 1], dtype=np.float32) sampled_sells = np.array([sampled_features[i] for i in range(len(sampled_features)) if sampled_labels[i] == 2], dtype=np.float32) sampled_nones = np.array([sampled_features[i] for i in range(len(sampled_features)) if sampled_labels[i] == 0], dtype=np.float32) return sampled_buys, sampled_sells, sampled_nones
def run_save_model(save_folder, spec, model_no, X_train, y_train, model_fn): kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2) cvscores = [] f1scores = [] for train, val in kfold.split(X_train, y_train): # create model using the model_fn parameter model = model_fn(spec, X_train) if model == None: return # returns if there was a mistake in specifications # fit model to k-split of training data num_examples, dx, dy = X_train[train].shape X_resampled, y_resampled = SMOTE(kind='borderline1', random_state=1).fit_sample( X_train[train].reshape((num_examples, dx * dy)), y_train[train]) num_total_examples, _ = X_resampled.shape X_resampled_reshaped = X_resampled.reshape(num_total_examples, dx, dy) model.fit(x=X_resampled_reshaped, y=y_resampled, epochs=10, batch_size=16, verbose=0) # evaluate model scores = model.evaluate(X_train[val], y_train[val], verbose=0) print('Accuracy: {}%'.format(scores[1] * 100)) cvscores.append(scores[1]) # get f1 f1 = f1_score(y_train[val], model.predict(X_train[val]) > 0.5) print('F1 score: {}'.format(f1)) f1scores.append(f1) mean_acc = 'Mean Accuracy: {}% +/- {}%'.format(np.mean(cvscores) * 100, np.std(cvscores) * 100) mean_f1 = 'Mean F1 score: {} +/- {}'.format(np.mean(f1scores), np.std(f1scores)) print(mean_acc) print(mean_f1) # modelfile = save_folder + 'model' + str(model_no) + '.h5' # save_model(model, modelfile) # print('model saved') txtfile = save_folder + 'model' + str(model_no) + '.txt' with open(txtfile, 'w') as f: f.write(mean_acc) f.write(mean_f1) f.write('\n') f.write('\n') f.writelines(spec) print('specs saved')
def target_training_data(targetclass): ##### target_training_data((targetclass)) is meant for genearing second half for training data set ##### generation of evalation data set in outsourced to all_target_training_data(Nnofs,Nnofs_evaluate,fractrain): import dictionary dictionary=dictionary.dict print(' ') # print('working on training set:') # print('targetclass=',targetclass) print('Resampling training set for class', targetclass) X1=3;X2=40 ##### components in the high-dimensional data point to be displayed for visualisation #dict_classes=gen_dictionary.gen_dictionary() #dict_classes=dictionary classes=[ keys for keys in dictionary ] classdir=classes traincontainer=[];traincontainer_y=[] origcontainer_y=[];origcontainer=[] origcontainer_ynn=[]; traincontainer_ynn=[] appendsecondhalf=[] lensh=0 for i in range(len(classdir)): cl=classdir[i] dirinclass=os.listdir(cl) lendirinclass=len(dirinclass) dirinclass=[ os.path.join(cl,dirinclass[i],'ta.npy') for i in range(lendirinclass) ] #print('i=',i,'cl=',cl, 'lendirinclass=',lendirinclass) #################### targetclass ############ fnorig=str(targetclass)+'.orig.npy' fnorig_y=str(targetclass)+'_y.orig.npy' fnorig_ynn=str(targetclass)+'_ynn.orig.npy' fntrain=str(targetclass)+'.train.npy' fntrain_y=str(targetclass)+'_y.train.npy' fntrain_ynn=str(targetclass)+'_ynn.train.npy' if cl==targetclass: # print('{i, cl }=',{i,cl}) # print('in target class',targetclass) #print('dirinclass=',dirinclass) shuffle(dirinclass) firsthalf=dirinclass[0:int(fractrain*len(dirinclass))] secondhalf=dirinclass[int(fractrain*len(dirinclass)):] appendsecondhalf.append(secondhalf) # print('firsthalf=',firsthalf) # print('secondhalf=',secondhalf) ##################################### for k in range(len(firsthalf)): # print('to append', firsthalf[k],'into',fnorig) datain=np.load(firsthalf[k]) origcontainer.append(datain) # print('to append', 0,'into',fnorig_y) origcontainer_y.append(0) origcontainer_ynn.append([0,1]) # print('print from firsthalf:') # print('k:',k,'targetclass:',targetclass,'dictionary[targetclass]:',dictionary[targetclass]) for k in range(len(secondhalf)): # print('to append', secondhalf[k],'into',fntrain) datain=np.load(secondhalf[k]) traincontainer.append(datain) # print('to append',0,'into',fntrain_y) traincontainer_y.append(0) traincontainer_ynn.append([0,1]) ##################################### else: # print('{i, cl }=',{i,cl}) # print('classes other than targetclass',targetclass) for k in range(len(dirinclass)): nnpy=dirinclass[k] if os.path.isfile(nnpy): datain=np.load(nnpy) # print('to append', nnpy,'into',fntrain) traincontainer.append(datain) # print('to append', '1','into',fntrain_y) traincontainer_y.append(1) traincontainer_ynn.append([0,1]) origcontainer=np.array(origcontainer) origcontainer_y=np.array(origcontainer_y) traincontainer=np.array(traincontainer) traincontainer_y=np.array(traincontainer_y) origcontainer_ynn=np.array(origcontainer_ynn) traincontainer_ynn=np.array(traincontainer_ynn) # np.save(fnorig, origcontainer) # np.save(fnorig_y,origcontainer_y) # np.save(fnorig_ynn,origcontainer_ynn) np.save(fntrain, traincontainer) np.save(fntrain_y,traincontainer_y) np.save(fntrain_ynn,traincontainer_ynn) #################### end of targetclass ############ #################################################### #print('Begin of oversampling for trainning set ') #k=len(classdir); k=2; seed=10 X = traincontainer y = traincontainer_y #ynn = traincontainer_ynn #print('1',X.shape) X = np.reshape(X, (X.shape[0], X.shape[2]*X.shape[2])) #print('2',X.shape) ####### scatter plot of X and y #plt.xlabel('x') #plt.ylabel('y') #plt.scatter(X[:, X1], X[:, X2], marker='o', # c=y, s=25, edgecolor='k', cmap=plt.cm.coolwarm) #plt.show() #### creating sampling_strategy ##### #lensh=max(lensh,len(secondhalf)) #print('maxlensh ======== ',lensh) sampling_strategy={} #sampling_strategy[0]=Nnofs*list(y).count(0) #sampling_strategy[0]=Nnofs*list(y).count(1) #sampling_strategy[1]=Nnofs*list(y).count(1) print('npycountt:',npycountt) sampling_strategy[0]=Nnofs*npycountt sampling_strategy[1]=Nnofs*npycountt print('sampling_strategy (training set) = ',sampling_strategy) # print("counter before oversampling = ", sorted(Counter(y).items())) ##### implementing oversampling #### if sampler_train == 'SMOTE': k=2;seed=10;n_jobs=-1; X_res, y_res= SMOTE(sampling_strategy=sampling_strategy, k_neighbors=k-1, random_state=seed,n_jobs=n_jobs)\ .fit_resample(X, y) if sampler_train == 'BorderlineSMOTE': k=2;seed=10;n_jobs=-1; X_res, y_res=imblearn.over_sampling.BorderlineSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k,n_jobs=n_jobs) \ .fit_resample(X, y) if sampler_train == 'ADASYN': k=3;seed=10;n_jobs=-1; X_res, y_res = ADASYN(random_state=seed,sampling_strategy=sampling_strategy,n_neighbors=k+1,n_jobs=n_jobs)\ .fit_resample(X, y) if sampler_train == 'KMeansSMOTE': k=2;seed=10;n_jobs=-1; X_res, y_res = KMeansSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k+2,n_jobs=n_jobs)\ .fit_resample(X, y) if sampler_train == 'RandomOverSampler': k=2;seed=10 X_res, y_res = RandomOverSampler(sampling_strategy=sampling_strategy,random_state=seed)\ .fit_resample(X, y) if sampler_train == 'SVMSMOTE': k=4 m_neighbors=2*k n_jobs=-1;seed=10; X_res, y_res = SVMSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k,n_jobs=n_jobs)\ .fit_resample(X, y) #### implementing oversampling #### y_resnn = [ [y_res[i], np.abs((y_res[i]**(1) - 1))] for i in range(len(y_res))] #plt.xlabel('x') #plt.ylabel('y') #plt.scatter(X_res[:, X1], X_res[:, X2], marker='o', # c=y_res, s=25, edgecolor='k', cmap=plt.cm.coolwarm) #plt.show() # print("counter before oversampling (trainning set) = ", sorted(Counter(y).items())) # print("counter after oversampling (trainning set) = ", sorted(Counter(y_res).items())) dim=int(X_res.shape[1]**0.5) X_res=X_res.reshape(X_res.shape[0],dim,dim) ### report sizes of data before and after oversampling # norig=sum([ Counter(y)[keys] for keys in Counter(y) ]) # print('Total number of data before oversampling:',norig) # novsp=sum([ Counter(y_res)[keys] for keys in Counter(y_res) ]) # print('Total number of data after oversampling:', novsp) # print('Ratio of number of data after and before oversampling of trainning data:',novsp/norig) ### here print("counter before oversampling (trainning set) = ", sorted(Counter(y).items())[0]) norigsample=sorted(Counter(y).items())[0][1] print("counter after oversampling (trainning set) = ", sorted(Counter(y_res).items())) norig=sum([ Counter(y)[keys] for keys in Counter(y) ]) print('Total number of data before oversampling:',norigsample) #novsp=sum([ Counter(y_res)[keys] for keys in Counter(y_res) ]) novsp=Counter(y_res)[0] print('Total number of data after oversampling:', novsp) print('Ratio of number of data after and before oversampling of trainning data:',novsp/norigsample) ### end here ### save oversampled data fnres=targetclass+'_ovsp.train.npy' fnres_y=targetclass + '_y_ovsp.train.npy' fnres_ynn=targetclass + '_ynn_ovsp.train.npy' np.save(fnres_y, y_res) np.save(fnres_ynn, y_resnn) np.save(fnres, X_res) #################################################### return firsthalf,appendsecondhalf
#randomly divide train & test data. #test data number = 300 X_train, X_test, Y_train, Y_test, X_train_image, X_test_image, Y_train_image, Y_test_image = pickle_import.func_import( 300) #imbalanced data handling import collections from imblearn.over_sampling import SMOTE from imblearn.over_sampling import ADASYN # Apply SMOTE method on training data X_train_SMOTE, Y_train_SMOTE = SMOTE(random_state=0).fit_resample( X_train, Y_train) X_train_image_SMOTE, Y_train_image_SMOTE = SMOTE(random_state=0).fit_resample( X_train_image.reshape(X_train_image.shape[0], -1), Y_train_image) X_train_image_SMOTE = X_train_image_SMOTE.reshape(X_train_image_SMOTE.shape[0], 50, 50, 3) # Apply ADASYN method on training data X_train_ADASYN, Y_train_ADASYN = ADASYN(random_state=0).fit_resample( X_train, Y_train) X_train_image_ADASYN, Y_train_image_ADASYN = ADASYN( random_state=0).fit_resample( X_train_image.reshape(X_train_image.shape[0], -1), Y_train_image) X_train_image_ADASYN = X_train_image_ADASYN.reshape( X_train_image_ADASYN.shape[0], 50, 50, 3) #0 - None fire image, 1 - fire image print("Origin data :", collections.Counter(Y_train)) print("After SMOTE :", collections.Counter(Y_train_SMOTE)) print("After ADASYN :", collections.Counter(Y_train_ADASYN))
#plt.ylabel('y') #plt.scatter(X_res[:, X1], X_res[:, X2], marker='o', # c=y_res, s=25, edgecolor='k', cmap=plt.cm.coolwarm) #plt.show() y_res_3 = [] for i in range(len(y_res)): dummy = [0 for j in range(k)] dummy[y_res[i]] = 1 #print(y_res[i],dummy) y_res_3.append(dummy) print("counter after oversampling = ", sorted(Counter(y_res).items())) dim = int(X_res.shape[1]**0.5) X_res = X_res.reshape(X_res.shape[0], dim, dim) ### to work here print('') for j in classdir: countfn2s = 0 for i in range(len(y_res)): # print(i,y_res[i],cdict[y_res[i]]) if cdict[y_res[i]] == j: countfn2s = countfn2s + 1 fn2s = j + '.' + str(countfn2s) + '.npy' # print('file name to save:',fn2s) # print('To save item',i,'in X_res', 'as',fn2s,'for class',j) np.save(fn2s, X_res[i]) # print('')
train_features = pd.DataFrame(pca.fit_transform(train_features)) test_features = pd.DataFrame(pca.transform(test_features)) explained_variance = pca.explained_variance_ratio_ pca_stats = pd.DataFrame(explained_variance) df_feature_list = pd.DataFrame(feature_list, columns= ['feature'] ) pca_stats = pca_stats.join(df_feature_list) #Change the path accordingly #pca_stats.to_csv(r'T:\tbase\short_lstm\pca_stats.csv') # ============================================================================= # Reshaping the dataframes into 3 dimensional vectors for LSTM input # ============================================================================= train_features = np.array(train_features) train_features = train_features.reshape((4310, 1, 168)) train_labels = np.array(train_labels) train_labels = train_labels.reshape((4310, 1, 1)) test_features = np.array(test_features) #Change to 1438 when oversampled test_features = test_features.reshape((730, 1, 168)) test_labels = np.array(test_labels) test_labels = test_labels.reshape((730, 1, 1)) # ============================================================================= # BI-LSTM # ============================================================================= import keras import tensorflow as tf
def resampleTrainSets(data, labels, ttResampleType='Stratified'): #''' https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets ''' # if stratified is chosen - do nothing trainData = data trainLabels = labels if (ttResampleType == 'Under'): # split into FAIL records failRecs = trainLabels[:, 0].astype('bool') failLabels = trainLabels[failRecs] failData = trainData[failRecs] numFails = len(failLabels) # split into PASS records passRecs = failRecs == False passLabels = trainLabels[passRecs] passData = trainData[passRecs] ### UNDERSAMPLE PASS data # shuffle PASS data and sample # generate random index array n_samples = passLabels.shape[0] mix = np.random.permutation(n_samples) # create shuffled datasets passLabels = passLabels[mix] passData = passData[mix] # limit to numFails records passLabels = passLabels[:numFails] passData = passData[:numFails] # paste FAIL and PASS subset back together trainLabels = np.concatenate((passLabels, failLabels)) trainData = np.concatenate((passData, failData)) elif (ttResampleType == 'Over'): # split into FAIL records failRecs = trainLabels[:, 0].astype('bool') failLabels = trainLabels[failRecs] failData = trainData[failRecs] numFails = len(failLabels) # split into PASS records passRecs = failRecs == False passLabels = trainLabels[passRecs] passData = trainData[passRecs] numPasses = len(passLabels) ### OVERSAMPLE FAIL data # generate index list that is numPasses long from integers up to numFails mix = np.random.choice(numFails, numPasses) newFailLabels = [failLabels[x] for x in mix] newFailData = [failData[x] for x in mix] # paste FAIL and PASS subset back together trainLabels = np.concatenate((passLabels, newFailLabels)) trainData = np.concatenate((passData, newFailData)) elif (ttResampleType == 'SMOTE'): # undo reshaping of input data to work with SMOTE trainLabels = trainLabels[:, 1] trainData = trainData.reshape((trainData.shape[0], trainData.shape[1])) # do SMOTEing trainData, trainLabels = SMOTE().fit_resample(trainData, trainLabels) # redo reshaping of input data trainLabels = to_categorical(trainLabels) trainData = trainData.reshape( (trainData.shape[0], trainData.shape[1], 1)) elif (ttResampleType == 'NearMiss'): # undo reshaping of input data to work with SMOTE trainLabels = trainLabels[:, 1] trainData = trainData.reshape((trainData.shape[0], trainData.shape[1])) # do NearMissing nm = NearMiss(version=1) trainData, trainLabels = nm.fit_resample(trainData, trainLabels) # redo reshaping of input data trainLabels = to_categorical(trainLabels) trainData = trainData.reshape( (trainData.shape[0], trainData.shape[1], 1)) # remix training data and labels n_samples = trainLabels.shape[0] mix = np.random.permutation(n_samples) trainLabels = trainLabels[mix] trainData = trainData[mix] return trainData, trainLabels
values[:, 24] = labelencoder.fit_transform(values[:, 24]) values = values.astype('float32') # Feature Scaling scaler = MinMaxScaler(feature_range=(0, 1)) values[:, 0:24] = scaler.fit_transform(values[:, 0:24]) #Define X and Y XFinal = values[:, 0:24] Y = output = values[:, 24] #Fix imbalance in event XFinal, Y = SMOTE().fit_resample(XFinal, Y.ravel()) #Set inputs and outputs inputs = XFinal.reshape((len(XFinal), 24)) output = Y.reshape((len(Y), 1)) # horizontally stack columns dataset = hstack((inputs, output)) # split into train and test sets n_train_minutes = 360 * 3 * 256 #train on first 18 minutes train = dataset[:n_train_minutes, :] test = dataset[n_train_minutes:, :] # split into input and outputs X_train, y_train = train[:, :-1], train[:, -1] X_test, y_test = test[:, :-1], test[:, -1] # reshape input to be 3D [samples, timesteps, features]
data = data.dropna() temp_c = list(data.columns) temp_c.remove('label') y = data['label'] X = data[features] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=False) X_train, y_train = SMOTE().fit_resample(X_train, y_train) X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])) X_test = X_test.values.reshape( (X_test.values.shape[0], 1, X_test.values.shape[1])) print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) model = Sequential() model.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2]))) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) history = model.fit(X_train,
# #### 3. LSTM on normal data with SMOTE # In[14]: from imblearn.over_sampling import SMOTE X_sm = np.array(X_in) Y_sm = np.array(Y_in) X_sm = X_sm.reshape(-1, 1) Y_sm = Y_sm.reshape(-1, 1) X_sm, Y_sm = SMOTE(random_state=2).fit_resample(X_sm, Y_sm.ravel()) trainX_sm = X_sm.reshape(X_sm.shape[0], 1, 1) trainY_sm = Y_sm.reshape(Y_sm.shape[0], 1, 1) model = Sequential() model.add(LSTM(4, input_shape=(trainX_sm.shape[1], trainX_sm.shape[2]))) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) print(model.summary()) model.fit(trainX_sm, Y_sm, epochs=200, batch_size=10, verbose=2) # In[15]: ypred_sm = model.predict(trainX)