Esempio n. 1
0
 def over_sample(buys, sells, nones, seed):
     features = np.concatenate((buys, sells, nones), axis=0)
     labels = np.array([1 for _ in range(len(buys))] +
                       [2 for _ in range(len(sells))] +
                       [0 for _ in range(len(nones))], dtype=np.int32)
     all_features = features.reshape(
         features.shape[0],
         features.shape[1] * features.shape[2] * features.shape[3])
     sampled_features, sampled_labels = SMOTE(random_state=seed).fit_resample(all_features, labels)
     sampled_features = sampled_features.reshape(
         sampled_features.shape[0],
         features.shape[1],
         features.shape[2],
         features.shape[3])
     sampled_buys = np.array([sampled_features[i]
                              for i in range(len(sampled_features))
                              if sampled_labels[i] == 1],
                             dtype=np.float32)
     sampled_sells = np.array([sampled_features[i]
                               for i in range(len(sampled_features))
                               if sampled_labels[i] == 2],
                              dtype=np.float32)
     sampled_nones = np.array([sampled_features[i]
                               for i in range(len(sampled_features))
                               if sampled_labels[i] == 0],
                              dtype=np.float32)
     return sampled_buys, sampled_sells, sampled_nones
def run_save_model(save_folder, spec, model_no, X_train, y_train, model_fn):
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
    cvscores = []
    f1scores = []
    for train, val in kfold.split(X_train, y_train):
        # create model using the model_fn parameter
        model = model_fn(spec, X_train)
        if model == None:
            return  # returns if there was a mistake in specifications
        # fit model to k-split of training data
        num_examples, dx, dy = X_train[train].shape
        X_resampled, y_resampled = SMOTE(kind='borderline1', random_state=1).fit_sample(
            X_train[train].reshape((num_examples, dx * dy)), y_train[train])
        num_total_examples, _ = X_resampled.shape
        X_resampled_reshaped = X_resampled.reshape(num_total_examples, dx, dy)
        model.fit(x=X_resampled_reshaped, y=y_resampled, epochs=10, batch_size=16, verbose=0)
        # evaluate model
        scores = model.evaluate(X_train[val], y_train[val], verbose=0)
        print('Accuracy: {}%'.format(scores[1] * 100))
        cvscores.append(scores[1])
        # get f1
        f1 = f1_score(y_train[val], model.predict(X_train[val]) > 0.5)
        print('F1 score: {}'.format(f1))
        f1scores.append(f1)

    mean_acc = 'Mean Accuracy: {}% +/- {}%'.format(np.mean(cvscores) * 100, np.std(cvscores) * 100)
    mean_f1 = 'Mean F1 score: {} +/- {}'.format(np.mean(f1scores), np.std(f1scores))
    print(mean_acc)
    print(mean_f1)

    # modelfile = save_folder + 'model' + str(model_no) + '.h5'
    # save_model(model, modelfile)
    # print('model saved')

    txtfile = save_folder + 'model' + str(model_no) + '.txt'
    with open(txtfile, 'w') as f:
        f.write(mean_acc)
        f.write(mean_f1)
        f.write('\n')
        f.write('\n')
        f.writelines(spec)
        print('specs saved')
Esempio n. 3
0
def target_training_data(targetclass):
    ##### target_training_data((targetclass)) is meant for genearing second half for training data set
    ##### generation of evalation data set in outsourced to all_target_training_data(Nnofs,Nnofs_evaluate,fractrain):
    
    import dictionary
    dictionary=dictionary.dict
    print(' ')
#    print('working on training set:')
#    print('targetclass=',targetclass)
    print('Resampling training set for class', targetclass)
    X1=3;X2=40  ##### components in the high-dimensional data point to be displayed for visualisation
    #dict_classes=gen_dictionary.gen_dictionary()
    #dict_classes=dictionary
    classes=[ keys for keys in dictionary ]
    classdir=classes
    traincontainer=[];traincontainer_y=[]
    origcontainer_y=[];origcontainer=[]
    origcontainer_ynn=[];
    traincontainer_ynn=[]
    appendsecondhalf=[]
    lensh=0
    for i in range(len(classdir)):
        cl=classdir[i]
        dirinclass=os.listdir(cl)
        lendirinclass=len(dirinclass)   
        dirinclass=[ os.path.join(cl,dirinclass[i],'ta.npy') for i in range(lendirinclass) ]  
        #print('i=',i,'cl=',cl, 'lendirinclass=',lendirinclass)
        
    #################### targetclass ############
        fnorig=str(targetclass)+'.orig.npy'
        fnorig_y=str(targetclass)+'_y.orig.npy'
        fnorig_ynn=str(targetclass)+'_ynn.orig.npy'
        fntrain=str(targetclass)+'.train.npy'
        fntrain_y=str(targetclass)+'_y.train.npy'
        fntrain_ynn=str(targetclass)+'_ynn.train.npy'
        
        if cl==targetclass:
#            print('{i, cl }=',{i,cl})
#            print('in target class',targetclass)
            #print('dirinclass=',dirinclass)        
            shuffle(dirinclass)
            firsthalf=dirinclass[0:int(fractrain*len(dirinclass))]
            secondhalf=dirinclass[int(fractrain*len(dirinclass)):]
            appendsecondhalf.append(secondhalf)
#            print('firsthalf=',firsthalf)
#            print('secondhalf=',secondhalf)       
            #####################################
            for k in range(len(firsthalf)):
             #       print('to append', firsthalf[k],'into',fnorig)
                    datain=np.load(firsthalf[k])
                    origcontainer.append(datain)
             #       print('to append', 0,'into',fnorig_y)
                    origcontainer_y.append(0)
                    origcontainer_ynn.append([0,1])
 #                   print('print from firsthalf:')
 #                   print('k:',k,'targetclass:',targetclass,'dictionary[targetclass]:',dictionary[targetclass])
    
            for k in range(len(secondhalf)):
              #      print('to append', secondhalf[k],'into',fntrain)
                    datain=np.load(secondhalf[k])
                    traincontainer.append(datain)
               #     print('to append',0,'into',fntrain_y)
                    traincontainer_y.append(0)
                    traincontainer_ynn.append([0,1])
            #####################################            
                    
        else:
#            print('{i, cl }=',{i,cl})
#            print('classes other than targetclass',targetclass)
    
            for k in range(len(dirinclass)):
                nnpy=dirinclass[k]
                if os.path.isfile(nnpy):
                    datain=np.load(nnpy)
                #    print('to append', nnpy,'into',fntrain)
                    traincontainer.append(datain)
                 #   print('to append', '1','into',fntrain_y)
                    traincontainer_y.append(1)
                    traincontainer_ynn.append([0,1])    
    
    origcontainer=np.array(origcontainer)
    origcontainer_y=np.array(origcontainer_y)
    
    traincontainer=np.array(traincontainer)
    traincontainer_y=np.array(traincontainer_y)
    
    origcontainer_ynn=np.array(origcontainer_ynn)
    traincontainer_ynn=np.array(traincontainer_ynn)
    
#    np.save(fnorig, origcontainer)                            
#    np.save(fnorig_y,origcontainer_y)
#    np.save(fnorig_ynn,origcontainer_ynn)
    np.save(fntrain, traincontainer)                            
    np.save(fntrain_y,traincontainer_y)
    np.save(fntrain_ynn,traincontainer_ynn)
    #################### end of targetclass ############
        
    ####################################################
    #print('Begin of oversampling for trainning set ')
    #k=len(classdir);
    k=2;
    seed=10
    X = traincontainer
    y = traincontainer_y
    #ynn = traincontainer_ynn
    
    #print('1',X.shape)
    X = np.reshape(X, (X.shape[0], X.shape[2]*X.shape[2]))
    #print('2',X.shape)
    
    ####### scatter plot of X and y
    #plt.xlabel('x')
    #plt.ylabel('y')
    #plt.scatter(X[:, X1], X[:, X2], marker='o', 
    #               c=y, s=25, edgecolor='k', cmap=plt.cm.coolwarm)
    #plt.show()
    
    #### creating sampling_strategy #####
    #lensh=max(lensh,len(secondhalf))
    #print('maxlensh  ======== ',lensh)
    sampling_strategy={}
    #sampling_strategy[0]=Nnofs*list(y).count(0)  
    #sampling_strategy[0]=Nnofs*list(y).count(1)  
    #sampling_strategy[1]=Nnofs*list(y).count(1)  
    print('npycountt:',npycountt)
    sampling_strategy[0]=Nnofs*npycountt
    sampling_strategy[1]=Nnofs*npycountt

    print('sampling_strategy (training set) = ',sampling_strategy)
    
  #  print("counter before oversampling = ", sorted(Counter(y).items()))
    
    
    ##### implementing oversampling ####
    if sampler_train == 'SMOTE':
        k=2;seed=10;n_jobs=-1;
        X_res, y_res= SMOTE(sampling_strategy=sampling_strategy, k_neighbors=k-1, random_state=seed,n_jobs=n_jobs)\
                      .fit_resample(X, y)

    if sampler_train == 'BorderlineSMOTE':
        k=2;seed=10;n_jobs=-1;
        X_res, y_res=imblearn.over_sampling.BorderlineSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k,n_jobs=n_jobs) \
        .fit_resample(X, y)

    if sampler_train == 'ADASYN':
        k=3;seed=10;n_jobs=-1;
        X_res, y_res = ADASYN(random_state=seed,sampling_strategy=sampling_strategy,n_neighbors=k+1,n_jobs=n_jobs)\
            .fit_resample(X, y)
    
    if sampler_train == 'KMeansSMOTE':
        k=2;seed=10;n_jobs=-1;
        X_res, y_res = KMeansSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k+2,n_jobs=n_jobs)\
            .fit_resample(X, y)
            
    if sampler_train == 'RandomOverSampler':
        k=2;seed=10
        X_res, y_res = RandomOverSampler(sampling_strategy=sampling_strategy,random_state=seed)\
            .fit_resample(X, y)
            
    if sampler_train == 'SVMSMOTE':
        k=4
        m_neighbors=2*k
        n_jobs=-1;seed=10;
        X_res, y_res = SVMSMOTE(sampling_strategy=sampling_strategy,random_state=seed,k_neighbors=k,n_jobs=n_jobs)\
            .fit_resample(X, y)
    
    
    #### implementing oversampling ####
    y_resnn = [ [y_res[i], np.abs((y_res[i]**(1) - 1))] for i in range(len(y_res))]    
    
    
    #plt.xlabel('x')
    #plt.ylabel('y')
    #plt.scatter(X_res[:, X1], X_res[:, X2], marker='o', 
    #               c=y_res, s=25, edgecolor='k', cmap=plt.cm.coolwarm)
    #plt.show()
 #   print("counter before oversampling (trainning set) = ", sorted(Counter(y).items()))
 #   print("counter after oversampling (trainning set) = ", sorted(Counter(y_res).items()))
    dim=int(X_res.shape[1]**0.5)
    X_res=X_res.reshape(X_res.shape[0],dim,dim)
       
    ### report sizes of data before and after oversampling
#    norig=sum([ Counter(y)[keys] for keys in Counter(y) ])
#    print('Total number of data before oversampling:',norig)
#    novsp=sum([ Counter(y_res)[keys] for keys in Counter(y_res) ])
#    print('Total number of data after oversampling:', novsp)
#    print('Ratio of number of data after and before oversampling of trainning data:',novsp/norig)
### here     

    print("counter before oversampling (trainning set) = ", sorted(Counter(y).items())[0])
    norigsample=sorted(Counter(y).items())[0][1]
    print("counter after oversampling (trainning set) = ", sorted(Counter(y_res).items()))
    norig=sum([ Counter(y)[keys] for keys in Counter(y) ])
    print('Total number of data before oversampling:',norigsample)
    #novsp=sum([ Counter(y_res)[keys] for keys in Counter(y_res) ])
    novsp=Counter(y_res)[0]
    print('Total number of data after oversampling:', novsp)
    print('Ratio of number of data after and before oversampling of trainning data:',novsp/norigsample)

### end here
    ### save oversampled data
    fnres=targetclass+'_ovsp.train.npy'
    fnres_y=targetclass + '_y_ovsp.train.npy'
    fnres_ynn=targetclass + '_ynn_ovsp.train.npy'
    np.save(fnres_y, y_res)
    np.save(fnres_ynn, y_resnn)
    np.save(fnres, X_res)
    ####################################################
    return firsthalf,appendsecondhalf
Esempio n. 4
0
#randomly divide train & test data.
#test data number = 300
X_train, X_test, Y_train, Y_test, X_train_image, X_test_image, Y_train_image, Y_test_image = pickle_import.func_import(
    300)

#imbalanced data handling
import collections
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

# Apply SMOTE method on training data
X_train_SMOTE, Y_train_SMOTE = SMOTE(random_state=0).fit_resample(
    X_train, Y_train)
X_train_image_SMOTE, Y_train_image_SMOTE = SMOTE(random_state=0).fit_resample(
    X_train_image.reshape(X_train_image.shape[0], -1), Y_train_image)
X_train_image_SMOTE = X_train_image_SMOTE.reshape(X_train_image_SMOTE.shape[0],
                                                  50, 50, 3)

# Apply ADASYN method on training data
X_train_ADASYN, Y_train_ADASYN = ADASYN(random_state=0).fit_resample(
    X_train, Y_train)
X_train_image_ADASYN, Y_train_image_ADASYN = ADASYN(
    random_state=0).fit_resample(
        X_train_image.reshape(X_train_image.shape[0], -1), Y_train_image)
X_train_image_ADASYN = X_train_image_ADASYN.reshape(
    X_train_image_ADASYN.shape[0], 50, 50, 3)

#0 - None fire image, 1 - fire image
print("Origin data :", collections.Counter(Y_train))
print("After SMOTE :", collections.Counter(Y_train_SMOTE))
print("After ADASYN :", collections.Counter(Y_train_ADASYN))
Esempio n. 5
0
#plt.ylabel('y')
#plt.scatter(X_res[:, X1], X_res[:, X2], marker='o',
#               c=y_res, s=25, edgecolor='k', cmap=plt.cm.coolwarm)
#plt.show()

y_res_3 = []
for i in range(len(y_res)):
    dummy = [0 for j in range(k)]
    dummy[y_res[i]] = 1
    #print(y_res[i],dummy)
    y_res_3.append(dummy)

print("counter after oversampling = ", sorted(Counter(y_res).items()))

dim = int(X_res.shape[1]**0.5)
X_res = X_res.reshape(X_res.shape[0], dim, dim)

### to work here
print('')
for j in classdir:
    countfn2s = 0
    for i in range(len(y_res)):
        #            print(i,y_res[i],cdict[y_res[i]])
        if cdict[y_res[i]] == j:
            countfn2s = countfn2s + 1
            fn2s = j + '.' + str(countfn2s) + '.npy'
            #                print('file name to save:',fn2s)
            #            print('To save item',i,'in X_res', 'as',fn2s,'for class',j)
            np.save(fn2s, X_res[i])
#    print('')
train_features = pd.DataFrame(pca.fit_transform(train_features))
test_features = pd.DataFrame(pca.transform(test_features))
explained_variance = pca.explained_variance_ratio_

pca_stats = pd.DataFrame(explained_variance)
df_feature_list = pd.DataFrame(feature_list, columns= ['feature'] )
pca_stats = pca_stats.join(df_feature_list)
#Change the path accordingly
#pca_stats.to_csv(r'T:\tbase\short_lstm\pca_stats.csv')

# =============================================================================
# Reshaping the dataframes into 3 dimensional vectors for LSTM input
# =============================================================================

train_features = np.array(train_features)
train_features = train_features.reshape((4310, 1, 168)) 
train_labels = np.array(train_labels)
train_labels = train_labels.reshape((4310, 1, 1)) 
test_features = np.array(test_features)
#Change to 1438 when oversampled
test_features = test_features.reshape((730, 1, 168)) 
test_labels = np.array(test_labels)
test_labels = test_labels.reshape((730, 1, 1)) 



# =============================================================================
# BI-LSTM 
# =============================================================================
import keras
import tensorflow as tf
Esempio n. 7
0
def resampleTrainSets(data, labels, ttResampleType='Stratified'):
    #''' https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets '''

    # if stratified is chosen - do nothing
    trainData = data
    trainLabels = labels

    if (ttResampleType == 'Under'):
        # split into FAIL records
        failRecs = trainLabels[:, 0].astype('bool')
        failLabels = trainLabels[failRecs]
        failData = trainData[failRecs]
        numFails = len(failLabels)

        # split into PASS records
        passRecs = failRecs == False
        passLabels = trainLabels[passRecs]
        passData = trainData[passRecs]

        ### UNDERSAMPLE PASS data
        # shuffle PASS data and sample
        # generate random index array
        n_samples = passLabels.shape[0]
        mix = np.random.permutation(n_samples)
        # create shuffled datasets
        passLabels = passLabels[mix]
        passData = passData[mix]
        # limit to numFails records
        passLabels = passLabels[:numFails]
        passData = passData[:numFails]

        # paste FAIL and PASS subset back together
        trainLabels = np.concatenate((passLabels, failLabels))
        trainData = np.concatenate((passData, failData))

    elif (ttResampleType == 'Over'):
        # split into FAIL records
        failRecs = trainLabels[:, 0].astype('bool')
        failLabels = trainLabels[failRecs]
        failData = trainData[failRecs]
        numFails = len(failLabels)

        # split into PASS records
        passRecs = failRecs == False
        passLabels = trainLabels[passRecs]
        passData = trainData[passRecs]
        numPasses = len(passLabels)

        ### OVERSAMPLE FAIL data
        # generate index list that is numPasses long from integers up to numFails
        mix = np.random.choice(numFails, numPasses)
        newFailLabels = [failLabels[x] for x in mix]
        newFailData = [failData[x] for x in mix]

        # paste FAIL and PASS subset back together
        trainLabels = np.concatenate((passLabels, newFailLabels))
        trainData = np.concatenate((passData, newFailData))

    elif (ttResampleType == 'SMOTE'):
        # undo reshaping of input data to work with SMOTE
        trainLabels = trainLabels[:, 1]
        trainData = trainData.reshape((trainData.shape[0], trainData.shape[1]))

        # do SMOTEing
        trainData, trainLabels = SMOTE().fit_resample(trainData, trainLabels)

        # redo reshaping of input data
        trainLabels = to_categorical(trainLabels)
        trainData = trainData.reshape(
            (trainData.shape[0], trainData.shape[1], 1))

    elif (ttResampleType == 'NearMiss'):
        # undo reshaping of input data to work with SMOTE
        trainLabels = trainLabels[:, 1]
        trainData = trainData.reshape((trainData.shape[0], trainData.shape[1]))

        # do NearMissing
        nm = NearMiss(version=1)
        trainData, trainLabels = nm.fit_resample(trainData, trainLabels)

        # redo reshaping of input data
        trainLabels = to_categorical(trainLabels)
        trainData = trainData.reshape(
            (trainData.shape[0], trainData.shape[1], 1))

    # remix training data and labels
    n_samples = trainLabels.shape[0]
    mix = np.random.permutation(n_samples)
    trainLabels = trainLabels[mix]
    trainData = trainData[mix]

    return trainData, trainLabels
Esempio n. 8
0
values[:, 24] = labelencoder.fit_transform(values[:, 24])
values = values.astype('float32')

# Feature Scaling
scaler = MinMaxScaler(feature_range=(0, 1))
values[:, 0:24] = scaler.fit_transform(values[:, 0:24])

#Define X and Y
XFinal = values[:, 0:24]
Y = output = values[:, 24]

#Fix imbalance in event
XFinal, Y = SMOTE().fit_resample(XFinal, Y.ravel())

#Set inputs and outputs
inputs = XFinal.reshape((len(XFinal), 24))
output = Y.reshape((len(Y), 1))

# horizontally stack columns
dataset = hstack((inputs, output))

# split into train and test sets
n_train_minutes = 360 * 3 * 256  #train on first 18 minutes
train = dataset[:n_train_minutes, :]
test = dataset[n_train_minutes:, :]

# split into input and outputs
X_train, y_train = train[:, :-1], train[:, -1]
X_test, y_test = test[:, :-1], test[:, -1]

# reshape input to be 3D [samples, timesteps, features]
data = data.dropna()

temp_c = list(data.columns)
temp_c.remove('label')

y = data['label']
X = data[features]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    shuffle=False)
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.values.reshape(
    (X_test.values.shape[0], 1, X_test.values.shape[1]))
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

model = Sequential()
model.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

history = model.fit(X_train,
# #### 3. LSTM on normal data with SMOTE

# In[14]:

from imblearn.over_sampling import SMOTE

X_sm = np.array(X_in)
Y_sm = np.array(Y_in)

X_sm = X_sm.reshape(-1, 1)
Y_sm = Y_sm.reshape(-1, 1)

X_sm, Y_sm = SMOTE(random_state=2).fit_resample(X_sm, Y_sm.ravel())

trainX_sm = X_sm.reshape(X_sm.shape[0], 1, 1)
trainY_sm = Y_sm.reshape(Y_sm.shape[0], 1, 1)

model = Sequential()
model.add(LSTM(4, input_shape=(trainX_sm.shape[1], trainX_sm.shape[2])))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['binary_accuracy'])
print(model.summary())
model.fit(trainX_sm, Y_sm, epochs=200, batch_size=10, verbose=2)

# In[15]:

ypred_sm = model.predict(trainX)