def localWords(feed1, feed0): docList = []; classList=[]; fullText = []; minLen = min(len(feed1['entries']),len(feed0['entries'])) for i in range(minLen): wordList = textParse(feed1['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(1) #NY is class 1 wordList = textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = PrepareData.createVocabList(docList)#create vocabulary top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words for pairW in top30Words: if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = range(2*minLen); testSet=[] #create test set for i in range(20): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat=[]; trainClasses = [] for docIndex in trainingSet:#train the classifier (get probs) trainNB0 trainMat.append(PrepareData.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = NaiveBayesianModel.trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = PrepareData.bagOfWords2VecMN(vocabList, docList[docIndex]) if NaiveBayesianModel.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ',float(errorCount)/len(testSet) return vocabList,p0V,p1V
def main(): if len(sys.argv) == 1: n_gram_list = [1] else: n_gram_list = sys.argv[1:len(sys.argv)] n_gram_list = map(int, n_gram_list) x = PrepareData.feat_extraction(n_gram_list, x_one_hot) x_validate = PrepareData.feat_extraction(n_gram_list, x_one_hot) n_feat = x.shape[1] raw_data_size = (n_feat, text_length, 1) # train_seq, test_seq = data_set_split(x.shape[0], 0.2) n_classes = y.shape[1] k = 5 k_fold_sequence = data_set_k_fold_separation(x.shape[0], k) output_train = open('train_acc.txt', 'wb') output_test = open('train_acc.txt', 'wb') y_validate = [] for i in range(k): test_seq = k_fold_sequence[i] train_seq = [] for j in range(k): if i != j: train_seq.extend(k_fold_sequence[j]) nc = NC(input_size=raw_data_size, n_classes=n_classes, raw_feature_dim=n_feat) xtrain = x[train_seq] ytrain = y[train_seq] nc.fit([xtrain, xtrain], ytrain) eval_train_result = nc.evaluation(xtrain, ytrain) print(eval_train_result) print >> output_train, [k, eval_train_result] eval_test_result = nc.evaluation(x[test_seq], y[test_seq]) print(eval_test_result) print >> output_test, [k, eval_test_result] y_validate_k = nc.predict(x_validate) y_validate_k = y_validate_k.argmax(axis=1) print >> output_train, {'average', np.mean(eval_train_result, axis=0)} print >> output_test, {'average', np.mean(eval_test_result, axis=0)} y_validate_file_path = 'ytest.txt' np.savetxt(fname=y_validate_file_path, X=np.asarray(y_validate_k), fmt='%1.2f')
def __init__(self): self.data = PrepareData() self.dataset = Seq2SeqDataset() self.data_loader = DataLoader(dataset=self.dataset, batch_size=1, shuffle=True) self.lang_1 = data.lang_1 self.lang_2 = data.lang_2 self.char2index = data.char2index self.index2char = data.index2char self.input_size = 100 self.hidden_size = 64 self.output_size = 100 self.learning_rate = 0.01 self.num_epoch = 500 self.teacher_forcing = True self.use_cuda = torch.cuda.is_available() self.device = 'cuda:0' if self.use_cuda else 'cpu' self.encoder = EncoderRNN(input_size=self.input_size, hidden_size=self.hidden_size) self.decoder = DecoderRNN(output_size=self.output_size, hidden_size=self.hidden_size) self.attn_decoder = AttnDecoder(self.hidden_size, self.output_size) if use_cuda: self.encoder = self.encoder.to(self.device) self.decoder = self.decoder.to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=self.learning_rate) self.decoder_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=self.learning_rate) self.loss_function = nn.NLLLoss()
def testFutureData(self): fakePrice = np.array(range(1,16)) # 0-14 fakevolume= np.array(range(15)) fakeDate = np.array(range(15)) dataFrame = pd.DataFrame({'price':fakePrice}, index = fakeDate) dataFrame['volume'] = pd.Series(fakevolume,index = fakeDate) myfilter = np.array([0, 0.5,0.5]) myfilter = myfilter[::-1] retFr = PrepareData.getFutureFiltered(dataFrame, myfilter) #check that length of dataframe is unchenged self.assertEqual(len(retFr),15) filteredFrame = retFr[np.isfinite(retFr['FutureFilter'])] # Checking length and content of returned vectors self.assertEqual( len(filteredFrame['FutureFilter']), 13) self.assertEqual( retFr['price'][12], 13) self.assertEqual( retFr['volume'][12], 12) # Checking resulting vector self.assertEqual( retFr['FutureFilter'][0], 1.5) ## (2+3)/2 = 2.5 = > 2.5/1 - 1.0 = 1.5 self.assertEqual( retFr['FutureFilter'][12], 14.5/13-1.0) ## (2+3)/2 = 14.5 = > 14.5/13 - 1.0 = 1.5
def testingNB(): listOPosts, listClasses = LoadData.loadDataSet() myVocabList = PrepareData.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(PrepareData.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = NaiveBayesianModel.trainNB0(array(trainMat), array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = array(PrepareData.setOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', NaiveBayesianModel.classifyNB( thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] thisDoc = array(PrepareData.setOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', NaiveBayesianModel.classifyNB( thisDoc, p0V, p1V, pAb)
def get_link_index(usecase, sourcecode): test_meta_data = PrepareData.get_test_meta_data() for index in range(len(test_meta_data)): usecase_item = test_meta_data[index]["link name"][0] sourcecode_item = test_meta_data[index]["link name"][1] if usecase == usecase_item and sourcecode == sourcecode_item: return index return -1
def testUseHistFilter(self): price = range(1,20) volume = range(1,20) timestamp = range(1,20) dataFrame = pd.DataFrame({'price':price,'volume':volume}, index = timestamp) myfilter = PrepareData.getHistoryFilter([5, 4, 2]) retFrame = PrepareData.getHistoryFiltered(dataFrame, myfilter) i = 0; for i in range(5,len(retFrame)): meanP = (price[i-5] + price[i-4] + price[i-2])/3.0 change = meanP/price[i]-1.0; print change print retFrame['HistoryFilter'].iloc[i] self.assertAlmostEqual(change, retFrame['HistoryFilter'].iloc[i]) i = i+1;
def testFutureFilter(self): f = PrepareData.getFutureFilter([2, 4, 6, 8]) self.assertEqual( len(f), 9) self.assertEqual( f[-2], 0.25) self.assertEqual( f[-4], 0.25) self.assertEqual( f[-6], 0.25) self.assertEqual( f[-8], 0.25) self.assertEqual( np.sum(f), 1.0)
def select_images(self): all_conditions_list = { "DENSITY": self.get_checked_boxes(self.den_var_categories, "DENSITY"), "SUBTLETY": self.get_checked_boxes(self.subt_var_categories, "SUBTLETY"), "ASSESSMENT": self.get_checked_boxes(self.assesm_var_categories, "ASSESSMENT"), "LESION_TYPE": self.get_checked_boxes(self.lesion_var_categories, "LESION_TYPE") } selected_data = PrepareData(all_conditions_list) selected_data.count_values() return all_conditions_list
def CompareInitData(OldTmsIP, NewTmsIP, OldPort, NewPort): OldInitData = ParseInit(OldTmsIP, OldPort) NewInitData = ParseInit(NewTmsIP, NewPort) OldInitData = PrepareData.PrepareData(OldInitData) NewInitData = PrepareData.PrepareData(NewInitData) # print(OldInitData) # print(NewInitData) for tableOld in OldInitData: TableIdOld = tableOld[0] TableDataOld = tableOld[1] for tableNew in NewInitData: TableIdNew = tableNew[0] TableDataNew = tableNew[1] if TableIdOld == TableIdNew: if TableDataOld == TableDataNew: print("TableID: " + str(TableIdOld) + "\t was Successful") # print(TableDataOld + "\n" + TableDataNew) else: print("TableID: " + str(TableIdOld) + "\t was Failed!!!") print(TableDataOld + "\n" + TableDataNew)
def testUseFutureFilter(self): price = range(1,20) volume = range(1,20) timestamp = range(0,19) dataFrame = pd.DataFrame({'price':price}, index = timestamp) dataFrame['volume'] = pd.Series(volume,index = timestamp) myfilter = PrepareData.getFutureFilter([1, 3, 5]) newFrame = PrepareData.getFutureFiltered(dataFrame, myfilter) length = len(newFrame['price'])-len(myfilter) for i in range(length): meanP = (price[i+1] + price[i+3] + price[i+5])/3.0 change = meanP/price[i]-1.0; self.assertAlmostEqual(change, newFrame['FutureFilter'][i]) i = i+1;
def Run(args): cid = plt.gcf().canvas.mpl_connect('key_press_event', closePlot) # get the combined data - cleaned Data = ReadData.Run(args) measure_cols = ['DNI', 'GHI', 'DHI'] # sort out the scale info scale_map = {'Hourly': 'Hour', 'Daily': 'Date'} scale = scale_map[args['scale']] window = 30 # take sum on a given time scale Data_sum = PrepareData.aggregateDf(Data, scale[0], 'sum') # get correlation between the measure columns print 'Correlation in {}'.format(measure_cols) print Data_sum.corr(), '\n' # plot Data # plot(['DNI'], Data_sum, '-') # make the dataset & dump dump_dir = 'Dumped Dataset/Suny/{} {}'.format('Cont', window) site = dump_dir.split('/')[1].lower() if not os.path.exists(dump_dir): os.makedirs(dump_dir) PrepareData.createDataSets(Data_sum, 'Cont', input_measure_cols=['DNI'], output_measure_cols=['DNI'], split=True, window=window, dump_dir=dump_dir) train_in, train_out, test_in, test_out = PrepareData.loadDumpedData( dump_dir) with open(os.path.join('Params', '{}.json'.format(site))) as paramsF: paramsFactory = json.load(paramsF) runModels(train_in, train_out, test_in, test_out, scale, paramsFactory)
def capture_more_link(): test_x = [] test_y = [] test_meta_data = PrepareData.get_test_meta_data() PrepareData.get_ml_data(test_meta_data, test_x, test_y) # print("test_meta_data =", len(test_meta_data), test_meta_data) predict_y_phase1 = Phase3Classifier.train() # print("predict_y =", len(predict_y), predict_y) predict_y_phase1 = list(predict_y_phase1) predict_y_phase2 = predict_y_phase1.copy() sourcecode_dict = ProcessSourceCode.get_sourcecode_infomation() for index in range(len(predict_y_phase1)): if predict_y_phase1[index] == 1: link_name = test_meta_data[index]["link name"] usecase = link_name[0] sourcecode = link_name[1] Implements_Extends_Recovery.capture_implements_link( usecase, sourcecode, predict_y_phase2, test_meta_data, sourcecode_dict) Implements_Extends_Recovery.capture_extends_link( usecase, sourcecode, predict_y_phase2, test_meta_data, sourcecode_dict) # print("test_data[20] =", len(test_data), test_data[20]) print("phase 1 : ") Result.get_result(test_y, predict_y_phase1) tag_1_count = predict_y_phase1.count(1) print("capture link =", tag_1_count) print("phase 2 : ") Result.get_result(test_y, predict_y_phase2) tag_1_count = predict_y_phase2.count(1) print("capture link =", tag_1_count)
def selectFile(self): print("Selecting CSV/Excel file") # Current directory self.filename = QFileDialog.getOpenFileName() path = self.filename[0] # If file is selected if len(path) > 0: # Store file name to filename self.filename = os.path.basename(path) if self.filename.endswith(".xlsx"): print(f"[*]\tConverting excel spreadsheet to csv") # Read and store content of the excel file try: self.df = pd.read_excel( self.filename) # sheetname is optional # Replaces .xlsx with nothing self.filename = self.filename.replace('.xlsx', '') # Adds .csv extension to name before converting .xlsx file type to .csv self.filename = f"{self.filename}.csv" # Write the dataframe object into csv file self.df.to_csv( self.filename, index=False ) # index=False prevents pandas to write row index except Exception as e: print(e) # Check if it's a csv file if self.filename.endswith(".csv"): print("[*]\tCSV File selected") try: o.main(self.filename) self.filename = "reformatted_" + self.filename except Exception as e: print(f"[*]\tFail\t{e}") # Check if it's an excel file else: print(f"Not sure what format this is...\"{self.filename}\"") self.dFileName = self.filename
def loadImages(dirName): from os import listdir hwLabels = [] trainingFileList = listdir(dirName) #load the training set m = len(trainingFileList) trainingMat = zeros((m,1024)) for i in range(m): fileNameStr = trainingFileList[i] fileStr = fileNameStr.split('.')[0] #take off .txt classNumStr = int(fileStr.split('_')[0]) if classNumStr == 9: hwLabels.append(-1) else: hwLabels.append(1) trainingMat[i,:] = pred.img2vector('%s/%s' % (dirName, fileNameStr)) return trainingMat, hwLabels
def plot_avalanche_activity_index(): ''' :def: This function creates box plot showing the Avalanche Activity Index (AAI) per day when AAI > 0 (only data available from dataset) :return: void ''' # Get data to plot data = [] for i in sorted(df['max.danger.corr'].unique()): # Take a subset of specific avalanche danger level df_av_freq = df[df['max.danger.corr'] == i] # Gather the data to create the box plot data.append(df_av_freq['x'].value_counts().values) # Box plot utils.create_box_plot(df=data, xlabel='Avalanche Danger Level', ylabel='Avalanche Activity Index (AAI)') # Plot the avalanche danger level with respect to the four year seasons utils.create_stacked_bar_plot(df, 'max.danger.corr', 'season')
def testParseOrderdepth(self): config = ConfigParser.ConfigParser() config.read('config.cfg') filename = config.get('Section1', 'orderdepthfilename') directory = config.get('Section1', 'bitcoinHistDataFolder') a = PrepareData.getOrderDepthRatio(directory + filename, 60) size = np.shape(a) self.assertEqual(size[1], 1) self.assertEqual(size[0], 613) indexes = a.index startTime = indexes[0] a = np.array(a) indexes = a[:,0] - startTime frame = pd.DataFrame(a) frame.to_csv('test.csv')
def performPredictions(): model_path = Parameters.model_path + 'model.json' weights_path = Parameters.weights_path prepare = PrepareData.prepareData() test, test_label, test_names = prepare.generateTestingSamples() # load model file jason_file = open(model_path, 'r') loaded_jason_file = jason_file.read() jason_file.close() model = model_from_json(loaded_jason_file) # load weights file lastWeights = 0 for f in listdir(weights_path): epoch = (weights_path + f).split('/')[-1].split('.')[0] if int(epoch) > lastWeights: lastWeights = int(epoch) model.load_weights(weights_path + str(lastWeights) + '.h5') optimizer = optimizers.rmsprop(lr=Parameters.learning_rate) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) # perform predictions loss, acc = model.evaluate(test, test_label) score = model.predict(test) # Print number of 0s and 1s predicted predicted_label = model.predict_classes(test) unique, counts = np.unique(predicted_label, return_counts=True) print("\tPredicted labels:", dict(zip(unique, counts))) # ROC roc = roc_auc_score(test_label, score) print('Run:', Parameters.run) print('Performance of ProDec-BLSTM: roc:', roc) # save predictions to disk writeResults(test_names, test_label, score, predicted_label, Parameters.run, roc, acc, True)
def testHistData(self): fakePrice = np.array(range(1,16)) # 1-15 fakevolume= np.array(range(15)) fakeDate = np.array(range(15)) myfilter = np.array([0.5,0.5,0]) myfilter = myfilter[::-1] dataframe = pd.DataFrame( columns=['price', 'volume','date']) dataframe['price'] = fakePrice dataframe['volume'] = fakevolume dataframe['date'] = fakeDate df = PrepareData.getHistoryFiltered(dataframe, myfilter) # Checking length and content of returned vectors self.assertEqual( len(df['HistoryFilter']), 15) self.assertTrue( math.isnan(df['HistoryFilter'][0])) self.assertTrue(math.isnan(df['HistoryFilter'][1])) print df['HistoryFilter'] # Checking resulting vector self.assertEqual( df['HistoryFilter'][2], -0.5) ## (1+2)/2 = 1.5 = > 1.5/3 - 1.0 = 0.5 self.assertEqual( df['HistoryFilter'][14], 13.5/15-1.0) ## (14+13)/2 = 13.5 = > 15/13.5- 1.0 = 1.5
def test_classifier_model(): train_x = [] train_y = [] test_x = [] test_y = [] train_meta_data = PrepareData.get_train_meta_data() test_metadata = PrepareData.get_test_meta_data() PrepareData.get_ml_data(train_meta_data, train_x, train_y) PrepareData.get_ml_data(test_metadata, test_x, test_y) print("train correct link =", train_y.count(1)) print("test correct link =", test_y.count(1)) # print("precision", "\t"*3, "recall", "\t"*3, "accuracy", "\t"*3, "f1-score") train_x = np.array(train_x) train_y = np.array(train_y) test_x = np.array(test_x) test_y = np.array(test_y) print("capture link =", list(predict_y).count(1))
def PlotResults(setupClient, model, X_train, X_test, y_train, y_test, w_train, w_test, ix_train, ix_test): print(Fore.BLUE + "--------------------------") print(Back.BLUE + " RESULTS ") print(Fore.BLUE + "--------------------------") if setupClient.runMode == 'binary' or setupClient.runMode == 'param' or setupClient.runMode == 'SimpleRNN': print('Evaluating model on X_test, y_test') score = model.evaluate(X_test, y_test, batch_size=setupClient.Params['BatchSize']) # testLoss = 'Test loss:%0.3f' % score[0] # testAccuracy = 'Test accuracy:%0.3f' % score[1] print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Test loss', score[0])) print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Test accuracy', score[1])) # get the architecture as a json string arch = model.to_json() with open(os.path.join(setupClient.ModelSavePath, 'architecture.json'), 'w') as arch_file: print('Saving model as json', os.path.join(setupClient.ModelSavePath, 'architecture.json')) arch_file.write(arch) # now save the weights as an HDF5 file model.save_weights(os.path.join(setupClient.ModelSavePath, 'ModelWeights.h5'), overwrite=True) if not os.path.isfile(setupClient.TrainedModelPath + '/DNN_Setup'): print("Pickle file not found!") quit() foo = open(setupClient.TrainedModelPath + 'DNN_Setup', "rb") bla = pickle.load(foo) minusMean = np.multiply(-1, bla.Scaler.mean_) OneOverStd = np.divide(1, np.sqrt(bla.Scaler.var_)) with open(os.path.join(setupClient.ModelSavePath, 'Scaling.txt'), 'w') as scaleFileOut: scaleFileOut.write( str(setupClient.InputDNNVariables[setupClient.VarSet]) + '\n') scaleFileOut.write('Mean\n' + str(bla.Scaler.mean_) + '\n') scaleFileOut.write('minusMean\n' + str(minusMean) + '\n') scaleFileOut.write('Var\n' + str(bla.Scaler.var_) + '\n') scaleFileOut.write('sqrtVar\n' + str(np.sqrt(bla.Scaler.var_)) + '\n') scaleFileOut.write('OneOverStd\n' + str(OneOverStd) + '\n') theClasses = [] print('\nRunning model prediction on X train/test samples') yResult_test_cls = [] yResult_train_cls = [] yResult_test = model.predict(X_test, verbose=True, batch_size=setupClient.Params['BatchSize']) yResult_train = model.predict(X_train, verbose=True, batch_size=setupClient.Params['BatchSize']) #insert the score result back into the original file # ix_test['DNN_Score'] = yResult_test # ix_train['DNN_Score'] = yResult_train # ix_test.to_pickle(setupClient.ModelSavePath+'/ResultsTestPD.pkl',protocol=2) # ix_train.to_pickle(setupClient.ModelSavePath+'/ResultsTrainPD.pkl',protocol=2) # np.save( os.path.join(setupClient.ModelSavePath, "ResultsTestPD.npy") , ix_test ) # antonio # np.save( os.path.join(setupClient.ModelSavePath, "ResultsTrainPD.npy") , ix_train ) # antonio # np.save( os.path.join(setupClient.ModelSavePath, "rootBranchSubSample.npy") , ix_test.columns.values) # antonio if setupClient.runMode == 'multi': yResult_test_cls = np.argmax( yResult_test, axis=1) #stores the element with max score yResult_train_cls = np.argmax( yResult_train, axis=1) #stores the element with max score theClasses = ['Zjets', 'Signal', 'Diboson', 'Top'] else: yResult_test_cls = np.array([int(round(x[0])) for x in yResult_test]) yResult_train_cls = np.array([int(round(x[0])) for x in yResult_train]) theClasses = ['Background', 'Signal'] # print(X_test[:20]) # print ('') # # print(ix_test[:20]) # print ('') # print(yResult_test) # quit() # # print(yResult_test_cls) # print ('') # print(yResult_train) # print ('') # print(yResult_train_cls) if setupClient.doConfusionMatrix: # Plot the confusion matrix plt.clf() # The class method is: sklearn.metrics.confusion_matrix(y_true, y_pred, labels=None, sample_weight=None) cnf_matrix = confusion_matrix(y_test, yResult_test_cls, sample_weight=w_test) np.set_printoptions(precision=2) plot_confusion_matrix(setupClient, cnf_matrix, classes=theClasses, normalize=True, title='Normalized confusion matrix') if setupClient.doEfficiency: print('Calculating Efficiencies on Test sample') if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN': s_eff = w_test[(y_test == 1) & ( yResult_test_cls > 0.5)].sum() / w_test[y_test == 1].sum() b_eff = w_test[(y_test != 1) & ( yResult_test_cls > 0.5)].sum() / w_test[y_test != 1].sum() print(" ") print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Signal efficiency', s_eff)) print('{:<35} {:<25.3f}'.format( Fore.GREEN + 'Background efficiency:', b_eff)) print('{:<35} {:<25.3f}'.format( Fore.GREEN + 'Background rejection:', 1.0 / b_eff)) if setupClient.runMode == 'multi': channelEffi = channelDic.copy() for channel, i in channelDic.items(): channelEffi[channel] = w_test[(y_test == i) & ( yResult_test_cls == 1)].sum() / w_test[y_test == i].sum() for channel, eff in channelEffi.items(): print('{:<35} {:<25.3f}'.format( Fore.GREEN + channel + ' efficiency', eff)) b_eff = w_test[(y_test != 1) & ( yResult_test_cls == 1)].sum() / w_test[y_test != 1].sum() print('{:<30} {:<20.3f}'.format('Background efficiency', b_eff)) print('{:<30} {:<20.3f}'.format('Background rejection', 1.0 / b_eff)) print(" ") if setupClient.doScore: if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN' or setupClient.runMode == 'param': # First create one sample of X_train only from signal and one only from background events Xtrain_signal = X_train[y_train == 1] Xtrain_background = X_train[y_train != 1] # Then do the same for Xtest Xtest_signal = X_test[y_test == 1] Xtest_background = X_test[y_test != 1] # Get predictions of the model on these -train- samples print('Running model prediction on Xtrain_signal') yhat_train_signal = model.predict( Xtrain_signal, batch_size=setupClient.Params['BatchSize']) print('Running model prediction on Xtrain_background') yhat_train_background = model.predict( Xtrain_background, batch_size=setupClient.Params['BatchSize']) # Get predictions of the model on these -test- samples print('Running model prediction on Xtest_signal') yhat_test_signal = model.predict( Xtest_signal, batch_size=setupClient.Params['BatchSize']) print('Running model prediction on Xtest_background') yhat_test_background = model.predict( Xtest_background, batch_size=setupClient.Params['BatchSize']) hasData = False if setupClient.runMode == 'binary' and setupClient.unblind == True: # Get the data PD file dataFileName = setupClient.PDPath + setupClient.MixPD_TrainTestTag + '_Data.pkl' if os.path.isfile(dataFileName): hasData = True print('Reading Data file:', dataFileName) data_full = pd.read_pickle(dataFileName) data_full_matrix = data_full[setupClient.InputDNNVariables[ setupClient.VarSet]].as_matrix() print('{:<45} {:<15}'.format( 'Getting Scaler of Training sample from file', Fore.GREEN + setupClient.TrainedModelPath + 'DNN_Setup')) if not os.path.isfile(setupClient.TrainedModelPath + '/DNN_Setup'): print("Pickle file not found!") quit() f = open(setupClient.TrainedModelPath + 'DNN_Setup', "rb") savedSetupClient = pickle.load(f) data_full_matrix = savedSetupClient.Scaler.transform( data_full_matrix) # Get predictions on data print('Running model prediction on data') yhat_data = model.predict( data_full_matrix, verbose=True, batch_size=setupClient.Params['BatchSize']) yhat_data_rounded = np.array([round(x[0]) for x in yhat_data]) # Save as numpy array # np.save( os.path.join(setupClient.ModelSavePath,"yhat_data.npy") , yhat_data) else: print('Data file:', dataFileName, ' not found. Will proceed to MC only') if setupClient.runMode == 'SimpleRNN': # antonio for ifile in setupClient.InputFilesSB['Data']: dataFileName = setupClient.PDPath + ifile + '_FullNoRandom.pkl' if os.path.isfile(dataFileName): hasData = False print('Reading Data file:', dataFileName) data_full = pd.read_pickle(dataFileName) VariablesSet = setupClient.InputDNNVariables[ setupClient.VarSet] data_full_matrix = data_full[VariablesSet].copy() var_names = data_full_matrix.keys() new_data_full_matrix = np.zeros( (data_full_matrix.shape[0], 6, 4)) for i in range(0, data_full_matrix.shape[0]): for j in range(0, data_full_matrix.shape[1]): new_data_full_matrix[i, int(j / 4), j % 4] = data_full_matrix.iloc[i, j] data_full_matrix = new_data_full_matrix PrepareData.scale(data_full_matrix, ['pt', 'eta', 'phi', 'E'], False, setupClient) # apply scaling to test set # Get predictions on data print('Running model prediction on data') yhat_data = model.predict( data_full_matrix, verbose=True, batch_size=setupClient.Params['BatchSize']) data_full['RNN_Score'] = yhat_data print(data_full.shape) np.save( os.path.join(setupClient.ModelSavePath, "ResultsDataMLPD_" + ifile + ".npy"), data_full) # antonio np.save( os.path.join( setupClient.ModelSavePath, "rootBranchSubSampleForDataML_" + ifile + ".npy"), data_full.columns.values) # antonio else: print('Data file:', dataFileName, ' not found. Will proceed to MC only') sns.set_palette("coolwarm", 4) # Plot scores bins = np.linspace(0, 1, 50) plt.hist(yhat_train_signal, bins=bins, histtype='step', lw=2, alpha=0.5, label=[r'Signal Train'], normed=True) plt.hist(yhat_test_signal, bins=bins, histtype='stepfilled', lw=2, alpha=0.5, label=[r'Signal Test'], normed=True) plt.hist(yhat_test_background, bins=bins, histtype='stepfilled', lw=2, alpha=0.5, label=[r'Background Test'], normed=True) plt.hist(yhat_train_background, bins=bins, histtype='step', lw=2, alpha=0.5, label=[r'Background Train'], normed=True) if hasData and setupClient.unblind == True: # Plot the data as well. Using skh_plt because matplotlib does not come with markers for hist class skh_plt.hist(yhat_data, bins=bins, errorbars=True, histtype='marker', label='Data', color='black', normed=True) plt.ylabel('Norm. Entries') plt.xlabel('DNN score') plt.legend(loc="upper center") plt.savefig(setupClient.ModelSavePath + "/MC_Data_TrainTest_Score.png") plt.yscale('log') plt.savefig(setupClient.ModelSavePath + "/MC_Data_TrainTest_Score_log.png") plt.clf() if setupClient.doROC: if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN' or setupClient.runMode == 'param': # Get 'Receiver operating characteristic' (ROC) fpr, tpr, thresholds = roc_curve(y_test, yResult_test) # Compute Area Under the Curve (AUC) from prediction scores roc_auc = auc(fpr, tpr) print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'ROC AUC', roc_auc)) # print "ROC AUC: %0.3f" % roc_auc plt.plot(fpr, tpr, color='darkorange', lw=2, label='Full curve (area = %0.2f)' % roc_auc) plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--') plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.title('ROC curves for Signal vs Background') plt.legend(loc="lower right") # plt.plot([0.038], [0.45], marker='*', color='red',markersize=5, label="Cut-based",linestyle="None") # plt.plot([0.038, 0.038], [0,1], color='red', lw=1, linestyle='--') # same background rejection point plt.savefig(setupClient.ModelSavePath + "/ROC.png") plt.clf() ### NOW try the weighted ROC curve fpr_w, tpr_w, thresholds_w = roc_curve(y_test, yResult_test, sample_weight=w_test) roc_auc_w = auc(fpr_w, tpr_w, reorder=True) print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'ROC AUC weighted', roc_auc_w)) plt.plot(fpr_w, tpr_w, color='darkorange', lw=2, label='Full curve (area = %0.2f)' % roc_auc_w) plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--') plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('True Positive Rate (weighted)') plt.xlabel('False Positive Rate (weighted)') plt.title('ROC curve for Signal vs Background') plt.legend(loc="lower right") # plt.plot([0.038], [0.45], marker='*', color='red',markersize=5, label="Cut-based",linestyle="None") # plt.plot([0.038, 0.038], [0,1], color='red', lw=1, linestyle='--') # same background rejection point plt.savefig(setupClient.ModelSavePath + "/ROC_weighted.png") plt.clf() np.save(os.path.join(setupClient.ModelSavePath, "tpr_w.npy"), tpr_w) np.save(os.path.join(setupClient.ModelSavePath, "fpr_w.npy"), fpr_w) np.save( os.path.join(setupClient.ModelSavePath, "thresholds_w.npy"), thresholds_w) np.save(os.path.join(setupClient.ModelSavePath, "thresholds.npy"), thresholds) np.save(os.path.join(setupClient.ModelSavePath, "tpr.npy"), tpr) np.save(os.path.join(setupClient.ModelSavePath, "fpr.npy"), fpr) np.save(os.path.join(setupClient.ModelSavePath, "AUC.npy"), roc_auc) np.save(os.path.join(setupClient.ModelSavePath, "AUC_w.npy"), roc_auc_w)
import Clusterisation, LoadData, MakePlot, PDF_maker, PrepareData, DataFromClusters n_clusters = 156 #or -1 for clasters == crimeies not more 156 primaryType = True # == !FBI code X, y, data, dictionary_crimes = PrepareData.prepareData(LoadData.load_from_csv(primaryType), primaryType) # dictionary is exist onprimaryType == True k_means, n_clusters = Clusterisation.make_k_means(n_clusters, data) MakePlot.make_plot(k_means, X, y, n_clusters) data_clusters = DataFromClusters.get_data_clusters(k_means, X, y) if primaryType: n = 1 for i in data_clusters: PDF_maker.make_pdf_clusters(i,dictionary_crimes, n) n += 1 else: n = 1 for i in data_clusters: PDF_maker.make_pdf_clusters_without_descr(i, n) n += 1 PDF_maker.makeCounter(data, dictionary_crimes, primaryType)
slope_output_layer = self.sigmoid_derivative(output_layer_output) slope_hidden_layer = self.sigmoid_derivative(hidden_layer_output) d_output = E * slope_output_layer Error_at_hidden_layer = dot(d_output, self.weights1.T) d_hiddenlayer = Error_at_hidden_layer * slope_hidden_layer #GRADIENT DESCENT self.weights1 = self.weights1 + dot(hidden_layer_output.T, d_output) * learning_rate self.weights0 = self.weights0 + dot(input_layer.T, d_hiddenlayer) * learning_rate def prediction(self, pred): layer1 = self.sigmoid(dot(pred, self.weights0)) layer2 = self.sigmoid(dot(layer1, self.weights1)) return layer2 ann = NeuralNetwork() PD.prepare_training_data() nn_train_inp = array(PD.training_input) nn_train_oup = expand_dims(PD.training_output, axis=1) ann.train(nn_train_inp, nn_train_oup, 10000) PD.prepare_testing_data() nn_test_inp = array(PD.testing_input) nn_test_oup = ann.prediction(nn_test_inp)
import random import torch from torch import nn from torch.autograd import Variable from torch.utils.data import DataLoader from DecoderRNN import DecoderRNN from EncoderRNN import EncoderRNN from PrepareData import * from attn import Attn from attn_decoder import AttnDecoder from seq2seq_dataset import Seq2SeqDataset data = PrepareData() SOS = 0 use_cuda = torch.cuda.is_available() class Translate(): def __init__(self): self.data = PrepareData() self.dataset = Seq2SeqDataset() self.data_loader = DataLoader(dataset=self.dataset, batch_size=1, shuffle=True) self.lang_1 = data.lang_1 self.lang_2 = data.lang_2 self.char2index = data.char2index self.index2char = data.index2char
if setupClient.ConvertRootToPD: print(Fore.BLUE + "--------------------------") print(Back.BLUE + ' CONVERTING ROOT-->PANDAS ') print(Fore.BLUE + "--------------------------") print('{:<45} {:<15}'.format( "Input Flat Ntuples directory", Fore.GREEN + setupClient.InputMLNtuplePath)) print( '{:<45} {:<15}'.format('Output Pandas Dataframe directory', Fore.GREEN + setupClient.PDPath), checkCreateDir(setupClient.PDPath)) print('{:<45} {:<15}'.format( 'Branches to keep from ROOT file', Fore.GREEN + str(setupClient.rootBranchSubSample))) PrepareData.convertToPanda(setupClient) elif setupClient.CreateTrainTestPD: print(Fore.BLUE + "--------------------------") print(Back.BLUE + ' CREATING TRAIN/TEST PDs ') print(Fore.BLUE + "--------------------------") print('{:<45} {:<15}'.format('InputFilesSB', Fore.GREEN + str(InputFilesSB))) print( '{:<45} {:<15}'.format('I/O Pandas Dataframe directory', Fore.GREEN + setupClient.PDPath), checkCreateDir(setupClient.PDPath)) print('{:<45} {:<15}'.format( 'PD Train/Test Name Tag', Fore.MAGENTA + setupClient.MixPD_TrainTestTag)) print('{:<45} {:<15}'.format( 'PreselectionCuts', Fore.MAGENTA + setupClient.PreselectionCuts))
from sentence_transformers import SentenceTransformer import scipy.spatial import numpy as np import PrepareData embedder = SentenceTransformer( 'output/training_tf-idf_word_embeddings-2020-06-19_15-54-05') corpus = PrepareData.load_data() # Corpus with example sentences corpus_embeddings = [] for document in corpus: sentences_embeddings = embedder.encode(document) sentences_embeddings = np.array(sentences_embeddings) document_embedding = np.mean(sentences_embeddings, axis=0) corpus_embeddings.append(document_embedding) # Query sentences: # #similarity_matrix = [] #for first_doc in corpus_embeddings: # similarity_vector = [] # for second_doc in corpus_embeddings: # similarity_vector.append(1 - scipy.spatial.distance.cosine(first_doc, second_doc)) # similarity_matrix.append(similarity_vector) # #similarity_matrix = np.array(similarity_matrix) #print(similarity_matrix) # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
import numpy as np import libs.Utils as utils import PrepareData as data # Load dataset df = data.get_clean_dataset() def plot_avalanche_activity_per_year(): ''' :def: This function creates a bar plot displaying the avalanche activity (number of avalancher) per year of study (21 years in total). :return: void ''' # Plot number of avalanches per year utils.create_bar_plot(df, 'year', 'Year', 'Num avalanches', True) # From this plot we can see there hasn't been a clear pattern of increasing/decreasing the avalanche activity within time # In addition, we can see that the data collected the fist seven years is much smaller than the years after. # Therefore, comparision between years should be done through the entire data set. def plot_avalanche_feature_correlation(): ''' :def: This function creates the correlation map between a pre-selected columns including categorical variables. :return: void '''
指导文件文件保存在 http://nbviewer.jupyter.org/github/BlueBirdHouse/CarND-TensorFlow-Lab/blob/master/lab.ipynb """ from sklearn.utils import resample from sklearn.model_selection import train_test_split import PrepareData #A3 = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 255],[255, 255, 255, 255, 255, 255, 255, 255, 255,255,255,255]]) #train_features1,NG = PrepareData.normalize_grayscale(A3) #Temp = 0 #%%执行导入数据过程 #导入训练集合 train_features, train_labels = PrepareData.uncompress_features_labels('notMNIST_train.zip') #导入测试集合 test_features, test_labels = PrepareData.uncompress_features_labels('notMNIST_test.zip') #%%问题1:数据归一化过程 # Problem 1 - Implement Min-Max scaling for grayscale image data train_features,train_features_NG = PrepareData.normalize_grayscale(train_features) test_features,test_features_NG = PrepareData.normalize_grayscale(test_features) #%%清除问题数据 train_features,train_labels = PrepareData.deleteNan(train_features,train_labels,train_features_NG) test_features,test_labels = PrepareData.deleteNan(test_features,test_labels,test_features_NG) #%% 取原有数据集合中的一部分,原作者的目的是为了配合虚拟空间的使用
def plot_avalanche_activity_vs_aspect(): # - Is the orientation of the avalanche important? # Plot the avalanche activity vs the aspect degree df_north, df_northeast, df_east, df_southeast, df_south, df_southwest, df_west, df_northwest = data.get_df_aspect( ) labels = [ 'N\n' + str(len(df_north)), 'NE\n' + str(len(df_northeast)), 'E\n' + str(len(df_east)), 'SE\n' + str(len(df_southeast)), 'S\n' + str(len(df_south)), 'SW\n' + str(len(df_southwest)), 'W\n' + str(len(df_west)), 'NW\n' + str(len(df_northwest)) ] coordinates = [ 'North', 'North-East', 'East', 'South-East', 'South', 'South-West', 'West', 'North-West' ] # For better observation each aspect will have the same space in the pie pie_weights = [1 / 8, 1 / 8, 1 / 8, 1 / 8, 1 / 8, 1 / 8, 1 / 8, 1 / 8] # On the other hand is the color of each trunch who determines the weight (number of avalanches) weight = [ df_north.shape[0], df_northeast.shape[0], df_east.shape[0], df_southeast.shape[0], df_south.shape[0], df_southwest.shape[0], df_west.shape[0], df_northwest.shape[0] ] weight_cmap = np.true_divide(weight, len(df_northeast)) # Take the percentage of each orientation total = sum(weight) percentage = np.around(np.true_divide(weight, total), 2) percentage_ = np.multiply(percentage, 100) percentage_int = [int(i) for i in percentage_] percentage_str = [ str(len(df_north)) + '\n' + str(percentage_int[0]) + '%', str(len(df_northeast)) + '\n' + str(percentage_int[1]) + '%', str(len(df_east)) + '\n' + str(percentage_int[2]) + '%', str(len(df_southeast)) + '\n' + str(percentage_int[3]) + '%', str(len(df_south)) + '\n' + str(percentage_int[4]) + '%', str(len(df_southwest)) + '\n' + str(percentage_int[5]) + '%', str(len(df_west)) + '\n' + str(percentage_int[6]) + '%', str(len(df_northwest)) + '\n' + str(percentage_int[7]) + '%' ] #utils.create_pie(sizes=pie_weights, labels=labels, colorweight=weight_cmap, startangle=90+22) utils.create_two_pie(sizes1=pie_weights, sizes2=pie_weights, labels1=coordinates, labels2=percentage_str, colorweight=weight_cmap, startangle=90 + 22)
X_train,X_test,y_train,y_test = train_test_split(Train.X,Train.y,test_size=0.25,random_state=4) clf = xgb.XGBModel(max_depth=8,n_estimators=100,objective="reg:linear", random_state=17,n_jobs=-1) clf.fit(X_train, y_train, eval_metric='rmse', verbose = True, eval_set = [(X_train,y_train),(X_test, y_test)]) clf.save_model('./model/XGBoost.model') pickle.dump(clf, open("XGBosst.pickle.dat", "wb")) if __name__ == "__main__": aa = MakeSeperateFile.MakeSeperateFile(sys.argv[1]) aa.makeFile() with open('BusIdList.csv','rb') as f: list = pickle.load(f) data = PrepareData.PrepareData(list,2000) data.prepareData() col = ["Lat1","Lat2","Long1","Long2","hour", "minute","second", "timeTaken","week",'sin1','sin2','sin3','sin4','sin5','sin6', 'sin7','sin8','sin9','sin10','sin11','sin12','sin13','sin14', 'cos1','cos2','cos3','cos4','cos5','cos6','cos7','cos8','cos9', 'cos10','cos11','cos12','cos13','cos14'] final_data = pd.read_csv('./final_data/finalData.csv',names=col) train = Train(final_data) train.Linear_Regression() train.RandomForest() train.XGBoost() test_df = pd.read_csv(sys.argv[2]) model = pickle.load(open('./model/LinearRegression.pickle','rb'))
def TrainingModel(): nb_epochs = Parameters.nb_of_epochs prepare = PrepareData.prepareData() train, train_lable, test, test_label, test_names = prepare.generateInputData( ) best_roc = -1 # Construct the neural network model = get_model() print(model.summary()) # Save the structure of the neural network model_json = model.to_json() with open(Parameters.model_path + 'model' + ".json", "w") as json_file: json_file.write(model_json) writeParamsModel() train_acc = [] train_loss = [] test_acc = [] test_loss = [] # Train for epoch in range(nb_epochs): print('\nEPOCH:', epoch + 1, 'of', nb_epochs) history = model.fit(train, train_lable, epochs=1, shuffle=True, batch_size=Parameters.batch_size) loss, acc = model.evaluate(test, test_label) # Save training accuracy and loss train_acc.append(history.history.get('acc')[0]) train_loss.append(history.history.get('loss')[0]) # Print and save test acc and loss print('\tTest - loss: ', loss, '- acc:', acc) test_acc.append(acc) test_loss.append(loss) # Predict score = model.predict(test) predicted_label = model.predict_classes(test) unique, counts = np.unique(predicted_label, return_counts=True) # Print number of 0s and 1s predicted print("\tPredicted labels:", dict(zip(unique, counts))) # Roc roc = roc_auc_score(test_label, score) rocValues.append(roc) print('\tROC =', roc) # Select the epoch with the best performance if roc >= best_roc: best_roc = roc save_epoch = epoch best_score = score model.save_weights(Parameters.weights_path + str(save_epoch) + ".h5") best_predicted_label = predicted_label bestAcc = acc # Stop is too much overfitting appears save_plots(train_acc, train_loss, test_acc, test_loss, rocValues) if history.history.get('acc')[0] - acc > 0.2: break print('Run:', Parameters.run) print('Performance of ProDec-BLSTM: roc:', best_roc) writeResults(test_names, test_label, best_score, best_predicted_label, Parameters.run, best_roc, bestAcc, False) return train_acc, train_loss, test_acc, test_loss
import matplotlib.pyplot as PLT import PrepareData as PD import Accuracy as ACC accuracy = ACC.count_accuracy() PD.original_graph_data() #PLOTTING ORIGINAL FEATURES PLT.scatter(PD.p1x, PD.p1y, label="setosa", color="red", marker="o", s=50) PLT.scatter(PD.p2x, PD.p2y, label="versicolor", color="green", marker="o", s=50) PLT.scatter(PD.p3x, PD.p3y, label="virginica", color="blue", marker="o", s=50) #PLOTTING PREDICTED FEATURES PLT.scatter(ACC.pred_x, ACC.pred_y, label="prediction", color="black", marker="|", s=100) PLT.xlabel('sepal length') PLT.ylabel('sepal width') PLT.title(accuracy) PLT.legend() PLT.show()
#Extract the dataset train_folders = ImportData.maybe_extract(train_filename) test_folders = ImportData.maybe_extract(test_filename) #seed the random generator to get the same random series np.random.seed(133) #Problem 1: print("Problem 1: display samples of images to check if they looks good") if not SKIP: DataCheckers.displayLettersAsImage(test_folders) #create classes, normalize dataset & put file in manageable format train_dataset = PrepareData.maybe_pickle(train_folders, 15000) test_datasets = PrepareData.maybe_pickle(test_folders, 1800) #Problem 2 display letter form the dataset array to check if it looks good print( "Problem 2 display letter form the dataset array to check if it looks good" ) if not SKIP: DataCheckers.plotRandomLettersFromDataset(test_datasets) DataCheckers.plotRandomLettersFromDataset(train_dataset) #Problem 3 check if the repartition is even print("Problem 3 check if the repartition is even") if not SKIP: print("Variance test dataset: " + str(DataCheckers.checkRepartition(test_datasets)))
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ''' import PrepareData import Models import quandl import h2o #Please uncomment this line and put some key if some unprobable problem with quandl quandl.ApiConfig.api_key = "rFhqT3ot2z_6AnzpB9nU" #Get all data needed from Quandl, Preprocess I and II and have features Dataframe ready to train X = PrepareData.PrepareFeatures() #Prepares label to predict CRISIS 6 months ahead and gets labels Series y = PrepareData.PrepareLabel(months_ahead= 6, dates_index= X.index) #Creates one dataset dataset = PrepareData.MergeDataset(X,y) #Trains our preliminar model as documented on pdf testframe, prel_model = Models.TrainPreliminarModel(dataset) #Gets Performance Summary on preliminar model prel_model.model_performance(testframe) #Trains our final model as documented on pdf with 5 fold CV final_model = Models.TrainCrossValidation5FoldFinalModel(dataset)
char_file_path = 'char.json' with open(char_file_path, 'r') as char_file: char_list = js.load(char_file) char_dic = Build_Char_One_Hot_Dic.one_hot_encoding(char_list) one_hot_feature_dim = len(char_list) train_file_path = str(Path().resolve().parent ) + '/Offline-Challenge/test/xtrain_obfuscated.txt' with open(train_file_path, 'r') as raw_data_file: raw_data = read_raw_data(raw_data_file) validate_file_path = str( Path().resolve().parent) + '/Offline-Challenge/xtest_obfuscated.txt' with open(validate_file_path, 'r') as validate_raw_file: validate_raw = read_raw_data(validate_raw_file) x_one_hot, text_length = PrepareData.prepare_data(raw_data, char_dic, one_hot_feature_dim) x_validate_one_hot, text_length = PrepareData.prepare_data( validate_raw, char_dic, one_hot_feature_dim, text_length=text_length) label_file_path = str( Path().resolve().parent) + '/Offline-Challenge/test/ytrain.txt' with open(label_file_path, 'r') as label_file: label_data = read_label(label_file) y = label_data main()
def main(): # you can input a list of integers for the raw n gram combination if len(sys.argv) == 1: n_gram_list = [5] else: n_gram_list = sys.argv[1:len(sys.argv)] n_gram_list = map(int, n_gram_list) x = PrepareData.feat_extraction(n_gram_list, x_one_hot) x_validate = PrepareData.feat_extraction(n_gram_list, x_validate_one_hot) n_feat = x.shape[1] raw_data_size = (n_feat, text_length, 1) n_classes = y.shape[1] k = 5 k_fold_sequence = data_set_k_fold_separation(x.shape[0], k) output_train = open('train_acc.txt', 'wb') output_test = open('test_acc.txt', 'wb') y_validate = [] train_loss_acc = [] test_loss_acc = [] for i in range(k): test_seq = k_fold_sequence[i] train_seq = [] for j in range(k): if i != j: train_seq.extend(k_fold_sequence[j]) nc = NC(input_size=raw_data_size, n_classes=n_classes, raw_feature_dim=n_feat) xtrain = x[train_seq] ytrain = y[train_seq] nc.fit([xtrain, xtrain], ytrain) eval_train_result = nc.evaluation([xtrain, xtrain], ytrain) train_loss_acc.append(eval_train_result) print(eval_train_result) print >> output_train, [k, eval_train_result] xtest = x[test_seq] ytest = y[test_seq] eval_test_result = nc.evaluation([xtest, xtest], ytest) test_loss_acc.append(eval_test_result) print(eval_test_result) print >> output_test, [k, eval_test_result] y_validate_k = nc.predict([x_validate, x_validate]) y_validate_k = y_validate_k.argmax(axis=1) y_validate.append(y_validate_k) print >> output_train, [ 'average', np.mean(np.asarray(train_loss_acc), axis=0) ] print >> output_test, [ 'average', np.mean(np.asarray(test_loss_acc), axis=0) ] y_validate_file_path = 'ytest_all.txt' np.savetxt(fname=y_validate_file_path, X=np.asarray(y_validate), fmt='%i') Y = np.asarray(y_validate).transpose() y_final = [] for tmp in Y: y_final.append(np.bincount(tmp).argmax()) y_final_validate_file_path = 'ytest.txt' np.savetxt(fname=y_final_validate_file_path, X=y_final, fmt='%i')