def TrainBuildTransformer(): with open("Model/Config.json") as Fd: ConfigDict = json.load(Fd) MaxLength = ConfigDict["MaxLength"] BatchSize = ConfigDict["BatchSize"] EmbeddingSize = ConfigDict["EmbeddingSize"] HeadNum = ConfigDict["HeadNum"] EnLayer = ConfigDict["EnLayer"] DeLayer = ConfigDict["DeLayer"] Dropout = ConfigDict["Dropout"] print("Loading Tgt vocab") TgtDict = DLoad.LoadVocabulary("Data/tgt.vocab") print("Tgt vocab Loading finished") print("Loading Src vocab") SrcDict = DLoad.LoadVocabulary("Data/src.vocab") print("Src vocab Loadinf finished") # SrcIndSentences, SrcLength, SrcDict = DLoad.LoadData( # "Data/src.sents", "Data/src.vocab", MaxLength) # TgtIndSentences, TgtLength, TgtDict = DLoad.LoadData( # "Data/tgt.sents", "Data/tgt.vocab", MaxLength) # TrainDataset = DLoad.TrainCorpusDataset( # SrcIndSentences, SrcLength, TgtIndSentences, TgtLength) #BatchDatas = DLoad.TrainDataLoaderCreator(TrainDataset, BatchSize) SrcVocabularySize = SrcDict.VocabularySize() TgtVocabularySize = TgtDict.VocabularySize() print("Building Model") Trans = TransformerNMTModel(HeadNum, EmbeddingSize, SrcVocabularySize, TgtVocabularySize, MaxLength, EnLayer, DeLayer, Dropout) print("Model building finished") # return Trans, BatchDatas, SrcDict, TgtDict, MaxLength, EmbeddingSize return Trans, BatchSize, SrcDict, TgtDict, MaxLength, EmbeddingSize
def Test15(): Sentences = DL.LoadSentences("src.sents") PaddedSentences, Length = DL.PaddingSentences(Sentences, 30) for Sent in PaddedSentences: print(Sent) for L in Length: print(L)
def Test16(): Sentences = DL.LoadSentences("src.sents") PaddedSentences, Length = DL.PaddingSentences(Sentences, 30) Dict = DL.LoadVocabulary("src.vocab") IndSentences = DL.ChangePaddedSentencesToInd(PaddedSentences, Dict) print(Dict.VocabularySize()) for Sent in IndSentences: print(Sent)
def train_model(self, training_data, training_label, testing_data, usps, NUM_EPOC=5000, BATCH_SIZE=128): """ Train Neural network model :param training_data: features for training :param training_label: training target :param testing_data: testing dataset :param NUM_EPOC: Number of epoch :param BATCH_SIZE: size of batch :return: predicted label, training accuracy """ training_accuracy = [] with tf.Session() as sess: # Set Global Variables ? tf.global_variables_initializer().run() # '------------training started-------------') for epoch in range(NUM_EPOC): # Shuffle the Training Dataset at each epoch p = np.random.permutation(range(len(training_data))) training_data = training_data[p] training_label = training_label[p] # Start batch training for start in range(0, len(training_data), BATCH_SIZE): end = start + BATCH_SIZE sess.run(self.training, feed_dict={ self.inputTensor: training_data[start:end], self.outputTensor: training_label[start:end] }) # append training accuracy for current epoch training_accuracy.append( np.mean( np.argmax(training_label, axis=1) == sess.run( self.prediction, feed_dict={ self.inputTensor: training_data, self.outputTensor: training_label }))) # Testing predicted_test_label = sess.run( self.prediction, feed_dict={self.inputTensor: testing_data}) DataLoad.write_to_csv("nn_test.csv", predicted_test_label) predicted_usps_label = sess.run( self.prediction, feed_dict={self.inputTensor: usps[0]}) DataLoad.write_to_csv("nn_test_usps.csv", predicted_usps_label) return predicted_test_label, training_accuracy, predicted_usps_label
def predict1(self, users_df, sessions, products): users = copy.deepcopy(users_df) users = data.favourite_products(users, sessions, products) users = data.spendings(users, sessions, products) users = data.discounts_stats(users, sessions) users = users.set_index('user_id') users = users.drop([ 'name', 'city', 'street', ], axis=1) users = users.fillna(0) return self.clf.predict(users)
def train(self, users_df, sessions, products): users = copy.deepcopy(users_df) users = data.favourite_products(users, sessions, products) users = data.spendings(users, sessions, products) users = data.discounts_stats(users, sessions) users = data.discounts_label(users, sessions) users = users.set_index('user_id') users = users.drop([ 'name', 'city', 'street', ], axis=1) users = users.fillna(0) y_train = users['label'] X_train = users.drop('label', axis=1) self.clf.fit(X_train, y_train)
def get_agl_pdf(filename): pdf = Aglomerative.create_pdf(DataLoad.get_transformed_data(filename), 10) response = make_response(pdf) response.headers[ 'Content-Disposition'] = "attachment; filename='police_stops_report.pdf" response.mimetype = 'application/pdf' return response
def train_model(self, feature_d, target, learning_rate, num_epoch=400, theta=0.1): """ Train logistic regression model :param feature_d: features :param target: target values for given feature :param learning_rate: learning rate for model :param num_epoch: number of epoch :return: list of accuracy at each epoch """ # weights initialization self.weights = np.zeros((feature_d.shape[1], target.shape[1])) training_accuracy = [] for i in range(num_epoch): z = np.dot(feature_d, self.weights) #hypothesis = self.sigmoid(z) hypothesis = self.softmax(z, theta=theta) gradient = np.dot(feature_d.T, (hypothesis - target)) / target.shape[0] self.weights -= learning_rate * gradient #print("hypothesis shape", hypothesis.shape, "target shape", target.shape, "weights ", self.weights.shape, # "grad shape", gradient.shape,"new weights",self.weights.shape) training_accuracy.append( DataLoad.get_accuracy_logistic(np.round(hypothesis), target)) return training_accuracy
def start_neural_network(self, learning_rate=0.002, num_epoch=50, show_graph=False): """ Load features (concat/subtract) and target from dataset (HBD or GSC) Start training Neural Network model Print accuracy on test dataset :param dataset: HBD or GSC dataset :param op: concat or subtract feature :param limit: size of dataset for training and testing :param learning_rate: learning rate :param num_buckets: number of output bucket :param show_graph: boolean to display accuracy graph :return: """ print("Neural Network ") train, validation, test, usps = DataLoad.create_dataset() train_target = DataLoad.convert_target(train[1]) test_target = DataLoad.convert_target(test[1]) print("start define model") self.define_model(num_features=train[0].shape[1], num_buckets=train_target.shape[1], learning_rate=learning_rate) print("start training model") predicted_test_label, training_accuracy, usps_test_label = self.train_model( training_data=train[0], training_label=train_target, testing_data=test[0], usps=usps, NUM_EPOC=num_epoch, BATCH_SIZE=100) #xpred=DataLoad.pick_max_result(predicted_test_label) print("Testing Accuracy: ", DataLoad.get_accuracy(predicted_test_label, test[1])) print(confusion_matrix(test[1], predicted_test_label)) #xpred = DataLoad.pick_max_result(usps_test_label) print("Testing USPS Accuracy: ", DataLoad.get_accuracy(usps_test_label, usps[1])) print(confusion_matrix(usps[1], usps_test_label)) if show_graph: import matplotlib.pyplot as plt plt.plot(training_accuracy) plt.show()
def tRBF(): ''' Radial basis function test ''' Loc, POI, Prec = DataLoad.lcsv('TestData\GaugeLoc.csv', 'TestData\InterpPts.csv', 'TestData\Dataset.csv') Z, Zavg = RBF.Interp_bat(Loc, POI, Prec, 0, 20) return 'Radial Basis Function Interpolation working fine!'
def tCubic(): ''' Cubic interpolator test ''' Loc, POI, Prec = DataLoad.lcsv('TestData\GaugeLoc.csv', 'TestData\InterpPts.csv', 'TestData\Dataset.csv') Z, Zavg = Cubic.Interp_bat(Loc, POI, Prec, 0, 20) return 'Cubic Interpolation working fine!'
def tLinear(): ''' Nearest Neighborhood test ''' Loc, POI, Prec = DataLoad.lcsv('TestData\GaugeLoc.csv', 'TestData\InterpPts.csv', 'TestData\Dataset.csv') Z, Zavg = Linear.Interp_bat(Loc, POI, Prec, 0, 20) return 'Linear Interpolation working fine!'
def tIDW(): ''' IDW Test ''' Loc, POI, Prec = DataLoad.lcsv('TestData\GaugeLoc.csv', 'TestData\InterpPts.csv', 'TestData\Dataset.csv') Z, Zavg = IDW.Interp_bat(Loc, POI, Prec, 2.0, 0.00001, 0, 20) return 'IDW working fine!'
def create_pdf(count_klast, filename): data = DataLoad.get_transformed_data(filename).as_matrix() k_means = KMeans(n_clusters=count_klast, random_state=1) k_means.fit(data) centers = k_means.cluster_centers_ klusters = [] for j in range(count_klast): klusters.append([]) for i, la in enumerate(k_means.labels_): klusters[la].append(data[i]) output = cStringIO.StringIO() p = canvas.Canvas(output) number = 1 it = 1 it2 = 1 for k in klusters: if (800 - it * 20) < 20: it = 1 it2 = 1 p.showPage() it = it + 1 it2 = it2 + 1 p.drawString( 100, 800 - it * 20, "center of klusters num" + str(number) + " age: " + str(int(centers[number - 1][0])) + " time: " + str(datetime.timedelta(seconds=int(centers[number - 1][2])))) it = it + 1 it2 = it2 + 1 p.drawString(50, 800 - it * 20, "Men") p.drawString(200, 800 - it * 20, "Woman") it = it + 1 it2 = it2 + 1 for i in k: if (int(i[1]) == 0): if (800 - it * 20) < 20: it = 1 it2 = 1 p.showPage() p.drawString( 50, 800 - it * 20, "age: " + str(int(i[0])) + " time: " + str(datetime.timedelta(seconds=int(i[2])))) it = it + 1 else: if (800 - it2 * 20) < 20: it = 1 it2 = 1 p.showPage() p.drawString( 200, 800 - it2 * 20, "age: " + str(int(i[0])) + " time: " + str(datetime.timedelta(seconds=int(i[2])))) it2 = it2 + 1 number = number + 1 p.save() pdf_out = output.getvalue() output.close() return pdf_out
def combine_model(data_=MNIST): _, _, test, usps = DataLoad.create_dataset() logistic = load_data(data_[0]) ran_for = load_data(data_[1]) svm = load_data(data_[2]) nn = load_data(data_[3]) combine_result = [] for i in range(len(logistic)): combine_result.append( get_mode([logistic[i], ran_for[i], svm[i], nn[i]])) if data_ == MNIST: cm = confusion_matrix(test[1], combine_result) print(DataLoad.get_accuracy(test[1], combine_result)) else: cm = (confusion_matrix(usps[1], combine_result)) print(DataLoad.get_accuracy(usps[1], combine_result)) print(cm)
def TestBuildTransformer(): with open("Model/Config.json") as Fd: ConfigDict = json.load(Fd) MaxLength = ConfigDict["MaxLength"] BatchSize = ConfigDict["BatchSize"] EmbeddingSize = ConfigDict["EmbeddingSize"] HeadNum = ConfigDict["HeadNum"] EnLayer = ConfigDict["EnLayer"] DeLayer = ConfigDict["DeLayer"] SrcIndSentences, SrcLength, SrcDict = DLoad.LoadData( "Data/test.sents", "Data/src.vocab", MaxLength) TgtDict = DLoad.LoadVocabulary("Data/tgt.vocab") TestDataset = DLoad.TestCorpusDataset(SrcIndSentences, SrcLength) BatchDatas = DLoad.TestDataLoaderCreator(TestDataset, BatchSize) SrcVocabularySize = SrcDict.VocabularySize() TgtVocabularySize = TgtDict.VocabularySize() print("Building Model") Trans = TransformerNMTModel( HeadNum, EmbeddingSize, SrcVocabularySize, TgtVocabularySize, MaxLength, EnLayer, DeLayer) print("Model building finished") return Trans, BatchDatas, SrcDict, TgtDict, MaxLength
def Test25(): MaxLength = 30 BatchSize = 2 EmbeddingSize = 4 HeadNum = 2 SrcIndSentences, SrcLength, SrcDict = DL.LoadData("src.sents", "src.vocab", MaxLength) TgtIndSentences, TgtLength, TgtDict = DL.LoadData("tgt.sents", "tgt.vocab", MaxLength) TrainDataset = DL.TrainCorpusDataset(SrcIndSentences, SrcLength, TgtIndSentences, TgtLength) BatchDatas = DL.TrainDataLoaderCreator(TrainDataset, BatchSize) for Batch in BatchDatas: SrcSent = Batch["SrcSent"] print(SrcSent) SrcLength = Batch["SrcLength"] print(SrcLength) TgtSent = Batch["TgtSent"] print(TgtSent) TgtLength = Batch["TgtLength"] print(TgtLength)
def get_klasters(count_klast): data = DataLoad.get_transformed_data().as_matrix() k_means = KMeans(n_clusters=count_klast) fits = k_means.fit(data) centrx = k_means.cluster_centers_ klusters = [] for j in range(count_klast): klusters.append([]) for i, la in enumerate(k_means.labels_): klusters[la].append(data[i]) print "silhuette:", metrics.silhouette_score(data, k_means.labels_) return klusters
def recomm(text, data_path, pip_path, lda_path, recomm_num=5): input_text = DT.loadInput(text, spark, sc) pred_df, pred_dis, pred_index = MD.Model().ldaPredict(input_text, pip_path=pip_path, lda_path=lda_path) data_withTopic = DT.loadTopicData(data_path, topic=pred_index, spark=spark) data_withDis = CR.calSimi(pred_dis, data_withTopic) data_sort = data_withDis.sort("dis") text_list = list() source = data_sort.select("text").rdd.take(recomm_num) for i in range(recomm_num): text_list.append(source[i]["text"]) return data_sort, text_list
def Test21(): MaxLength = 30 def CollateFunction(Batch): #print(len(Batch)) OutputBatch = { "SrcSent": [], "SrcLength": [], "TgtSent": [], "TgtLength": [] } for Elem in Batch: #print(Elem[0][0]) OutputBatch["SrcSent"].append(Elem[0][0]) OutputBatch["SrcLength"].append(Elem[0][1]) OutputBatch["TgtSent"].append(Elem[1][0]) OutputBatch["TgtLength"].append(Elem[1][1]) #print(OutputBatch["SrcSent"]) OutputBatch["SrcSent"] = t.LongTensor(OutputBatch["SrcSent"]) OutputBatch["TgtSent"] = t.LongTensor(OutputBatch["TgtSent"]) return OutputBatch SrcIndSentences, SrcLength, SrcDict = DL.LoadData("src.sents", "src.vocab", MaxLength) TgtIndSentences, TgtLength, TgtDict = DL.LoadData("tgt.sents", "tgt.vocab", MaxLength) TrainDataset = DL.TrainCorpusDataset(SrcIndSentences, SrcLength, TgtIndSentences, TgtLength) z = DL.TrainDataLoaderCreator(TrainDataset, 4) Count = 0 while True: if Count == 100: break Count = Count + 1 for x in z: print("Batch") print(x["SrcSent"].size())
def cargarCondicionesA(self): fname = self.QFileDialog.getOpenFileName(None, 'Open file', "*.xlsx") if (fname[0] != ""): # self.pushButton.setText("Button is clicked") ruta = str(fname[0]) load = ld.DataLoad(ruta) load.crearCondicionesA() self.listaNombreRegionCA = load.getNombreRegionesCA() self.listaCualificacionCA = load.getCualificaionCA() # colocar el bolean de que si cargo C.A self.siCondicionesA = True else: self.siCondicionesA = False
def Test26(): MaxLength = 30 BatchSize = 2 EmbeddingSize = 4 HeadNum = 2 EnLayer = 2 DeLayer = 2 SrcIndSentences, SrcLength, SrcDict = DL.LoadData("src.sents", "src.vocab", MaxLength) TgtIndSentences, TgtLength, TgtDict = DL.LoadData("tgt.sents", "tgt.vocab", MaxLength) TrainDataset = DL.TrainCorpusDataset(SrcIndSentences, SrcLength, TgtIndSentences, TgtLength) BatchDatas = DL.TrainDataLoaderCreator(TrainDataset, BatchSize) SrcVocabularySize = SrcDict.VocabularySize() TgtVocabularySize = TgtDict.VocabularySize() Trans = T.TransformerNMTModel(HeadNum, EmbeddingSize, SrcVocabularySize, TgtVocabularySize, MaxLength, EnLayer, DeLayer) for BatchInd, Batch in enumerate(BatchDatas): print("BegingBatch") SrcSent = Batch["SrcSent"] print(SrcSent.size()) SrcLength = Batch["SrcLength"] #print(SrcLength.size()) TgtSent = Batch["TgtSent"] print(TgtSent.size()) TgtLength = Batch["TgtLength"] #print(TgtLength.size()) SrcMask = T.BatchLengthToBoolTensorMask(SrcLength, MaxLength) TgtMask = T.BatchLengthToBoolTensorMask(TgtLength, MaxLength) Output = Trans(SrcSent, TgtSent, SrcMask, TgtMask) print("Step") print(BatchInd + 1) print(Output.size()) print(Output[0][2])
def tKrigP(): ''' Kriging test ''' Loc, POIC, Prec = DataLoad.lcsv('TestData\GaugeLoc.csv', 'TestData\InterpPts.csv', 'TestData\Dataset.csv') Loc = numpy.array(Loc)/1000.0 POIC = numpy.array(POIC)/1000.0 SVExp, CovMea = KrigingP.exp_semivariogram(Prec, Loc) xopt, ModOpt, VarFunArr = KrigingP.theor_variogram(SVExp) Z, SP, ZAvg = KrigingP.Krig(10.0, POIC, Loc, Prec, CovMea, ModOpt, xopt, VarFunArr,10 ,11, 'Ord') print Z print ZAvg return
def run_svm(kernal=k,train_size=50000): train, valid, test, usps = DataLoad.create_dataset() svclassifier = SVC(kernel=kernal, gamma=1) svclassifier.fit(train[0][:train_size], train[1][:train_size]) y_pred = svclassifier.predict(test[0]) DataLoad.write_to_csv("svm_test.csv", y_pred) print("accuracy test ", DataLoad.get_accuracy(y_pred, test[1])) print(confusion_matrix(test[1], y_pred)) # ------------------------- y_pred = svclassifier.predict(usps[0]) DataLoad.write_to_csv("svm_test_usps.csv", y_pred) print("accuracy usps test ", DataLoad.get_accuracy(y_pred, usps[1])) print(confusion_matrix(usps[1], y_pred))
def run_random_forest(): train, val, test, usps = DataLoad.create_dataset() classifier = RandomForestClassifier(n_estimators=10) classifier.fit(train[0], train[1]) y_pred = classifier.predict(test[0]) print("accuracy test ", DataLoad.get_accuracy(y_pred, test[1])) print(confusion_matrix(test[1], y_pred)) DataLoad.write_to_csv("random_forest_test.csv", y_pred) # ------------------------- y_pred = classifier.predict(usps[0]) print("accuracy usps ", DataLoad.get_accuracy(y_pred, usps[1])) print(confusion_matrix(usps[1], y_pred)) DataLoad.write_to_csv("random_forest_test_usps.csv", y_pred)
def wraperMethod(*args, **kwargs): try: self = args[0] self.session = self.repository.findByStatusAndCommit( FrameworkStatus[FrameworkConstant.ACTIVE], Session) if self.session: self.globals.success( f'"{self.session.key}" session loaded successfully') else: self.session = DataLoad.getBasicSession( self) ###- getBasicSession(self) print(f'self.session = {self.session}') self.globals.failure( f'''couldn't find any active session. Running most recent version of "{self.session.key}" session.''', self.globals.NOTHING) except Exception as exception: print( f'''{Constant.WRAPPER}{LoadSession.__name__} failed to load framework session. Cause: {str(exception)}''' ) return function(*args, **kwargs)
def Test31(): TgtDict = DL.LoadVocabulary("Model/tgt.vocab") Out = TT.TranslateOutput(TgtDict, 5).Init(4) print(Out.IndexSent) Out.Add([1, 2, 3, 4]) print(Out.IndexSent) Out.Add([2, 3, 4, 5]) print(Out.IndexSent) Out.Add([2, 3, 4, 5]) print(Out.AllFinish()) print(Out.IndexSent) Out.Add([2, 3, 4, 5]) print(Out.AllFinish()) Out.Add([2, 3, 4, 5]) Out.Add([2, 3, 4, 5]) print(Out.AllFinish()) print(Out.GetCurrentIndexTensor()) print(Out.IndexSent) print(Out.GetWordSent()) print(Out.ToFile("Output/predict"))
def make_diagam(count_klast, filename): data = DataLoad.get_transformed_data(filename).as_matrix() k_means = KMeans(n_clusters=count_klast, random_state=1) k_means.fit(data) centers = k_means.cluster_centers_ print "silhuette:", metrics.silhouette_score(data, k_means.labels_) klusters = [] for j in range(count_klast): klusters.append([]) for i, la in enumerate(k_means.labels_): klusters[la].append(data[i]) # Создание легенды legend = [] for center in centers: legend.append('age:' + str(int(center[0])) + '\ntime ' + str(datetime.timedelta(seconds=int(center[2])))) # Делаем данные для графика klust_sizes = [] for kluster in klusters: klust_sizes.append(len(kluster)) plt.figure(num=1, figsize=(6, 6)) plt.axes(aspect=1) plt.title('Size of klasters', size=14) plt.pie(klust_sizes, labels=legend) img = io.BytesIO() plt.savefig('static/kmeans2d.png') plt.show() plt.clf() # fig = plt.figure() # ax2 = Axes3D(fig) # ax2.scatter(data[:, 0], data[:, 1], data[:, 2], c=klusters, cmap='prism') # ax2.set_xlabel('driver_age') # ax2.set_ylabel('driver_gender') # ax2.set_zlabel('stop_time') # plt.savefig('static/kmeans3d.png') return img
def main(): data_train = dataload.load_data_train() modeltrain.Train_Model(data_train) data_pre = dataload.load_data_pre() pre_result = dataprediction.Predict_Data(data_pre) print("真实值为 930291366.85 预测结果为:%f" % (pre_result))
def get_default_test_predictions(): return DataLoad.get_transformed_data().as_matrix()