def TestCTRNN(angs, model, criterion, device): dataProcessor = DataPreprocessor(angs, sample_length=700, normalize=True) initdir = torch.from_numpy( dataProcessor.GetInitialInput()).float().to(device) input = torch.from_numpy( dataProcessor.GetTrainingInputs()).float().to(device) output = torch.from_numpy( dataProcessor.GetTrainingOutputs()).float().to(device) pred, h = model(initdir, input) loss = criterion(pred, output) print(f"Loss on real data: {loss.item():.4f}") pred = np.transpose(pred.detach().cpu().numpy(), (2, 1, 0)) pred = np.reshape(pred, (pred.shape[0], -1)) radsOut = np.unwrap(np.arctan2(pred[0], pred[1])) plt.plot(angs[1], label='ground truth') plt.plot(radsOut, label='predicted') plt.xlabel('Timestep (ms)') plt.ylabel('Angle (rad)') plt.title('Prediction Visualization') plt.legend() plt.savefig('performance_fakeAngs.png') del initdir del input del output
def TestCTRNN(angs, model, criterion, device, training_outputs): dataProcessor = DataPreprocessor(angs, sample_length=700, normalize=True) initdir = torch.from_numpy(dataProcessor.GetInitialInput()).float().to(device) input = torch.from_numpy(dataProcessor.GetTrainingInputs()).float().to(device) output = torch.from_numpy(dataProcessor.GetTrainingOutputs()).float().to(device) pred = model(initdir, input) loss = criterion(pred, output) print(f"Loss on real data: {loss.item():.4f}") pred = np.transpose(pred.detach().cpu().numpy(), (2, 1, 0)) pred = np.reshape(pred, (pred.shape[0], -1)) radsOut = np.unwrap(np.arctan2(pred[0], pred[1])) print("Graphing test performance to performance.png...") for i, output in enumerate(training_outputs): out = np.transpose(output.detach().cpu().numpy(), (2, 1, 0)) out = np.reshape(out, (out.shape[0], -1)) plt.plot(np.unwrap(np.arctan2(out[0], out[1])), label=f'fake batch {i}') plt.plot(angs[1], label='ground truth', color='blue') plt.plot(radsOut, label='predicted', color='orange') plt.xlabel('Timestep (ms)') plt.ylabel('Angle (rad)') plt.title('Prediction Visualization') plt.legend(prop={'size': 6}) plt.savefig('performance.png')
def build_model(): ''' request body{ name: model name, path: path to model } ''' jsondata = request.get_json() name = jsondata["name"] if name not in model.keys(): _model = support_vector_machine() model[name] = _model if name not in preprocessor.keys(): _preprocessor = DataPreprocessor() preprocessor[name] = _preprocessor dataX, datay = DataPreprocessor().getData() trainX, testX, trainy, testy = train_test_split(dataX, datay, test_size=0.2) model[name].train(trainX, trainy) score = model[name].score(testX, testy) # payload = {"test R square": str(score), # "result": "success", # } return str(score)
def main(): realAngs = np.load('angs_smooth.npy') - 2 * np.pi plt.plot(np.unwrap(realAngs[1]), label='real angs') diffs = realAngs[1][1:] - realAngs[1][:-1] print("realAngs avg diff: ", np.mean(diffs)) print("realAngs stddev: ", np.std(diffs)) print("realAngs range: ", np.min(realAngs[1]), np.max(realAngs[1])) for i in range(10): datagen = AVDataGenerator(T=realAngs.shape[1], dt=25, mean=np.mean(diffs) / 100, sigma=np.std(diffs) / 10, momentum=0) # if i == 0: # fakeAngs = realAngs # else: fakeAngs = datagen.GenerateAngs() dataProcessor = DataPreprocessor(fakeAngs, sample_length=700, normalize=False) fakeOutputs = dataProcessor.GetTrainingOutputs() fakeOutputs = np.transpose(fakeOutputs, (2, 1, 0)) fakeOutputs = np.reshape(fakeOutputs, (fakeOutputs.shape[0], -1)) fakeOutputs = np.unwrap(np.arctan2(fakeOutputs[0], fakeOutputs[1])) plt.plot(fakeOutputs, label='fake angs') # fakeAngs = datagen.GenerateAngs() # plt.plot(fakeAngs[1], label='fake angs 2') plt.xlabel('Timestep (ms)') plt.ylabel('Angle (rad)') plt.title('AVDataGenerator Visualization') plt.legend(prop={'size': 6}) plt.savefig('datagen.png')
def getData(CSVFile): smoother = DataPreprocessor() data = read_csv(CSVFile) data = data[::-1] # reverse ohclv_data = np.c_[data['Open'], data['High'], data['Low'], data['Close'], data['Volume']] smoothened_ohclv_data = smoother.PandaSmoother(ohclv_data) return smoothened_ohclv_data, np.array(data["Close"]), list(data["Date"])
def write_counts_of_each_phrase(csv_reader, csv_writer, patterns): print(patterns) data_preprocessor = DataPreprocessor() for row in csv_reader: cleaned_row = data_preprocessor.remove_punctuation(''.join(row)) resulting_row = [] for pattern in patterns: number_of_occurrences = len(re.findall(pattern, cleaned_row)) resulting_row.append(number_of_occurrences) csv_writer.writerow(resulting_row)
def _count_bigrams_by_frequency(file_name): with open(file_name, 'r', encoding='mac-roman', newline='\r\n') as file: bigrams = {} unigrams = {} data_preprocessor = DataPreprocessor() for row in file.readlines(): row = data_preprocessor.clean_row(row) for index in range(len(row) - 1): unigrams[row[index]] = unigrams.get(row[index]) bigrams[(row[index], row[index + 1])] = bigrams.get( (row[index], row[index + 1]), 0) + 1 return bigrams
def find_instances_near_two_class_centroid(self): data = DataPreprocessor().select_all_features() # data = data.drop(['target'], axis=1) mean_class_0 = data.loc[data['target'] == 0].mean() mean_class_1 = data.loc[data['target'] == 1].mean() means = pd.concat([mean_class_0, mean_class_1], axis=1) means = means.T data = data.append(means, ignore_index=True) class_0 = data.loc[data['target'] == 0] class_1 = data.loc[data['target'] == 1] dist_condensed_class_0 = pdist(class_0.values) distance_mat_0 = pd.DataFrame(squareform(dist_condensed_class_0), index=class_0.index, columns=class_0.index) distance_class_0 = distance_mat_0.iloc[(distance_mat_0.shape[0] - 1)] distance_class_0.sort_values(ascending=True, inplace=True) distance_class_0.drop(labels=[distance_class_0.index[0]], inplace=True) sort_by_distance_class_0 = distance_class_0.index nearest_to_centroid_class_0 = sort_by_distance_class_0[ 0:len(sort_by_distance_class_0) // 2] not_near_to_centroid_0 = set( class_0.index) - set(nearest_to_centroid_class_0) dist_condensed_class_1 = pdist(class_1.values, metric='euclidean') distance_mat_1 = pd.DataFrame(squareform(dist_condensed_class_1), index=class_1.index, columns=class_1.index) distance_class_1 = distance_mat_1.iloc[(distance_mat_1.shape[0] - 1)] distance_class_1.sort_values(ascending=True, inplace=True) distance_class_1.drop(labels=[distance_class_1.index[0]], inplace=True) sort_by_distance_class_1 = distance_class_1.index nearest_to_centroid_class_1 = sort_by_distance_class_1[ 0:len(sort_by_distance_class_1) // 2] not_near_to_centroid_1 = set( class_1.index) - set(nearest_to_centroid_class_1) new_data_class_0 = data.ix[not_near_to_centroid_0] new_data_class_1 = data.ix[nearest_to_centroid_class_1] useful_data = pd.concat([new_data_class_0, new_data_class_1]) useful_data_index = np.concatenate([ np.array(list(not_near_to_centroid_0)), nearest_to_centroid_class_1 ]) remain_data_index = set(data.index) - set(useful_data_index) remain_data = data.ix[remain_data_index] return useful_data, remain_data
def get_clean_ad_tweet_data(self, tweet_file, ad_file): annotations_data = pd.read_csv(ad_file, index_col=0) annotations_data['Keywords'] = annotations_data['Brand Name']\ .str.cat(annotations_data['Ad Name'], sep=" ")\ .str.cat(annotations_data['KeyTerms_Edited'], sep=" ") df = annotations_data.drop_duplicates() print(df.shape) man_ann_data = pd.read_csv(tweet_file) annotations_data['keywords_clean'] = annotations_data[ 'Keywords'].apply(lambda ad: DataPreprocessor.cleanTweet(ad)) man_ann_data['tweet_clean'] = man_ann_data['tweet_text'].apply( lambda twt: DataPreprocessor.cleanTweet(twt)) return annotations_data, man_ann_data
def init_model(workflow, client): global train_start_time global train_end_time initReq = workflow + "#" + client name, _, __ = requestHandler.parseReq(initReq, "fwf") print("whrkflow name : " + name) if name not in model.keys(): _model = support_vector_machine() model[name] = _model if name not in preprocessor.keys(): _preprocessor = DataPreprocessor() preprocessor[name] = _preprocessor train_start_time = time.time() dataX, datay = preprocessor[name].getData(workflow + "_" + client) trainX, testX, trainy, testy = train_test_split(dataX, datay, test_size=0.2) model[name].train(trainX, trainy) train_end_time = time.time() score = model[name].score(testX, testy) print("test score : " + str(score))
def GenerateFakeAngs(): realAngs = np.load('angs_smooth.npy') - 2 * np.pi diffs = realAngs[1][1:] - realAngs[1][:-1] sigmas = [ np.std(diffs) / 30, np.std(diffs) / 10, np.std(diffs) / 6, np.std(diffs) / 4, np.std(diffs) / 3, np.std(diffs) / 2, np.std(diffs) ] for sigma in sigmas: timesteps = 100000 datagen = AVDataGenerator(T=timesteps, dt=25, mean=np.mean(diffs) / 100, sigma=sigma, momentum=0) generatedAngs = datagen.GenerateAngs() plt.plot(generatedAngs[1]) print("Generated angs shape: ", generatedAngs.shape) trainsize = int(0.8 * timesteps) dataProcessor = DataPreprocessor(generatedAngs[:, :trainsize], sample_length=700, normalize=True) # torch.from_numpy(dataProcessor.GetInitialInput()).float().to(device) angs.append(generatedAngs[1, :trainsize]) initdirs.append(dataProcessor.GetInitialInput()) inputs.append(dataProcessor.GetTrainingInputs()) outputs.append(dataProcessor.GetTrainingOutputs()) testDataProcessor = DataPreprocessor(generatedAngs[:, trainsize:], sample_length=700, normalize=True) # torch.from_numpy(dataProcessor.GetInitialInput()).float().to(device) test_angs.append(generatedAngs[1, trainsize:]) test_initdirs.append(testDataProcessor.GetInitialInput()) test_inputs.append(testDataProcessor.GetTrainingInputs()) test_outputs.append(testDataProcessor.GetTrainingOutputs()) plt.savefig('fakeangs.png') plt.clf()
def build_model(): ''' request body{ name: model name, path: path to model } ''' global model global preprocessor global requestHandler global train_start_time global train_end_time jsondata = request.get_json() app.logger.info("json data : " + str(jsondata)) name, _, __ = requestHandler.parseReq(jsondata, "nwf") # name, _, __ = requestHandler.parseReq(initReq, "fwf") # name = jsondata["workflow"]+"#"+jsondata["client_name"] _client = jsondata["client_name"] _workflow = jsondata["workflow"] client = _workflow + "_" + _client if name not in model.keys(): _model = support_vector_machine() model[name] = _model app.logger.info("name key error ") if name not in preprocessor.keys(): _preprocessor = DataPreprocessor() preprocessor[name] = _preprocessor app.logger.info("name key error ") train_start_time = time.time() app.logger.info("model start training") dataX, datay = DataPreprocessor().getData(client) trainX, testX, trainy, testy = train_test_split(dataX, datay, test_size=0.2) model[name].train(trainX, trainy) app.logger.info("training success") score = model[name].score(testX, testy) train_end_time = time.time() # payload = {"test R square": str(score), # "result": "success", # } res = {"start_time": train_start_time, "end_time": train_end_time} return jsonify(res)
def calculate_total_loss(self, x, y): loss = 0 for i in range(len(y)): if i % (len(y) / 4) == 0: print("(" + str(i) + "/" + str(len(y)) + ") ", end='') sys.stdout.flush() # Do the one hot here as we get the sequences so that we dont run out of memory xoh = DataPreprocessor.one_hot_vector(x[i], self.input_dim) yoh = DataPreprocessor.one_hot_vector(y[i], self.input_dim) o, _ = self.forward_propogation(xoh) yoh = np.array(yoh) correct_characters_predicted = o[np.arange(len(yoh)), np.argmax(yoh, axis=1)] loss += self.cross_entropy_sum(correct_characters_predicted) return loss
def lemma_token_pos(self, text, allowed_pos): text = text.lower() # to take care of capital case word in glove doc = nlp(text) lemma_list = [] for token in doc: if token.is_stop is False: #if (token.pos_ == 'NOUN' or token.pos_ == 'VERB' or token.pos_ == 'ADJ' or token.pos_ == 'adv'): if (token.pos_ in allowed_pos): token_preprocessed = DataPreprocessor.preprocessor( token.lemma_) if token_preprocessed != '': lemma_list.append(token_preprocessed) return lemma_list
def bptt(self, x, y): T = len(y) xoh = DataPreprocessor.one_hot_vector(x, self.input_dim) yoh = DataPreprocessor.one_hot_vector(y, self.input_dim) o, s = self.forward_propogation(x) dLdU = np.zeros(self.U.shape) dLdV = np.zeros(self.V.shape) dLdW = np.zeros(self.W.shape) delta_o = o delta_o[np.arange(len(yoh)), np.argmax(yoh, axis=1)] -= 1 # Go backwards through time (::-1 is the reverse of time) for t in np.arange(T)[::-1]: dLdV += np.outer(delta_o[t], s[t].T) delta_t = self.V.T.dot(delta_o[t]) * (1 - (s[t]**2)) for bptt_step in np.arange(max(0, t - self.bptt_truncate), t + 1)[::-1]: dLdW += np.outer(delta_t, s[bptt_step - 1]) dLdU[:, np.argmax(xoh[bptt_step])] += delta_t delta_t = self.W.T.dot(delta_t) * (1 - s[bptt_step - 1]**2) return [dLdU, dLdV, dLdW]
def lemma_token(self, text): text = text.lower() # to take care of capital case word in glove tokenizer = nlp.Defaults.create_tokenizer(nlp) tokens = tokenizer(text) token_list = [] lemma_list = [] for token in tokens: if token.is_stop is False: token_preprocessed = DataPreprocessor.preprocessor( token.lemma_) if token_preprocessed != '': lemma_list.append(token_preprocessed) token_list.append(token.text) return lemma_list
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Get the data angs = np.load('angs_smooth.npy') angs[1] -= 2 * np.pi # Preprocess the data dataProcessor = DataPreprocessor(angs, sample_length=700, normalize=True) initdir = torch.from_numpy(dataProcessor.GetInitialInput()).float().to(device) input = torch.from_numpy(dataProcessor.GetTrainingInputs()).float().to(device) output = torch.from_numpy(dataProcessor.GetTrainingOutputs()).float().to(device) # Define the model and optimizer model = SingleLayerCTRNN(store_h=True) optimizer = torch.optim.Adam(model.parameters(), lr=.005, weight_decay=1e-6) criterion = nn.MSELoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() # Train losses = [] hidden_states = None for epoch in range(NUM_EPOCHS): optimizer.zero_grad() pred, h = model(initdir, input) loss = criterion(pred, output) hidden_states = np.array(h.cpu().detach().numpy()) print (f'Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {loss.item():.4f}' ) losses.append(loss.item()) loss.backward() optimizer.step() # Graph the losses print(f"Losses: {losses}") plt.plot(losses) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.savefig('loss_ctrnn.png') plt.clf() # Test testAngs = np.load('angs_smooth.npy') - 2 * np.pi TestCTRNN(testAngs, model, criterion, device) from scipy.stats import binned_statistic, binned_statistic_2d seqlen = hidden_states.shape[0] * hidden_states.shape[1] velocities = input.cpu().detach().numpy().reshape(seqlen, -1)[:, 0] headdirs = testAngs[1][:seqlen] % (2 * np.pi) print("Hidden states shape: ", hidden_states.shape) print("Inputs[i].shape: ", velocities.shape) print("Angs[i].shape: ", headdirs.shape) _, ax = plt.subplots(10, 10, figsize=(20,15)) for cell in range(100): x,y = np.divmod(cell,10) curr_ax = ax[x, y] activations = torch.relu( torch.tanh( torch.from_numpy( hidden_states.reshape(seqlen, -1)[:, cell] ) ) ) # activations = hidden_states.reshape(seqlen, -1)[:, cell] # print("No nonlin") print(f"Doing it for cell={cell}") bs = binned_statistic_2d(headdirs, velocities, activations, bins=[30, 30]) curr_ax.pcolormesh(bs[1], bs[2], bs[0]) curr_ax.set_yticks([]) curr_ax.set_xticks([]) plt.savefig(f"activations_realAngs.png") plt.clf() _, ax = plt.subplots(10, 10, figsize=(20,15)) for cell in range(100): x,y = np.divmod(cell,10) curr_ax = ax[x, y] activations = torch.relu( torch.tanh( torch.from_numpy( hidden_states.reshape(seqlen, -1)[:, cell] ) ) ) # activations = hidden_states.reshape(seqlen, -1)[:, cell] # import pdb; pdb.set_trace() print(f"Doing it for cell={cell}, no bins") bs = binned_statistic(headdirs, activations) curr_ax.plot( (bs[1][1:] + bs[1][:-1]) / 2, bs[0]) curr_ax.set_yticks([]) curr_ax.set_xticks([]) plt.savefig(f"activations_realAngs_headdirs.png") plt.clf() _, ax = plt.subplots(10, 10, figsize=(20,15)) for cell in range(100): x,y = np.divmod(cell,10) curr_ax = ax[x, y] activations = torch.relu( torch.tanh( torch.from_numpy( hidden_states.reshape(seqlen, -1)[:, cell] ) ) ) # activations = hidden_states.reshape(seqlen, -1)[:, cell] # print("No nonlin") # import pdb; pdb.set_trace() print(f"Doing it for cell={cell}") bs = binned_statistic(velocities, activations) curr_ax.plot( (bs[1][1:] + bs[1][:-1]) / 2, bs[0]) curr_ax.set_yticks([]) curr_ax.set_xticks([]) plt.savefig(f"activations_realAngs_vels.png") plt.clf() del initdir del input del output
from FreshRNN import RNN from DataPreprocessor import DataPreprocessor import numpy as np datasetsPath = 'data' vocab = 256 sequence_length = 100 trainx = [] trainy = [] testx = [] testy = [] if __name__ == "__main__": dataset = [] DataPreprocessor.get_dataset(dataset, datasetsPath, clean=True) del dataset[0] trainx, trainy = DataPreprocessor.data_targets(dataset, sequence_length) trainx = np.array(trainx) trainy = np.array(trainy) print(trainx[0]) print(trainy[0]) model = RNN(vocab) losses = model.train_with_sgd(model, trainx[:500], trainy[:500], nepoch=5,
def main(): dataprep = DataPreprocessor() dataprep.preprocess() runLogisticRegression(dataprep)
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Processing training data...") realAngs = np.load('angs_smooth.npy') - 2 * np.pi diffs = realAngs[1][1:] - realAngs[1][:-1] initdirs = [] inputs = [] outputs = [] datameanmean = np.mean(diffs)/100 datameansigma = np.std(diffs)/10 for i in range(TRAINING_BATCHES): # if i == 0: # dataProcessor = DataPreprocessor(realAngs, sample_length=700, normalize=True) # else: datagen = AVDataGenerator(T=realAngs.shape[1], dt=25, mean=np.random.uniform(datameanmean/2, datameanmean*1.5), \ sigma=np.random.uniform(datameansigma/2, datameansigma*1.5), momentum=0) dataProcessor = DataPreprocessor(datagen.GenerateAngs(), sample_length=700, normalize=True) initdirs.append(torch.from_numpy(dataProcessor.GetInitialInput()).float()) inputs.append(torch.from_numpy(dataProcessor.GetTrainingInputs()).float()) outputs.append(torch.from_numpy(dataProcessor.GetTrainingOutputs()).float()) print(f"Sample initdirs for fake batch {i}: ", initdirs[i][0][0]) initdirs = torch.stack(initdirs).to(device) inputs = torch.stack(inputs).to(device) outputs = torch.stack(outputs).to(device) print("Defining the model...") model = SingleLayerCTRNN(input_dim=2) optimizer = torch.optim.Adam(model.parameters(), lr=.005, weight_decay=1e-6) criterion = nn.MSELoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() print("Training...") losses = [] for epoch in range(NUM_EPOCHS): for batch in range(TRAINING_BATCHES): optimizer.zero_grad() pred = model(initdirs[batch], inputs[batch]) loss = criterion(pred, outputs[batch]) print (f'Epoch [{epoch+1}/{NUM_EPOCHS}] Batch [{batch+1}/{TRAINING_BATCHES}] Loss: {loss.item():.4f}' ) losses.append(loss.item()) loss.backward() optimizer.step() print("Graphing the losses to loss_ctrnn.py...") print(f"Losses: {losses}") plt.plot(losses) plt.xlabel('Iteration') plt.ylabel('Loss') plt.title('Training loss history') plt.savefig('loss_ctrnn.png') plt.clf() print("Testing the model...") testAngs = np.load('angs_smooth.npy') - 2 * np.pi # datagen = AVDataGenerator(T=realAngs.shape[1], dt=25, mean=datameanmean, \ # sigma=datameansigma, momentum=0) # testAngs = datagen.GenerateAngs() TestCTRNN(testAngs, model, criterion, device, outputs)
import json from TextProcessor import TextProcessor from DataPreprocessor import DataPreprocessor import configparser import sys from sklearn.cluster import KMeans from sklearn import metrics from scipy.spatial.distance import cdist import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer if __name__ == '__main__': data_preprocessor = DataPreprocessor() text_processor = TextProcessor() data = data_preprocessor.readFile('neberitrubku_output.csv') data = data_preprocessor.cleanData(data) sse = {} for k in range(1, 15): kmeans = text_processor.make_clusters(data, k) #data["clusters"] = kmeans.labels_ #print(data["clusters"]) sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center plt.figure() plt.plot(list(sse.keys()), list(sse.values())) plt.xlabel("Number of cluster") plt.ylabel("SSE")
def login(): message = None print("is called") if request.method == 'POST': print("is posted") datafromjs = request.form['mydata'] print(datafromjs) vars = datafromjs.split(',') print(vars) vec_start = vars[0:-4] vec_start = results = [int(i) for i in vec_start] time = vars[-4] vec_end = vars[-3:-1] vec_end = results = [int(i) for i in vec_end] print("vec start: " + str(vec_start)) print("vec end: " + str(vec_end)) rel_path = vars[-1].strip() processor = DataPreprocessor() # #outputs = processor.preprocess(processor, datafromjs) dir_path = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(dir_path, '../website/' + rel_path + '.csv') path = os.path.abspath(os.path.realpath(file_path)) print(path) model_input = [] with open(path) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: vector = [] row = [int(i) for i in row] vector += vec_start vector.append(time) vector += row vector += vec_end vector.append(1) vector.append(1) print("vector: " + str(len(vector))) model_input.append(vector) # print("length: " + str(len(model_input))) dir_path = os.path.dirname(os.path.realpath(__file__)) file_path = os.path.join(dir_path, '../models/severity.h5') path = os.path.abspath(os.path.realpath(file_path)) model = load_model(path) outputs = [] processor.preprocess(data=model_input) X, y = processor.getDataForSeverity() for inputvec in X: print(inputvec.shape) inputvec = inputvec.reshape(1, 98) outputs.append(model.predict(inputvec, steps=1)) print("size outputs " + str(len(outputs))) print(outputs[0]) results = [] for r in outputs: o = list(r[0]) print(o) print(max(o)) res = o.index(max(o)) results.append(res) print(str(results)) # if res == 0: # result = 'Low' # if res == 1: # result = 'Mid' # else: # result = 'High' resultsStr = str(results) finalResult = resultsStr[1:-1] print(finalResult) resp = make_response(finalResult) resp.headers['Content-Type'] = "application/json" return resp
from DataPreprocessor import DataPreprocessor from TrainTestPipeline import TrainTestPipeline import pandas as pd import os # if not os.path.exists('MicrotracMinMax'): # os.mkdir('MicrotracMinMax') data_folder = './MicrotracDataFilesPT' flow_values_excel = './TrueFlowValues_.xlsx' dp = DataPreprocessor(data_folder, flow_values_excel, root_folder_='.') success = dp.prepare_df(preproc_type='yeo-johnson') assert success # # Pearson-correlation, full dataset with augmented features x_filt, columns = dp.get_feature_selection_x(method='pearson', threshold = 0.8, \ heldout_cols = ['Density']) y_regr = dp.get_regression_y() all_samples = dp.get_samples() pipeline = TrainTestPipeline(x_data=x_filt, y_data=y_regr, all_samples=all_samples, model_name='RandomForestRegressor', heldout_samples='random', num_heldout=4) tr_test_ = pipeline.do_train_test(cv=False)
def main(): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Get the data print("Processing training data...") realAngs = np.load('angs_smooth.npy') - 2 * np.pi diffs = realAngs[1][1:] - realAngs[1][:-1] datameanmean = np.mean(diffs) / 100 datameansigma = np.std(diffs) / 10 datagen = AVDataGenerator(T=realAngs.shape[1], dt=25, mean=datameanmean, sigma=datameansigma, momentum=0) initdirs = [] inputs = [] outputs = [] for i in range(TRAINING_BATCHES): dataProcessor = DataPreprocessor(datagen.GenerateAngs(), sample_length=700, normalize=True) initdirs.append( torch.from_numpy(dataProcessor.GetInitialInput()).float()) inputs.append( torch.from_numpy(dataProcessor.GetTrainingInputs()).float()) outputs.append( torch.from_numpy(dataProcessor.GetTrainingOutputs()).float()) print(f"Sample initdirs for fake batch {i}: ", initdirs[i][0][0]) initdirs = torch.stack(initdirs).to(device) inputs = torch.stack(inputs).to(device) outputs = torch.stack(outputs).to(device) # Define the model and optimizer model = SingleLayerCTRNN(store_h=True) optimizer = torch.optim.Adam(model.parameters(), lr=.005, weight_decay=1e-6) criterion = nn.MSELoss() if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() # Train print("Training...") losses = [] hidden_states = None for epoch in range(NUM_EPOCHS): for batch in range(TRAINING_BATCHES): optimizer.zero_grad() pred, h = model(initdirs[batch], inputs[batch]) loss = criterion(pred, outputs[batch]) hidden_states = np.array(h.cpu().detach().numpy()) print( f'Epoch [{epoch+1}/{NUM_EPOCHS}] Batch [{batch+1}/{TRAINING_BATCHES}] Loss: {loss.item():.4f}' ) losses.append(loss.item()) loss.backward() optimizer.step() print(f"Losses: {losses}") # Test testAngs = np.load('angs_smooth.npy') - 2 * np.pi datagen = AVDataGenerator(T=realAngs.shape[1], dt=25, mean=datameanmean, \ sigma=datameansigma, momentum=0) testAngs = datagen.GenerateAngs() TestCTRNN(testAngs, model, criterion, device) from scipy.stats import binned_statistic, binned_statistic_2d seqlen = hidden_states.shape[0] * hidden_states.shape[1] velocities = inputs[0].cpu().detach().numpy().reshape(seqlen, -1)[:, 0] import pdb pdb.set_trace() headdirs = np.unwrap(np.arctan2(outputs[0][0], outputs[0][1]))[:seqlen] % (2 * np.pi) print("Hidden states shape: ", hidden_states.shape) print("Inputs[i].shape: ", velocities.shape) print("Angs[i].shape: ", headdirs.shape) _, ax = plt.subplots(10, 10, figsize=(20, 15)) for cell in range(100): x, y = np.divmod(cell, 10) curr_ax = ax[x, y] activations = torch.relu( torch.tanh( torch.from_numpy(hidden_states.reshape(seqlen, -1)[:, cell]))) # activations = hidden_states.reshape(seqlen, -1)[:, cell] # print("No nonlin") print(f"Doing it for cell={cell}") bs = binned_statistic_2d(headdirs, velocities, activations, bins=[30, 30]) curr_ax.pcolormesh(bs[1], bs[2], bs[0]) curr_ax.set_yticks([]) curr_ax.set_xticks([]) plt.savefig(f"activations_fakeAngs.png") plt.clf() _, ax = plt.subplots(10, 10, figsize=(20, 15)) for cell in range(100): x, y = np.divmod(cell, 10) curr_ax = ax[x, y] activations = torch.relu( torch.tanh( torch.from_numpy(hidden_states.reshape(seqlen, -1)[:, cell]))) # activations = hidden_states.reshape(seqlen, -1)[:, cell] # import pdb; pdb.set_trace() print(f"Doing it for cell={cell}, no bins") bs = binned_statistic(headdirs, activations) curr_ax.plot((bs[1][1:] + bs[1][:-1]) / 2, bs[0]) curr_ax.set_yticks([]) curr_ax.set_xticks([]) plt.savefig(f"activations_fakeAngs_headdirs.png") plt.clf() _, ax = plt.subplots(10, 10, figsize=(20, 15)) for cell in range(100): x, y = np.divmod(cell, 10) curr_ax = ax[x, y] activations = torch.relu( torch.tanh( torch.from_numpy(hidden_states.reshape(seqlen, -1)[:, cell]))) # activations = hidden_states.reshape(seqlen, -1)[:, cell] # print("No nonlin") # import pdb; pdb.set_trace() print(f"Doing it for cell={cell}") bs = binned_statistic(velocities, activations) curr_ax.plot((bs[1][1:] + bs[1][:-1]) / 2, bs[0]) curr_ax.set_yticks([]) curr_ax.set_xticks([]) plt.savefig(f"activations_fakeAngs_vels.png") plt.clf() del initdirs del inputs del outputs
import numpy as np from sklearn.model_selection import cross_val_score from sklearn.neural_network import MLPRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from DataPreprocessor import DataPreprocessor from sklearn.model_selection import cross_validate from sklearn.metrics import make_scorer from scipy.stats import uniform from FeatureSelector import * from sklearn.utils import shuffle preprocessor = DataPreprocessor() def mse_error(y_true, y_pred): error = y_true - y_pred return np.sum(error**2) / (len(y_pred)) X_train, y_train = preprocessor.get_train_test_data(norm=True, test=0) BClass = MLPRegressor(max_iter=10000, hidden_layer_sizes=[50, 50, 50, 50], early_stopping=True, validation_fraction=.1, n_iter_no_change=300, activation='tanh', alpha=0.00001, learning_rate='adaptive', momentum=0.3,
from LoadSplit import LoadSplit from DataPreprocessor import DataPreprocessor from UserParams import UserParams # load and split the data user_params = UserParams( dataset_path="data\\santander_customer_transaction_prediction_target.csv", target_name="target", train_test_path="train_test_splited\\") load_splitter = LoadSplit(user_params) X_train, X_test, Y_train, Y_test = load_splitter.load_and_split() # preprocess the data data_preprocessor = DataPreprocessor(user_params, X_train, X_test, Y_train, Y_test) X_train, X_test = data_preprocessor.fit_transform()
def get_features(self): self.data = DataPreprocessor().select_all_features() # self.data = getData() self.cluster_data_before_classify(self.data)