class Fitted_Activation: def __init__(self, data_file, width=0.04): """ Constructor requires activation fit data :param data_file: path to .csv file containing activation data :param width: optional, set Gaussian width """ data = np.loadtxt(data_file, delimiter=',') self.walking_cycle_percent = data[:, 0] self.activation = data[:, 1] self.time = np.arange(0, 1, 0.002) self.centers = np.arange(0, 1, 0.005) self.width = width self.model = Regression(self.walking_cycle_percent, self.activation, self.centers, self.width) def show_curves(self): plt.figure() plt.plot(self.walking_cycle_percent, self.activation, 'k') plt.plot(self.time, self.get_activation(self.time), 'r') plt.xlabel('Walking Cycle (%)') plt.ylabel('Activation') plt.legend(['Data', 'Regression']) plt.show() def get_activation(self, t): """ :param t: time point at which to evaluate activation, in percentage of walking cycle :return: activation level at that time """ a = self.model.eval(t) a[a < 0.002] = 0 return a
def get_fitted_ankle_angle(time, norm=False): data = np.loadtxt('curve_datasets/ankle_angle.csv', delimiter=',') centres = np.arange(0, 1, 0.005) sample_time = data[:, 0] / max(data[:, 0]) if norm: sample_angle = data[:, 1] / max(data[:, 1]) else: sample_angle = data[:, 1] model = Regression(sample_time, sample_angle, centres, 0.04) swing_stance_aa = model.eval(time) return swing_stance_aa
def get_fitted_natural_stimulation(time, scale=1): """ :param scale: optional, how much to scale the voltage (normalized to max) :return: swing stance time, corresponding stimulation profile """ data = np.loadtxt('curve_datasets/processed_natural_stimulation_2.csv', delimiter=',') centers = np.arange(0, 1, 0.005) model = Regression(data[:, 0], data[:, 1], centers, 0.092) # plt.plot(data[:,0], data[:,1]) swing_stance_stim = model.eval(time) swing_stance_stim[swing_stance_stim < 0] = 0 return scale * swing_stance_stim / max(swing_stance_stim)
def load(FLAGS): """ Load all data and store it in either a list (old) or in a dataset class (new) """ questions = [] queries = [] answers = [] impression_lvls = [] engagement_lvls = [] click_probs = [] np.random.seed(42) filename_dataset = f"Data/dataset_filename={FLAGS.filename}_expanded={FLAGS.expanded}_balance={FLAGS.balance}_impression={FLAGS.impression}_reduced_classes={FLAGS.reduced_classes}_embedder={FLAGS.embedder}_negative_samples={FLAGS.negative_samples}.p" # Check if loadable file exists if not os.path.exists(FLAGS.folder): raise OSError(f"Folder {FLAGS.folder} does not exist") if not os.path.exists(FLAGS.folder + FLAGS.filename): raise OSError(f"File {FLAGS.folder+FLAGS.filename} does not exist") N = 500 with open(FLAGS.folder + FLAGS.filename) as tsvfile: tsvreader = csv.reader(tsvfile, delimiter="\t") # skip the first line (consists of labels) next(tsvreader, None) for i, line in enumerate(tsvreader): # skip the instances that have a low impression level if FLAGS.impression and line[7] == "low": continue # if i == N: # break # Add values to the data lists queries.append(line[0]) questions.append(line[1]) answers.append([line[i] for i in range(2, 7)]) impression_lvls.append(line[7]) if FLAGS.reduced_classes: engagement_lvls.append(0 if int(line[8]) == 0 else 1) else: engagement_lvls.append(int(line[8])) click_probs.append([float(line[i]) for i in range(9, 14)]) # Attempt to fix class imbalance assuming 0 is to large if FLAGS.balance: # Index the locations of zeros and non-zeros engagement_lvls = np.array(engagement_lvls) zero_indices = np.where(engagement_lvls == 0)[0] non_zero_indices = np.where(engagement_lvls != 0)[0] # Get the median size of the engagement levels if FLAGS.reduced_classes: median_size = int(Counter(engagement_lvls)[1]) else: median_size = int( np.median(list(Counter(engagement_lvls).values()))) # Return the to be used indices sampled_indices = np.random.choice(zero_indices, median_size, replace=False) indices = np.concatenate((sampled_indices, non_zero_indices)) # Update datalist based on indices queries = [queries[i] for i in indices] questions = [questions[i] for i in indices] answers = [answers[i] for i in indices] impression_lvls = [impression_lvls[i] for i in indices] engagement_lvls = [engagement_lvls[i] for i in indices] click_probs = [click_probs[i] for i in indices] if FLAGS.expanded and FLAGS.negative_samples: # Get values for sampling n_questions = len(questions) ranges = get_ranges(queries) sampled_question_indices = [] for r in ranges: # Negative samples for each query range samples = np.random.choice( [i for i in range(n_questions) if i not in r], FLAGS.sample_size, replace=False) sampled_question_indices.append(samples) # Update the engagement levels to 2 for max engagement and 1 for other max_engagement = np.max([engagement_lvls[i] for i in r]) for i in r: if engagement_lvls[i] == max_engagement: engagement_lvls[i] = 2 else: engagement_lvls[i] = 1 # set language model if FLAGS.embedder == "Bert": # Flatten to load into embedder answers = [i for sublist in answers for i in sublist] embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') question_embeds = embedder.encode(questions, convert_to_tensor=False, show_progress_bar=True, batch_size=128, num_workers=4) query_embeds = embedder.encode(queries, convert_to_tensor=False, show_progress_bar=True, batch_size=128, num_workers=4) answer_embeds = embedder.encode(answers, convert_to_tensor=False, show_progress_bar=True, batch_size=128, num_workers=4) query_embeds = torch.from_numpy(query_embeds) question_embeds = torch.from_numpy(question_embeds) answer_embeds = torch.from_numpy(answer_embeds) print(query_embeds.shape) print(question_embeds.shape) print(answer_embeds.shape) answers = list(zip(*[iter(answers)] * 5)) if FLAGS.expanded and FLAGS.negative_samples: # Make list to extend the embeddings answer_embeds = list( answer_embeds.reshape(query_embeds.shape[0], -1)) question_embeds = list(question_embeds) query_embeds = list(query_embeds) # Extend the data with the negative samples for r, samples in zip(ranges, sampled_question_indices): queries.extend([queries[r[0]]] * len(samples)) questions.extend([questions[i] for i in samples]) answers.extend([answers[i] for i in samples]) impression_lvls.extend([impression_lvls[i] for i in samples]) engagement_lvls.extend([0] * len(samples)) click_probs.extend([click_probs[i] for i in samples]) query_embeds.extend([query_embeds[r[0]]] * len(samples)) question_embeds.extend([question_embeds[i] for i in samples]) answer_embeds.extend([answer_embeds[i] for i in samples]) # Turn the embeddings back to torch tensors query_embeds = torch.stack(query_embeds) question_embeds = torch.stack(question_embeds) answer_embeds = torch.stack(answer_embeds) print(query_embeds.shape) print(question_embeds.shape) print(answer_embeds.shape) elif FLAGS.embedder == "TFIDF": # initialize the vectorized if FLAGS.expanded: with open(f"{FLAGS.folder}TFIDF_vocab.p") as f: vocab = pkl.load(f) vectorizer = TfidfVectorizer(vocabulary=vocab) else: vectorizer = TfidfVectorizer() if FLAGS.expanded and FLAGS.negative_samples: # Extend the data with the negative samples for r, samples in zip(ranges, sampled_question_indices): queries.extend([queries[r[0]]] * len(samples)) questions.extend([questions[i] for i in samples]) answers.extend([answers[i] for i in samples]) impression_lvls.extend([impression_lvls[i] for i in samples]) engagement_lvls.extend([0] * len(samples)) click_probs.extend([click_probs[i] for i in samples]) # create the corpus: a list of string, each string is a data instance corpus = [ " ".join([queries[i], questions[i], " ".join(answers[i])]) for i in range(len(queries)) ] # this yields a sparse vector X = vectorizer.fit_transform(corpus) if not FLAGS.expanded: with open(f"{FLAGS.folder}TFIDF_vocab.p", "wb") as f: pkl.dump(vectorizer.vocabulary_, f) # use code snippet from https://ray075hl.github.io/ray075hl.github.io/sparse_matrix_pytorch/ to convert to torch tensor X = X.tocoo().astype(np.float32) indices = torch.from_numpy(np.vstack((X.row, X.col))).long() values = torch.from_numpy(X.data) shape = torch.Size(X.shape) X = torch.sparse_coo_tensor(indices, values, shape) print(f"shape of X: {X.shape}") else: print(f"Embedder {FLAGS.embedder} does not exist") return # either return the dataset for regression, with only questios, queries # and answers, or return with all attributes if FLAGS.expanded: # TODO # if statement if TFIDF or BERT # load neural net and perform forward pass on the data, yielding the predicted engagement levels if FLAGS.embedder == "Bert": answer_embeds = answer_embeds.reshape(query_embeds.shape[0], -1) input_matrix = torch.cat( (query_embeds, question_embeds, answer_embeds), dim=1) nn = Regression(n_inputs=input_matrix.shape[1], n_hidden=[300, 32], dropout_percentages=[0.0, 0.0], n_classes=1, batchnorm=True) nn.load_state_dict(torch.load("Models/Best_regression_model.pt")) nn.eval() with torch.no_grad(): preds = nn(input_matrix).squeeze() elif FLAGS.embedder == "TFIDF": nn = Regression(n_inputs=X.shape[1], n_hidden=[300, 32], dropout_percentages=[0.0, 0.0], n_classes=1, batchnorm=True) # TODO Correct model nn.load_state_dict( torch.load( "Models/Regression_Bert_SGD_0.0001_1e-05_300, 32_0.0, 0.0_True_40.pt" )) nn.eval() with torch.no_grad(): preds = nn(X).squeeze() # Save in Data object dataset = Data(queries, questions, answers, impression_lvls, engagement_lvls, click_probs, preds) # save the dataloader with open(filename_dataset, "wb") as f: pkl.dump(dataset, f, protocol=4) # return the dataset for regression else: dataset = [] if FLAGS.embedder == "Bert": for i, (query, question) in tqdm( enumerate(zip(query_embeds, question_embeds))): # reshape answers answers = answer_embeds[i * 5:i * 5 + 5] answers = answers.reshape(-1) engagement_lvl = torch.Tensor([int(engagement_lvls[i]) ]).float() inp = torch.cat((query, question, answers), 0) # Add the datapoint to the dataset dataset.append((inp, engagement_lvl)) elif FLAGS.embedder == "TFIDF": for i, inp in enumerate(X): dataset.append( (inp, torch.Tensor([int(engagement_lvls[i])]).float())) # save the dataloader with open(filename_dataset, "wb") as f: pkl.dump(dataset, f)