def load_data(file_name): print("Start loading file[%s]" % file_name) with open(file_name, "rb") as F: data = pickle.load(F) F.close() print("End loading") return (data)
def save_rev_ids_for_predictions(y_pred, y_test, z, model_name): correct_minority = [] incorrect_minority = [] total = 0 correct = 0 incorrect = 0 for i in range(0, len(y_test)): # if minority class in reality if y_test[i] == 0: total += 1 # if correctly predicted minority class if y_pred[i] == y_test[i]: correct += 1 correct_minority.append(z[i]) else: incorrect += 1 incorrect_minority.append(z[i]) # save it in a file with open('correct_' + model_name + '.txt', 'w') as f: for item in correct_minority: item = item.encode('utf8') f.write("%s\n" % item) f.close() with open('incorrect_' + model_name + '.txt', 'w') as f: for item in incorrect_minority: item = item.encode('utf8') f.write("%s\n" % item) f.close() return
def __init__(self,list_path): f = open(list_path,'r') self.uttid_dic = {} #initialization for ln in f.readlines(): temp_str = ln.split(' ',1) self.uttid_dic[temp_str[0]] = int(temp_str[1]) f.close()
def load_word_embeddings(directory, file, dictionary): embeddings_index = {} f = open(os.path.join(directory, file)) for line in f: word, vec = line.split(' ', 1) if word in dictionary: embeddings_index[word] = np.array(list(map(float, vec.split()))) f.close() return embeddings_index
def __init__(self,train_dist_path, test_dist_path,dis_phone_num,prob_trans_path): f = open(train_dist_path,'r') # train_prob ln=f.readline() temp_prob_str = ln.split(' ')[3:-1] self.train_prob = torch.FloatTensor(list(map(float, temp_prob_str))) # var ln=f.readline() temp_var_str = ln.split(' ')[3:-1] self.train_var = torch.FloatTensor(list(map(float, temp_var_str))) f.close() # test_prob f = open(test_dist_path,'r') ln=f.readline() temp_prob_str = ln.split(' ')[3:-1] self.test_prob = torch.FloatTensor(list(map(float, temp_prob_str))) # var ln=f.readline() temp_var_str = ln.split(' ')[3:-1] self.test_var = torch.FloatTensor(list(map(float, temp_var_str))) f.close() # ratio length=len(self.test_prob) self.prob_ratio = torch.min(self.test_prob/self.train_prob, torch.ones(length)) print('self.prob_ratio:') print(self.prob_ratio) assert max(self.prob_ratio) <=1 , 'The prob ratio is larger than 1!?' assert min(self.test_var) > 0, 'The min of test var is <= 0!?' self.std_var_ratio = torch.sqrt(self.test_var/self.train_var) print('self.std_var_ratio:') print(self.std_var_ratio) print('self.test_var.sum():') print(self.test_var.sum()) # The first 4 are zero unvoiced parts self.phone_prob = torch.FloatTensor([0,0,0,0, 0.0202, 0.0522, 0.0917, 0.0153, 0.0088, 0.0483, 0.0130, 0.0048, 0.0290, 0.0212, 0.0249, 0.0177, 0.0240, 0.0146, 0.0093, 0.0194, 0.0490, 0.0457, 0.0050, 0.0296, 0.0367, 0.0407, 0.0530, 0.0114, 0.0416, 0.0011, 0.0124, 0.0302, 0.0457, 0.0073, 0.0571, 0.0064, 0.0047, 0.0249, 0.0123, 0.0191, 0.0287, 0.0230, 0.0002]) self.prob_sum = self.phone_prob.sum() phone_num = len(self.phone_prob) assert dis_phone_num >=0 , 'The dis_phone_num need to be non negtive!' self.dis_phone_num = dis_phone_num threshold_prob = self.prob_sum/((phone_num-dis_phone_num)*0.8) self.prob_ratio_upper = torch.max(self.phone_prob/threshold_prob, 0.2*torch.ones(phone_num)) # load the GMM params temp_mat = sio.loadmat(prob_trans_path) self.mu_ratio = torch.from_numpy(temp_mat['mu_ratio']).float() # weight cumulation sum self.comp_wcum = torch.from_numpy(temp_mat['comp_wcum']).float()
def memory_read(self): E = list() try: F = open(self.file_name, 'rb') E = pickle.load(F) F.close() except: F = open(self.file_name, 'wb') F.close() return E
def test(net, testset, testloader, criterian, batch_size, n_class, log_file): '''Testing the network ''' net.eval() testloss, testacc = 0., 0. class_correct = list(0. for i in range(10)) class_total = list(0. for i in range(10)) for (img, label) in testloader: img = var(img).cuda() label = var(label).cuda() #forward pass output = net(img) #loss loss = criterian(output, label) testloss += loss.data[0] #prediction _, predict = torch.max(output, 1) num_correct = (predict == label).sum() testacc += num_correct.data[0] # c = (predict == label).squeeze() for i in range(batch_size): l = label[i].data[0] class_correct[l] += c[i].data[0] class_total[l] += 1 #end_for #end_for testloss /= len(testset) testacc /= len(testset) f = open(log_file, 'a') f.write('\n-------------------\n') print("Test: Loss: %.5f, Acc: %.2f %%" % (testloss, 100 * testacc)) f.write("Test: Loss: %.5f, Acc: %.2f %%\n" % (testloss, 100 * testacc)) for i in range(10): print('Accuracy of %5d : %2d %%' % (i, 100 * class_correct[i] / class_total[i])) f.write('Accuracy of %5d : %2d %%\n' % (i, 100 * class_correct[i] / class_total[i])) #end_for f.close()
def save_bert_file(dict, output, dataset_name, model_name, hyper_num, oov_num, f_info_out, include_oov=True): logger.info("save info...") f_info_out.write( f'{model_name}\t{dataset_name}\t{len(dict)}\t{oov_num}\t{hyper_num}\t{include_oov}\n' ) logger.info("save json...") dname = os.path.splitext(dataset_name)[0] fjson = json.dumps(dict, ensure_ascii=False) f = open(os.path.join(output, model_name.replace("/", "-"), dname + ".json"), mode="w", encoding="utf-8") f.write(fjson) f.close()
def predict_perf(model, test_loader, predict_type, ans_dict, beer_index): model.eval() #run predict iter_bar = tqdm(test_loader) all_predict = {} softmax = nn.Softmax(dim=0) cnt_true = 0 cnt = 0 out_ans = {} for x, ids, answers, attn_mask, token_type_ids in iter_bar: batch_size = x.size(0) model.zero_grad() with torch.no_grad(): sent = model(x.to(device), attn_mask.to(device), token_type_ids.to(device), predict_type) for i in range(x.size(0)): if predict_type == "MOVIE": #binary if sent[i] > 0.5: ans = 1 else: ans = 0 if int(ids[i]) <= 1000: #if answers[i] > 0: ans_ = 1 else: ans_ = 0 else: ans = round(sent[i].item(), 1) ans_ = ans_dict[ids[i]][beer_index] if ans // 0.2 == ans_ // 0.2: cnt_true += 1 out_ans[ids[i]] = ans cnt += 1 with open('perf_predict.pickle', "wb") as F: pickle.dump(out_ans, F) F.close() print("Performence:", cnt_true / cnt)
def predict(model, test_loader, predict_name, tokenizer, predict_type): model.eval() #run predict iter_bar = tqdm(test_loader) all_predict = {} softmax = nn.Softmax(dim=0) no_ans = 0 total_ans_len = 0 prev_last = -1 for x, ids, answers, attn_mask, token_type_ids in iter_bar: batch_size = x.size(0) model.zero_grad() with torch.no_grad(): out_ans_start, out_ans_end = model(x.to(device), attn_mask.to(device), token_type_ids.to(device), "SQUAD") for i in range(batch_size): c_len = int(torch.sum(token_type_ids[i]).item()) si = softmax(out_ans_start[i][:c_len]) ei = softmax(out_ans_end[i][:c_len]) if predict_type == "BEER": ans_start = torch.argmax(si) ans_end = torch.argmax(ei) if ans_start == 0 or ans_end == 0: new_ans = "" else: si[0] = 0 ei[0] = 0 if ans_end < ans_start: if si[ans_start] > ei[ans_end]: ei[ans_end] = 0 ans_end = ans_start + 1 + torch.argmax( ei[ans_start + 1:]) else: si[ans_start] = 0 ans_start = torch.argmax(si[:ans_end]) prev_start = ans_start prev_end = ans_end while (ans_end >= ans_start and (ans_end.item() - ans_start.item() + 1) / torch.sum(token_type_ids[i]).item() < want_len): s_big = si[ans_start] e_big = ei[ans_end] si[ans_start] = 0 ei[ans_end] = 0 ans_start_2 = torch.argmax(si) ans_end_2 = torch.argmax(ei) s_sec = si[ans_start_2] e_sec = ei[ans_end_2] if s_big - s_sec > e_big - e_sec: if ans_end_2 > ans_end and (ans_end.item( ) - ans_start.item() + 1) / torch.sum( token_type_ids[i]).item() < want_len: ans_end = ans_end_2 if ans_start > ans_start_2 and (ans_end.item( ) - ans_start.item() + 1) / torch.sum( token_type_ids[i]).item() < want_len: ans_start = ans_start_2 else: if ans_start > ans_start_2 and (ans_end.item( ) - ans_start.item() + 1) / torch.sum( token_type_ids[i]).item() < want_len: ans_start = ans_start_2 if ans_end_2 > ans_end and (ans_end.item( ) - ans_start.item() + 1) / torch.sum( token_type_ids[i]).item() < want_len: ans_end = ans_end_2 if ans_end == prev_end and ans_start == prev_start: si[ans_start_2] = 0 ei[ans_end_2] = 0 prev_start = ans_start prev_end = ans_end new_ans = tokenizer.decode( (x[i][ans_start:ans_end + 1])[:40]) elif predict_type == "SQUAD" or predict_type == "MOVIE": if predict_type == "MOVIE": si[0] = 0 ei[0] = 0 ans_start = torch.argmax(si) ans_end = torch.argmax(ei) if ans_start == 0 or ans_end == 0: new_ans = "" else: new_ans = tokenizer.decode( (x[i][ans_start:ans_end + 1])[:40]) total_ans_len += min(ans_end - ans_start + 1, 40) #print("------------- ans_start: %d ---------------" % ans_start) #print("------------- ans_end: %d ---------------" % ans_end) #print(new_ans) if new_ans == "": no_ans += 1 new_ans = new_ans.replace("[UNK]", "").replace("[CLS]", "").replace("[SEP]", "") if predict_type == "MOVIE": if not (ids[i] in all_predict): all_predict[ids[i]] = [""] if new_ans != "": if prev_last == ids[i]: all_predict[ids[i]][-1] += (" " + new_ans) else: all_predict[ids[i]].append(new_ans) if new_ans != "" and ans_end + 1 >= len( ei) - 1: #get last in sentence prev_last = ids[i] else: prev_last = -1 else: all_predict[ids[i]] = new_ans iter_bar.set_description("Run iter") with open(predict_name, "w") as F: json.dump(all_predict, F) F.close() print("NO_ans: ", no_ans) print("AVG_len: ", total_ans_len / len(all_predict))
def write_to_file(predictions): with open('test.pred', 'w') as f: for p in predictions: f.write(str(p) + '\n') f.close()
def main(): data_2 = '2_qubit_crit_data.npz' data_4 = '4_qubit_crit_data.npz' data_6 = '6_qubit_crit_data.npz' data_7 = '7_qubit_crit_data.npz' data_10 = '10_qubit_crit_data.npz' data_11 = '11_qubit_crit_data.npz' data_12 = '12_qubit_crit_data.npz' training_n_sizes = [2, 4, 7] validation_n_sizes = [6, 2, 4, 7] training_data_2 = get_dataset(data_2, 2, 10000) training_data_4 = get_dataset(data_4, 4, 10000) training_data_7 = get_dataset(data_7, 7, 10000) training_data_2, test_data_2 = random_split(training_data_2, [9000, 1000]) training_data_4, test_data_4 = random_split(training_data_4, [9000, 1000]) training_data_7, test_data_7 = random_split(training_data_7, [9000, 1000]) training_data_2, val_data_2 = random_split(training_data_2, [8000, 1000]) training_data_4, val_data_4 = random_split(training_data_4, [8000, 1000]) training_data_7, val_data_7 = random_split(training_data_7, [8000, 1000]) training_data_10 = get_dataset(data_10, 10, 10000) datasets = [training_data_2, training_data_4, training_data_7] training_loaders = [ DataLoader(x, batch_size=32, shuffle=True, num_workers=20) for x in datasets ] val_data_6 = get_dataset(data_6, 6, 10000) val_data_6, test_data_6 = random_split(val_data_6, [9000, 1000]) #val_data_11 = get_dataset(data_11,11,100) #val_data_12 = get_dataset(data_12,12,100) val_datasets = [val_data_6, val_data_2, val_data_4, val_data_7] val_loaders = [ DataLoader(x, batch_size=10000, num_workers=20) for x in val_datasets ] test_loader_10 = DataLoader(training_data_10, batch_size=10000, num_workers=20) test_datasets = [test_data_2, test_data_4, test_data_6, test_data_7] test_loaders = [ DataLoader(x, batch_size=1000, num_workers=20) for x in test_datasets ] test_loaders.append(test_loader_10) for mps_size in range(2, 11): err_name = "{}_dump_errors.p".format(mps_size) model_name = "{}_site_model.pt".format(mps_size) model, tot_err, val_err, t_errs, val_errs = mps_fit( mps_size, training_loaders, training_n_sizes, val_loaders, validation_n_sizes) loss_func = nn.MSELoss() model.eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") test_sizes = [2, 4, 6, 7, 10] test_errors = {} with torch.no_grad(): for j, loader in enumerate(test_loaders): sys_size = test_sizes[j] temp = 0 for i, (fields, wf) in enumerate(loader): fields = fields.to(device) gs = model(fields.to(device), sys_size) loss = loss_func(gs, wf.to(device)) print(loss.item() * (2**sys_size)) print(gs[0]) print(gs[-1]) temp += loss.item() test_errors[sys_size] = (loss / len(loader)) test_error_file = open("{}_test_err_.p".format(mps_size), 'wb') pickle.dump(test_errors, test_error_file) test_error_file.close() errs = [tot_err, val_err, t_errs, val_errs] torch.save(model.state_dict(), model_name) f = open(err_name, 'wb') pickle.dump(errs, f) f.close() '''
def run_experiment(testChrom, test_num, state_count, cross_fold, INITIAL_T, INITIAL_E, INITIAL_T0): # initialize the string which will eventually be a formatted data row in a CSV file data_string = testChrom[0:-1] + ", " + str(state_count) + ", " + str( test_num) + ", " + str(cross_fold) # initialize list which will be the list of observations in the training set observations = [] # initialize observation count count = 0 # flag for determining if we've reached the correct point in the input file for the chromosome start = True # open file with nucleosome vector sequences with open("yeast_vector_sequence_for_HMM.txt") as f: # for each like in the file for line in f: # if we've reached the beginning of the chromosome we're interested in if testChrom in line: #set flag indicating to start reading in the file start = True # if its the right kind of line, and the start flag has been set to true, start processing if "K36me" not in line and start and "chr" not in line and ( "0" in line or "1" in line) and "{" not in line: # increment the observation count count = count + 1 # get portion of the line which has the nucleosome vector line = line[0:51] # add the observation to the list of observations observations.append(line.replace(",", "")) # if we've reached the end of the chromosome in the file if start and testChrom not in line and "chr" in line: # break out of the loop, we're done start = False # close the file f.close() # is actually 10 because the test seq/validation seq are flipped half the time NUMBER_OF_FOLDS = 5 half_string = "fullchromosome" flip_test_seq = False adjusted_cross_fold = cross_fold if cross_fold >= 5: flip_test_seq = True adjusted_cross_fold = adjusted_cross_fold - 5 chunks = list( get_chunks(observations, int(len(observations) / NUMBER_OF_FOLDS))) while len(chunks) > NUMBER_OF_FOLDS: chunks[-2] = chunks[-2] + chunks[-1] del chunks[-1] held_observations = chunks[adjusted_cross_fold] final_test_observations = [] observations = [] for i in range(len(chunks)): if i != adjusted_cross_fold: observations.extend(chunks[i]) if not flip_test_seq: final_test_observations = held_observations[len(held_observations) // 2:] held_observations = held_observations[:len(held_observations) // 2] else: final_test_observations = held_observations[:len(held_observations) // 2] held_observations = held_observations[len(held_observations) // 2:] representedHistones = {} representedHistones[12] = True representedHistones[8] = True representedHistones[3] = True representedHistones[17] = True # S = the number of states S = state_count # initialize the model, using the initial matricies generated above model = HiddenMarkovModel(INITIAL_T, INITIAL_E, INITIAL_T0, representedHistones, maxStep=200, epsilon=0.1) # calculate the time I start running start_time = datetime.datetime.now() test_index = 12 # start the learning of the genomic weights, and save the learned matricies trans0, transition, emission, final_likelihood, c = model.Baum_Welch_EM( observations, held_observations, test_index) # print the initial and final matricies, in case I need them later or want to test something torch.set_printoptions(profile="full") print("Initial Matricies") print("INITIAL_T") print(INITIAL_T) print("INITIAL_E") print(INITIAL_E) print("INITIAL_T0") print(INITIAL_T0) print("Final Matricies") print("transition") print(transition) print("emission") print(emission) print("trans0") print(trans0) # add the final matricies to a list of all of the best maticires for this state number ts_for_experiment.append(transition) t0s_for_experiment.append(trans0) es_for_experiment.append(emission) # calculate and print the total time it took to run total_time = datetime.datetime.now() - start_time print("total time: " + str(total_time)) # add the total time and final training seq log likelihood to the data string data_string = data_string + ", " + str(total_time) + ", " + str( final_likelihood) # create a list representing the lengths of the various test sequences I want test_sequence_lengths = [5, 10, 20, 100, 0] # 0 for all # dictionary represnting the testing sequence log likelihoods for a test sequence length heldBackLikelihoods = dict() # for each of these lengths for length in test_sequence_lengths: # initialize an empty test sequence test_seq = [] # the 0 length is used to represent the whole 10% of the genome if length == 0: # so set the test_seq and the test_genome_data variables to be the # entire dataset held back from the training set test_seq = final_test_observations else: # otherwise, set the variables to a subset of the held back training data test_seq = final_test_observations[0:length] # print the length we're using print("test seq length: " + str(len(test_seq))) # get the log likelihood of the test sequence, given the releavant genome data held_back_likelihood = model.get_likelihood_of_seq(test_seq) # print the log likelihood print("final test observations held_back_likelihood") print(held_back_likelihood) # add the LL to the data string data_string = data_string + ", " + str(held_back_likelihood.detach()) # add the LL for the test sequence length to the dict heldBackLikelihoods[length] = held_back_likelihood # use a very similar loop to generate some stats for these predictions # this very similar loop is separate because of how I like to format the data for length in test_sequence_lengths: # initialize an empty test sequence test_seq = [] # the 0 length is used to represent the whole 10% of the genome if length == 0: # so set the test_seq and the test_genome_data variables to be the # entire dataset held back from the training set test_seq = final_test_observations else: # otherwise, set the variables to a subset of the held back training data test_seq = final_test_observations[0:length] # predict an emission sequence, using the guess_sequence_max method, and print the assosciated stats print("Logreg Guess Statistics:") data_string = data_string + "\n" all_results.append(data_string) # print the final data string print(data_string) # return the LL of the longest test seq return heldBackLikelihoods[0], model.T, model.E, model.T0
def run_experiment(testChrom, test_num, state_count, random, cross_fold): test_index = 12 # initialize the string which will eventually be a formatted data row in a CSV file data_string = testChrom[0:-1] + ", " + str(state_count) + ", " + str(test_num) + ", " + str(cross_fold) # initialize list which will be the list of observations in the training set observations = [] # initialize observation count count = 0 # initialzie list which will represent the genome data for the observation sequence observation_genome_data = [] # flag for determining if we've reached the correct point in the input file for the chromosome start = True # initialize values which will represent the average A/T/C/G counts avgA = 0 avgT = 0 avgC = 0 avgG = 0 # open file with nucleosome vector sequences with open("yeast_vector_sequence_for_HMM.txt") as f: # for each like in the file for line in f: # if we've reached the beginning of the chromosome we're interested in if testChrom in line: #set flag indicating to start reading in the file start = True # if its the right kind of line, and the start flag has been set to true, start processing if "K36me" not in line and start and "chr" not in line and ("0" in line or "1" in line) and "{" not in line: # increment the observation count count = count + 1 # generate an array representing the A/T/C/G counts for this nucleosome genomeData = line[52:] genomeData = genomeData.strip() genomeData = genomeData.split(",") genomeData = list(map(int, genomeData)) # update the total A/T/C/G counts # this will be used to compute the average A/T/C/G counts after this loop avgA = avgA + genomeData[0] avgT = avgT + genomeData[1] avgC = avgC + genomeData[2] avgG = avgG + genomeData[3] # get portion of the line which has the nucleosome vector line = line[0:51] # add the genome data to the list of genome data entries observation_genome_data.append(genomeData) # add the observation to the list of observations observations.append(line.replace(",", "")) # if we've reached the end of the chromosome in the file if start and testChrom not in line and "chr" in line:# and "chr2" not in line: # break out the loop, we're done start = False # close the file f.close() # compute the average A/T/C/G counts for this chromosome avgA = avgA / len(observations) avgT = avgT / len(observations) avgC = avgC / len(observations) avgG = avgG / len(observations) NUMBER_OF_FOLDS = 10 chunks = get_chunks(observations, int(len(observations)/NUMBER_OF_FOLDS), NUMBER_OF_FOLDS) genome_data_chunks = get_chunks(observation_genome_data, int(len(observation_genome_data)/NUMBER_OF_FOLDS), NUMBER_OF_FOLDS) final_test_index = cross_fold + 1 if(final_test_index == len(chunks)): final_test_index = 0 held_observations = chunks[cross_fold] held_observations_genome_data = genome_data_chunks[cross_fold] final_test_observations = chunks[final_test_index] final_test_observations_genome_data = genome_data_chunks[final_test_index] observations = [] observation_genome_data = [] for i in range(len(chunks)): if i != final_test_index: observations.extend(chunks[i]) observation_genome_data.extend(genome_data_chunks[i]) # S = the number of states S = state_count # if I've said the random flat when setting up this experiment, then I want to # generate random initial matricies, instead of using the ones we learned with the standard HMM if random: # get random values summing to 1 for each row, using the dirichlet distribution INITIAL_T = np.random.dirichlet(np.ones(S), 1) for i in range(S - 1): INITIAL_T = np.vstack([INITIAL_T, np.random.dirichlet(np.ones(S), 1)]) # get random values summing to 1 for each row, using the dirichlet distribution INITIAL_E = np.random.dirichlet(np.ones(S), 1) for i in range(25): INITIAL_E = np.vstack([INITIAL_E, np.random.dirichlet(np.ones(S), 1)]) # get random values summing to 1 for each row (in this case only 1 row), using # the dirichlet distribution INITIAL_T0 = np.random.dirichlet(np.ones(S), 1) # if we don't want random initial matricies, then we use the ones learned in the standard HMM else: INITIAL_T, INITIAL_E, INITIAL_T0 = get_premade_matricies(S, cross_fold) representedHistones = {} representedHistones[12] = True representedHistones[8] = True representedHistones[3] = True representedHistones[17] = True # initialize the model, using the initial matricies generated above model = HiddenMarkovModel(INITIAL_T, INITIAL_E, INITIAL_T0, representedHistones, maxStep=50, epsilon = 0.001) model.final_test = final_test_observations model.final_test_data = final_test_observations_genome_data # calculate the time I start running start_time = datetime.datetime.now() # start the learning of the genomic weights, and save the learned matricies for c in range(len(chunks)): if c != final_test_index and c != cross_fold: chunk = chunks[c] data_chunk = genome_data_chunks[c] print("For sequence " + str(c) + " with test sequence " + str(final_test_index)+ " and with validation sequence " + str(cross_fold)) model.learn_genomic_weights(chunk, data_chunk, avgA, avgT, avgC, avgG, held_observations, held_observations_genome_data, test_index) # print the learned genomic weights torch.set_printoptions(profile="full") print("Best Genomic Weights") print(model.genomic_weights) # calculate and print how much time it took to run total_time = datetime.datetime.now() - start_time print("total time: " + str(total_time)) # add the total time and final log likelihood to formatted csv row data_string = data_string + ", " + str(total_time) + ", " + str(model.scale_plot[-1].detach().numpy()) # create a list representing the lengths of the various test sequences I want test_sequence_lengths = [5, 10, 20, 100, 0] # 0 for all # for each of these lengths for length in test_sequence_lengths: # initialize an empty test sequence test_seq = [] # the 0 length is used to represent the whole 10% of the genome if length == 0: # so set the test_seq and the test_genome_data variables to be the # entire dataset held back from the training set test_seq = final_test_observations test_genome_data = final_test_observations_genome_data else: # otherwise, set the variables to a subset of the held back training data test_seq = final_test_observations[0:length] test_genome_data = final_test_observations_genome_data[0:length] # print the length we're using print("test seq length: " + str(len(test_seq))) # get the log likelihood of the test sequence, given the releavant genome data held_back_likelihood = model.get_likelihood_of_seq(test_seq, test_genome_data) # print the log likelihood print("held_back_likelihood") print(held_back_likelihood) # add the log likelihood to the formatted string data_string = data_string + ", " + str(held_back_likelihood.detach()) # use a very similar loop to generate some stats for these predictions # this very similar loop is separate because of how I like to format the data for length in test_sequence_lengths: # initialize an empty test sequence test_seq = [] # the 0 length is used to represent the whole 10% of the genome if length == 0: # so set the test_seq and the test_genome_data variables to be the # entire dataset held back from the training set test_seq = final_test_observations test_genome_data = final_test_observations_genome_data else: # otherwise, set the variables to a subset of the held back training data test_seq = final_test_observations[0:length] test_genome_data = final_test_observations_genome_data[0:length] # print the final data string data_string = data_string + "\n" print(data_string) # it its made it this far, then the experiment was successful return model.genomic_weights
def save_word_embeddings(directory, file, embeddings_index): f = open(os.path.join(directory, file), 'w') for word, vec in embeddings_index.items(): f.write(word + ' ' + ' '.join(str(x) for x in vec) + '\n') f.close()
def memory_write(self, memory_new): F = open(self.file_name, 'wb') pickle.dump(memory_new, F) F.close()
def Dict2File(Dict, filename): F = open(filename, 'a+') F.write(str(Dict)) F.close()