def main(): path = Path('../data/partial') train_X, train_X_str, train_Y, train_Y_str, test_X, test_X_str, test_Y, test_Y_str, word2index, tag2index, index2tag = get_xy( path) print('************Training Set Summary*************') T, V = len(tag2index), len(word2index) print('Number of tags: {}, Number of words: {}'.format(T, V)) print('************Train*************') start = time.time() emission_weight = get_emission_weight(train_X, train_Y, word2index, tag2index) transition_weight = get_transition_weight(train_Y, tag2index) end = time.time() time_elapsed(start, end) print('************Saving Model Outputs*************') path_output = path / 'dev.p2.out' viterbi_output(path_output, test_X_str, test_X, tag2index, emission_weight, transition_weight, link_weight_sum) prec, rec, f1 = eval(train_X, train_Y_str, tag2index, emission_weight, transition_weight, link_weight_sum) print('precision, recall, f1 on training set: {0} {1} {2}'.format( prec, rec, f1)) prec, rec, f1 = eval(test_X, test_Y_str, tag2index, emission_weight, transition_weight, link_weight_sum) print('precision, recall, f1 on test set: {0} {1} {2}'.format( prec, rec, f1))
def UniProt(gene_list, gene_name_list, get_info=True): """ This procedure searches UniProt human polymorphism and disease mutations (download to computer) for information about disease. Parameter gene_list: a list of genes Preconditon: gene_list is a list of genes Parameter gene_name_list: a list of gene names Preconditon: gene_name_list is a list of Gene names [str] Parameter gene_name_set: a set of gene names (used for searching) Preconditon: gene_name_set is a set of gene names Parameter get_info: a boolean that says whether UniProt should search the web for dbSNP information Preconditon: get_info is a bool """ if not get_info: print("Searching UniProt. This takes a few seconds.") else: print("Searching UniProt for mutation info. This takes a few minutes.") filename = helper.edit_filename( "humsavar.txt", "sibling", foldername='databases') with open(filename, "r") as g: data = g.read() start_time = time.time() i = 0 for gene_name in gene_name_list: if '\n' + gene_name + " " in data: row_list = re.findall('\n' + gene_name + r" .+", data) helper.time_elapsed(start_time) for row in row_list: if "Disease" in row: disease_name = re.search(r"(rs\w+|-)\s+(.+)", row).group(2) ind = gene_name_list.index(gene_name) gene = gene_list[ind] gene.set_disease(disease_name, gene_name, "UniProt", get_info) if get_info: disease = gene.disease_list()[-1] dbSNP = re.search(r"(rs\w+|-)", row).group(1) aa_change = re.search(r"(p.\w+)", row).group(0) disease.set_amino_change(aa_change) disease.set_SNP(dbSNP) disease.get_SNP_info() if disease.has_full_info(): i = i + 1 if i == 6: i = 0 time.sleep(random.uniform(0.5, 1.25)) helper.time_elapsed(start_time) time.sleep(random.uniform(0.5, 1.25)) print("\nUniProt database complete")
def main(): path = Path('../data/full') train_X, train_X_str, train_Y, train_Y_str, dev_X, dev_X_str, dev_Y, dev_Y_str, test_X, test_X_str, test_Y, test_Y_str, word2index, tag2index, postag2index, index2tag = get_xy( path) print('************Training Set Summary*************') T, V, POS = len(tag2index), len(word2index), len(postag2index) print('Number of tags: {}, Number of words: {}, Number of pos tags: {}'. format(T, V, POS)) print('************Train*************') start = time.time() optimal_transition_weight, optimal_emission_weight, optimal_emission_weight_pos, optimal_combination_weight, optimal_combination_weight_pos = trainDecay( train_X, train_Y, dev_X, dev_Y_str, tag2index, word2index, postag2index, link_weight_sum, iteration=20, random_seed=1) end = time.time() time_elapsed(start, end) # print('************Saving Model Parameters*************') # path_transition = path/'best_weight_features3_transition_strp.npy' # path_emission = path/'best_weight_features3_emission_strp.npy' # np.save(path_transition, optimal_transition_weight) # np.save(path_emission, optimal_emission_weight) print('************Evaluation*************') prec, rec, f1 = eval(train_X, train_Y_str, tag2index, optimal_emission_weight, optimal_transition_weight, optimal_emission_weight_pos, optimal_combination_weight, optimal_combination_weight_pos, link_weight_sum) print('precision, recall, f1 on training set: {0} {1} {2}'.format( prec, rec, f1)) prec, rec, f1 = eval(dev_X, dev_Y_str, tag2index, optimal_emission_weight, optimal_transition_weight, optimal_emission_weight_pos, optimal_combination_weight, optimal_combination_weight_pos, link_weight_sum) print('precision, recall, f1 on development set: {0} {1} {2}'.format( prec, rec, f1)) prec, rec, f1 = eval(test_X, test_Y_str, tag2index, optimal_emission_weight, optimal_transition_weight, optimal_emission_weight_pos, optimal_combination_weight, optimal_combination_weight_pos, link_weight_sum) print('precision, recall, f1 on test set: {0} {1} {2}'.format( prec, rec, f1))
def main(): path = Path('../data/full') train_X, train_X_str, train_Y, train_Y_str, test_X, test_X_str, test_Y, test_Y_str, word2index, tag2index, postag2index, index2tag = get_xy( path) print('************Training Set Summary*************') T, V, POS = len(tag2index), len(word2index), len(postag2index) print('Number of tags: {}, Number of words: {}, Number of POS tags: {}'. format(T, V, POS)) Lambda = 0.1 def callbackF(w): loss = get_loss_grad(w)[0] transition_weight = w[:(T + 1) * (T + 1)].reshape((T + 1, T + 1)) emission_weight = w[(T + 1) * (T + 1):(T + 1) * (T + 1) + T * V].reshape((T, V)) emission_weight_pos = w[(T + 1) * (T + 1) + T * V:].reshape((T, POS)) loss_l2 = LossRegularization(emission_weight, transition_weight, emission_weight_pos, param=Lambda) print('Loss:{:.4f} L2 Loss:{:.4f}'.format(loss, loss_l2)) def get_loss_grad(w): with HiddenPrints(): transition_weight = w[:(T + 1) * (T + 1)].reshape((T + 1, T + 1)) emission_weight = w[(T + 1) * (T + 1):(T + 1) * (T + 1) + T * V].reshape((T, V)) emission_weight_pos = w[(T + 1) * (T + 1) + T * V:].reshape( (T, POS)) loss = Loss(train_X, train_Y, tag2index, emission_weight, transition_weight, emission_weight_pos, param=Lambda) grads_transition = GradientTransition(train_X, train_Y, tag2index, emission_weight, transition_weight, emission_weight_pos, param=Lambda) grads_emission = GradientEmission(train_X, train_Y, tag2index, word2index, emission_weight, transition_weight, emission_weight_pos, param=Lambda) grads_emission_pos = GradientEmissionPOS(train_X, train_Y, tag2index, postag2index, emission_weight, transition_weight, emission_weight_pos, param=Lambda) grads = np.concatenate( (grads_transition.reshape(-1), grads_emission.reshape(-1), grads_emission_pos.reshape(-1))) return loss, grads print('************Train*************') start = time.time() init_w = np.zeros(((T + 1) * (T + 1) + T * (V + POS), )) optimal_weight, final_loss, result_dict = fmin_l_bfgs_b(get_loss_grad, init_w, pgtol=0.01, callback=callbackF) end = time.time() time_elapsed(start, end) print('************Saving Model Parameters*************') optimal_transition_weight = optimal_weight[:(T + 1) * (T + 1)].reshape( (T + 1, T + 1)) optimal_emission_weight = optimal_weight[(T + 1) * (T + 1):(T + 1) * (T + 1) + T * V].reshape((T, V)) optimal_emission_weight_pos = optimal_weight[(T + 1) * (T + 1) + T * V:].reshape((T, POS)) path_transition = path / 'best_weight_features2_transition.npy' path_emission = path / 'best_weight_features2_emission.npy' path_emission_pos = path / 'best_weight_features2_emission_pos.npy' np.save(path_transition, optimal_transition_weight) np.save(path_emission, optimal_emission_weight) np.save(path_emission_pos, optimal_emission_weight_pos) print('************Saving Model Outputs*************') path_output = path / 'dev.p5.CRF.f3.out' viterbi_output(path_output, test_X_str, test_X, tag2index, optimal_emission_weight, optimal_transition_weight, optimal_emission_weight_pos, link_weight_sum) print('************Evaluation*************') prec, rec, f1 = eval(train_X, train_Y_str, tag2index, optimal_emission_weight, optimal_transition_weight, optimal_emission_weight_pos, link_weight_sum) print('precision, recall, f1 on training set: {0} {1} {2}'.format( prec, rec, f1)) prec, rec, f1 = eval(test_X, test_Y_str, tag2index, optimal_emission_weight, optimal_transition_weight, optimal_emission_weight_pos, link_weight_sum) print('precision, recall, f1 on test set: {0} {1} {2}'.format( prec, rec, f1))
def NLM(gene_list, gene_name_list, gene_name_set): """ This procedure searches the GHR NLM database of genes and finds all of the genes that are in a list of genes. The database can be found here: https://ghr.nlm.nih.gov/gene Parameter gene_list: a list of genes Preconditon: gene_list is a list of genes Parameter gene_name_list: a list of gene names Preconditon: gene_name_list is a list of Gene names [str] Parameter gene_name_set: a set of gene names (used for searching) Preconditon: gene_name_set is a set of gene names Parameter start_time: the time this function began. This is used to calculate the amount of time this function takes to execute and print it on the screen. Preconditon: start_time is a time object. """ url1 = "https://ghr.nlm.nih.gov/gene?initial=" alphabet = list(ascii_lowercase) # list of letters ua = UserAgent() # Needed so the website won't block my acesss; generates # a thingy that looks just look regular chrome. this_list = [] print("Searching NLM website. This takes a few seconds.") start_time = time.time() for let in alphabet: headers = {"User-Agent": ua.random} # uses the thingy I generated request = Request(url1 + let, headers=headers) # request website response = urlopen(request) # open the website respData = response.read() # read contents on the website response.close() # close the website page_soup = soup(respData, "html.parser") # get HTML from website raw_info = page_soup.findAll("ul", {"class": "browse-results"})[0].text # Useful function that I need to learn to use better. this_list = this_list + \ re.findall("(" + let.upper() + "[A-Z\d]+): \w", raw_info) helper.time_elapsed(start_time) for gene_name in this_list: helper.time_elapsed(start_time) url2 = "https://ghr.nlm.nih.gov/gene/" if gene_name in gene_name_set: try: ind = gene_name_list.index(gene_name) gene = gene_list[ind] headers = {"User-Agent": ua.random} request = Request(url2 + gene_name + "#conditions", headers=headers) response = urlopen(request) respData = response.read() response.close() page_soup = soup(respData, "html.parser") dis1 = re.findall( "Health Conditions Related to Genetic Changes\s+(([\w+,-]+ )+)", page_soup.text) disease = dis1[0][0] gene.set_disease(disease, gene_name, "NLM") diseases = re.findall("More About This Health Condition\s+(([\w+,-]+ )+)", page_soup.text)[0:-1] for disease in diseases: gene.set_disease(disease[0], gene_name, "NLM") except IndexError: pass print("\nNLM database complete.")