def supplement_from_additional_preds(additional_preds_file, min_num_high_conf, high_confidence_ecs, high_and_low_confidence_ecs, all_ec_to_gene): ecs_supplemented = set() start_parsing = False with open(additional_preds_file) as reader: for line in reader: line = line.strip() if line == "": continue if not start_parsing: if line.startswith("Protein_name"): start_parsing = True continue split = line.split("\t") protein, ec = "\t".join(split[0:-6]), split[-6] conf_to_tool, tool_to_conf = get_conf_to_tool(split[-5:]) utils.add_to_dict(all_ec_to_gene, ec, protein) if min_num_high_conf > 0: if ("2" not in conf_to_tool) or (len(conf_to_tool["2"]) < min_num_high_conf): continue else: # Take PRIAM high-confidence predictions by default. if tool_to_conf["PRIAM"] != "2": continue if ec not in high_confidence_ecs: ecs_supplemented.add(ec) high_confidence_ecs.add(ec) high_and_low_confidence_ecs.add(ec) return high_confidence_ecs, high_and_low_confidence_ecs, ecs_supplemented, all_ec_to_gene
def format_rxn_to_gene_for_later(fasta_file, input_rxn_to_gene_file, output_rxn_to_gene_file): complete_seq_names = set() with open(fasta_file) as open_file: for line in open_file: line = line.strip() if (line == "") or (line[0] != ">"): continue complete_name = line[1:] complete_seq_names.add(complete_name) rxn_to_gene = {} with open(input_rxn_to_gene_file) as open_file: for line in open_file: line = line.strip() if (line == "") or (line[0] == "#"): continue split = line.split() rxn = split[0] genes = split[1].split(";") for gene in genes: complete_gene, _ = utils.get_seq_name(complete_seq_names, gene) utils.add_to_dict(rxn_to_gene, rxn, complete_gene) with open(output_rxn_to_gene_file, "w") as writer: for rxn, genes in rxn_to_gene.items(): for gene in genes: writer.write(rxn + "\t" + gene + "\n")
def get_conf_to_tool(list_of_confs): conf_to_tool, tool_to_conf = {}, {} for tool, conf_level in zip( ["CatFam", "DETECT", "EFICAz", "EnzDP", "PRIAM"], list_of_confs): tool_to_conf[tool] = conf_level utils.add_to_dict(conf_to_tool, conf_level, tool) return conf_to_tool, tool_to_conf
def get_map_from_file(file_name, first_elem_is_key=True, need_values_in_set=True): key_value = {} with open(file_name) as input: for line in input: line = line.strip() if line == "": continue split = line.split("\t") if first_elem_is_key: key, value = split[0], split[1] else: key, value = split[1], split[0] utils.add_to_dict(key_value, key, value, need_values_in_set) return key_value
def load_best_tools(training_data, method_arguments): ec_to_best_tools = {} if method_arguments == "all": keywords_of_int = ["High_confidence", "Low_confidence"] else: keywords_of_int = ["High_confidence"] with open(training_data) as infile: for line in infile: line = line.strip() if line == "": continue split = line.split() ec, tool, keyword = split[0], split[1], split[2] if keyword in keywords_of_int: utils.add_to_dict(ec_to_best_tools, ec, tool) return ec_to_best_tools
def read_and_split_conf_preds(ec_preds_file, high_cutoff, low_cutoff): high_conf_ecs = set() high_and_low_conf_ecs = set() all_ec_to_gene = {} low_ec_to_score_to_gene = {} with open(ec_preds_file) as input: for line in input: line = line.strip() if line == "": continue split = line.split("\t") if utils.is_num(split[-1]): ec, score = split[0], float(split[-1]) gene = "\t".join(split[1:-1]) else: ec, score = split[0], float(split[-2]) gene = "\t".join(split[1:-2]) if score > high_cutoff: high_conf_ecs.add(ec) utils.add_to_dict(all_ec_to_gene, ec, gene) if score > low_cutoff: high_and_low_conf_ecs.add(ec) utils.add_to_dict_key_score_value(low_ec_to_score_to_gene, ec, score, gene) # For the low-confidence predictions, only retain the genes predicting an EC with the highest score. for ec, score_to_gene in low_ec_to_score_to_gene.items(): if ec in high_conf_ecs: continue max_score = max(score_to_gene.keys()) for gene in score_to_gene[max_score]: utils.add_to_dict(all_ec_to_gene, ec, gene) return high_conf_ecs, high_and_low_conf_ecs, all_ec_to_gene
def main(): ########## 0. parse command-line argument ########## # training file name only (required) parser = get_parser() args = vars(parser.parse_args()) training_filename = args['training_file'] ########## 1. get the unigram and bigram counts ########## ## need unigrams and bigrams for words and clusters # dictionaries to store the counts # format: {word0:{word1:count}} unigrams = {} small_clusters = {} large_clusters = {} bigrams_ww = {} bigrams_sw = {} bigrams_lw = {} # also get word factor files # convert from word to small cluster and small cluster to large cluster word_to_small = {} small_to_large = {} # will need total word count for unigram probs total_word_count = 0 ## the ngram counts come from the training file with open(training_filename, 'r') as training_file: # read through line by line (one sentence per line) for line in training_file: # split the line into words (with clusters still attached) line_words = line.strip().split(' ') ## loop through the words in the sentence for index, word in enumerate(line_words): # increment total word count total_word_count += 1 # get the word and its parts word2 = utils.get_part(word, WORD_LABEL) small2 = utils.get_part(word, SMALL_LABEL) large2 = utils.get_part(word, LARGE_LABEL) # add to mappings of words and factors utils.add_to_dict(word2, small2, word_to_small) utils.add_to_dict(small2, large2, small_to_large) # add to unigram count dictionaries utils.add_uni_counts(word2, unigrams) utils.add_uni_counts(small2, small_clusters) utils.add_uni_counts(large2, large_clusters) # if it is the second or later word, get prev word cluster # for first word, just consider unigrams (TO DO should be bigram with <s> first??) if index > 0: word1 = utils.get_part(line_words[index-1], WORD_LABEL) small1 = utils.get_part(line_words[index-1], SMALL_LABEL) large1 = utils.get_part(line_words[index-1], LARGE_LABEL) # add to bigram dictionaries utils.add_bi_counts(word1, word2, bigrams_ww) utils.add_bi_counts(small1, word2, bigrams_sw) utils.add_bi_counts(large1, word2, bigrams_lw) sys.stderr.write('Finished getting ngram count dictionaries\n') ########## 3. calculate backoff probabilities for each ngram ########## ## get counts of counts for use in discounting count_unigrams = utils.get_counts_uni(unigrams) count_ww = utils.get_counts_bi(bigrams_ww) count_sw = utils.get_counts_bi(bigrams_sw) count_lw = utils.get_counts_bi(bigrams_lw) # will need vocab size for unk probs vocab_size = len(unigrams) sys.stderr.write('Finished getting counts of counts\n') ## get discounts based on simple Good-Turing # TO DO later make this more robust so I can use other smoothing methods # note Good-Turing depends on the counts, not on the ngram itself disc_uni = utils.calc_discount(count_unigrams) disc_ww = utils.calc_discount(count_ww) disc_sw = utils.calc_discount(count_sw) disc_lw = utils.calc_discount(count_lw) ## calculate log probability of each unigram and bigram # dictionaries to store probabilities prob_unigrams = utils.probs_uni(unigrams, total_word_count, disc_uni) prob_ww = utils.probs_bi(bigrams_ww, unigrams, disc_ww) prob_sw = utils.probs_bi(bigrams_sw, small_clusters, disc_sw) prob_lw = utils.probs_bi(bigrams_lw, large_clusters, disc_lw) # TO DO where to store unk? # for now just make it a variable # unknowns (GT estimate): count(words appearing once) / |V| and store in variable prob_unk = log(count_unigrams[1], 10) - log(vocab_size, 10) sys.stderr.write('Finished getting probability dictionaries\n') ########## 4. calculate backoff (alpha) of each backoff step ########## # backoff from word to small cluster backoff_ws = utils.calc_backoff_bi(word_to_small, prob_ww, prob_sw) sys.stderr.write('Finished getting w2s backoff dictionary\n') # backoff from small cluster to large cluster backoff_sl = utils.calc_backoff_bi(small_to_large, prob_sw, prob_lw) sys.stderr.write('Finished getting s2l backoff dictionary\n') ## TO DO some of these (and w2s) are > 1 which shouldn't happen! # backoff from large cluster to unigram (ignore previous word altogether) backoff_l = utils.calc_backoff_uni(prob_lw, prob_unigrams) #### TO DO Something is wrong here because almost all are -1000! sys.stderr.write('Finished getting l2u backoff dictionary\n') sys.stderr.write('Finished getting backoff factor dictionaries\n') ########## 5. print probs and alphas to stdout ########## ## probabilities # start with unknown prob sys.stdout.write('\unks:\n') sys.stdout.write(str(prob_unk) + '\t<unk>\n') # unigram probs sys.stdout.write('\\1-grams:\n') for unigram in prob_unigrams: sys.stdout.write(str(prob_unigrams[unigram]) + '\t' + unigram + '\n') # lw bigram probs sys.stdout.write('\\2-grams lw:\n') for large_cluster in prob_lw: for word in prob_lw[large_cluster]: sys.stdout.write(str(prob_lw[large_cluster][word]) + '\t' + large_cluster + ' ' + word + '\n') # sw bigram probs sys.stdout.write('\\2-grams sw:\n') for small_cluster in prob_sw: for word in prob_sw[small_cluster]: sys.stdout.write(str(prob_sw[small_cluster][word]) + '\t' + small_cluster + ' ' + word + '\n') # ww bigram probs sys.stdout.write('\\2-grams ww:\n') for prev_word in prob_ww: for word in prob_ww[prev_word]: sys.stdout.write(str(prob_ww[prev_word][word]) + '\t' + prev_word + ' ' + word + '\n') ## backoff weights # back off from lw to unigram sys.stdout.write('\\backoff l to unigram:\n') for cluster in backoff_l: sys.stdout.write(str(backoff_l[cluster]) + '\t' + cluster + '\n') # backoff from sw to lw sys.stdout.write('\\backoff s to l:\n') for cluster in backoff_sl: sys.stdout.write(str(backoff_sl[cluster]) + '\t' + cluster + '\n') # backoff from ww to sw sys.stdout.write('\\backoff w to s:\n') for cluster in backoff_ws: sys.stdout.write(str(backoff_ws[cluster]) + '\t' + cluster + '\n')
def build_network(input_batch, num_samples, latent_units, hidden_units_q, hidden_units_p, bias=None, data_type='binary'): input_tensor = tf.tile(input_batch, [num_samples, 1], name='tiled_input') # encoder layers_q = [] samples_q = [] input_cur = input_tensor #input_tensor = tf.Print(input_tensor,[input_tensor],message='input_tensor', summarize=785) samples_q.append(input_tensor) layer_iter = 1 params = {} for hidden_units_cur, latent_units_cur in zip(hidden_units_q, latent_units): # build the dense hidden layers for this stochastic unit dense, variables = build_dense_layers(input_cur, hidden_units_cur, activation_function=tf.nn.tanh, layer_name='q_det_unit_' + str(layer_iter) + '_') # add variables to the params dict utils.add_to_dict(params, variables) # build the stochastic layer layer_q = GaussianStochLayer.build_stochastic_layer(dense,latent_units_cur, layer_name='q_stoch_layer_'+str(layer_iter)+'_') utils.add_to_dict(params,layer_q.params) layers_q.append(layer_q) input_cur = layers_q[-1].get_samples() #input_cur = tf.Print(input_cur, [input_cur], message='samples for layer '+str(layer_iter), summarize=100) samples_q.append(input_cur) layer_iter += 1 # decoder layers_p = [] layer_iter = 1 rev_samples_q = list(reversed(samples_q))[:-1] rev_latent_units = list(reversed(latent_units))[1:] for hidden_units_cur, latent_units_cur, input_cur in zip(hidden_units_p[:-1], rev_latent_units, rev_samples_q[:-1]): # build the dense hidden layers for this stochastic unit dense, variables = build_dense_layers(input_cur, hidden_units_cur, activation_function=tf.nn.tanh, layer_name='p_det_unit_' + str(layer_iter) + '_') # add variables to the params dict utils.add_to_dict(params, variables) # build the stochastic layer layer_p = GaussianStochLayer.build_stochastic_layer(dense, latent_units_cur,layer_name='p_stoch_layer_' + str(layer_iter) + '_') utils.add_to_dict(params, layer_p.params) layers_p.append(layer_p) layer_iter += 1 # build the last dense layer for the decoder dense, variables = build_dense_layers(rev_samples_q[-1], hidden_units_p[-1], activation_function=tf.nn.tanh, layer_name='p_det_unit_' + str(layer_iter) + '_') # add variables to the params dict utils.add_to_dict(params, variables) # build the last stochastic layer if(data_type == 'binary'): layer_p = BernoulliStochLayer.build_stochastic_layer(dense, input_tensor.shape[1], layer_name='p_stoch_layer_' + str(layer_iter) + '_', mean_bias=bias) utils.add_to_dict(params, layer_p.params) layers_p.append(layer_p) elif data_type == 'continuous': layer_p = GaussianStochLayer.build_stochastic_layer(dense, input_tensor.shape[1], layer_name='p_stoch_layer_' + str(layer_iter) + '_', mean_bias=bias) utils.add_to_dict(params, layer_p.params) layers_p.append(layer_p) prior = UnitGaussianLayer(layers_q[-1].mean_layer.shape) return Network(layers_q, layers_p, samples_q, prior, num_samples, params)