Esempio n. 1
0
def supplement_from_additional_preds(additional_preds_file, min_num_high_conf,
                                     high_confidence_ecs,
                                     high_and_low_confidence_ecs,
                                     all_ec_to_gene):

    ecs_supplemented = set()
    start_parsing = False
    with open(additional_preds_file) as reader:
        for line in reader:
            line = line.strip()
            if line == "":
                continue
            if not start_parsing:
                if line.startswith("Protein_name"):
                    start_parsing = True
                continue

            split = line.split("\t")
            protein, ec = "\t".join(split[0:-6]), split[-6]
            conf_to_tool, tool_to_conf = get_conf_to_tool(split[-5:])
            utils.add_to_dict(all_ec_to_gene, ec, protein)
            if min_num_high_conf > 0:
                if ("2" not in conf_to_tool) or (len(conf_to_tool["2"]) <
                                                 min_num_high_conf):
                    continue
            else:  # Take PRIAM high-confidence predictions by default.
                if tool_to_conf["PRIAM"] != "2":
                    continue
            if ec not in high_confidence_ecs:
                ecs_supplemented.add(ec)
                high_confidence_ecs.add(ec)
                high_and_low_confidence_ecs.add(ec)
    return high_confidence_ecs, high_and_low_confidence_ecs, ecs_supplemented, all_ec_to_gene
Esempio n. 2
0
def format_rxn_to_gene_for_later(fasta_file, input_rxn_to_gene_file,
                                 output_rxn_to_gene_file):

    complete_seq_names = set()
    with open(fasta_file) as open_file:
        for line in open_file:
            line = line.strip()
            if (line == "") or (line[0] != ">"):
                continue
            complete_name = line[1:]
            complete_seq_names.add(complete_name)

    rxn_to_gene = {}
    with open(input_rxn_to_gene_file) as open_file:
        for line in open_file:
            line = line.strip()
            if (line == "") or (line[0] == "#"):
                continue
            split = line.split()
            rxn = split[0]
            genes = split[1].split(";")
            for gene in genes:
                complete_gene, _ = utils.get_seq_name(complete_seq_names, gene)
                utils.add_to_dict(rxn_to_gene, rxn, complete_gene)

    with open(output_rxn_to_gene_file, "w") as writer:
        for rxn, genes in rxn_to_gene.items():
            for gene in genes:
                writer.write(rxn + "\t" + gene + "\n")
Esempio n. 3
0
def get_conf_to_tool(list_of_confs):

    conf_to_tool, tool_to_conf = {}, {}
    for tool, conf_level in zip(
        ["CatFam", "DETECT", "EFICAz", "EnzDP", "PRIAM"], list_of_confs):
        tool_to_conf[tool] = conf_level
        utils.add_to_dict(conf_to_tool, conf_level, tool)
    return conf_to_tool, tool_to_conf
Esempio n. 4
0
def get_map_from_file(file_name, first_elem_is_key=True, need_values_in_set=True):

    key_value = {}
    with open(file_name) as input:
        for line in input:
            line = line.strip()
            if line == "":
                continue
            split = line.split("\t")
            if first_elem_is_key:
                key, value = split[0], split[1]
            else:
                key, value = split[1], split[0]
            utils.add_to_dict(key_value, key, value, need_values_in_set)
    return key_value
Esempio n. 5
0
def load_best_tools(training_data, method_arguments):

    ec_to_best_tools = {}
    if method_arguments == "all":
        keywords_of_int = ["High_confidence", "Low_confidence"]
    else:
        keywords_of_int = ["High_confidence"]
    with open(training_data) as infile:
        for line in infile:
            line = line.strip()
            if line == "":
                continue
            split = line.split()
            ec, tool, keyword = split[0], split[1], split[2]
            if keyword in keywords_of_int:
                utils.add_to_dict(ec_to_best_tools, ec, tool)
    return ec_to_best_tools
Esempio n. 6
0
def read_and_split_conf_preds(ec_preds_file, high_cutoff, low_cutoff):

    high_conf_ecs = set()
    high_and_low_conf_ecs = set()
    all_ec_to_gene = {}
    low_ec_to_score_to_gene = {}

    with open(ec_preds_file) as input:
        for line in input:
            line = line.strip()
            if line == "":
                continue
            split = line.split("\t")

            if utils.is_num(split[-1]):
                ec, score = split[0], float(split[-1])
                gene = "\t".join(split[1:-1])
            else:
                ec, score = split[0], float(split[-2])
                gene = "\t".join(split[1:-2])

            if score > high_cutoff:
                high_conf_ecs.add(ec)
                utils.add_to_dict(all_ec_to_gene, ec, gene)

            if score > low_cutoff:
                high_and_low_conf_ecs.add(ec)
                utils.add_to_dict_key_score_value(low_ec_to_score_to_gene, ec,
                                                  score, gene)

    # For the low-confidence predictions, only retain the genes predicting an EC with the highest score.
    for ec, score_to_gene in low_ec_to_score_to_gene.items():
        if ec in high_conf_ecs:
            continue
        max_score = max(score_to_gene.keys())
        for gene in score_to_gene[max_score]:
            utils.add_to_dict(all_ec_to_gene, ec, gene)

    return high_conf_ecs, high_and_low_conf_ecs, all_ec_to_gene
def main():
    ########## 0. parse command-line argument ##########
    # training file name only (required)
    parser = get_parser()
    args = vars(parser.parse_args())
    training_filename = args['training_file']
    
    
    ########## 1. get the unigram and bigram counts ##########
    ## need unigrams and bigrams for words and clusters
    # dictionaries to store the counts
    # format: {word0:{word1:count}}
    unigrams = {}
    small_clusters = {}
    large_clusters = {}
    bigrams_ww = {}
    bigrams_sw = {}
    bigrams_lw = {}
    
    # also get word factor files
    # convert from word to small cluster and small cluster to large cluster
    word_to_small = {}
    small_to_large = {}
    
    # will need total word count for unigram probs
    total_word_count = 0
    
    ## the ngram counts come from the training file
    with open(training_filename, 'r') as training_file:
        # read through line by line (one sentence per line)
        for line in training_file:
            # split the line into words (with clusters still attached)
            line_words = line.strip().split(' ')
            
            ## loop through the words in the sentence
            for index, word in enumerate(line_words):
                # increment total word count
                total_word_count += 1
                
                # get the word and its parts
                word2 = utils.get_part(word, WORD_LABEL)
                small2 = utils.get_part(word, SMALL_LABEL)
                large2 = utils.get_part(word, LARGE_LABEL)
                
                # add to mappings of words and factors
                utils.add_to_dict(word2, small2, word_to_small)
                utils.add_to_dict(small2, large2, small_to_large)
                
                # add to unigram count dictionaries
                utils.add_uni_counts(word2, unigrams)
                utils.add_uni_counts(small2, small_clusters)
                utils.add_uni_counts(large2, large_clusters)
                
                # if it is the second or later word, get prev word cluster
                # for first word, just consider unigrams (TO DO should be bigram with <s> first??)
                if index > 0:
                    word1 = utils.get_part(line_words[index-1], WORD_LABEL)
                    small1 = utils.get_part(line_words[index-1], SMALL_LABEL)
                    large1 = utils.get_part(line_words[index-1], LARGE_LABEL)
                    
                    # add to bigram dictionaries
                    utils.add_bi_counts(word1, word2, bigrams_ww)
                    utils.add_bi_counts(small1, word2, bigrams_sw)
                    utils.add_bi_counts(large1, word2, bigrams_lw)
    
    sys.stderr.write('Finished getting ngram count dictionaries\n')    
    
    ########## 3. calculate backoff probabilities for each ngram ##########
    ## get counts of counts for use in discounting
    count_unigrams = utils.get_counts_uni(unigrams)
    count_ww = utils.get_counts_bi(bigrams_ww)
    count_sw = utils.get_counts_bi(bigrams_sw)
    count_lw = utils.get_counts_bi(bigrams_lw)

    # will need vocab size for unk probs
    vocab_size = len(unigrams)
    sys.stderr.write('Finished getting counts of counts\n')    
    
    ## get discounts based on simple Good-Turing
    # TO DO later make this more robust so I can use other smoothing methods
    # note Good-Turing depends on the counts, not on the ngram itself
    disc_uni = utils.calc_discount(count_unigrams)
    disc_ww = utils.calc_discount(count_ww)
    disc_sw = utils.calc_discount(count_sw)
    disc_lw = utils.calc_discount(count_lw)
    
    ## calculate log probability of each unigram and bigram
    # dictionaries to store probabilities
    prob_unigrams = utils.probs_uni(unigrams, total_word_count, disc_uni)
    prob_ww = utils.probs_bi(bigrams_ww, unigrams, disc_ww)
    prob_sw = utils.probs_bi(bigrams_sw, small_clusters, disc_sw)
    prob_lw = utils.probs_bi(bigrams_lw, large_clusters, disc_lw)
    # TO DO where to store unk?
    # for now just make it a variable
    # unknowns (GT estimate): count(words appearing once) / |V| and store in variable
    prob_unk = log(count_unigrams[1], 10) - log(vocab_size, 10)

    sys.stderr.write('Finished getting probability dictionaries\n')    

    ########## 4. calculate backoff (alpha) of each backoff step ##########
    # backoff from word to small cluster
    backoff_ws = utils.calc_backoff_bi(word_to_small, prob_ww, prob_sw)
    sys.stderr.write('Finished getting w2s backoff dictionary\n')   
    
    # backoff from small cluster to large cluster
    backoff_sl = utils.calc_backoff_bi(small_to_large, prob_sw, prob_lw)
    sys.stderr.write('Finished getting s2l backoff dictionary\n')  
    ## TO DO some of these (and w2s) are > 1 which shouldn't happen!
    
    # backoff from large cluster to unigram (ignore previous word altogether)
    backoff_l = utils.calc_backoff_uni(prob_lw, prob_unigrams)
    #### TO DO Something is wrong here because almost all are -1000!

    sys.stderr.write('Finished getting l2u backoff dictionary\n')   
    sys.stderr.write('Finished getting backoff factor dictionaries\n')    
    

    ########## 5. print probs and alphas to stdout ##########
    ## probabilities
    # start with unknown prob
    sys.stdout.write('\unks:\n')
    sys.stdout.write(str(prob_unk) + '\t<unk>\n')
    
    # unigram probs
    sys.stdout.write('\\1-grams:\n')
    for unigram in prob_unigrams:
        sys.stdout.write(str(prob_unigrams[unigram]) + '\t' + unigram + '\n')
    
    # lw bigram probs
    sys.stdout.write('\\2-grams lw:\n')
    for large_cluster in prob_lw:
        for word in prob_lw[large_cluster]:
            sys.stdout.write(str(prob_lw[large_cluster][word]) + '\t' + large_cluster + ' ' + word + '\n')
    
    # sw bigram probs
    sys.stdout.write('\\2-grams sw:\n')
    for small_cluster in prob_sw:
        for word in prob_sw[small_cluster]:
            sys.stdout.write(str(prob_sw[small_cluster][word]) + '\t' + small_cluster + ' ' + word + '\n')
    
    # ww bigram probs
    sys.stdout.write('\\2-grams ww:\n')
    for prev_word in prob_ww:
        for word in prob_ww[prev_word]:
            sys.stdout.write(str(prob_ww[prev_word][word]) + '\t' + prev_word + ' ' + word + '\n')
    
    ## backoff weights
    # back off from lw to unigram
    sys.stdout.write('\\backoff l to unigram:\n')
    for cluster in backoff_l:
        sys.stdout.write(str(backoff_l[cluster]) + '\t' + cluster + '\n')
    
    # backoff from sw to lw
    sys.stdout.write('\\backoff s to l:\n')
    for cluster in backoff_sl:
        sys.stdout.write(str(backoff_sl[cluster]) + '\t' + cluster + '\n')
    
    # backoff from ww to sw
    sys.stdout.write('\\backoff w to s:\n')
    for cluster in backoff_ws:
        sys.stdout.write(str(backoff_ws[cluster]) + '\t' + cluster + '\n')
Esempio n. 8
0
    def build_network(input_batch, num_samples, latent_units, hidden_units_q, hidden_units_p, bias=None, data_type='binary'):

        input_tensor = tf.tile(input_batch, [num_samples, 1], name='tiled_input')

        # encoder
        layers_q = []
        samples_q = []
        input_cur = input_tensor
        #input_tensor = tf.Print(input_tensor,[input_tensor],message='input_tensor', summarize=785)
        samples_q.append(input_tensor)
        layer_iter = 1
        params = {}
        for hidden_units_cur, latent_units_cur in zip(hidden_units_q, latent_units):
            # build the dense hidden layers for this stochastic unit
            dense, variables = build_dense_layers(input_cur, hidden_units_cur,
                                       activation_function=tf.nn.tanh,
                                       layer_name='q_det_unit_' + str(layer_iter) + '_')

            # add variables to the params dict
            utils.add_to_dict(params, variables)

            # build the stochastic layer
            layer_q = GaussianStochLayer.build_stochastic_layer(dense,latent_units_cur,
                                                                      layer_name='q_stoch_layer_'+str(layer_iter)+'_')
            utils.add_to_dict(params,layer_q.params)
            layers_q.append(layer_q)
            input_cur = layers_q[-1].get_samples()
            #input_cur = tf.Print(input_cur, [input_cur], message='samples for layer '+str(layer_iter), summarize=100)
            samples_q.append(input_cur)
            layer_iter += 1

        # decoder
        layers_p = []
        layer_iter = 1
        rev_samples_q = list(reversed(samples_q))[:-1]
        rev_latent_units = list(reversed(latent_units))[1:]
        for hidden_units_cur, latent_units_cur, input_cur in zip(hidden_units_p[:-1], rev_latent_units, rev_samples_q[:-1]):
            # build the dense hidden layers for this stochastic unit
            dense, variables = build_dense_layers(input_cur, hidden_units_cur,
                                       activation_function=tf.nn.tanh,
                                       layer_name='p_det_unit_' + str(layer_iter) + '_')

            # add variables to the params dict
            utils.add_to_dict(params, variables)

            # build the stochastic layer
            layer_p = GaussianStochLayer.build_stochastic_layer(dense, latent_units_cur,layer_name='p_stoch_layer_' + str(layer_iter) + '_')
            utils.add_to_dict(params, layer_p.params)
            layers_p.append(layer_p)
            layer_iter += 1

        # build the last dense layer for the decoder
        dense, variables = build_dense_layers(rev_samples_q[-1], hidden_units_p[-1],
                                   activation_function=tf.nn.tanh,
                                   layer_name='p_det_unit_' + str(layer_iter) + '_')

        # add variables to the params dict
        utils.add_to_dict(params, variables)

        # build the last stochastic layer
        if(data_type == 'binary'):
            layer_p = BernoulliStochLayer.build_stochastic_layer(dense, input_tensor.shape[1],
                                                                      layer_name='p_stoch_layer_' + str(layer_iter) + '_',
                                                                      mean_bias=bias)
            utils.add_to_dict(params, layer_p.params)
            layers_p.append(layer_p)
        elif data_type == 'continuous':
            layer_p = GaussianStochLayer.build_stochastic_layer(dense, input_tensor.shape[1],
                                                                  layer_name='p_stoch_layer_' + str(layer_iter) + '_',
                                                                  mean_bias=bias)
            utils.add_to_dict(params, layer_p.params)
            layers_p.append(layer_p)
        prior = UnitGaussianLayer(layers_q[-1].mean_layer.shape)
        return Network(layers_q, layers_p, samples_q, prior, num_samples, params)