Example #1
0
def main(argv):
    argparser = argument_parser('predict')
    args = argparser.parse_args(argv[1:])

    ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir)
    max_seq_len = config['max_seq_length']

    label_map = {t: i for i, t in enumerate(labels)}
    inv_label_map = {v: k for k, v in label_map.items()}

    test_words, dummy_labels = read_conll(args.test_data, mode='test')
    test_data = process_sentences(test_words, dummy_labels, tokenizer,
                                  max_seq_len)

    test_x = encode(test_data.combined_tokens, tokenizer, max_seq_len)

    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)

    pred_labels = []
    for i, pred in enumerate(preds):
        pred_labels.append(
            [inv_label_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]])

    lines = write_result(args.output_file,
                         test_data.words,
                         test_data.lengths,
                         test_data.tokens,
                         test_data.labels,
                         pred_labels,
                         mode='predict')

    return 0
Example #2
0
 def tag(self, text, tokenized=False):
     max_seq_len = self.config['max_seq_length']
     inv_label_map = { i: l for i, l in enumerate(self.labels) }
     if tokenized:
         words = text.split()    # whitespace tokenization
     else:
         words = tokenize(text)    # approximate BasicTokenizer
     dummy = ['O'] * len(words)
     data = process_sentences([words], [dummy], self.tokenizer, max_seq_len)
     x = encode(data.combined_tokens, self.tokenizer, max_seq_len)
     if self.session is None or self.graph is None:
         probs = self.model.predict(x, batch_size=8)    # assume singlethreaded
     else:
         with self.session.as_default():
             with self.graph.as_default():
                 probs = self.model.predict(x, batch_size=8)
     preds = np.argmax(probs, axis=-1)
     pred_labels = []
     for i, pred in enumerate(preds):
         pred_labels.append([inv_label_map[t]
                             for t in pred[1:len(data.tokens[i])+1]])
     lines = write_result(
         'output.tsv', data.words, data.lengths,
         data.tokens, data.labels, pred_labels, mode='predict'
     )
     return ''.join(lines)
Example #3
0
def main(argv):
    argparser = argument_parser()
    args = argparser.parse_args(argv[1:])
    seq_len = args.max_seq_length  # abbreviation

    pretrained_model, tokenizer = load_pretrained(args)

    train_words, train_tags = read_conll(args.train_data)
    test_words, test_tags = read_conll(args.test_data)
    train_data = process_sentences(train_words, train_tags, tokenizer, seq_len)
    test_data = process_sentences(test_words, test_tags, tokenizer, seq_len)

    label_list = get_labels(train_data.labels)
    tag_map = {l: i for i, l in enumerate(label_list)}
    inv_tag_map = {v: k for k, v in tag_map.items()}

    init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map)

    train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
    test_x = encode(test_data.combined_tokens, tokenizer, seq_len)

    train_y, train_weights = label_encode(train_data.combined_labels, tag_map,
                                          seq_len)
    test_y, test_weights = label_encode(test_data.combined_labels, tag_map,
                                        seq_len)

    ner_model = create_ner_model(pretrained_model, len(tag_map))
    optimizer = create_optimizer(len(train_x[0]), args)

    ner_model.compile(optimizer,
                      loss='sparse_categorical_crossentropy',
                      sample_weight_mode='temporal',
                      metrics=['sparse_categorical_accuracy'])

    ner_model.fit(train_x,
                  train_y,
                  sample_weight=train_weights,
                  epochs=args.num_train_epochs,
                  batch_size=args.batch_size)

    if args.ner_model_dir is not None:
        label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
        save_ner_model(ner_model, tokenizer, label_list, args)
        save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args)

    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)

    pred_tags = []
    for i, pred in enumerate(preds):
        pred_tags.append(
            [inv_tag_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]])

    lines = write_result(args.output_file, test_data.words, test_data.lengths,
                         test_data.tokens, test_data.labels, pred_tags)

    c = conlleval.evaluate(lines)
    conlleval.report(c)
    return 0
    print('ins', ins)
    print('outs', outs)

    contig_starts = [v for v, out in outs.items() if not (out in (0, 1) and ins[v] == 1)]
    print('contig_starts', contig_starts)

    contigs = []
    for start in contig_starts:
        while graph[start]:  # multiple edges
            path = [start]
            u = graph[start].pop()
            while ins[u] == outs[u] == 1:
                path.append(u)
                u = graph[u].pop()
            contigs.append(''.join(v[0] for v in path) + u)

    return contigs


if __name__ == '__main__':
    #dataset, test_contigs = big_example()
    dataset = read_dataset()

    dataset = [l.lower() for l in dataset]
    adjlist = debruijn_graph(dataset)
    contigs = sorted(assemble_contigs(adjlist))
    #print(len(contigs))
    #test_contigs = sorted(test_contigs)
    #print(len(test_contigs))
    write_result('\n'.join(sorted(contigs)))
from common import small_example, big_example, read_dataset, write_result
from eulerian_cycle import eulerian_path

if __name__ == '__main__':
    dataset, _ = small_example()
    dataset = read_dataset()

    adjlist = {
        words[0]: set(words[1].split(','))
        for words in (l.split(' -> ') for l in dataset)
    }
    print(adjlist)
    path = eulerian_path(adjlist)

    output = ''.join(kmer[0] for kmer in path) + path[-1][1:]
    write_result(output)
from itertools import product
from common import small_example, big_example, read_dataset, write_result
from eulerian_cycle import eulerian_path, eulerian_cycle
from debruijn import debruijn_graph


if __name__ == '__main__':
    kmers = (''.join(kmer) for kmer in product('01', repeat=17))
    adjlist = debruijn_graph(kmers)
    cycle = eulerian_cycle(adjlist)

    output = ''.join(kmer[0] for kmer in cycle[:-1])
    write_result(output)
Example #7
0
                all_edges.append((node.i, next_node.i, pnuc))
            node = next_node

    return root, all_edges

    #edges = [(1, i, p[0]) for i, p in zip(count(0), patterns)]
    #print(edges)
    #
    #prev_level_edges = {n: t for (f, t, n) in edges}
    #for i, prev_ns, next_ns in zip(count(),
    #                               [p[1:] for p in patterns],
    #                               [p[:1] for p in patterns]):
    #    new_level_edges = []
    #    for pn, nn in zip(prev_ns, next_ns):
    #        new_level_edges[pn] =
    #
    #
    #    for l in i_letters:
    #        if
    #
    #    prev_level_edges = level_edges


if __name__ == '__main__':
    inp, out = small_example()
    #inp, out = big_example()
    inp = read_dataset()

    root, edges = trie(map(Seq, inp))
    write_result('\n'.join('%d %d %s' % (f, t, n) for f, t, n in edges))
Example #8
0
n_steps_bayes = int(2e6)
n_steps_standard = int(1e5)

n_steps_bayes = int(2e5)
n_steps_standard = int(1e4)

# # -----------------------------------------------------------------------------------------------------------#
# # Create training tasks:
# -----------------------------------------------------------------------------------------------------------#
n_tasks_train = 5
train_tasks_data = []
for _ in xrange(n_tasks_train):
    train_tasks_data.append(cmn.permute_pixels(orig_data))
cmn.write_result(
    '---- Meta-Training set: {0} tasks of {1} training samples'.format(
        n_tasks_train, orig_data.train.num_examples), setting_name)
# # # -----------------------------------------------------------------------------------------------------------#
# # #  Meta-training
# # # -----------------------------------------------------------------------------------------------------------#
prior_file_path = '/tmp/prior2.ckpt'
#
print('---- Meta-training ...')
startRuntime = timeit.default_timer()
test_accuracy = learn_bayes_multitask.learn_tasks(
    train_tasks_data,
    objective_type='Variational_Bayes',
    prior_file_path=prior_file_path,
    mode='Meta_Training',
    n_steps=n_steps_bayes)
stopRuntime = timeit.default_timer()
Example #9
0
# epsilonStdDefault = 1  # For debug set epsilons with 0.0 -> recovers standard NN
#
# # Learning Parameters:
# learning_rate = 1e-4  # 1e-4
#
# # Ratio of of steps for first stage (learning the posterior mean only):
# steps_stage_1_ratio = 0.05  # 0.1
#
# # Ratio of of steps out of the second stage with epsilon = 1:
# steps_with_full_eps_ratio = 0.2  # 0.5
#
# batch_size = 128
# # Note we need large enough batch to reduce the variance of the gradient estimate
# # since we are using the "Local Reparameterization Trick" the variance is proportional to 1/batch_size
#
# # Variables init
# random_init_std = 0.1
# bias_init = 0

#  Bayes Learning
# -----------------------------------------------------------------------------------------------------------#

print('---- Bayes-no-prior  learning for a single task...')
startRuntime = timeit.default_timer()
test_accuracy = learn_bayes_single_task.learn_task(
    data1, objective_type='Bayes_No_Prior', n_steps=n_steps_bayes)
stopRuntime = timeit.default_timer()
cmn.write_result(
    'Bayes-no-prior  learning - Test Error: {0} %, Runtime: {1} [sec]'.format(
        100 * (1 - test_accuracy), stopRuntime - startRuntime), setting_name)
with tf.Session() as sess:
    pass

# --------------------------------------------------------------------------------------#
# Run experiments
# --------------------------------------------------------------------------------------#

# Variational Bayes Learning
# -----------------------------------------------------------------------------------------------------------#
print('---- Variational Bayes learning for a single task...')
startRuntime = timeit.default_timer()
test_accuracy = learn_bayes_single_task.learn_task(
    data1, objective_type='Variational_Bayes', n_steps=n_steps_bayes)
stopRuntime = timeit.default_timer()
cmn.write_result(
    'Variational Bayes learning - Test Error: {0} %, Runtime: {1} [sec]'.
    format(100 * (1 - test_accuracy),
           stopRuntime - startRuntime), setting_name)

# -----------------------------------------------------------------------------------------------------------#
# PAC-Bayes Learning
# -----------------------------------------------------------------------------------------------------------#
print('---- PAC-Bayesian McAllaster learning for a single task...')
startRuntime = timeit.default_timer()
test_accuracy = learn_bayes_single_task.learn_task(
    data1, objective_type='PAC_Bayes_McAllaster', n_steps=n_steps_bayes)
stopRuntime = timeit.default_timer()
cmn.write_result(
    'PAC-Bayesian McAllaster learning - Test Error: {0} %, Runtime: {1} [sec]'.
    format(100 * (1 - test_accuracy),
           stopRuntime - startRuntime), setting_name)
Example #11
0
# -----------------------------------------------------------------------------------------------------------#
# # standard learning
# # -----------------------------------------------------------------------------------------------------------#

print('---- Standard learning on source task...')

startRuntime = timeit.default_timer()
test_accuracy_standard = learn_standard_single_task.learn_task(
    dataSet=source_task,
    weights_save_file=weights_file,
    dropout_flag=False,
    n_steps=n_steps_standard)
stopRuntime = timeit.default_timer()
cmn.write_result(
    'Source task - standard learning  - Average Test Error : {0} %, Runtime: {1} [sec]'
    .format(100 * (1 - test_accuracy_standard),
            stopRuntime - startRuntime), setting_name)

print(
    '---- Standard learning on target task using transfered initial point...')
startRuntime = timeit.default_timer()
test_accuracy_standard = learn_standard_single_task.learn_task(
    dataSet=target_task,
    weights_restore_file=weights_file,
    dropout_flag=False,
    n_steps=n_steps_standard)
stopRuntime = timeit.default_timer()
cmn.write_result(
    'Target task- standard learning - with transfer -  Average Test Error : {0} %, Runtime: {1} [sec]'
    .format(100 * (1 - test_accuracy_standard),
            stopRuntime - startRuntime), setting_name)
Example #12
0
    contig_starts = [
        v for v, out in outs.items() if not (out in (0, 1) and ins[v] == 1)
    ]
    print('contig_starts', contig_starts)

    contigs = []
    for start in contig_starts:
        while graph[start]:  # multiple edges
            path = [start]
            u = graph[start].pop()
            while ins[u] == outs[u] == 1:
                path.append(u)
                u = graph[u].pop()
            contigs.append(''.join(v[0] for v in path) + u)

    return contigs


if __name__ == '__main__':
    #dataset, test_contigs = big_example()
    dataset = read_dataset()

    dataset = [l.lower() for l in dataset]
    adjlist = debruijn_graph(dataset)
    contigs = sorted(assemble_contigs(adjlist))
    #print(len(contigs))
    #test_contigs = sorted(test_contigs)
    #print(len(test_contigs))
    write_result('\n'.join(sorted(contigs)))
Example #13
0
        else:
            return ''


def trie_matching(text, trie):
    occurences = defaultdict(list)
    for i in range(len(text)):
        postfix = text[i:]
        path = prefix_trie_matching(postfix, trie)
        if path:
            occurences[path].append(i)

    return occurences


if __name__ == '__main__':
    inp, out = small_example()
    #inp, out = big_example()
    inp = read_dataset()

    root, edges = trie(map(Seq, inp[1:]))
    occurences = trie_matching(Seq(inp[0]), root)

    positions = ''
    for p in inp[1:]:
        if p in occurences:
            positions += ' '.join(map(str, occurences[p])) + '\n'

    #positions = prefix_trie_matching(Seq(inp[0]), root)
    write_result(positions)
                    permuts.append(permut)
                    if -el < 0:
                        permut = permut[:pos] + [abs(permut[pos])] + permut[pos + 1:]
                        permuts.append(permut)
                    break

    return permuts


if __name__ == '__main__':
    inp, out = small_example()
    inp, out = big_example()
    inp = read_dataset()
    premut = list(map(int, inp[1:-1].split()))
    permuts = greedy_sort(premut)
    strs = ('(' + ' '.join(('+' if i > 0 else '') + str(i) for i in p) + ')' for p in permuts)

    #for s_mine, s_their in zip(strs, out):
    #    print(len(s_mine), len(s_their))
    #    for i, c_m, c_t in zip(count(), s_mine, s_their):
    #        if c_m != c_t:
    #            print(i)
    #            print(s_mine)
    #            print(s_their)
    #            print(c_m)
    #            print(c_t)
    #            input()

    str = '\n'.join(strs)
    write_result(str)
Example #15
0
def main(argv):

    argparser = argument_parser()
    args = argparser.parse_args(argv[1:])
    seq_len = args.max_seq_length    # abbreviation

    pretrained_model, tokenizer = load_pretrained(args)

    train_words, train_tags = read_conll(args.train_data)
    test_words, test_tags = read_conll(args.test_data)


    print(args.no_context)

    if args.no_context:
        train_data = process_no_context(train_words, train_tags, tokenizer, seq_len)
        test_data = process_no_context(test_words, test_tags, tokenizer, seq_len)
    elif args.documentwise:
        tr_docs, tr_doc_tags, tr_line_ids = split_to_documents(train_words, train_tags)
        te_docs, te_doc_tags, te_line_ids = split_to_documents(test_words, test_tags)
        train_data = process_docs(tr_docs, tr_doc_tags, tr_line_ids, tokenizer, seq_len)
        test_data = process_docs(te_docs, te_doc_tags, te_line_ids, tokenizer, seq_len)
    else:
        train_data = process_sentences(train_words, train_tags, tokenizer, seq_len, args.predict_position)
        test_data = process_sentences(test_words, test_tags, tokenizer, seq_len, args.predict_position)
    
    label_list = get_labels(train_data.labels)
    tag_map = { l: i for i, l in enumerate(label_list) }
    inv_tag_map = { v: k for k, v in tag_map.items() }

    train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
    test_x = encode(test_data.combined_tokens, tokenizer, seq_len)
    train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len)
    test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len)


    if args.use_ner_model and (args.ner_model_dir is not None):
        ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir)
    else:
        optimizer = create_optimizer(len(train_x[0]), args)
        model = create_ner_model(pretrained_model, len(tag_map))
        if args.num_gpus > 1:
            ner_model = multi_gpu_model(model, args.num_gpus)
        else:
            ner_model = model

        ner_model.compile(
            optimizer,
            loss='sparse_categorical_crossentropy',
            sample_weight_mode='temporal',
            metrics=['sparse_categorical_accuracy']
            )
                
        ner_model.fit(
            train_x,
            train_y,
            sample_weight=train_weights,
            epochs=args.num_train_epochs,
            batch_size=args.batch_size
            )
        if args.ner_model_dir is not None:
            label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
            save_ner_model(ner_model, tokenizer, label_list, args)

    
    probs = ner_model.predict(test_x, batch_size=args.batch_size)
    preds = np.argmax(probs, axis=-1)
    
    results = []
    m_names = []
    if args.no_context:
        pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers)
        output_file = "output/{}-NC.tsv".format(args.output_file)
        m_names.append('NC')  
        ensemble = []
        for i,pred in enumerate(pr_test_first):
            ensemble.append([inv_tag_map[t] for t in pred])
        lines_ensemble, sentences_ensemble = write_result(
            output_file, test_data.words, test_data.lengths,
            test_data.tokens, test_data.labels, ensemble
            )
        c = conlleval.evaluate(lines_ensemble)
        conlleval.report(c)
        results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])



    else:
        # First tag then vote
        pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers)
        # Accumulate probabilities, then vote
        prob_ensemble, prob_test_first = get_predictions2(probs, test_data.tokens, test_data.sentence_numbers)
        ens = [pr_ensemble, prob_ensemble, pr_test_first, prob_test_first]
        if args.documentwise:
            # D-CMV: Documentwise CMV
            # D-CMVP: Documetwise CMV, probs summed, argmax after that
            # D-F: Documentwise First
            # D-FP: Same as D-FP 
            method_names = ['D-CMV','D-CMVP','D-F','D-FP']  
        else:           
            method_names = ['CMV','CMVP','F','FP']
        for i, ensem in enumerate(ens):
            ensemble = []
            for j,pred in enumerate(ensem):
                ensemble.append([inv_tag_map[t] for t in pred])
            output_file = "output/{}-{}.tsv".format(args.output_file, method_names[i])
            lines_ensemble, sentences_ensemble = write_result(
                    output_file, test_data.words, test_data.lengths,
                    test_data.tokens, test_data.labels, ensemble)
            print("Model trained: ", args.ner_model_dir)
            print("Seq-len: ", args.max_seq_length)
            print("Learning rate: ", args.learning_rate)
            print("Batch Size: ", args.batch_size)
            print("Epochs: ", args.num_train_epochs)
            print("Training data: ", args.train_data)
            print("Testing data: ", args.test_data)
            print("")
            print("Results with {}".format(method_names[i]))
            c = conlleval.evaluate(lines_ensemble)
            print("")
            conlleval.report(c)
            results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])
            m_names.extend(method_names)

        
    if args.sentence_in_context:     
        starting_pos = np.arange(0,seq_len+1,32)
        starting_pos[0] = 1
        m_names.extend(starting_pos)
        for start_p in starting_pos:
            tt_lines, tt_tags, line_nos, line_starts = combine_sentences2(test_data.tokens, test_data.labels, seq_len-1, start_p-1)
            tt_x = encode(tt_lines, tokenizer, seq_len)
            tt_y, train_weights = label_encode(tt_tags, tag_map, seq_len)
            probs = ner_model.predict(tt_x, batch_size=args.batch_size)
            preds = np.argmax(probs, axis=-1)


            pred_tags = []
            for i, pred in enumerate(preds):
                idx = line_nos[i].index(i)
                pred_tags.append([inv_tag_map[t] for t in pred[line_starts[i][idx]+1:line_starts[i][idx]+len(test_data.tokens[i])+1]])
                
            output_file = "output/{}-{}.tsv".format(args.output_file, start_p)
            lines_first, sentences_first = write_result(
                output_file, test_data.words, test_data.lengths,
                test_data.tokens, test_data.labels, pred_tags
            )
            print("")
            print("Results with prediction starting position ", start_p)
            c = conlleval.evaluate(lines_first)
            conlleval.report(c)
            results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore])

    result_file = "./results/results-{}.csv".format(args.output_file) 
    with open(result_file, 'w+') as f:
        for i, line in enumerate(results):
            params = "{},{},{},{},{},{},{},{},{}".format(args.output_file,
                                            args.max_seq_length, 
                                            args.bert_config_file, 
                                            args.num_train_epochs, 
                                            args.learning_rate,
                                            args.batch_size,
                                            args.predict_position,
                                            args.train_data,
                                            args.test_data)
            f.write(params)
            f.write(",{}".format(m_names[i]))
            for item in line:
                f.write(",{}".format(item))
            f.write('\n') 

    for i in results:
        print(i)
    return 0