def main(argv): argparser = argument_parser('predict') args = argparser.parse_args(argv[1:]) ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir) max_seq_len = config['max_seq_length'] label_map = {t: i for i, t in enumerate(labels)} inv_label_map = {v: k for k, v in label_map.items()} test_words, dummy_labels = read_conll(args.test_data, mode='test') test_data = process_sentences(test_words, dummy_labels, tokenizer, max_seq_len) test_x = encode(test_data.combined_tokens, tokenizer, max_seq_len) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_labels = [] for i, pred in enumerate(preds): pred_labels.append( [inv_label_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]]) lines = write_result(args.output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_labels, mode='predict') return 0
def tag(self, text, tokenized=False): max_seq_len = self.config['max_seq_length'] inv_label_map = { i: l for i, l in enumerate(self.labels) } if tokenized: words = text.split() # whitespace tokenization else: words = tokenize(text) # approximate BasicTokenizer dummy = ['O'] * len(words) data = process_sentences([words], [dummy], self.tokenizer, max_seq_len) x = encode(data.combined_tokens, self.tokenizer, max_seq_len) if self.session is None or self.graph is None: probs = self.model.predict(x, batch_size=8) # assume singlethreaded else: with self.session.as_default(): with self.graph.as_default(): probs = self.model.predict(x, batch_size=8) preds = np.argmax(probs, axis=-1) pred_labels = [] for i, pred in enumerate(preds): pred_labels.append([inv_label_map[t] for t in pred[1:len(data.tokens[i])+1]]) lines = write_result( 'output.tsv', data.words, data.lengths, data.tokens, data.labels, pred_labels, mode='predict' ) return ''.join(lines)
def main(argv): argparser = argument_parser() args = argparser.parse_args(argv[1:]) seq_len = args.max_seq_length # abbreviation pretrained_model, tokenizer = load_pretrained(args) train_words, train_tags = read_conll(args.train_data) test_words, test_tags = read_conll(args.test_data) train_data = process_sentences(train_words, train_tags, tokenizer, seq_len) test_data = process_sentences(test_words, test_tags, tokenizer, seq_len) label_list = get_labels(train_data.labels) tag_map = {l: i for i, l in enumerate(label_list)} inv_tag_map = {v: k for k, v in tag_map.items()} init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map) train_x = encode(train_data.combined_tokens, tokenizer, seq_len) test_x = encode(test_data.combined_tokens, tokenizer, seq_len) train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len) test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len) ner_model = create_ner_model(pretrained_model, len(tag_map)) optimizer = create_optimizer(len(train_x[0]), args) ner_model.compile(optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', metrics=['sparse_categorical_accuracy']) ner_model.fit(train_x, train_y, sample_weight=train_weights, epochs=args.num_train_epochs, batch_size=args.batch_size) if args.ner_model_dir is not None: label_list = [v for k, v in sorted(list(inv_tag_map.items()))] save_ner_model(ner_model, tokenizer, label_list, args) save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_tags = [] for i, pred in enumerate(preds): pred_tags.append( [inv_tag_map[t] for t in pred[1:len(test_data.tokens[i]) + 1]]) lines = write_result(args.output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_tags) c = conlleval.evaluate(lines) conlleval.report(c) return 0
print('ins', ins) print('outs', outs) contig_starts = [v for v, out in outs.items() if not (out in (0, 1) and ins[v] == 1)] print('contig_starts', contig_starts) contigs = [] for start in contig_starts: while graph[start]: # multiple edges path = [start] u = graph[start].pop() while ins[u] == outs[u] == 1: path.append(u) u = graph[u].pop() contigs.append(''.join(v[0] for v in path) + u) return contigs if __name__ == '__main__': #dataset, test_contigs = big_example() dataset = read_dataset() dataset = [l.lower() for l in dataset] adjlist = debruijn_graph(dataset) contigs = sorted(assemble_contigs(adjlist)) #print(len(contigs)) #test_contigs = sorted(test_contigs) #print(len(test_contigs)) write_result('\n'.join(sorted(contigs)))
from common import small_example, big_example, read_dataset, write_result from eulerian_cycle import eulerian_path if __name__ == '__main__': dataset, _ = small_example() dataset = read_dataset() adjlist = { words[0]: set(words[1].split(',')) for words in (l.split(' -> ') for l in dataset) } print(adjlist) path = eulerian_path(adjlist) output = ''.join(kmer[0] for kmer in path) + path[-1][1:] write_result(output)
from itertools import product from common import small_example, big_example, read_dataset, write_result from eulerian_cycle import eulerian_path, eulerian_cycle from debruijn import debruijn_graph if __name__ == '__main__': kmers = (''.join(kmer) for kmer in product('01', repeat=17)) adjlist = debruijn_graph(kmers) cycle = eulerian_cycle(adjlist) output = ''.join(kmer[0] for kmer in cycle[:-1]) write_result(output)
all_edges.append((node.i, next_node.i, pnuc)) node = next_node return root, all_edges #edges = [(1, i, p[0]) for i, p in zip(count(0), patterns)] #print(edges) # #prev_level_edges = {n: t for (f, t, n) in edges} #for i, prev_ns, next_ns in zip(count(), # [p[1:] for p in patterns], # [p[:1] for p in patterns]): # new_level_edges = [] # for pn, nn in zip(prev_ns, next_ns): # new_level_edges[pn] = # # # for l in i_letters: # if # # prev_level_edges = level_edges if __name__ == '__main__': inp, out = small_example() #inp, out = big_example() inp = read_dataset() root, edges = trie(map(Seq, inp)) write_result('\n'.join('%d %d %s' % (f, t, n) for f, t, n in edges))
n_steps_bayes = int(2e6) n_steps_standard = int(1e5) n_steps_bayes = int(2e5) n_steps_standard = int(1e4) # # -----------------------------------------------------------------------------------------------------------# # # Create training tasks: # -----------------------------------------------------------------------------------------------------------# n_tasks_train = 5 train_tasks_data = [] for _ in xrange(n_tasks_train): train_tasks_data.append(cmn.permute_pixels(orig_data)) cmn.write_result( '---- Meta-Training set: {0} tasks of {1} training samples'.format( n_tasks_train, orig_data.train.num_examples), setting_name) # # # -----------------------------------------------------------------------------------------------------------# # # # Meta-training # # # -----------------------------------------------------------------------------------------------------------# prior_file_path = '/tmp/prior2.ckpt' # print('---- Meta-training ...') startRuntime = timeit.default_timer() test_accuracy = learn_bayes_multitask.learn_tasks( train_tasks_data, objective_type='Variational_Bayes', prior_file_path=prior_file_path, mode='Meta_Training', n_steps=n_steps_bayes) stopRuntime = timeit.default_timer()
# epsilonStdDefault = 1 # For debug set epsilons with 0.0 -> recovers standard NN # # # Learning Parameters: # learning_rate = 1e-4 # 1e-4 # # # Ratio of of steps for first stage (learning the posterior mean only): # steps_stage_1_ratio = 0.05 # 0.1 # # # Ratio of of steps out of the second stage with epsilon = 1: # steps_with_full_eps_ratio = 0.2 # 0.5 # # batch_size = 128 # # Note we need large enough batch to reduce the variance of the gradient estimate # # since we are using the "Local Reparameterization Trick" the variance is proportional to 1/batch_size # # # Variables init # random_init_std = 0.1 # bias_init = 0 # Bayes Learning # -----------------------------------------------------------------------------------------------------------# print('---- Bayes-no-prior learning for a single task...') startRuntime = timeit.default_timer() test_accuracy = learn_bayes_single_task.learn_task( data1, objective_type='Bayes_No_Prior', n_steps=n_steps_bayes) stopRuntime = timeit.default_timer() cmn.write_result( 'Bayes-no-prior learning - Test Error: {0} %, Runtime: {1} [sec]'.format( 100 * (1 - test_accuracy), stopRuntime - startRuntime), setting_name)
with tf.Session() as sess: pass # --------------------------------------------------------------------------------------# # Run experiments # --------------------------------------------------------------------------------------# # Variational Bayes Learning # -----------------------------------------------------------------------------------------------------------# print('---- Variational Bayes learning for a single task...') startRuntime = timeit.default_timer() test_accuracy = learn_bayes_single_task.learn_task( data1, objective_type='Variational_Bayes', n_steps=n_steps_bayes) stopRuntime = timeit.default_timer() cmn.write_result( 'Variational Bayes learning - Test Error: {0} %, Runtime: {1} [sec]'. format(100 * (1 - test_accuracy), stopRuntime - startRuntime), setting_name) # -----------------------------------------------------------------------------------------------------------# # PAC-Bayes Learning # -----------------------------------------------------------------------------------------------------------# print('---- PAC-Bayesian McAllaster learning for a single task...') startRuntime = timeit.default_timer() test_accuracy = learn_bayes_single_task.learn_task( data1, objective_type='PAC_Bayes_McAllaster', n_steps=n_steps_bayes) stopRuntime = timeit.default_timer() cmn.write_result( 'PAC-Bayesian McAllaster learning - Test Error: {0} %, Runtime: {1} [sec]'. format(100 * (1 - test_accuracy), stopRuntime - startRuntime), setting_name)
# -----------------------------------------------------------------------------------------------------------# # # standard learning # # -----------------------------------------------------------------------------------------------------------# print('---- Standard learning on source task...') startRuntime = timeit.default_timer() test_accuracy_standard = learn_standard_single_task.learn_task( dataSet=source_task, weights_save_file=weights_file, dropout_flag=False, n_steps=n_steps_standard) stopRuntime = timeit.default_timer() cmn.write_result( 'Source task - standard learning - Average Test Error : {0} %, Runtime: {1} [sec]' .format(100 * (1 - test_accuracy_standard), stopRuntime - startRuntime), setting_name) print( '---- Standard learning on target task using transfered initial point...') startRuntime = timeit.default_timer() test_accuracy_standard = learn_standard_single_task.learn_task( dataSet=target_task, weights_restore_file=weights_file, dropout_flag=False, n_steps=n_steps_standard) stopRuntime = timeit.default_timer() cmn.write_result( 'Target task- standard learning - with transfer - Average Test Error : {0} %, Runtime: {1} [sec]' .format(100 * (1 - test_accuracy_standard), stopRuntime - startRuntime), setting_name)
contig_starts = [ v for v, out in outs.items() if not (out in (0, 1) and ins[v] == 1) ] print('contig_starts', contig_starts) contigs = [] for start in contig_starts: while graph[start]: # multiple edges path = [start] u = graph[start].pop() while ins[u] == outs[u] == 1: path.append(u) u = graph[u].pop() contigs.append(''.join(v[0] for v in path) + u) return contigs if __name__ == '__main__': #dataset, test_contigs = big_example() dataset = read_dataset() dataset = [l.lower() for l in dataset] adjlist = debruijn_graph(dataset) contigs = sorted(assemble_contigs(adjlist)) #print(len(contigs)) #test_contigs = sorted(test_contigs) #print(len(test_contigs)) write_result('\n'.join(sorted(contigs)))
else: return '' def trie_matching(text, trie): occurences = defaultdict(list) for i in range(len(text)): postfix = text[i:] path = prefix_trie_matching(postfix, trie) if path: occurences[path].append(i) return occurences if __name__ == '__main__': inp, out = small_example() #inp, out = big_example() inp = read_dataset() root, edges = trie(map(Seq, inp[1:])) occurences = trie_matching(Seq(inp[0]), root) positions = '' for p in inp[1:]: if p in occurences: positions += ' '.join(map(str, occurences[p])) + '\n' #positions = prefix_trie_matching(Seq(inp[0]), root) write_result(positions)
permuts.append(permut) if -el < 0: permut = permut[:pos] + [abs(permut[pos])] + permut[pos + 1:] permuts.append(permut) break return permuts if __name__ == '__main__': inp, out = small_example() inp, out = big_example() inp = read_dataset() premut = list(map(int, inp[1:-1].split())) permuts = greedy_sort(premut) strs = ('(' + ' '.join(('+' if i > 0 else '') + str(i) for i in p) + ')' for p in permuts) #for s_mine, s_their in zip(strs, out): # print(len(s_mine), len(s_their)) # for i, c_m, c_t in zip(count(), s_mine, s_their): # if c_m != c_t: # print(i) # print(s_mine) # print(s_their) # print(c_m) # print(c_t) # input() str = '\n'.join(strs) write_result(str)
def main(argv): argparser = argument_parser() args = argparser.parse_args(argv[1:]) seq_len = args.max_seq_length # abbreviation pretrained_model, tokenizer = load_pretrained(args) train_words, train_tags = read_conll(args.train_data) test_words, test_tags = read_conll(args.test_data) print(args.no_context) if args.no_context: train_data = process_no_context(train_words, train_tags, tokenizer, seq_len) test_data = process_no_context(test_words, test_tags, tokenizer, seq_len) elif args.documentwise: tr_docs, tr_doc_tags, tr_line_ids = split_to_documents(train_words, train_tags) te_docs, te_doc_tags, te_line_ids = split_to_documents(test_words, test_tags) train_data = process_docs(tr_docs, tr_doc_tags, tr_line_ids, tokenizer, seq_len) test_data = process_docs(te_docs, te_doc_tags, te_line_ids, tokenizer, seq_len) else: train_data = process_sentences(train_words, train_tags, tokenizer, seq_len, args.predict_position) test_data = process_sentences(test_words, test_tags, tokenizer, seq_len, args.predict_position) label_list = get_labels(train_data.labels) tag_map = { l: i for i, l in enumerate(label_list) } inv_tag_map = { v: k for k, v in tag_map.items() } train_x = encode(train_data.combined_tokens, tokenizer, seq_len) test_x = encode(test_data.combined_tokens, tokenizer, seq_len) train_y, train_weights = label_encode(train_data.combined_labels, tag_map, seq_len) test_y, test_weights = label_encode(test_data.combined_labels, tag_map, seq_len) if args.use_ner_model and (args.ner_model_dir is not None): ner_model, tokenizer, labels, config = load_ner_model(args.ner_model_dir) else: optimizer = create_optimizer(len(train_x[0]), args) model = create_ner_model(pretrained_model, len(tag_map)) if args.num_gpus > 1: ner_model = multi_gpu_model(model, args.num_gpus) else: ner_model = model ner_model.compile( optimizer, loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', metrics=['sparse_categorical_accuracy'] ) ner_model.fit( train_x, train_y, sample_weight=train_weights, epochs=args.num_train_epochs, batch_size=args.batch_size ) if args.ner_model_dir is not None: label_list = [v for k, v in sorted(list(inv_tag_map.items()))] save_ner_model(ner_model, tokenizer, label_list, args) probs = ner_model.predict(test_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) results = [] m_names = [] if args.no_context: pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers) output_file = "output/{}-NC.tsv".format(args.output_file) m_names.append('NC') ensemble = [] for i,pred in enumerate(pr_test_first): ensemble.append([inv_tag_map[t] for t in pred]) lines_ensemble, sentences_ensemble = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, ensemble ) c = conlleval.evaluate(lines_ensemble) conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) else: # First tag then vote pr_ensemble, pr_test_first = get_predictions(preds, test_data.tokens, test_data.sentence_numbers) # Accumulate probabilities, then vote prob_ensemble, prob_test_first = get_predictions2(probs, test_data.tokens, test_data.sentence_numbers) ens = [pr_ensemble, prob_ensemble, pr_test_first, prob_test_first] if args.documentwise: # D-CMV: Documentwise CMV # D-CMVP: Documetwise CMV, probs summed, argmax after that # D-F: Documentwise First # D-FP: Same as D-FP method_names = ['D-CMV','D-CMVP','D-F','D-FP'] else: method_names = ['CMV','CMVP','F','FP'] for i, ensem in enumerate(ens): ensemble = [] for j,pred in enumerate(ensem): ensemble.append([inv_tag_map[t] for t in pred]) output_file = "output/{}-{}.tsv".format(args.output_file, method_names[i]) lines_ensemble, sentences_ensemble = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, ensemble) print("Model trained: ", args.ner_model_dir) print("Seq-len: ", args.max_seq_length) print("Learning rate: ", args.learning_rate) print("Batch Size: ", args.batch_size) print("Epochs: ", args.num_train_epochs) print("Training data: ", args.train_data) print("Testing data: ", args.test_data) print("") print("Results with {}".format(method_names[i])) c = conlleval.evaluate(lines_ensemble) print("") conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) m_names.extend(method_names) if args.sentence_in_context: starting_pos = np.arange(0,seq_len+1,32) starting_pos[0] = 1 m_names.extend(starting_pos) for start_p in starting_pos: tt_lines, tt_tags, line_nos, line_starts = combine_sentences2(test_data.tokens, test_data.labels, seq_len-1, start_p-1) tt_x = encode(tt_lines, tokenizer, seq_len) tt_y, train_weights = label_encode(tt_tags, tag_map, seq_len) probs = ner_model.predict(tt_x, batch_size=args.batch_size) preds = np.argmax(probs, axis=-1) pred_tags = [] for i, pred in enumerate(preds): idx = line_nos[i].index(i) pred_tags.append([inv_tag_map[t] for t in pred[line_starts[i][idx]+1:line_starts[i][idx]+len(test_data.tokens[i])+1]]) output_file = "output/{}-{}.tsv".format(args.output_file, start_p) lines_first, sentences_first = write_result( output_file, test_data.words, test_data.lengths, test_data.tokens, test_data.labels, pred_tags ) print("") print("Results with prediction starting position ", start_p) c = conlleval.evaluate(lines_first) conlleval.report(c) results.append([conlleval.metrics(c)[0].prec, conlleval.metrics(c)[0].rec, conlleval.metrics(c)[0].fscore]) result_file = "./results/results-{}.csv".format(args.output_file) with open(result_file, 'w+') as f: for i, line in enumerate(results): params = "{},{},{},{},{},{},{},{},{}".format(args.output_file, args.max_seq_length, args.bert_config_file, args.num_train_epochs, args.learning_rate, args.batch_size, args.predict_position, args.train_data, args.test_data) f.write(params) f.write(",{}".format(m_names[i])) for item in line: f.write(",{}".format(item)) f.write('\n') for i in results: print(i) return 0