def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] num = 0 for line in codecs.open(path, 'r', 'utf8'): num += 1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: if line[0] == " ": line = "$" + line[1:] word = line.split() else: word = line.split() assert len(word) >= 2, print([word[0]]) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros, plus_tag=False): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf-8'): #zerose가 1이면 존재하는 숫자 모두 0으로 바꿈 line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: #문장 사이 개행 부분 if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: #문장의 각 단어 라 word = line.split() assert len(word) >= 2 if plus_tag: word_tag = word[0] + '/' + word[1] word.insert(0, word_tag) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] max_sentence_length = 0 max_word_length = 0 for line in codecs.open(path, 'r', 'utf8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: # print sentence # sys.exit() sentences.append(sentence) if len(sentence) > max_sentence_length: max_sentence_length = len(sentence) sentence = [] else: word = line.split() assert len(word) >= 2 sentence.append(word) if len(word[0]) > max_word_length: max_word_length = len(word[0]) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) if len(sentence) > max_sentence_length: max_sentence_length = len(sentence) return sentences, max_sentence_length, max_word_length
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() if len(word) < 2: print path print line print word assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] ind = 0 for line in codecs.open(path, 'r', 'utf8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() line = line.replace('creative-work', 'creativework') if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() if len(word) < 6: print line, ind, path assert len(word) == 6 sentence.append(word) ind += 1 if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences2(path, lower, zeros, line_idx): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] start_line_idx = line_idx # return every 300 sentences read_lines = open(path, 'r', encoding = 'utf-8').readlines() leng = get_tot_length(path) #while len(sentences)< 300 and line_idx < leng: while line_idx -start_line_idx <= 10000: line = read_lines[line_idx] line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() assert len(word) >= 2 sentence.append(word) line_idx+= 1 if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences, line_idx
def load_sentences(path, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] label = [] labels = [] for line in codecs.open(path, 'r', 'utf-8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) >= 2: sentences.append(sentence) labels.append(label) sentence = [] label = [] else: word = line.split() assert len(word) >= 2 sentence.append(word[0]) label.append(word[3]) if len(sentence) >= 2: sentences.append(sentence) labels.append(label) return sentences,labels
def read_CONLL(path, zeros=True, lower=True, pos=False): sentences = [] sentence = [] idx = 0 for line in codecs.open(path, 'r', 'utf8'): idx += 1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() if len(word) < 2: print(idx, line) assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] if pos: tags = [[w[-2] for w in s] for s in sentences] else: tags = [[w[-1] for w in s] for s in sentences] #tags = [[w[1] for w in s] for s in sentences] return words, tags
def load_sentences(input_file_path_or_list, zeros, file_format="conll"): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ assert file_format in ["conll", "conllu"] sentences = [] sentence = [] max_sentence_length = 0 max_word_length = 0 if isinstance(input_file_path_or_list, str): input_f = codecs.open(input_file_path_or_list, 'r', 'utf8') else: input_f = input_file_path_or_list if file_format == "conllu": sep = '\t' elif file_format == "conll": sep = None for line in input_f: if file_format == "conllu" and line.startswith("#"): continue line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: # print sentence # sys.exit() sentences.append(sentence) if len(sentence) > max_sentence_length: max_sentence_length = len(sentence) sentence = [] else: tokens = line.split(sep) if file_format == "conll": assert len(tokens) >= 2 elif file_format == "conllu": assert len(tokens) == 10, line + " " + " ".join( tokens) + " CONLL-U format requires exactly 10 columns" if "-" in tokens[ 0]: # skip if the first column contains '-' as this indicates that this line is irrelavant for us. continue sentence.append(tokens) if len(tokens[0]) > max_word_length: max_word_length = len(tokens[0]) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) if len(sentence) > max_sentence_length: max_sentence_length = len(sentence) return sentences, max_sentence_length, max_word_length
def tag(): if request.method == 'POST': data = request.get_json() text = data['text'] if data['split_sentences']: sentences = split_sentences(text) else: sentences = text if data['tokenize'] or data['split_sentences']: tokenized_sentences = [tokenize(s) for s in sentences] else: tokenized_sentences = text count = 0 output = [] for words in tokenized_sentences: if len(words) == 0: continue # Lowercase sentence if model.parameters['lower']: line = line.lower() # Replace all digits with zeros if model.parameters['zeros']: line = zero_digits(line) # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=model.parameters['lower']) input = create_input(sentence, model.parameters, False) # Decoding if model.parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if model.parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len( words ), "Predictions have different length than sentence. Something went wrong." output.append(list(zip(words, y_preds))) count += 1 if count % 100 == 0: logging.info(count) return jsonify(output)
def tag(model, line): # Load existing model print("Loading model...") model = Model(model_path=model) parameters = model.parameters # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() start = time.time() print('Tagging...') words_ini = line.rstrip().split() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) print('---- sentence tagged in %.4fs ----' % (time.time() - start)) return ' '.join(w + '__' + str(y) for w, y in zip(words_ini, y_preds))
def load_sentences(path, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. Returns a list of list of lists: [ [ [sent1_word1, . . . , sent1_tag1] . . . [sentn_wordn, . . . , sentn_tagn] ] . . . [ [sentl_word1, . . . , sentl_tag1] . . . [sentl_wordn, . . . , sentl_tagn] ] ] """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def tag_document(doc, parameters, model, f_eval, word_to_id, char_to_id): count = 0 all_ypreds = list() all_tokens = list() for line in doc.sentences: toks_text = [x.orth_ for x in line.tokens] # line = ' '.join(toks_text) if toks_text: # WL edit: used to be 'if line', was crashing on '\n' lines # Lowercase sentence if parameters['lower']: toks_text = [line.lower() for line in toks_text] # Replace all digits with zeros if parameters['zeros']: toks_text = [zero_digits(line) for line in toks_text] # Prepare input sentence = prepare_sentence(toks_text, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(toks_text) # strip IOB prefixes y_preds = [x.split('-')[-1] for x in y_preds] all_ypreds.append(y_preds) all_tokens.append(toks_text) count += 1 if count % 100 == 0: print count return (all_ypreds, all_tokens)
def predicts(self, line): if line: # Save original bigrams bigram_sent = self.to_bigram(line, 0).strip().split() # Replave all digits with zeros line = zero_digits(line) input_seq = self.to_bigram(line, 0).strip().split() # Prepare input sentence = prepare_sentence(input_seq, self.word_to_id, self.char_to_id, lower=self.parameters['lower']) input = create_input(sentence, self.parameters, False) if self.parameters['crf']: y_preds = np.array(self.f_eval(*input))[1:-1] else: y_preds = self.f_eval(*input).argmax(axis=1) tags = [self.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if self.parameters['tag_scheme'] == 'iobes': tags = iobes_iob(tags) print(tags) # Make output form out_form = "" unigram_sent = self.bigrams_to_unigrams(bigram_sent) for i in range(len(tags)): if tags[i].startswith('B'): out_form += '<' + unigram_sent[i] elif tags[i].startswith('I'): if i == len(tags) - 1: out_form += unigram_sent[i] + ':' + tags[i][2:] + '>' elif tags[i + 1] == 'O': out_form += unigram_sent[i] + ':' + tags[i][2:] + '>' else: out_form += unigram_sent[i] else: out_form += unigram_sent[i] return out_form
def load_sentences(path, zeros, lower): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf8'): if not line.rstrip(): if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = zero_digits(line.rstrip().split()) if zeros else line.rstrip().split() assert len(word) >= 2 #word = ['!' if w in ('-',',',',','。','.','>','?', ':', ':') else w for w in word] sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_ner2line_sentences(path, lower=False, zeros = False): sentences = [] tokens = [] marks = [] istoken = True len_mention = 0 for line in codecs.open(path, 'r', 'utf-8'): line = line.strip() if line.startswith('#begin document'): logging.info('Skip line %s' % line) continue if len(line) > 0: if istoken: line = zero_digits(line) if zeros else line for tok in line.split(): tokens.append(tok) istoken = False else: for mark in line.split(): index = mark.split(',') if len(index) < 4: start, end, label, head = int(index[0]), int(index[1]), 'M', -1 else: start, end, label, head = int(index[0]), int(index[1]), index[2],int(index[3]) label = mention_type(label) if end <= start or end > len(tokens): logging.info( "WARNING: markable boundary out of sentence, sentence length: %d markable: %d, %d" % ( len(tokens), start, end)) else: marks.append((start, end - 1, label,head)) len_mention += 1 else: if len(tokens) > 0: sentences.append({'tokens': tokens, 'marks': marks}) tokens = [] marks = [] istoken = True return sentences, len_mention
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain only one word of the citation string while running the model and the word and corresponding tag while training. Sentences are separated by empty lines. """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'cp850'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by tabs. """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split('\t') assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower=False, zeros=False): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] for line in open(path, 'r'): line = line.strip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() word[0] = zero_digits(word[0]) if zeros else word[0] # assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] for line in codecs.open(path, 'r', 'utf8'): if not line.rstrip(): if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = zero_digits(line.rstrip().split()) if zeros else line.rstrip().split() if lower: word = [re.sub('[\u0061-\u007a]', 'a', re.sub('[\u0041-\u005a]', 'a', word[0])), word[1], word[2]] assert len(word) >= 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_data(path, zeros): """ Load sentences from path (data set). Sentences are separated by empty lines. You can replace all digits to zeros if you want. """ sentences = [] sentence = [] for line in open(path, 'r', encoding='UTF-8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if line: word = line.split() assert len(word) >=2 sentence.append(word) else: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] for i, line in enumerate(codecs.open(path, 'r', 'utf8')): line = zero_digits(line.rstrip()) if zeros else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: word = line.split() assert len(word) >= 2, 'Error in describing %s, line %d in %s' % ( line, i, path) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def evaluate(test_file, models, tmp_folder, result_file): """ :param test_file: conll format contains gold label of all layers, :param models: list of model path :param tmp_folder: temp folder :param result_file: result file :return: """ input = test_file file_name = os.path.basename(test_file) lines = common.get_all_lines(input) newlines = [] nlayer = len(models) for line in lines: if line.strip() != "": newline = "\t".join(line.strip().split("\t")[:-nlayer] + ["O"]) # remove all gold label + append O tag newlines.append(newline) else: newlines.append("") input_files = [ tmp_folder + "/" + file_name + ".temp.layer" + str(i) for i in range(nlayer + 1) ] eval_files = [ tmp_folder + "/" + file_name + ".eval" + str(i) for i in range(nlayer) ] result_files = [ tmp_folder + "/" + file_name + ".result.layer" + str(i) + ".txt" for i in range(nlayer) ] save_all_lines(newlines, input_files[0]) # create input file of layer 0 eval_lines_all = [] nlayer = len(models) print "#LAYER:", nlayer for i in range(nlayer): predict_a_file(input_files[i], input_files[i + 1], models[i], True) lines_i = common.get_all_lines(input_files[i + 1]) print "#LEN: ", len(lines_i) # output conll to conlleval lines_eval = [] for k in range(len(lines)): if lines[k].strip() != "": tokens1 = lines[k].strip().split("\t") tokens2 = lines_i[k].strip().split("\t") assert zero_digits(tokens1[0]) == zero_digits(tokens2[0]) lines_eval.append(" ".join( [tokens1[0], tokens1[i - nlayer], tokens2[-2]])) else: lines_eval.append("") save_all_lines(lines_eval, eval_files[i]) eval_lines = call_conlleval(eval_files[i], result_files[i]) eval_lines_all.append("=======================") eval_lines_all.append("test file: " + test_file) eval_lines_all.append("layer" + str(i)) eval_lines_all.extend(eval_lines) result_file.write("\n".join(eval_lines_all))
f_output = codecs.open(opts.output, 'w', 'utf-8') start = time.time() print 'Tagging...' with codecs.open(opts.input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes':
f_output = codecs.open(opts.output, 'w', 'utf-8') start = time.time() print 'Tagging...' with codecs.open(opts.input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags
def run_tagging(model, f_eval, parameters, word_to_id, char_to_id, tag_to_id, opts_input="", opts_output="", opts_delimiter="__", opts_outputFormat=""): # Check parameters validity assert opts_delimiter assert os.path.isfile(opts_input) #set environment to use gpu f_output = codecs.open(opts_output, 'w', 'utf-8') start = time.time() logger.info('Tagging...') with codecs.open(opts_input, 'r', 'utf-8') as f_input: count = 0 for line in f_input: words_ini = line.rstrip().split() if line: # Lowercase sentence if parameters['lower']: line = line.lower() # Replace all digits with zeros if parameters['zeros']: line = zero_digits(line) words = line.rstrip().split() # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) if opts_outputFormat == 'json': f_output.write( json.dumps({ "text": ' '.join(words), "ranges": iob_ranges(y_preds) })) else: #logger.info( "write out tags..." f_output.write( '%s\n' % ' '.join('%s%s%s' % (w, opts_delimiter, y) for w, y in zip(words_ini, y_preds))) else: f_output.write('\n') count += 1 # if count % 100 == 0: # logger.info( count logger.info('---- %i lines tagged in %.4fs ----' % (count, time.time() - start)) f_output.close() logger.info(opts_output) logger.info("") return opts_output + " has been tagged!" # def main(): # logger.info( "executed" # if __name__ == '__main__': # main()
_, f_eval = model.build(training=False, **parameters) model.reload() while True: sent = raw_input("Type a query (type \"exit\" to exit):\n") count = 0 words = sent.rstrip().split() if sent == 'exit': break else: # Lowercase sentence if parameters['lower']: sent = sent.lower() # Replace all digits with zeros if parameters['zeros']: sent = zero_digits(sent) # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format # Write tags assert len(y_preds) == len(words) print( '%s\n\n' % '\n'.join('%s%s%s' % (w, " ", y) for w, y in zip(words, y_preds)))
def ner(): global model global f_eval global parameters global word_to_id global char_to_id global tag_to_id model_name = request.json["model"] words = request.json["words"] begin_end = request.json["begin_end"] if model is None: ## Model loading print "Loading model " + model_name + ".." model = Model(model_path="models/" + models[model_name]) parameters = model.parameters # Load reverse mappings word_to_id, char_to_id, tag_to_id = [{ v: k for k, v in x.items() } for x in [model.id_to_word, model.id_to_char, model.id_to_tag]] # Load the model _, f_eval = model.build(training=False, **parameters) model.reload() # else: # parameters = model.parameters # word_to_id, char_to_id, tag_to_id = [ # {v: k for k, v in x.items()} # for x in [model.id_to_word, model.id_to_char, model.id_to_tag] # ] # Lowercase sentence if parameters['lower']: words = [w.lower() for w in words] # Replace all digits with zeros if parameters['zeros']: words = [zero_digits(w) for w in words] words = [w if not w.isupper() else w.title() for w in words] # Prepare input sentence = prepare_sentence(words, word_to_id, char_to_id, lower=parameters['lower']) input = create_input(sentence, parameters, False) # Decoding if parameters['crf']: y_preds = np.array(f_eval(*input))[1:-1] else: y_preds = f_eval(*input).argmax(axis=1) y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] # Output tags in the IOB2 format if parameters['tag_scheme'] == 'iobes': y_preds = iobes_iob(y_preds) # Write tags assert len(y_preds) == len(words) # TODO:remove assert? ents = [{ "start_char": b, "end_char": e, "label": label } for (b, e), label in zip(begin_end, y_preds) if label != "O"] return json.dumps({"ents": ents})