def tag(self, seq): dy.renew_cg() new_seq = [] index = 1 for entry in seq: if not entry.is_compound_entry: ce_out, encoder_states = self._predict_is_compound_entry( unicode(entry.word, 'utf-8'), runtime=True) if np.argmax(ce_out.npvalue()) == 0: entry.index = index new_seq.append(entry) index += 1 else: compounds = self._transduce(unicode(entry.word, 'utf-8'), encoder_states) entry.index = str(index) + '-' + str(index + len(compounds)) new_seq.append(entry) for word in compounds: from io_utils.conll import ConllEntry entry = ConllEntry(index, word.encode('utf-8'), word.encode('utf-8'), '_', '_', '_', '0', '_', '_', '') new_seq.append(entry) index += 1 return new_seq
def _get_tokens(self, input_string, space_after_end_of_sentence=True): # print("\n") # print(input_string) tokens = [] y_pred, _, _ = self._predict_tok(input_string, runtime=True) index = 0 w = "" for i in range(len(input_string)): w += input_string[i] if np.argmax(y_pred[i].npvalue()) == 1: if w.strip() != "": index += 1 space_after = "SpaceAfter=No" if i < len(input_string) - 1: if input_string[i + 1] in string.whitespace: space_after = "_" entry = ConllEntry(index, str(w).strip(), '_', "_", "_", "_", 0, "_", "_", space_after=space_after) tokens.append(entry) w = "" if w.strip() != "": index += 1 entry = ConllEntry(index, str(w).strip(), '_', "_", "_", "_", 0, "_", "_", "") tokens.append(entry) # set SpaceAfter=No property of last token if len(tokens) > 0: tokens[ -1].space_after = "SpaceAfter=No" if space_after_end_of_sentence == False else "_" return tokens
def _get_tokens(self, input_string): tokens = [] y_pred, _, _ = self._predict_tok(input_string, runtime=True) index = 0 w = "" for char, y in zip(input_string, y_pred): w += char if np.argmax(y.npvalue()) == 1: if w.strip() != "": index += 1 entry = ConllEntry(index, w.strip().encode('utf-8'), '_', "_", "_", "_", 0, "_", "_", "") tokens.append(entry) w = "" if w.strip() != "": index += 1 entry = ConllEntry(index, w.strip().encode('utf-8'), '_', "_", "_", "_", 0, "_", "_", "") tokens.append(entry) return tokens
def tag(self, seq): dy.renew_cg() new_seq = [] index = 1 for entry in seq: if not entry.is_compound_entry: ce_out, encoder_states = self._predict_is_compound_entry( unicode(entry.word, 'utf-8'), runtime=True) if np.argmax(ce_out.npvalue()) == 0: entry.index = index new_seq.append(entry) index += 1 else: compounds = self._transduce(unicode(entry.word, 'utf-8'), encoder_states) # bug because _transduce may return empty tokens valid_tokens = [] for token in compounds: if token.strip() != "": valid_tokens.append(token) compounds = valid_tokens if len(compounds) <= 1: entry.index = index new_seq.append(entry) index += 1 else: entry.index = str(index) + '-' + str(index + len(compounds) - 1) entry.is_compound_entry = True entry.upos = '_' entry.xpos = '_' entry.attrs = '_' entry.label = '_' entry.head = '_' entry.deps = '_' new_seq.append(entry) for word in compounds: from io_utils.conll import ConllEntry entry = ConllEntry(index, word.encode('utf-8'), word.encode('utf-8'), '_', '_', '_', '0', '_', '_', '') new_seq.append(entry) index += 1 return new_seq
def tokenize( self, input_string ): # input string is a single string that can contain several sentences, output will be conllu-format list of sentences import sys if sys.version_info[0] == 2: uni_string = unicode(input_string, 'utf-8') # because reasons else: uni_string = input_string offset = 0 sentences = [] last_proc = 0 while offset < len(uni_string): proc = (offset + 1) * 100 / len(uni_string) while last_proc + 5 < proc: last_proc += 5 sys.stdout.write(" " + str(last_proc)) sys.stdout.flush() # print("Total len = "+str(len(uni_string))) # print("Current offset = "+str(offset)) window = 0 while True: # extend window until we find an end of sentence (SX) window += self.config.tokenize_maximum_sequence_length X = uni_string[offset:min(len(uni_string), offset + window)] # print(" X len = "+str(len(X))) softmax, _, _ = self._predict(X) # print(" Softmax len = "+str(len(softmax))) # convert to labels labels = [ self.decoder_output_i2c[np.argmax(s.npvalue())] for s in softmax ] # print(" Predicted label len = "+str(len(labels))) if "SX" in labels: break elif offset + len(labels) >= len( uni_string ): # maybe we reached end of input_string without an SX, then exit as well break offset += len(labels) # create sentence from labels sentence = [] word = "" cnt = 1 # with fopen("log.txt","a") as log: # log.write("\n\n") for i in range(len(labels)): # log.write("["+X[i].encode('utf-8')+"] "+labels[i]+" w=["+word.encode('utf-8')+"]\n") if "O" in labels[i]: word = word + X[i] if "S" in labels[i]: if X[i] in string.whitespace: # if whitespace, skip if word != "": entry = ConllEntry(index=cnt, word=word.decode('utf-8'), lemma="_", upos="_", xpos="_", attrs="_", head="0", label="_", deps="_", space_after="_") # log.write(" New ERROR incomplete entry ["+word.encode('utf-8')+"]\n") sentence.append(entry) cnt += 1 word = "" continue word += X[i] space_after = "SpaceAfter=No" if i < len(X) - 1: if X[i + 1] in string.whitespace: space_after = "_" entry = ConllEntry(index=cnt, word=word.decode('utf-8'), lemma="_", upos="_", xpos="_", attrs="_", head="0", label="_", deps="_", space_after=space_after) # log.write(" New entry ["+word.encode('utf-8')+"]\n") sentence.append(entry) cnt += 1 word = "" sentences.append(sentence) # for entry in sentence: # print(" \t Word :"+entry.word) return sentences