def vectorize_example(self, sentence, labels=None): sentence_ = [[self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)]] for word in sentence] if labels: labels_ = [LBLS.index(l) for l in labels] return sentence_, labels_ else: return sentence_, [LBLS[-1] for _ in sentence]
def get_chunks(seq, default=LBLS.index(NONE)): """Breaks input of 4 4 4 0 0 4 0 -> (0, 4, 5), (0, 6, 7)""" chunks = [] chunk_type, chunk_start = None, None for i, tok in enumerate(seq): # End of a chunk 1 if tok == default and chunk_type is not None: # Add a chunk. chunk = (chunk_type, chunk_start, i) chunks.append(chunk) chunk_type, chunk_start = None, None # End of a chunk + start of a chunk! elif tok != default: if chunk_type is None: chunk_type, chunk_start = tok, i elif tok != chunk_type: chunk = (chunk_type, chunk_start, i) chunks.append(chunk) chunk_type, chunk_start = tok, i else: pass # end condition if chunk_type is not None: chunk = (chunk_type, chunk_start, len(seq)) chunks.append(chunk) return chunks
def get_chunks(seq, default=LBLS.index(NONE)): """ This is used for caluculating entity level P/R/F1. Find all contiguous sequences of non-null tags with the same type. Breaks input of 4 4 4 0 0 4 0 -> (0, 3, 5), (1, 6, 7) (0, 3, 5): 0 is the type of the chunk, 3 is the start index of the chunk, 5 - 1 is the end index of the chunk """ chunks = [] chunk_type, chunk_start = None, None for i, tok in enumerate(seq): # End of a chunk 1 if tok == default and chunk_type is not None: # Add a chunk. chunk = (chunk_type, chunk_start, i) chunks.append(chunk) chunk_type, chunk_start = None, None # End of a chunk + start of a chunk! elif tok != default: if chunk_type is None: chunk_type, chunk_start = tok, i elif tok != chunk_type: chunk = (chunk_type, chunk_start, i) chunks.append(chunk) chunk_type, chunk_start = tok, i else: pass # end condition if chunk_type is not None: chunk = (chunk_type, chunk_start, len(seq)) chunks.append(chunk) return chunks
def vectorize_example(self, sentence, labels=None): sentence_ = [ self.tok2id.get(word, self.tok2id[UNK]) for word in sentence ] if labels: labels_ = [LBLS.index(l) for l in labels] return sentence_, labels_ else: return sentence_, [LBLS[-1] for _ in sentence]
def vectorize_example(self, sentence, label): sentence_ = [] for word in sentence: sentence_.append([ self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)] ]) #sentence_.append([self.tok2id.get(normalize(word), self.tok2id[UNK])]) label_ = LBLS.index(label) return sentence_, label_
def vectorize_example(self, sentence, labels=None): # 以词的索引和词的大小写属性作为词的特征,表征一个词,sentences=[[f1,f2]_w1,...,[f1,f2]_wn] sentence_ = [[ self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)] ] for word in sentence] if labels: labels_ = [LBLS.index(l) for l in labels] #句子中每个词对应的命名实体的索引 return sentence_, labels_ else: # 所有的词的实体设置为O,缺省类,返回([sentences], [labels]) return sentence_, [LBLS[-1] for _ in sentence]
def vectorize_example(self, sentence, labels=None): print "VECTORIZING %s" % sentence sentence_ = [[ self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)] ] for word in sentence] print "OUTPUT: " % sentence_ if labels: labels_ = [LBLS.index(l) for l in labels] return sentence_, labels_ else: return sentence_, [LBLS[-1] for _ in sentence]