Example #1
0
 def vectorize_example(self, sentence, labels=None):
     sentence_ = [[self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)]] for word in sentence]
     if labels:
         labels_ = [LBLS.index(l) for l in labels]
         return sentence_, labels_
     else:
         return sentence_, [LBLS[-1] for _ in sentence]
Example #2
0
def get_chunks(seq, default=LBLS.index(NONE)):
    """Breaks input of 4 4 4 0 0 4 0 ->   (0, 4, 5), (0, 6, 7)"""
    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None
        # End of a chunk + start of a chunk!
        elif tok != default:
            if chunk_type is None:
                chunk_type, chunk_start = tok, i
            elif tok != chunk_type:
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok, i
        else:
            pass
    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)
    return chunks
Example #3
0
 def vectorize_example(self, sentence, labels=None):
     sentence_ = [[self.tok2id.get(normalize(word), self.tok2id[UNK]), self.tok2id[P_CASE + casing(word)]] for word in sentence]
     if labels:
         labels_ = [LBLS.index(l) for l in labels]
         return sentence_, labels_
     else:
         return sentence_, [LBLS[-1] for _ in sentence]
Example #4
0
def get_chunks(seq, default=LBLS.index(NONE)):
    """Breaks input of 4 4 4 0 0 4 0 ->   (0, 4, 5), (0, 6, 7)"""
    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None
        # End of a chunk + start of a chunk!
        elif tok != default:
            if chunk_type is None:
                chunk_type, chunk_start = tok, i
            elif tok != chunk_type:
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok, i
        else:
            pass
    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)
    return chunks
Example #5
0
def get_chunks(seq, default=LBLS.index(NONE)):
    """
        This is used for caluculating entity level P/R/F1.
        Find all contiguous sequences of non-null tags with the same type.
        Breaks input of 4 4 4 0 0 4 0 ->   (0, 3, 5), (1, 6, 7)
        (0, 3, 5): 0 is the type of the chunk, 3 is the start index of the chunk, 5 - 1 is the end index of the chunk
    """
    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None
        # End of a chunk + start of a chunk!
        elif tok != default:
            if chunk_type is None:
                chunk_type, chunk_start = tok, i
            elif tok != chunk_type:
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok, i
        else:
            pass
    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)
    return chunks
 def vectorize_example(self, sentence, labels=None):
     sentence_ = [
         self.tok2id.get(word, self.tok2id[UNK]) for word in sentence
     ]
     if labels:
         labels_ = [LBLS.index(l) for l in labels]
         return sentence_, labels_
     else:
         return sentence_, [LBLS[-1] for _ in sentence]
Example #7
0
 def vectorize_example(self, sentence, label):
     sentence_ = []
     for word in sentence:
         sentence_.append([
             self.tok2id.get(normalize(word), self.tok2id[UNK]),
             self.tok2id[P_CASE + casing(word)]
         ])
         #sentence_.append([self.tok2id.get(normalize(word), self.tok2id[UNK])])
     label_ = LBLS.index(label)
     return sentence_, label_
Example #8
0
 def vectorize_example(self, sentence, labels=None):
     # 以词的索引和词的大小写属性作为词的特征,表征一个词,sentences=[[f1,f2]_w1,...,[f1,f2]_wn]
     sentence_ = [[
         self.tok2id.get(normalize(word), self.tok2id[UNK]),
         self.tok2id[P_CASE + casing(word)]
     ] for word in sentence]
     if labels:
         labels_ = [LBLS.index(l) for l in labels]  #句子中每个词对应的命名实体的索引
         return sentence_, labels_
     else:
         # 所有的词的实体设置为O,缺省类,返回([sentences], [labels])
         return sentence_, [LBLS[-1] for _ in sentence]
Example #9
0
 def vectorize_example(self, sentence, labels=None):
     print "VECTORIZING %s" % sentence
     sentence_ = [[
         self.tok2id.get(normalize(word), self.tok2id[UNK]),
         self.tok2id[P_CASE + casing(word)]
     ] for word in sentence]
     print "OUTPUT: " % sentence_
     if labels:
         labels_ = [LBLS.index(l) for l in labels]
         return sentence_, labels_
     else:
         return sentence_, [LBLS[-1] for _ in sentence]