def _feature_extractor(self, doc):
     features = np.asarray([self.word_vocab[w] if self.word_vocab[w] is not None else 1
                            for w in doc])
     if self.char_vocab:
         sentence_chars = []
         for w in doc:
             word_chars = []
             for c in w:
                 _cid = self.char_vocab[c]
                 word_chars.append(_cid if _cid is not None else 1)
             sentence_chars.append(word_chars)
         sentence_chars = pad_sentences(sentence_chars, self.model.max_word_len)
         features = (features, sentence_chars)
     return features
Beispiel #2
0
def vectorize(doc, w_vocab, c_vocab):
    words = np.asarray([w_vocab[w.lower()] if w.lower() in w_vocab else 1 for w in doc]).reshape(
        1, -1
    )
    sentence_chars = []
    for w in doc:
        word_chars = []
        for c in w:
            if c in c_vocab:
                _cid = c_vocab[c]
            else:
                _cid = 1
            word_chars.append(_cid)
        sentence_chars.append(word_chars)
    sentence_chars = np.expand_dims(pad_sentences(sentence_chars, model.word_length), axis=0)
    return words, sentence_chars
def vectorize(doc, vocab, char_vocab=None):
    words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc])\
        .reshape(1, -1)
    if char_vocab is not None:
        sentence_chars = []
        for w in doc:
            word_chars = []
            for c in w:
                if c in char_vocab:
                    _cid = char_vocab[c]
                else:
                    _cid = 1
                word_chars.append(_cid)
            sentence_chars.append(word_chars)
        sentence_chars = np.expand_dims(pad_sentences(sentence_chars, model.word_length), axis=0)
        return [words, sentence_chars]
    else:
        return words
 def __call__(self, doc: Doc) -> Doc:
     """
     Annotate the document with noun phrase spans
     """
     spans = []
     doc_vecs = []
     doc_lens = []
     for sentence in doc.sents:
         doc_vec = self._feature_extractor([t.text for t in sentence])
         doc_vecs.append(doc_vec)
         doc_lens.append(len(doc_vec))
     doc_vectors = pad_sentences(doc_vecs)
     np_indexes = self._infer_chunks(doc_vectors, doc_lens)
     for s, e in np_indexes:
         np_span = Span(doc, s, e)
         spans.append(np_span)
     spans = _NPPostprocessor.process(spans)
     set_noun_phrases(doc, spans)
     return doc
Beispiel #5
0
def vectorize(docs, w_vocab, c_vocab):
    data = []
    for doc in docs:
        words = np.asarray([
            w_vocab[w.lower()] if w_vocab[w.lower()] is not None else 1
            for w in doc
        ]).reshape(1, -1)
        if c_vocab is not None:
            sentence_chars = []
            for w in doc:
                word_chars = []
                for c in w:
                    _cid = c_vocab[c]
                    word_chars.append(_cid if _cid is not None else 1)
                sentence_chars.append(word_chars)
            sentence_chars = np.expand_dims(pad_sentences(
                sentence_chars, word_length),
                                            axis=0)
            data.append((words, sentence_chars))
        else:
            data.append(words)
    return data
Beispiel #6
0
 def __call__(self, doc: Doc) -> Doc:
     """
     Annotate the document with noun phrase spans
     """
     spans = []
     doc_vecs = []
     doc_chars = []
     doc_lens = []
     if len(doc) < 1:
         return doc
     for sentence in doc.sents:
         features = self._feature_extractor([t.text for t in sentence])
         if isinstance(features, tuple):
             doc_vec = features[0]
             doc_chars.append(features[1])
         else:
             doc_vec = features
         doc_vecs.append(doc_vec)
         doc_lens.append(len(doc_vec))
     doc_vectors = pad_sentences(np.asarray(doc_vecs))
     inputs = doc_vectors
     if self.char_vocab:
         max_len = doc_vectors.shape[1]
         padded_chars = np.zeros(
             (len(doc_chars), max_len, self.model.max_word_len))
         for idx, d in enumerate(doc_chars):
             d = d[:max_len]
             padded_chars[idx, -d.shape[0]:] = d
         inputs = [inputs, padded_chars]
     np_indexes = self._infer_chunks(inputs, doc_lens)
     for s, e in np_indexes:
         np_span = Span(doc, s, e)
         spans.append(np_span)
     spans = _NPPostprocessor.process(spans)
     set_noun_phrases(doc, spans)
     return doc