예제 #1
0
            word = self.get_word_from_annotation(annotation)
            for feature in annotation.getchildren():
                if feature.find('Name').text == 'anas':
                    token = self.get_token_by_idx(word_index, doc)
                    anas = (feature.find('Value').text
                            if feature.find('Value').text is not None else '')
                    token._.morph = set(anas.split(';'))
                    token._.lemma = self.get_lemma_from_morph(anas)
                    break

        return doc


if __name__ == "__main__":
    from Tokenizer import HuTokenizer

    debug_text = 'Jó, hogy ez az alma piros, mert az olyan almákat szeretem.'
    # debug_text = 'megszentségteleníthetetlenségeitekért meghalnak'
    remote_url = 'http://hlt.bme.hu/chatbot/gate/process?run='
    nlp = spacy.blank("en")
    nlp.tokenizer = HuTokenizer(nlp.vocab, url=remote_url)
    morph_analyzer = HuLemmaMorph(nlp, url=remote_url)
    nlp.add_pipe(morph_analyzer, last=True)

    doc = nlp(debug_text)
    for token in doc:
        print('Token is: ' + token.text)
        print(token._.lemma)
        print(token._.morph)
        print()
예제 #2
0
                return np.array([float(x) for x in coords if x != '\n'])
        print('Word not found in dictionary: ' + token.text)
        return np.zeros(300)

    def get_doc_vector(self, doc):
        return np.average([token.vector for token in doc], axis=0)


if __name__ == '__main__':
    import spacy
    from Tokenizer import HuTokenizer

    class Container(object):
        pass

    comp = HUWordToVec()
    obj = Container()
    obj.text = 'alma'
    vec1 = comp.get_token_vector(obj)
    print(vec1)
    print(len(vec1))  # expected: 300

    nlp = spacy.blank('en')
    nlp.tokenizer = HuTokenizer(nlp.vocab)
    nlp.add_pipe(HUWordToVec(), last=True)
    doc = nlp('Alma körte barack')
    for token in doc:
        print('Token is: ' + token.text)
        print(token.vector[0])
    print(doc.vector)