word = self.get_word_from_annotation(annotation) for feature in annotation.getchildren(): if feature.find('Name').text == 'anas': token = self.get_token_by_idx(word_index, doc) anas = (feature.find('Value').text if feature.find('Value').text is not None else '') token._.morph = set(anas.split(';')) token._.lemma = self.get_lemma_from_morph(anas) break return doc if __name__ == "__main__": from Tokenizer import HuTokenizer debug_text = 'Jó, hogy ez az alma piros, mert az olyan almákat szeretem.' # debug_text = 'megszentségteleníthetetlenségeitekért meghalnak' remote_url = 'http://hlt.bme.hu/chatbot/gate/process?run=' nlp = spacy.blank("en") nlp.tokenizer = HuTokenizer(nlp.vocab, url=remote_url) morph_analyzer = HuLemmaMorph(nlp, url=remote_url) nlp.add_pipe(morph_analyzer, last=True) doc = nlp(debug_text) for token in doc: print('Token is: ' + token.text) print(token._.lemma) print(token._.morph) print()
return np.array([float(x) for x in coords if x != '\n']) print('Word not found in dictionary: ' + token.text) return np.zeros(300) def get_doc_vector(self, doc): return np.average([token.vector for token in doc], axis=0) if __name__ == '__main__': import spacy from Tokenizer import HuTokenizer class Container(object): pass comp = HUWordToVec() obj = Container() obj.text = 'alma' vec1 = comp.get_token_vector(obj) print(vec1) print(len(vec1)) # expected: 300 nlp = spacy.blank('en') nlp.tokenizer = HuTokenizer(nlp.vocab) nlp.add_pipe(HUWordToVec(), last=True) doc = nlp('Alma körte barack') for token in doc: print('Token is: ' + token.text) print(token.vector[0]) print(doc.vector)