def weighted_vectorize(self, text): res = [] sentences = tools.seperate_sentences(text) tr_text = self.tr.textrank(text) for sen in sentences: tmp = [] tmp_weight = [] sen_words = tools.seperate(sen) for w in sen_words: if self.model.wv.vocab.__contains__(w): tmp.append(self.model.__getitem__(w)) if w in tr_text: tmp_weight.append(tr_text[w]) else: tmp_weight.append(1 / len(sen_words)) else: tmp.append([0] * self.vec_length) tmp_weight.append(1 / len(sen_words)) for i in range(len(tmp)): tmp[i] = tools.vector_multi(tmp[i], tmp_weight[i] / sum(tmp_weight)) sen_vec = tools.vector_add_multi(tmp) if len(sen_vec) == 0: print(sen) res.append(sen_vec) return res
def unweighted_vectorize(self, text): res = [] sentences = tools.seperate_sentences(text) for line in sentences: tmp = [] for word in tools.seperate(line): if self.model.wv.vocab.__contains__(word): wv = self.model.__getitem__(word) tmp.append(wv) else: tmp.append([0] * self.vec_length) tmp = tools.vector_add_multi(tmp) tmp = tools.vector_multi(tmp, 1 / (len(tmp))) res.append(tmp) return res