def most_similar_within_model(self, text, topn=10): words = cut(text) vec = self.model.infer_vector(words) s = self.model.docvecs.most_similar([vec], topn=topn) for index, score in s: print(index, score, self.docs[int(index)]["artist"]) print(self.docs[int(index)]["desc"], "\n")
def predict(captcha): imgs = cut(regularized(Image.open(captcha))) data = [scan(img) for img in imgs] x = pd.DataFrame(data) / 255.0 x = x.values.reshape(-1, 21, 12, 1) model = models.load_model('captcha_model', compile=True) conv = lambda x: chr(x + 48 if 0 <= x <= 9 else x + 87) return list(map(conv, model.predict_classes(x)))
def test_d2v_with_source(text, model, topn=5): conn = MyConn() source_tracks = open( "../data_related/lyrics_valid_tracks.txt").read().splitlines() text = replace_noise(text) text = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", text) words = cut(text, join_en=False) vec = model.infer_vector(words) s = model.docvecs.most_similar([vec], topn=10) print(s)
def __iter__(self): self.artists_documents = [] self.artists_flag = 0 for ar, source in self.artists: ar = ar.lower().strip() if source=="KB": text = str(generate_description_KB(ar, self.KB_df, self.name_2_KB_ents, self.mode)) elif source=="sup": text = str(generate_description_sup(ar, self.sup_data)) words = cut(text, stops_sup=self.stops_sup, filter_number=True) self.artists_documents.append((ar, source, text)) self.artists_flag += 1 yield models.doc2vec.TaggedDocument(words, [str(self.artists_flag)])
def count_similarity(self, text1, text2): words1 = cut(text1) words2 = cut(text2) print("simi score: {:.3f}".format( model.docvecs.similarity_unseen_docs(words1, words2)))
def preprocess(text): text = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", replace_noise(text)) return cut(text, join_en=False)
def get_doc_vector(text, model): # model = Doc2Vec.load(m_path) words = cut(replace_noise(text), join_en=False) vec = model.infer_vector(doc_words=words) return vec