コード例 #1
0
 def most_similar_within_model(self, text, topn=10):
     words = cut(text)
     vec = self.model.infer_vector(words)
     s = self.model.docvecs.most_similar([vec], topn=topn)
     for index, score in s:
         print(index, score, self.docs[int(index)]["artist"])
         print(self.docs[int(index)]["desc"], "\n")
コード例 #2
0
ファイル: predict.py プロジェクト: fujiawei-dev/cnn-captcha
def predict(captcha):
    imgs = cut(regularized(Image.open(captcha)))
    data = [scan(img) for img in imgs]
    x = pd.DataFrame(data) / 255.0
    x = x.values.reshape(-1, 21, 12, 1)
    model = models.load_model('captcha_model', compile=True)
    conv = lambda x: chr(x + 48 if 0 <= x <= 9 else x + 87)
    return list(map(conv, model.predict_classes(x)))
コード例 #3
0
def test_d2v_with_source(text, model, topn=5):
    conn = MyConn()
    source_tracks = open(
        "../data_related/lyrics_valid_tracks.txt").read().splitlines()
    text = replace_noise(text)
    text = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "", text)
    words = cut(text, join_en=False)
    vec = model.infer_vector(words)
    s = model.docvecs.most_similar([vec], topn=10)
    print(s)
コード例 #4
0
    def __iter__(self):
        self.artists_documents = []
        self.artists_flag = 0

        for ar, source in self.artists:
            ar = ar.lower().strip()
            if source=="KB":
                text = str(generate_description_KB(ar, self.KB_df, self.name_2_KB_ents, self.mode))
            elif source=="sup":
                text = str(generate_description_sup(ar, self.sup_data))
            words = cut(text, stops_sup=self.stops_sup, filter_number=True)
            self.artists_documents.append((ar, source, text))
            self.artists_flag += 1

            yield models.doc2vec.TaggedDocument(words, [str(self.artists_flag)])
コード例 #5
0
 def count_similarity(self, text1, text2):
     words1 = cut(text1)
     words2 = cut(text2)
     print("simi score: {:.3f}".format(
         model.docvecs.similarity_unseen_docs(words1, words2)))
コード例 #6
0
 def preprocess(text):
     text = re.sub(r"( )*[作词|作曲|编曲|制作人|录音|混母带|监制].*\n", "",
                   replace_noise(text))
     return cut(text, join_en=False)
コード例 #7
0
def get_doc_vector(text, model):
    # model = Doc2Vec.load(m_path)
    words = cut(replace_noise(text), join_en=False)
    vec = model.infer_vector(doc_words=words)

    return vec