def __init__( self, num_optim_steps: int, num_topics: int, dictionary: Dictionary, alpha: float = 1.0, beta: float = 1.0, ): self.num_topics = num_topics self.vocabulary_size = len(dictionary.values()) self.dictionary = dictionary self.num_optim_steps = num_optim_steps self.alpha = alpha self.beta = beta self.topic_assignments = ( None) # 'z' in the reference, fill in the `find_topic_assignments` self.document_topic_distribution = None self.word_topics_distribution = None self.document_mapping = {}
def wmdsimilarity(doc1, doc2, lang1, lang2, vecs, with_flow=False): tok1 = list(processing.tokenize(lang1, doc1, include_stopwords=True)) tok2 = list(processing.tokenize(lang2, doc2, include_stopwords=True)) print(tok1, tok2) dictionary = Dictionary(documents=[tok1, tok2]) vocab_len = len(dictionary) if vocab_len == 1: # Both documents are composed by a single unique token return 0.0 # Sets for faster look-up. docset1 = set(tok1) docset2 = set(tok2) print(dictionary, docset1, docset2) # Compute distance matrix. distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double) for i, t1 in dictionary.items(): for j, t2 in dictionary.items(): if t1 not in docset1 or t2 not in docset2: continue # Compute Euclidean distance between word vectors. distance_matrix[i, j] = np.sqrt( np.sum((vecs[lang1][t1] - vecs[lang2][t2])**2)) if np.sum(distance_matrix) == 0.0: # `emd` gets stuck if the distance matrix contains only zeros. print('The distance matrix is all zeros. Aborting (returning inf).') return float('inf') def nbow(document): d = np.zeros(vocab_len, dtype=np.double) nbow = dictionary.doc2bow(document) # Word frequencies. doc_len = len(document) for idx, freq in nbow: d[idx] = freq / float(doc_len) # Normalized word frequencies. return d # Compute nBOW representation of documents. d1 = nbow(tok1) d2 = nbow(tok2) # Compute WMD. if with_flow: emd = emd_with_flow(d1, d2, distance_matrix) return { 'tokens': list(dictionary.values()), 'pdf1': list(d1), 'pdf2': list(d2), 'wmd': emd[0], 'flow': emd[1], 'dist_matrix': distance_matrix.tolist() } else: return { 'tokens': list(dictionary.values), 'pdf1': list(d1), 'pdf2': list(d2), 'wmd': emd(d1, d2, distance_matrix), 'dist_matrix': distance_matrix.tolist() }
# %% # 以下から最新の学習済みモデルをダウンロード # https://github.com/singletongue/WikiEntVec/releases # 今回利用したのは20190520のjawiki.all_vectors.100d.txt.bz2 model = KeyedVectors.load_word2vec_format('work/jawiki.all_vectors.100d.txt') # %% word = '理科' results = model.wv.most_similar(word) print(word, "と類似度の高い単語") for result in results: print(result) # %% data = [[word, model.wv[word]] for word in dictionary.values() if word in model.wv] df = pd.DataFrame(data, columns=['word', 'vectors']) # %% df.head() # %% df.shape # %% distortions = [] for i in tqdm(range(1, 21)): km = KMeans(n_clusters=i, verbose=1, random_state=42, n_jobs=-1) km.fit(list(df['vectors']))