def main(): mkdir(args.output_dir) assert os.path.exists(args.model_filename) cstm = model.cstm(args.model_filename) ndim = cstm.get_ndim_d() # 訓練データ全体で出現頻度が上位10000のものを取得 common_words = cstm.get_high_freq_words(10000) # 上位10,000個 plot_words(common_words, ndim, args.output_dir, filename="words")
def main(args): assert os.path.exists(args.model_filename) cstm = model.cstm(args.model_filename) ndim_d = cstm.get_ndim_d() # 単語情報を取得 words = cstm.get_words() for meta in words: # 単語ID、単語、総出現回数、単語ベクトル、この単語が含まれる文書ID word_id, word, count, vector, doc_ids = meta vector = np.asarray(vector, dtype=np.float32) print word_id, word, count
def find_similar_words(args): assert os.path.exists(args.model_filename) cstm = model.cstm(args.model_filename) ndim_d = cstm.get_ndim_d() # 単語情報を取得 words = cstm.get_words_similar_to_word(u"apple", 20) print "word_id word count cosine" for meta in words: # 単語ID、単語、総出現回数、単語ベクトル、内積 word_id, word, count, vector, cosine = meta vector = np.asarray(vector, dtype=np.float32) word = word.encode(sys.stdout.encoding) + " " * max(0, 8 - len(word)) print "{} {} {} {}".format(word_id, word, count, cosine)
def main(args): assert args.font_path is not None mkdir(args.output_dir) assert os.path.exists(args.model_filename) cstm = model.cstm(args.model_filename) ndim = cstm.get_ndim_d() # ベクトルを取得 doc_vectors = np.asarray(cstm.get_doc_vectors(), dtype=np.float32) assert args.doc_id < doc_vectors.shape[0] doc_ids = [] if args.doc_id is None: num_docs = cstm.get_num_documents() for doc_id in xrange(num_docs): doc_ids.append(doc_id) else: doc_ids.append(args.doc_id) # 文書ごとに生成 words = cstm.get_words() for doc_id in doc_ids: doc_vector = doc_vectors[doc_id] # 各単語についてfを計算 dic = {} for meta in words: word = meta[1] count = meta[2] if count < args.min_occurence: continue word_vector = np.asarray(meta[3], dtype=np.float32) f = np.inner(word_vector, doc_vector) dic[word] = f dic = sorted(dic.items(), key=lambda x: -x[1]) # sortedが昇順なのでマイナスを掛ける max_count = min(args.max_num_word, len(dic)) dic = dict(dic[:max_count]) wordcloud = WordCloud( background_color="white", font_path=args.font_path, width=args.width, height=args.height, max_words=max_count, max_font_size=args.max_font_size).generate_from_frequencies(dic) color_funcs = [None, color_func_1, color_func_2, color_func_3, color_func_4] color_func = color_funcs[args.color] wordcloud.recolor(color_func=color_func) wordcloud.to_file("{}/cloud_f_{}.png".format(args.output_dir, doc_id))
def main(): mkdir(args.output_dir) assert os.path.exists(args.model_filename) cstm = model.cstm(args.model_filename) ndim_d = cstm.get_ndim_d() # ベクトルを取得 doc_vectors = np.asarray(cstm.get_doc_vectors(), dtype=np.float32) for i in xrange(ndim_d - 1): plot_kde(doc_vectors[:, i:], args.output_dir, filename="doc_kde_{}-{}".format(i, i + 1)) plot_scatter(doc_vectors[:, i:], args.output_dir, filename="doc_scatter_{}-{}".format(i, i + 1))
def main(): mkdir(args.output_dir) assert os.path.exists(args.model_filename) cstm = model.cstm(args.model_filename) ndim = cstm.get_ndim_d() # ベクトルを取得 word_vectors = np.asarray(cstm.get_word_vectors(), dtype=np.float32) doc_vectors = np.asarray(cstm.get_doc_vectors(), dtype=np.float32) # 訓練データ全体で出現頻度が上位10000のものを取得 common_words = cstm.get_high_freq_words(10000) # 各文書についてfをプロット word_vector_pair = [] for meta in common_words: word = meta[1] word_vector = np.asarray(meta[3], dtype=np.float32) word_vector_pair.append((word, word_vector)) plot_f(word_vector_pair, args.doc_id, doc_vectors[args.doc_id], args.output_dir)
def get_analogies(args): assert os.path.exists(args.model_filename) cstm = model.cstm(args.model_filename) ndim_d = cstm.get_ndim_d() king = np.asarray(cstm.get_word_vector_by_word(u"サーバルちゃん"), dtype=np.float32) man = np.asarray(cstm.get_word_vector_by_word(u"けもフレ"), dtype=np.float32) woman = np.asarray(cstm.get_word_vector_by_word(u"ごちうさ"), dtype=np.float32) queen = (king - man + woman).tolist() # 単語情報を取得 words = cstm.get_words_similar_to_vector(queen, 20) print "word_id word count cosine" for meta in words: # 単語ID、単語、総出現回数、単語ベクトル、内積 word_id, word, count, vector, cosine = meta vector = np.asarray(vector, dtype=np.float32) word = word.encode(sys.stdout.encoding) + " " * max(0, 8 - len(word)) print "{} {} {} {}".format(word_id, word, count, cosine)
def main(): mkdir(args.output_dir) assert os.path.exists(args.model_filename) cstm = model.cstm(args.model_filename) ndim = cstm.get_ndim_d() # ベクトルを取得 word_vectors = np.asarray(cstm.get_word_vectors(), dtype=np.float32) doc_vectors = np.asarray(cstm.get_doc_vectors(), dtype=np.float32) # 文書にカテゴリがある場合 # ファイル名の接頭語でカテゴリを判断 categories = [ "geforce", "gochiusa", "imas", "kemono", "macbook", "monst", "pad", "tekketsu", "win10", ] doc_filenames = cstm.get_doc_filenames() doc_vectors_for_category = [] for category_id, category_name in enumerate(categories): doc_vectors_for_category.append([]) for filename in doc_filenames: if filename.startswith(category_name): doc_id = cstm.get_doc_id_by_filename(filename) assert doc_id >= 0 and doc_id < len(doc_vectors) doc_vectors_for_category[category_id].append( doc_vectors[doc_id]) doc_vectors_for_category = np.asanyarray(doc_vectors_for_category) plot_scatter_category(doc_vectors_for_category, args.output_dir, filename="category")