Ejemplo n.º 1
0
def main():
    mkdir(args.output_dir)
    assert os.path.exists(args.model_filename)
    cstm = model.cstm(args.model_filename)
    ndim = cstm.get_ndim_d()

    # 訓練データ全体で出現頻度が上位10000のものを取得
    common_words = cstm.get_high_freq_words(10000)  # 上位10,000個
    plot_words(common_words, ndim, args.output_dir, filename="words")
Ejemplo n.º 2
0
def main(args):
    assert os.path.exists(args.model_filename)
    cstm = model.cstm(args.model_filename)
    ndim_d = cstm.get_ndim_d()

    # 単語情報を取得
    words = cstm.get_words()
    for meta in words:
        # 単語ID、単語、総出現回数、単語ベクトル、この単語が含まれる文書ID
        word_id, word, count, vector, doc_ids = meta
        vector = np.asarray(vector, dtype=np.float32)
        print word_id, word, count
Ejemplo n.º 3
0
def find_similar_words(args):
    assert os.path.exists(args.model_filename)
    cstm = model.cstm(args.model_filename)
    ndim_d = cstm.get_ndim_d()

    # 単語情報を取得
    words = cstm.get_words_similar_to_word(u"apple", 20)
    print "word_id	word		count	cosine"
    for meta in words:
        # 単語ID、単語、総出現回数、単語ベクトル、内積
        word_id, word, count, vector, cosine = meta
        vector = np.asarray(vector, dtype=np.float32)
        word = word.encode(sys.stdout.encoding) + " " * max(0, 8 - len(word))
        print "{}	{}	{}	{}".format(word_id, word, count, cosine)
Ejemplo n.º 4
0
def main(args):
	assert args.font_path is not None
	mkdir(args.output_dir)
	assert os.path.exists(args.model_filename)
	cstm = model.cstm(args.model_filename)
	ndim = cstm.get_ndim_d()
	# ベクトルを取得
	doc_vectors = np.asarray(cstm.get_doc_vectors(), dtype=np.float32)
	assert args.doc_id < doc_vectors.shape[0]

	doc_ids = []
	if args.doc_id is None:
		num_docs = cstm.get_num_documents()
		for doc_id in xrange(num_docs):
			doc_ids.append(doc_id)
	else:
		doc_ids.append(args.doc_id)

	# 文書ごとに生成
	words = cstm.get_words()
	for doc_id in doc_ids:
		doc_vector = doc_vectors[doc_id]
		# 各単語についてfを計算
		dic = {}
		for meta in words:
			word = meta[1]
			count = meta[2]
			if count < args.min_occurence:
				continue
			word_vector = np.asarray(meta[3], dtype=np.float32)
			f = np.inner(word_vector, doc_vector)
			dic[word] = f
		dic = sorted(dic.items(), key=lambda x: -x[1])	# sortedが昇順なのでマイナスを掛ける

		max_count = min(args.max_num_word, len(dic))
		dic = dict(dic[:max_count])

		wordcloud = WordCloud(
			background_color="white",
			font_path=args.font_path, 
			width=args.width, 
			height=args.height, 
			max_words=max_count, 
			max_font_size=args.max_font_size).generate_from_frequencies(dic)
		color_funcs = [None, color_func_1, color_func_2, color_func_3, color_func_4]
		color_func = color_funcs[args.color]
		wordcloud.recolor(color_func=color_func)
		wordcloud.to_file("{}/cloud_f_{}.png".format(args.output_dir, doc_id))
Ejemplo n.º 5
0
def main():
    mkdir(args.output_dir)
    assert os.path.exists(args.model_filename)
    cstm = model.cstm(args.model_filename)
    ndim_d = cstm.get_ndim_d()

    # ベクトルを取得
    doc_vectors = np.asarray(cstm.get_doc_vectors(), dtype=np.float32)

    for i in xrange(ndim_d - 1):
        plot_kde(doc_vectors[:, i:],
                 args.output_dir,
                 filename="doc_kde_{}-{}".format(i, i + 1))
        plot_scatter(doc_vectors[:, i:],
                     args.output_dir,
                     filename="doc_scatter_{}-{}".format(i, i + 1))
Ejemplo n.º 6
0
def main():
	mkdir(args.output_dir)
	assert os.path.exists(args.model_filename)
	cstm = model.cstm(args.model_filename)
	ndim = cstm.get_ndim_d()
	# ベクトルを取得
	word_vectors = np.asarray(cstm.get_word_vectors(), dtype=np.float32)
	doc_vectors = np.asarray(cstm.get_doc_vectors(), dtype=np.float32)

	# 訓練データ全体で出現頻度が上位10000のものを取得
	common_words = cstm.get_high_freq_words(10000)

	# 各文書についてfをプロット
	word_vector_pair = []
	for meta in common_words:
		word = meta[1]
		word_vector = np.asarray(meta[3], dtype=np.float32)
		word_vector_pair.append((word, word_vector))
	plot_f(word_vector_pair, args.doc_id, doc_vectors[args.doc_id], args.output_dir)
Ejemplo n.º 7
0
def get_analogies(args):
    assert os.path.exists(args.model_filename)
    cstm = model.cstm(args.model_filename)
    ndim_d = cstm.get_ndim_d()

    king = np.asarray(cstm.get_word_vector_by_word(u"サーバルちゃん"),
                      dtype=np.float32)
    man = np.asarray(cstm.get_word_vector_by_word(u"けもフレ"), dtype=np.float32)
    woman = np.asarray(cstm.get_word_vector_by_word(u"ごちうさ"), dtype=np.float32)
    queen = (king - man + woman).tolist()

    # 単語情報を取得
    words = cstm.get_words_similar_to_vector(queen, 20)
    print "word_id	word		count	cosine"
    for meta in words:
        # 単語ID、単語、総出現回数、単語ベクトル、内積
        word_id, word, count, vector, cosine = meta
        vector = np.asarray(vector, dtype=np.float32)
        word = word.encode(sys.stdout.encoding) + " " * max(0, 8 - len(word))
        print "{}	{}	{}	{}".format(word_id, word, count, cosine)
Ejemplo n.º 8
0
def main():
    mkdir(args.output_dir)
    assert os.path.exists(args.model_filename)
    cstm = model.cstm(args.model_filename)
    ndim = cstm.get_ndim_d()
    # ベクトルを取得
    word_vectors = np.asarray(cstm.get_word_vectors(), dtype=np.float32)
    doc_vectors = np.asarray(cstm.get_doc_vectors(), dtype=np.float32)

    # 文書にカテゴリがある場合
    # ファイル名の接頭語でカテゴリを判断
    categories = [
        "geforce",
        "gochiusa",
        "imas",
        "kemono",
        "macbook",
        "monst",
        "pad",
        "tekketsu",
        "win10",
    ]

    doc_filenames = cstm.get_doc_filenames()
    doc_vectors_for_category = []
    for category_id, category_name in enumerate(categories):
        doc_vectors_for_category.append([])
        for filename in doc_filenames:
            if filename.startswith(category_name):
                doc_id = cstm.get_doc_id_by_filename(filename)
                assert doc_id >= 0 and doc_id < len(doc_vectors)
                doc_vectors_for_category[category_id].append(
                    doc_vectors[doc_id])
    doc_vectors_for_category = np.asanyarray(doc_vectors_for_category)
    plot_scatter_category(doc_vectors_for_category,
                          args.output_dir,
                          filename="category")