def _absolute_score(self, sentence_1, sentence_2): splitted_1 = preprocess(sentence_1) splitted_2 = preprocess(sentence_2) vecs = sif_feature_vectors(splitted_1, splitted_2, self.embedding) A = vecs[0] B = vecs[1] return A @ B / (np.linalg.norm(A) * np.linalg.norm(B))
def test_most_similar(): """クエリ単語に対する類似度のランキングを表示 """ text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) most_similar('you', word_to_id, id_to_word, C, top=5)
def test_corpus(): """utils.pyのpreprocess関数を実行 """ import sys sys.path.append("/Users/inoueshinichi/Desktop/DeepLearning2_NLP") # 親ディレクトリのファイルをインポートするための設定 sys.path.append("/home/inoue/Desktop/DeepLearning2_NLP") from common.utils import preprocess text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) print(f"corpus: {corpus}") print(f"word_to_id: {word_to_id}") print(f"id_to_word: {id_to_word}")
def test_similarity(): """コサイン類似度の計算 """ text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) print(f"corpus: {corpus}") print(f"word_to_id: {word_to_id}") print(f"id_to_word: {id_to_word}") vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) c0 = C[word_to_id['you']] # `you`の単語ベクトル c1 = C[word_to_id['i']] # `i`の単語ベクトル print(f"cos_similarity: {cos_similarity(c0, c1)}")
def test_generate_contexts_and_targets(): # コーパスの作成 from common.utils import preprocess text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) print(f"corpus: {corpus}") print(f"id_to_word: {id_to_word}") # コンテキストとターゲットの作成 from common.utils import create_context_target contexts, targets = create_context_target(corpus) print(f"contexts: {contexts}") print(f"targets: {targets}") # one-hotベクトルに変換 from common.utils import convert_one_hot vocab_size = len(word_to_id) targets = convert_one_hot(targets, vocab_size) contexts = convert_one_hot(contexts, vocab_size) print("one-hot targets: ", targets) print("one-hot contexts: ", contexts)
def test_count_method_small(): """特異値分解(SVD)によるPPMI(正の相互情報量行列)の次元削減 """ text = 'You say goodbye and I say hello.' print(text) corpus, word_to_id, id_to_word = preprocess(text) print(f"corpus: {corpus}") print(f"word_to_id: {word_to_id}") print(f"id_to_word: {id_to_word}") vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) # 正の相互情報量 W = ppmi(C) np.set_printoptions(precision=3) print("covariance matrix") print(C) print('-'*50) print("PPMI") print(W) # SVD U, S, V = np.linalg.svd(W) # Note@ SVDは行列サイズNに対して、計算時間がO(N^3)とかなり遅いため、通常は、truncated SVD(scikit-learn)を使う # UがSVDによって変換された密な単語ベクトル(行列) print(f"C[0]: {C[0]}") print(f"W[0]: {W[0]}") print(f"U[0]: {U[0]}") print(f"U[0, :2]: {U[0, :2]}") # 次元削減したベクトルを2次元散布図としてプロット for word, word_id in word_to_id.items(): plt.annotate(word, (U[word_id, 0], U[word_id, 1])) plt.scatter(U[:, 0], U[:, 1], alpha=0.5) plt.show()
def test_ppmi(): """正の相互情報量を値とする共起行列を計算 """ text = 'You say goodbye and I say hello.' print(text) corpus, word_to_id, id_to_word = preprocess(text) print(f"corpus: {corpus}") print(f"word_to_id: {word_to_id}") print(f"id_to_word: {id_to_word}") vocab_size = len(word_to_id) C = create_co_matrix(corpus, vocab_size) # 正の相互情報量 W = ppmi(C) np.set_printoptions(precision=3) print("covariance matrix") print(C) print('-' * 50) print("PPMI") print(W)
def test_train_word2vec_model(): """word2vecモデルの学習 """ window_size = 1 hidden_size = 5 # 単語の分散表現ベクトルの次元数 batch_size = 3 max_epoch = 1000 text = 'You say goodbye and I say hello.' # コーパスの作成 corpus, word_to_id, id_to_word = preprocess(text) # コンテキストとターゲットの作成 vocab_size = len(word_to_id) contexts, target = create_context_target(corpus, window_size) target = convert_one_hot(target, vocab_size) contexts = convert_one_hot(contexts, vocab_size) print("one-hot target: ", target) print("one-hot contexts: ", contexts) # CBOWモデル model = SimpleCBOW(vocab_size, hidden_size) optimizer = Adam() # trainer trainer = Trainer(model, optimizer) # 学習 trainer.fit(contexts, target, max_epoch=max_epoch, batch_size=batch_size) trainer.plot() # CBOWの重み(W_in)を取得する word_vecs = model.word_vecs for word_id, word in id_to_word.items(): print(word, word_vecs[word_id])
def word_variance(): """共起行列の作成 """ # ----------- text = 'You say goodbye and I say hello.' corpus, word_to_id, id_to_word = preprocess(text) print(f"corpus: {corpus}") print(f"id_to_word: {id_to_word}") # 共起行列の例 C = np.array([ [0, 1, 0, 0, 0, 0, 0], [1, 0, 1, 0, 1, 1, 0], [0, 1, 0, 1, 0, 0, 0], [0, 0, 1, 0, 1, 0, 0], [0, 1, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0]], dtype=np.int32) print(f"C: {C}") print(f"C[0]: {C[0]}") # 単語ID[0]のベクトル print(f"C[1]: {C[1]}") # 単語ID[1]のベクトル print(C[word_to_id['goodbye']]) # [goodbye]のベクトル
from typing import Optional import sys sys.path.append('/Users/umeco/projects/zero_DL2/src/') from common.trainer import Trainer from common.optimizer import Adam from ch3.simple_cbow import SimpleCBOW from common.utils import preprocess, create_context_target, convert_one_hot window_size = 1 hidden_size = 5 batch_size = 3 max_epoch = 1000 text = 'You say goodbye and I say hello .' corpus, word_to_id, id_to_word = preprocess(text) vocab_size = len(word_to_id) contexts, target = create_context_target(corpus, window_size) target = convert_one_hot(target, vocab_size) contexts = convert_one_hot(contexts, vocab_size) model = SimpleCBOW(vocab_size, hidden_size) optimizer = Adam() trainer = Trainer(model, optimizer) trainer.fit(contexts, target, max_epoch, batch_size) trainer.plot()
def _absolute_score(self, sentence_1, sentence_2): stemmed_1 = preprocess(sentence_1) stemmed_2 = preprocess(sentence_2) intersection = stemmed_1.intersection(stemmed_2) union = stemmed_1.union(stemmed_2) return len(intersection) / len(union)
def load_data(self, file_name='data.txt', seed=1984): file_path = os.path.dirname(os.path.abspath(__file__)) + '/' + file_name if not os.path.exists(file_path): print('No file: %s' % file_name) return None questions, answers = [], [] for line in open(file_path, 'r', encoding='utf-8'): idx = line.find(';') questions.append(line[:idx] + " _") answers.append(" _" + line[idx+1:]) print("{} questions and {} answers found in text file.".format(len(questions), len(answers))) corpus_q, self.word_to_id_q, self.id_to_word_q = preprocess(" ".join(questions)) corpus_a, self.word_to_id_a, self.id_to_word_a = preprocess_jp("".join(answers), skip_symbol=False) print("Corpus done. Len of corpus : {} questions and {} answers.".format(len(corpus_q), len(corpus_a))) if " " in self.word_to_id_q: empty_space_id_in_q = self.word_to_id_q[" "] else: empty_space_id_in_q = len(self.word_to_id_q) self.id_to_word_q[empty_space_id_in_q] = " " self.word_to_id_q[" "] = empty_space_id_in_q if " " in self.word_to_id_a: empty_space_id_in_a = self.word_to_id_a[" "] else: empty_space_id_in_a = len(self.word_to_id_a) self.id_to_word_a[empty_space_id_in_a] = " " self.word_to_id_a[" "] = empty_space_id_in_a corpus_questions = self.split_list_by_value(corpus_q, self.word_to_id_q["_"]) corpus_answers = self.split_list_by_value(corpus_a, self.word_to_id_a["_"], True) q_max = max(len(x) for x in corpus_questions) a_max = max(len(x) for x in corpus_answers) # create numpy array x = np.zeros((len(corpus_questions), q_max), dtype=np.int) x.fill(empty_space_id_in_q) t = np.zeros((len(corpus_answers), a_max), dtype=np.int) t.fill(empty_space_id_in_a) for i, sentence in enumerate(corpus_questions): for j, c in enumerate(sentence): x[i][j] = c for i, sentence in enumerate(corpus_answers): for j, c in enumerate(sentence): t[i][j] = c print("x len, t len = {}, {}".format(len(x), len(t))) # shuffle indices = np.arange(len(x)) if seed is not None: np.random.seed(seed) np.random.shuffle(indices) x = x[indices] t = t[indices] # 10% for validation set split_at = len(x) - len(x) // 10 (self.x_train, self.x_test) = x[:split_at], x[split_at:] (self.t_train, self.t_test) = t[:split_at], t[split_at:] self.save_corpus(file_name) return (self.x_train, self.t_train), (self.x_test, self.t_test)
def _absolute_score(self, sentence_1, sentence_2): splitted_1 = preprocess(sentence_1) splitted_2 = preprocess(sentence_2) return self.embedding.model.wmdistance(splitted_1, splitted_2)