Esempio n. 1
0
 def _absolute_score(self, sentence_1, sentence_2):
     splitted_1 = preprocess(sentence_1)
     splitted_2 = preprocess(sentence_2)
     vecs = sif_feature_vectors(splitted_1, splitted_2, self.embedding)
     A = vecs[0]
     B = vecs[1]
     return A @ B / (np.linalg.norm(A) * np.linalg.norm(B))
def test_most_similar():
    """クエリ単語に対する類似度のランキングを表示
    """
    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)
    vocab_size = len(word_to_id)
    C = create_co_matrix(corpus, vocab_size)

    most_similar('you', word_to_id, id_to_word, C, top=5)
Esempio n. 3
0
def test_corpus():
    """utils.pyのpreprocess関数を実行
    """
    import sys
    sys.path.append("/Users/inoueshinichi/Desktop/DeepLearning2_NLP") # 親ディレクトリのファイルをインポートするための設定
    sys.path.append("/home/inoue/Desktop/DeepLearning2_NLP")
    from common.utils import preprocess

    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)

    print(f"corpus: {corpus}")
    print(f"word_to_id: {word_to_id}")
    print(f"id_to_word: {id_to_word}")
Esempio n. 4
0
def test_similarity():
    """コサイン類似度の計算
    """

    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)
    print(f"corpus: {corpus}")
    print(f"word_to_id: {word_to_id}")
    print(f"id_to_word: {id_to_word}")
    vocab_size = len(word_to_id)
    C = create_co_matrix(corpus, vocab_size)

    c0 = C[word_to_id['you']]  # `you`の単語ベクトル
    c1 = C[word_to_id['i']]  # `i`の単語ベクトル
    print(f"cos_similarity: {cos_similarity(c0, c1)}")
def test_generate_contexts_and_targets():
    # コーパスの作成
    from common.utils import preprocess
    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)
    print(f"corpus: {corpus}")
    print(f"id_to_word: {id_to_word}")

    # コンテキストとターゲットの作成
    from common.utils import create_context_target
    contexts, targets = create_context_target(corpus)
    print(f"contexts: {contexts}")
    print(f"targets: {targets}")

    # one-hotベクトルに変換
    from common.utils import convert_one_hot
    vocab_size = len(word_to_id)
    targets = convert_one_hot(targets, vocab_size)
    contexts = convert_one_hot(contexts, vocab_size)
    print("one-hot targets: ", targets)
    print("one-hot contexts: ", contexts)
def test_count_method_small():
    """特異値分解(SVD)によるPPMI(正の相互情報量行列)の次元削減
    """
    
    text = 'You say goodbye and I say hello.'
    print(text)
    corpus, word_to_id, id_to_word = preprocess(text)
    print(f"corpus: {corpus}")
    print(f"word_to_id: {word_to_id}")
    print(f"id_to_word: {id_to_word}")
    vocab_size = len(word_to_id)
    C = create_co_matrix(corpus, vocab_size)

    # 正の相互情報量
    W = ppmi(C)

    np.set_printoptions(precision=3)
    print("covariance matrix")
    print(C)
    print('-'*50)
    print("PPMI")
    print(W)

    # SVD
    U, S, V = np.linalg.svd(W) # Note@ SVDは行列サイズNに対して、計算時間がO(N^3)とかなり遅いため、通常は、truncated SVD(scikit-learn)を使う
    # UがSVDによって変換された密な単語ベクトル(行列)

    print(f"C[0]: {C[0]}") 
    print(f"W[0]: {W[0]}")
    print(f"U[0]: {U[0]}")

    print(f"U[0, :2]: {U[0, :2]}")


    # 次元削減したベクトルを2次元散布図としてプロット
    for word, word_id in word_to_id.items():
        plt.annotate(word, (U[word_id, 0], U[word_id, 1]))
    
    plt.scatter(U[:, 0], U[:, 1], alpha=0.5)
    plt.show()
Esempio n. 7
0
def test_ppmi():
    """正の相互情報量を値とする共起行列を計算
    """

    text = 'You say goodbye and I say hello.'
    print(text)
    corpus, word_to_id, id_to_word = preprocess(text)
    print(f"corpus: {corpus}")
    print(f"word_to_id: {word_to_id}")
    print(f"id_to_word: {id_to_word}")
    vocab_size = len(word_to_id)
    C = create_co_matrix(corpus, vocab_size)

    # 正の相互情報量
    W = ppmi(C)

    np.set_printoptions(precision=3)
    print("covariance matrix")
    print(C)
    print('-' * 50)
    print("PPMI")
    print(W)
Esempio n. 8
0
def test_train_word2vec_model():
    """word2vecモデルの学習
    """

    window_size = 1
    hidden_size = 5 # 単語の分散表現ベクトルの次元数
    batch_size = 3
    max_epoch = 1000

    text = 'You say goodbye and I say hello.'

    # コーパスの作成
    corpus, word_to_id, id_to_word = preprocess(text)

    # コンテキストとターゲットの作成
    vocab_size = len(word_to_id)
    contexts, target = create_context_target(corpus, window_size)
    target = convert_one_hot(target, vocab_size)
    contexts = convert_one_hot(contexts, vocab_size)
    print("one-hot target: ", target)
    print("one-hot contexts: ", contexts)

    # CBOWモデル
    model = SimpleCBOW(vocab_size, hidden_size)
    optimizer = Adam()

    # trainer
    trainer = Trainer(model, optimizer)

    # 学習
    trainer.fit(contexts, target, max_epoch=max_epoch, batch_size=batch_size)
    trainer.plot()

    # CBOWの重み(W_in)を取得する
    word_vecs = model.word_vecs
    for word_id, word in id_to_word.items():
        print(word, word_vecs[word_id])
def word_variance():
    """共起行列の作成
    """
    # -----------
    text = 'You say goodbye and I say hello.'
    corpus, word_to_id, id_to_word = preprocess(text)

    print(f"corpus: {corpus}")
    print(f"id_to_word: {id_to_word}")

    # 共起行列の例
    C = np.array([
        [0, 1, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 1, 0],
        [0, 1, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 1, 0, 0],
        [0, 1, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0]],
        dtype=np.int32)
    print(f"C: {C}")

    print(f"C[0]: {C[0]}") # 単語ID[0]のベクトル
    print(f"C[1]: {C[1]}") # 単語ID[1]のベクトル
    print(C[word_to_id['goodbye']]) # [goodbye]のベクトル
Esempio n. 10
0
from typing import Optional
import sys
sys.path.append('/Users/umeco/projects/zero_DL2/src/')
from common.trainer import Trainer
from common.optimizer import Adam
from ch3.simple_cbow import SimpleCBOW
from common.utils import preprocess, create_context_target, convert_one_hot

window_size = 1
hidden_size = 5
batch_size = 3
max_epoch = 1000

text = 'You say goodbye and I say hello .'
corpus, word_to_id, id_to_word = preprocess(text)

vocab_size = len(word_to_id)
contexts, target = create_context_target(corpus, window_size)
target = convert_one_hot(target, vocab_size)
contexts = convert_one_hot(contexts, vocab_size)

model = SimpleCBOW(vocab_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()
Esempio n. 11
0
 def _absolute_score(self, sentence_1, sentence_2):
     stemmed_1 = preprocess(sentence_1)
     stemmed_2 = preprocess(sentence_2)
     intersection = stemmed_1.intersection(stemmed_2)
     union = stemmed_1.union(stemmed_2)
     return len(intersection) / len(union)
Esempio n. 12
0
  def load_data(self, file_name='data.txt', seed=1984):
    file_path = os.path.dirname(os.path.abspath(__file__)) + '/' + file_name

    if not os.path.exists(file_path):
      print('No file: %s' % file_name)
      return None

    questions, answers = [], []

    for line in open(file_path, 'r', encoding='utf-8'):
      idx = line.find(';')
      questions.append(line[:idx] + " _")
      answers.append(" _" + line[idx+1:])
    
    print("{} questions and {} answers found in text file.".format(len(questions), len(answers)))

    corpus_q, self.word_to_id_q, self.id_to_word_q = preprocess(" ".join(questions))
    corpus_a, self.word_to_id_a, self.id_to_word_a = preprocess_jp("".join(answers), skip_symbol=False)

    print("Corpus done. Len of corpus : {} questions and {} answers.".format(len(corpus_q), len(corpus_a)))

    if " " in self.word_to_id_q:
      empty_space_id_in_q = self.word_to_id_q[" "]
    else:
      empty_space_id_in_q = len(self.word_to_id_q)
      self.id_to_word_q[empty_space_id_in_q] = " "
      self.word_to_id_q[" "] = empty_space_id_in_q
    
    if " " in self.word_to_id_a:
      empty_space_id_in_a = self.word_to_id_a[" "]
    else:
      empty_space_id_in_a = len(self.word_to_id_a)
      self.id_to_word_a[empty_space_id_in_a] = " "
      self.word_to_id_a[" "] = empty_space_id_in_a

    corpus_questions = self.split_list_by_value(corpus_q, self.word_to_id_q["_"])
    corpus_answers = self.split_list_by_value(corpus_a, self.word_to_id_a["_"], True)

    q_max = max(len(x) for x in corpus_questions)
    a_max = max(len(x) for x in corpus_answers)

    # create numpy array
    x = np.zeros((len(corpus_questions), q_max), dtype=np.int)
    x.fill(empty_space_id_in_q)
    t = np.zeros((len(corpus_answers), a_max), dtype=np.int)
    t.fill(empty_space_id_in_a)

    for i, sentence in enumerate(corpus_questions):
      for j, c in enumerate(sentence):
        x[i][j] = c
    for i, sentence in enumerate(corpus_answers):
      for j, c in enumerate(sentence):
        t[i][j] = c
    
    print("x len, t len = {}, {}".format(len(x), len(t)))
    
    # shuffle
    indices = np.arange(len(x))
    if seed is not None:
      np.random.seed(seed)
    np.random.shuffle(indices)
    x = x[indices]
    t = t[indices]

    # 10% for validation set
    split_at = len(x) - len(x) // 10
    (self.x_train, self.x_test) = x[:split_at], x[split_at:]
    (self.t_train, self.t_test) = t[:split_at], t[split_at:]

    self.save_corpus(file_name)

    return (self.x_train, self.t_train), (self.x_test, self.t_test)
Esempio n. 13
0
 def _absolute_score(self, sentence_1, sentence_2):
     splitted_1 = preprocess(sentence_1)
     splitted_2 = preprocess(sentence_2)
     return self.embedding.model.wmdistance(splitted_1, splitted_2)