コード例 #1
0
ファイル: evaluation.py プロジェクト: up276/NLP2K16
def wordLookup(input_word,
               train_path=os.path.join(FLAGS.data_path, 'ptb.train.txt')):
    vocab = reader._build_vocab(train_path)
    if input_word in vocab:
        return vocab[input_word]
    else:
        print("Word is not present in vocab")
        return None
コード例 #2
0
ファイル: PTB-LSTM.py プロジェクト: tobyma/tensorpack
def get_PennTreeBank(data_dir=None):
    if data_dir is None:
        data_dir = get_dataset_path('ptb_data')
    if not os.path.isfile(os.path.join(data_dir, 'ptb.train.txt')):
        download(TRAIN_URL, data_dir)
        download(VALID_URL, data_dir)
        download(TEST_URL, data_dir)
    word_to_id = tfreader._build_vocab(os.path.join(data_dir, 'ptb.train.txt'))
    data3 = [np.asarray(tfreader._file_to_word_ids(os.path.join(data_dir, fname), word_to_id))
             for fname in ['ptb.train.txt', 'ptb.valid.txt', 'ptb.test.txt']]
    return data3, word_to_id
コード例 #3
0
ファイル: PTB-LSTM.py プロジェクト: zlczlcgithub/tensorpack
def get_PennTreeBank(data_dir=None):
    if data_dir is None:
        data_dir = get_dataset_path('ptb_data')
    if not os.path.isfile(os.path.join(data_dir, 'ptb.train.txt')):
        download(TRAIN_URL, data_dir)
        download(VALID_URL, data_dir)
        download(TEST_URL, data_dir)
    word_to_id = tfreader._build_vocab(os.path.join(data_dir, 'ptb.train.txt'))
    data3 = [np.asarray(tfreader._file_to_word_ids(os.path.join(data_dir, fname), word_to_id))
             for fname in ['ptb.train.txt', 'ptb.valid.txt', 'ptb.test.txt']]
    return data3, word_to_id
コード例 #4
0
ファイル: ptb_word_lm.py プロジェクト: ncaldwell17/Mercury
    def get_tokens(self, data_path, input_type):
        word_to_id = reader._build_vocab(
            os.path.join(data_path, "wiki.train.txt"))

        tens = [i for i in range(10, 100, 10)]
        hundreds = [i for i in range(100, 1000, 100)]
        rounds_list = tens + hundreds

        rounds = set([word_to_id.get(str(i), 0) for i in rounds_list])
        days = set([word_to_id.get(str(i + 1), 0) for i in range(0, 31)])
        years = set([word_to_id.get(str(i + 1), 0) for i in range(999, 2020)])

        if input_type == 'rounds':
            return rounds
        elif input_type == 'years':
            return years
        elif input_type == 'days':
            return days
コード例 #5
0
ファイル: evaluation.py プロジェクト: up276/NLP2K16
def vizualizeTSNE(tsneEmbedding,
                  path_to_save,
                  path_to_train=os.path.join(FLAGS.data_path, 'ptb.train.txt'),
                  samples=400):
    """
    Makes visualization of random sample of t-SNE embedding and annotate with words, saves.
    """

    vocab = reader._build_vocab(path_to_train)
    reverse_vocab = {v: k for k, v in vocab.iteritems()}
    random_ix = np.random.choice(tsneEmbedding.T.shape[0], samples)
    embadding_random = tsneEmbedding.T[random_ix].T
    keys = np.array(reverse_vocab.keys())[random_ix]
    plt.figure(figsize=(25, 25))
    plt.scatter(embadding_random[0], embadding_random[1])
    plt.title("TSNE Word Representation from random %s words " % samples)

    for i, txt in enumerate(embadding_random.T):
        plt.annotate(reverse_vocab[keys[i]],
                     (embadding_random[0][i], embadding_random[1][i]))
        plt.savefig(path_to_save)
コード例 #6
0
eval_config.num_steps = 1
size = 200
vocab_size = 10000
num_layers = 2
batch_size = 20
# W0 = tf.get_variable("RNN/MultiRNNCell/Cell0/BasicLSTMCell/Linear/Matrix", [400, 800])
# b0 = tf.get_variable("RNN/MultiRNNCell/Cell0/BasicLSTMCell/Linear/Bias", [800])
# W0 = session.run(W0)
# b0 = session.run(b0)
# W1 = tf.get_variable("RNN/MultiRNNCell/Cell1/BasicLSTMCell/Linear/Matrix", [400, 800])
# b1 = tf.get_variable("RNN/MultiRNNCell/Cell1/BasicLSTMCell/Linear/Bias", [800])
# W1 = session.run(W1)
# b1 = session.run(b1)

import reader
voc = reader._build_vocab(
    '/Users/marting/scratch/tensorflow/simple-examples/data/ptb.train.txt')
cov = {v: k for k, v in voc.items()}

with tf.Graph().as_default(), tf.Session() as session:
    with tf.variable_scope("model", reuse=None, initializer=None):
        m = PTBModel(is_training=False, config=config)
    saver = tf.train.Saver()
    saver.restore(session, "/Users/marting/scratch/tensorflow/model.ckpt")
    with tf.variable_scope("model", reuse=True):
        print("Model restored.")
        embedding = tf.get_variable("embedding", [vocab_size, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        softmax_w = session.run(softmax_w)
        softmax_b = session.run(softmax_b)
        embedding = session.run(embedding)
コード例 #7
0
eval_config.num_steps = 1
size = 200
vocab_size = 10000
num_layers = 2
batch_size = 20
      # W0 = tf.get_variable("RNN/MultiRNNCell/Cell0/BasicLSTMCell/Linear/Matrix", [400, 800])
      # b0 = tf.get_variable("RNN/MultiRNNCell/Cell0/BasicLSTMCell/Linear/Bias", [800])
      # W0 = session.run(W0) 
      # b0 = session.run(b0)
      # W1 = tf.get_variable("RNN/MultiRNNCell/Cell1/BasicLSTMCell/Linear/Matrix", [400, 800])
      # b1 = tf.get_variable("RNN/MultiRNNCell/Cell1/BasicLSTMCell/Linear/Bias", [800])
      # W1 = session.run(W1) 
      # b1 = session.run(b1)

import reader
voc = reader._build_vocab('/Users/marting/scratch/tensorflow/simple-examples/data/ptb.train.txt')
cov = {v:k for k,v in voc.items()}

with tf.Graph().as_default(), tf.Session() as session:
  with tf.variable_scope("model", reuse=None, initializer=None):
     m = PTBModel(is_training=False, config=config)
  saver = tf.train.Saver()
  saver.restore(session, "/Users/marting/scratch/tensorflow/model.ckpt")
  with tf.variable_scope("model", reuse=True):
      print("Model restored.")
      embedding = tf.get_variable("embedding", [vocab_size, size])
      softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
      softmax_b = tf.get_variable("softmax_b", [vocab_size])
      softmax_w = session.run(softmax_w)
      softmax_b = session.run(softmax_b)
      embedding = session.run(embedding)