Esempio n. 1
0
def make_tensorboard(tf_graphdir="/tmp/artificial_hotel_reviews/a4_graph", V=100, H=1024, num_layers=2):
    reload(rnnlm)
    TF_GRAPHDIR = tf_graphdir
    # Clear old log directory.
    shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)
    
    lm = rnnlm.RNNLM(V=V, H=H, num_layers=num_layers)
    lm.BuildCoreGraph()
    lm.BuildTrainGraph()
    lm.BuildSamplerGraph()
    summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)
    return summary_writer
Esempio n. 2
0
    def setUp(self):
        sequence = ["a", "b", "c", "d"]
        self.vocab = vocabulary.Vocabulary(sequence)
        ids = self.vocab.words_to_ids(sequence)
        self.train_ids = np.array(ids * 10000, dtype=int)
        self.test_ids = np.array(ids * 100, dtype=int)

        model_params = dict(V=self.vocab.size, H=10,
                            softmax_ns=2, num_layers=1)
        self.lm = rnnlm.RNNLM(**model_params)
        self.lm.BuildCoreGraph()
        self.lm.BuildTrainGraph()
        self.lm.BuildSamplerGraph()
        # For toy model, ignore sampled softmax.
        self.lm.train_loss_ = self.lm.loss_
Esempio n. 3
0
def generate_text(trained_filename, model_params):
    # Same as above, but as a batch
    #max_steps = 20
    max_steps = 50
    num_samples = 10
    random_seed = 42

    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    lm.BuildSamplerGraph()

    with lm.graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=lm.graph) as session:
        # Seed RNG for repeatability
        tf.set_random_seed(random_seed)

        # Load the trained model
        saver.restore(session, trained_filename)

        # Make initial state for a batch with batch_size = num_samples
        #w = np.repeat([[vocab.START_ID]], num_samples, axis=0)
        [char_dict.get(token) for token in test_review_list]
        w = np.repeat([[char_dict.get('<SOR>')]], num_samples, axis=0)
        h = session.run(lm.initial_h_, {lm.input_w_: w})
        # take one step for each sequence on each iteration
        for i in range(max_steps):
            h, y = sample_step(lm, session, w[:, -1:], h)
            w = np.hstack((w, y))

        # Print generated sentences
        for row in w:
            for i, word_id in enumerate(row):
                #print(vocab.id_to_word[word_id], end=" ")
                print(ids_to_words[word_id], end="")
                #if (i != 0) and (word_id == vocab.START_ID):
                if (i != 0) and (word_id == char_dict.get("<EOR>")):
                    break
            print("")
Esempio n. 4
0
def loadAndPreprocessData():
    '''
	Read all the words to create a vocabulary
	'''
    all_tokens = []
    indir = '../preprocess/'

    for root, dirs, filenames in os.walk(indir):
        for filename in filenames:
            if filename.startswith('canonicalized_words_'):
                with open(indir + filename, 'r') as f:
                    for line in f.readlines():
                        w = line.rstrip()
                        if w != '':
                            all_tokens.append(w)
    print 'Processed all tokens: ', len(all_tokens)

    tokens_dict = Counter()
    for w in all_tokens:
        if w.startswith('DG') and w.endswith('DG'):
            w = 'DG'
        tokens_dict[w] += 1
    '''
	Remove noisy tokens - see notebook for exploratory analysis
	The first ~2500 tokens when sorted by key are noisy like "!!!!" or "* * * *" - for eg, the end of a chapter
	'''
    noisy_tokens = sorted(tokens_dict)[0:2507]
    print 'Identified noisy tokens - some examples: ', noisy_tokens[0:30]
    '''
	Clean up the tokens now that we know the noisy tokens and then generate the vocab
	'''
    noisy_tokens = set(noisy_tokens)
    words = [w for w in all_tokens if w not in noisy_tokens]
    # TODO: Should make V configurable
    V = 50000
    vocab = vocabulary.Vocabulary((word for word in words), size=V)
    print 'Vocabulary created with size: ', vocab.size
    '''
	Read in the sentences already parsed from the ~3000 books Gutenberg subset
	'''
    sents = []
    indir = '../preprocess/'
    books = []
    for root, dirs, filenames in os.walk(indir):
        for filename in filenames:
            if filename.startswith('parsed_sents_'):
                with open(indir + filename, 'r') as f:
                    for line in f.readlines():
                        sents.append(line.rstrip())
    print 'Parsed sentences loaded into memory: ', len(sents)
    print 'The 10,000th sentence is: ', sents[10000]
    '''
	Prepare training and test sentences
	'''
    split = 0.8
    shuffle = True

    sentences = np.array(sents, dtype=object)
    fmt = (len(sentences), sum(map(len, sentences)))
    print "Loaded %d sentences (%g tokens)" % fmt

    if shuffle:
        rng = np.random.RandomState(shuffle)
        rng.shuffle(sentences)  # in-place
    train_frac = 0.8
    split_idx = int(train_frac * len(sentences))
    train_sentences = sentences[:split_idx]
    test_sentences = sentences[split_idx:]

    fmt = (len(train_sentences), sum(map(len, train_sentences)))
    print "Training set: %d sentences (%d tokens)" % fmt
    fmt = (len(test_sentences), sum(map(len, test_sentences)))
    print "Test set: %d sentences (%d tokens)" % fmt
    '''
	Apply the vocab to the train and test sentences and convert words to ids to start training
	'''
    ## Preprocess sentences
    ## convert words to ids based on the vocab wordset created above
    ## Do this in batches to avoid crashes due to insufficient memory
    batch_size = 50000
    num_of_batches = int(round(len(train_sentences) / batch_size))
    print 'Preprocessing train sentences - number of batches: ', num_of_batches
    train_id_batches = []
    start = 0
    end = start + batch_size
    for i in range(num_of_batches):
        if i % 15 is 0:
            print 'Completed Batches: ', i
        train_id_batches.append(
            utils.preprocess_sentences(train_sentences[start:end], vocab))
        start = end
        end += batch_size
    # flatten the lists for 1D tensor
    temp = utils.flatten(train_id_batches)
    train_ids = utils.flatten(temp)
    print 'Train sentences converted to their IDs including start, end token and unknown word token'

    # repeat the same with test data
    batch_size = 50000
    num_of_batches = int(round(len(test_sentences) / batch_size))
    if num_of_batches > 10:
        num_of_batches = 10
    print 'Preprocessing test sentences - number of batches: ', num_of_batches
    test_id_batches = []
    start = 0
    end = start + batch_size
    for i in range(num_of_batches):
        print 'Batch: ', i
        test_id_batches.append(
            utils.preprocess_sentences(test_sentences[start:end], vocab))
        start = end
        end += batch_size
    test_ids = utils.flatten(utils.flatten(test_id_batches))

    print 'Test sentences converted to their IDs including start, end token and unknown word token'
    max_time = 40
    batch_size = 64
    learning_rate = 0.01
    num_epochs = 3

    # Model parameters
    model_params = dict(V=vocab.size, H=100, softmax_ns=200, num_layers=1)

    TF_SAVEDIR = "tf_saved"
    checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
    trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")
    # Will print status every this many seconds
    print_interval = 120

    # Clear old log directory
    shutil.rmtree("tf_summaries", ignore_errors=True)

    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    lm.BuildTrainGraph()

    # Explicitly add global initializer and variable saver to LM graph
    with lm.graph.as_default():
        initializer = tf.global_variables_initializer()
        saver = tf.train.Saver()

    # Clear old log directory
    shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
    if not os.path.isdir(TF_SAVEDIR):
        os.makedirs(TF_SAVEDIR)

    with tf.Session(graph=lm.graph) as session:
        # Seed RNG for repeatability
        tf.set_random_seed(42)
        session.run(initializer)

        for epoch in xrange(1, num_epochs + 1):
            t0_epoch = time.time()
            bi = utils.batch_generator(train_ids, batch_size, max_time)
            print "[epoch %d] Starting epoch %d" % (epoch, epoch)
            #### YOUR CODE HERE ####
            # Run a training epoch.

            run_epoch(lm, session, bi, train=True, learning_rate=learning_rate)

            #### END(YOUR CODE) ####
            print "[epoch %d] Completed in %s" % (
                epoch, utils.pretty_timedelta(since=t0_epoch))

            # Save a checkpoint
            saver.save(session, checkpoint_filename, global_step=epoch)
            ##
            # score_dataset will run a forward pass over the entire dataset
            # and report perplexity scores. This can be slow (around 1/2 to
            # 1/4 as long as a full epoch), so you may want to comment it out
            # to speed up training on a slow machine. Be sure to run it at the
            # end to evaluate your score.
            print("[epoch %d]" % epoch), score_dataset(lm,
                                                       session,
                                                       train_ids,
                                                       name="Train set")
            print("[epoch %d]" % epoch), score_dataset(lm,
                                                       session,
                                                       test_ids,
                                                       name="Test set")
            print ""
        # Save final model
        saver.save(session, trained_filename)
Esempio n. 5
0
 def setUp(self):
     model_params = dict(V=512, H=100, num_layers=1)
     self.lm = rnnlm.RNNLM(**model_params)
     self.lm.BuildCoreGraph()
     self.lm.BuildSamplerGraph()
Esempio n. 6
0
            lm.dropout_keep_prob_: 1.0,
            lm.learning_rate_: 0.1
        })

    #### END(YOUR CODE) ####
    # Note indexing here:
    #   [batch_size, max_time, 1] -> [batch_size, 1]
    return final_h, samples[:, -1, :]


# Same as above, but as a batch
max_steps = 20
num_samples = 10
random_seed = 42

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildSamplerGraph()

with lm.graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(random_seed)

    # Load the trained model
    saver.restore(session, trained_filename)

    # Make initial state for a batch with batch_size = num_samples
    w = np.repeat([[vocab.START_ID]], num_samples, axis=0)
Esempio n. 7
0
def run_training(train_ids,
                 test_ids,
                 max_time=100,
                 batch_size=256,
                 learning_rate=0.002,
                 num_epochs=20,
                 model_params,
                 tf_savedir="/tmp/artificial_hotel_reviews/a4_model"):
    #V = len(words_to_ids.keys())
    # Training parameters
    ## add parameter sets for each attack/defense configuration
    #max_time = 25
    #batch_size = 100
    #learning_rate = 0.01
    #num_epochs = 10

    # Model parameters
    #model_params = dict(V=vocab.size,
    #H=200,
    #softmax_ns=200,
    #num_layers=2)
    #model_params = dict(V=len(words_to_ids.keys()),
    #H=1024,
    #softmax_ns=len(words_to_ids.keys()),
    #num_layers=2)
    #model_params = dict(V=V, H=H, softmax_ns=softmax_ns, num_layers=num_layers)

    #TF_SAVEDIR = "/tmp/artificial_hotel_reviews/a4_model"
    TF_SAVEDIR = tf_savedir
    checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
    trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

    # Will print status every this many seconds
    #print_interval = 5
    print_interval = 30

    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    lm.BuildTrainGraph()

    # Explicitly add global initializer and variable saver to LM graph
    with lm.graph.as_default():
        initializer = tf.global_variables_initializer()
        saver = tf.train.Saver()

    # Clear old log directory
    shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
    if not os.path.isdir(TF_SAVEDIR):
        os.makedirs(TF_SAVEDIR)

    with tf.Session(graph=lm.graph) as session:
        # Seed RNG for repeatability
        tf.set_random_seed(42)

        session.run(initializer)

        #check trainable variables
        #variables_names = [v.name for v in tf.trainable_variables()]
        #values = session.run(variables_names)
        #for k, v in zip(variables_names, values):
        #print("Variable: ", k)
        #print("Shape: ", v.shape)
        #print(v)

        for epoch in range(1, num_epochs + 1):
            t0_epoch = time.time()
            bi = utils.rnnlm_batch_generator(train_ids, batch_size, max_time)
            print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))
            # Run a training epoch.
            run_epoch(lm,
                      session,
                      batch_iterator=bi,
                      train=True,
                      verbose=True,
                      tick_s=10,
                      learning_rate=learning_rate)

            print("[epoch {:d}] Completed in {:s}".format(
                epoch, utils.pretty_timedelta(since=t0_epoch)))

            # Save a checkpoint
            saver.save(session, checkpoint_filename, global_step=epoch)

            ##
            # score_dataset will run a forward pass over the entire dataset
            # and report perplexity scores. This can be slow (around 1/2 to
            # 1/4 as long as a full epoch), so you may want to comment it out
            # to speed up training on a slow machine. Be sure to run it at the
            # end to evaluate your score.
            #print("[epoch {:d}]".format(epoch), end=" ")
            #score_dataset(lm, session, train_ids, name="Train set")
            print("[epoch {:d}]".format(epoch), end=" ")
            score_dataset(lm, session, test_ids, name="Test set")
            print("")

        # Save final model
        saver.save(session, trained_filename)
        return trained_filename