コード例 #1
0
    def train(self, data, labels):
        self.build_model()  #整个模型定义过程,
        n_batches = int(ceil(data.shape[0] / self.batch_size))
        tf.global_variables_initializer().run()  #
        t_data, t_labels, v_data, v_labels = data_utils.generate_split(
            data, labels, self.val_split)  #划分数据为训练、验证集
        for epoch in range(1, self.n_epochs + 1):
            train_cost = 0
            for batch in range(1, n_batches + 1):
                X, y = data_utils.generate_batch(t_data, t_labels,
                                                 self.batch_size)  #每次抽取一个batch
                #将数据喂入placeholder
                f_dict = {
                    self.inp: X,
                    self.labels: y,
                    self.cur_drop_rate: self.dropout_rate
                }

                _, cost = self.session.run([self.train_op, self.loss],
                                           feed_dict=f_dict)  #得到每个batch的损失
                train_cost += cost  #累计总损失
                sys.stdout.write(
                    'Epoch %d Cost  :   %f - Batch %d of %d     \r' %
                    (epoch, cost, batch, n_batches))
                sys.stdout.flush()
            print

            self.test(v_data, v_labels)
コード例 #2
0
    def train(self, data, labels):
        self.build_model()
        n_batches = int(ceil(data.shape[0] / self.batch_size))
        tf.initialize_all_variables().run()
        t_data, t_labels, v_data, v_labels = data_utils.generate_split(
            data, labels, self.val_split)
        for epoch in range(1, self.n_epochs + 1):
            train_cost = 0
            for batch in range(1, n_batches + 1):
                X, y = data_utils.generate_batch(t_data, t_labels,
                                                 self.batch_size)
                f_dict = {
                    self.inp: X,
                    self.labels: y,
                    self.cur_drop_rate: self.dropout_rate
                }

                _, cost = self.session.run([self.train_op, self.loss],
                                           feed_dict=f_dict)
                train_cost += cost
                sys.stdout.write(
                    'Epoch %d Cost  :   %f - Batch %d of %d     \r' %
                    (epoch, cost, batch, n_batches))
                sys.stdout.flush()

            print

            self.test(v_data, v_labels)
コード例 #3
0
    def train(self, data, labels):
        self.build_model()
        if configuration.config['dataset'] == 'CR':
            data = data[6792::]
            labels = labels[6792::]
            train_data = data[0:6791]
            train_labels = labels[0:6791]
        n_batches = int(ceil(data.shape[0] / self.batch_size))
        tf.global_variables_initializer().run()
        t_data, t_labels, v_data, v_labels = data_utils.generate_split(data, labels, self.val_split)
        if configuration.config['dataset'] == 'CR':
            d1, l1, d2, l2 = data_utils.generate_split(train_data, train_labels, self.val_split)
            train_data = np.concatenate([d1, d2],0)
            train_labels = np.concatenate([l1, l2],0)
            t_data = np.concatenate([t_data, train_data],0)
            t_labels = np.concatenate([t_labels, train_labels],0)
        for epoch in range(1, self.n_epochs + 1):
            train_cost = 0
            for batch in range(1, n_batches + 1):
                X, y = data_utils.generate_batch(t_data, t_labels, self.batch_size)
                f_dict = {
                    self.inp: X,
                    self.labels: y,
                    self.cur_drop_rate: self.dropout_rate
                }

                _, cost = self.session.run([self.train_op, self.loss], feed_dict=f_dict)
                train_cost += cost
                sys.stdout.write('Epoch %d Cost  :   %f - Batch %d of %d     \r' % (epoch, cost, batch, n_batches))
                sys.stdout.flush()

            print(self.test(v_data, v_labels))
コード例 #4
0
    def test(self, data, labels):
        n_batches = int(ceil(data.shape[0] / self.batch_size))
        test_cost = 0
        preds = []
        ys = []
        for batch in range(1, n_batches + 1):
            X, Y = data_utils.generate_batch(data, labels, self.batch_size)
            f_dict = {self.inp: X, self.labels: Y, self.cur_drop_rate: 1.0}
            cost, y = self.session.run([self.loss, self.y], feed_dict=f_dict)
            test_cost += cost
            sys.stdout.write('Cost  :   %f - Batch %d of %d     \r' %
                             (cost, batch, n_batches))
            sys.stdout.flush()

            preds.extend(np.argmax(y, 1))
            ys.extend(Y)

        print
        print "Accuracy", np.mean(
            np.asarray(np.equal(ys, preds), dtype='float32')) * 100
コード例 #5
0
# Step 1: Download the data (maybe).
filename = maybe_download('text8.zip', 31344016)

# Step 2: Read and build the dictionary and replace rare words with UNK token.
vocabulary = read_data(filename)
print('Data size', len(vocabulary))
vocabulary_size = 50000
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])


# Step 3: get batch data generator
data_index = 0
sample_batch, sample_labels, data_index = generate_batch(data, data_index, batch_sz=8, n_skips=2, skip_wd=1)
for i in range(8):
    print(sample_batch[i], reverse_dictionary[sample_batch[i]], '->', sample_labels[i, 0],
          reverse_dictionary[sample_labels[i, 0]])


# Step 4: Build a skip-gram model.
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2   # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.
# We pick a random validation set to sample nearest neighbors. Here we limit the validation samples to the words that
# have a low numeric ID, which by construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.