def train(self, data, labels): self.build_model() #整个模型定义过程, n_batches = int(ceil(data.shape[0] / self.batch_size)) tf.global_variables_initializer().run() # t_data, t_labels, v_data, v_labels = data_utils.generate_split( data, labels, self.val_split) #划分数据为训练、验证集 for epoch in range(1, self.n_epochs + 1): train_cost = 0 for batch in range(1, n_batches + 1): X, y = data_utils.generate_batch(t_data, t_labels, self.batch_size) #每次抽取一个batch #将数据喂入placeholder f_dict = { self.inp: X, self.labels: y, self.cur_drop_rate: self.dropout_rate } _, cost = self.session.run([self.train_op, self.loss], feed_dict=f_dict) #得到每个batch的损失 train_cost += cost #累计总损失 sys.stdout.write( 'Epoch %d Cost : %f - Batch %d of %d \r' % (epoch, cost, batch, n_batches)) sys.stdout.flush() print self.test(v_data, v_labels)
def train(self, data, labels): self.build_model() n_batches = int(ceil(data.shape[0] / self.batch_size)) tf.initialize_all_variables().run() t_data, t_labels, v_data, v_labels = data_utils.generate_split( data, labels, self.val_split) for epoch in range(1, self.n_epochs + 1): train_cost = 0 for batch in range(1, n_batches + 1): X, y = data_utils.generate_batch(t_data, t_labels, self.batch_size) f_dict = { self.inp: X, self.labels: y, self.cur_drop_rate: self.dropout_rate } _, cost = self.session.run([self.train_op, self.loss], feed_dict=f_dict) train_cost += cost sys.stdout.write( 'Epoch %d Cost : %f - Batch %d of %d \r' % (epoch, cost, batch, n_batches)) sys.stdout.flush() print self.test(v_data, v_labels)
def train(self, data, labels): self.build_model() if configuration.config['dataset'] == 'CR': data = data[6792::] labels = labels[6792::] train_data = data[0:6791] train_labels = labels[0:6791] n_batches = int(ceil(data.shape[0] / self.batch_size)) tf.global_variables_initializer().run() t_data, t_labels, v_data, v_labels = data_utils.generate_split(data, labels, self.val_split) if configuration.config['dataset'] == 'CR': d1, l1, d2, l2 = data_utils.generate_split(train_data, train_labels, self.val_split) train_data = np.concatenate([d1, d2],0) train_labels = np.concatenate([l1, l2],0) t_data = np.concatenate([t_data, train_data],0) t_labels = np.concatenate([t_labels, train_labels],0) for epoch in range(1, self.n_epochs + 1): train_cost = 0 for batch in range(1, n_batches + 1): X, y = data_utils.generate_batch(t_data, t_labels, self.batch_size) f_dict = { self.inp: X, self.labels: y, self.cur_drop_rate: self.dropout_rate } _, cost = self.session.run([self.train_op, self.loss], feed_dict=f_dict) train_cost += cost sys.stdout.write('Epoch %d Cost : %f - Batch %d of %d \r' % (epoch, cost, batch, n_batches)) sys.stdout.flush() print(self.test(v_data, v_labels))
def test(self, data, labels): n_batches = int(ceil(data.shape[0] / self.batch_size)) test_cost = 0 preds = [] ys = [] for batch in range(1, n_batches + 1): X, Y = data_utils.generate_batch(data, labels, self.batch_size) f_dict = {self.inp: X, self.labels: Y, self.cur_drop_rate: 1.0} cost, y = self.session.run([self.loss, self.y], feed_dict=f_dict) test_cost += cost sys.stdout.write('Cost : %f - Batch %d of %d \r' % (cost, batch, n_batches)) sys.stdout.flush() preds.extend(np.argmax(y, 1)) ys.extend(Y) print print "Accuracy", np.mean( np.asarray(np.equal(ys, preds), dtype='float32')) * 100
# Step 1: Download the data (maybe). filename = maybe_download('text8.zip', 31344016) # Step 2: Read and build the dictionary and replace rare words with UNK token. vocabulary = read_data(filename) print('Data size', len(vocabulary)) vocabulary_size = 50000 data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) # Step 3: get batch data generator data_index = 0 sample_batch, sample_labels, data_index = generate_batch(data, data_index, batch_sz=8, n_skips=2, skip_wd=1) for i in range(8): print(sample_batch[i], reverse_dictionary[sample_batch[i]], '->', sample_labels[i, 0], reverse_dictionary[sample_labels[i, 0]]) # Step 4: Build a skip-gram model. batch_size = 128 embedding_size = 128 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. num_sampled = 64 # Number of negative examples to sample. # We pick a random validation set to sample nearest neighbors. Here we limit the validation samples to the words that # have a low numeric ID, which by construction are also the most frequent. These 3 variables are used only for # displaying model accuracy, they don't affect calculation. valid_size = 16 # Random set of words to evaluate similarity on.