def export_model(self, out_dir='default'): if out_dir == 'default': out_dir = self.output_dictionary utils.save_pkl(self.embedding, out_dir + config['TRAIN']['embedding_pkl']) utils.save_pkl(self.softmax_w, out_dir + config['TRAIN']['softmax_w_pkl'])
def train(self, learning_rate=0.01, print_step=1000, stop_threshold=0): losses = [] aver_losses = [] wa_scores = [] if print_step == 0: print_step = self.n_batches for _ in range(self.epochs): iteration = 0 start = time.time() with open(self.filename, 'r') as f: reader = csv.reader(f) for row in reader: # Print step iteration += 1 if iteration % print_step == 0: end = time.time() print("Epochs: {}".format(_), "Iteration: {}".format(iteration), "Avg. Training loss: {:.4f}".format(np.mean(losses)), "{:.4f} sec/ {} sample".format((end - start), self.batch_size * print_step)) aver_losses.append(np.mean(losses)) losses = [] start = time.time() # Print word analogy if iteration % (print_step * 10) == 0: eval = Embedding(np.array(self.E), self.int_to_vocab, self.vocab_to_int) wa_score = self.word_analogy.evaluate(eval, high_level_category=False, restrict_top_words=False) wa_scores.append(wa_score['all']) self.export_model(self.output_dictionary + 'step-{}/'.format(int(iteration))) loss = self._train_one_sample(int(row[0]), int(row[1]), learning_rate) losses.append(loss) eval = Embedding(np.array(self.E), self.int_to_vocab, self.vocab_to_int) wa_score = self.word_analogy.evaluate(eval, high_level_category=False, restrict_top_words=False) wa_scores.append(wa_score['all']) print('Epochs: {}, WA score: {}'.format(_, wa_score['all'])) # Save step if _ % 5 == 0: self.export_model(self.output_dictionary + 'step-{}/'.format(int(_))) # export losses utils.save_pkl(aver_losses, self.output_dictionary + config['TRAIN']['loss_file']) utils.save_pkl(wa_scores, self.output_dictionary + config['TRAIN']['acc_file'])
def _sample_contexts(self, from_file=True): if not from_file: samples = utils.sample_context(self.context_distribution, self.n_context_sample) return samples # Sample contexts if self.scope + 1 > len(self.contexts): for i in range(self.scope + 1 - len(self.contexts)): samples = utils.sample_context(self.context_distribution, self.n_context_sample) self.contexts.append(samples) # Save result back to pkl print('Uploading sample context file, scope: ', self.scope) utils.save_pkl(self.contexts, self.sample_contexts_file_name) return self.contexts[self.scope]
def save_dicts(self): # make directories dict_path = self.output_path + config['PREPROCESS']['output_dict_path'] if not os.path.exists(dict_path): os.makedirs(dict_path) # Save dictionaries utils.save_pkl(self.vocab_to_int, dict_path + config['PREPROCESS']['vocab_to_int']) utils.save_pkl(self.int_to_vocab, dict_path + config['PREPROCESS']['int_to_vocab']) utils.save_pkl(self.cont_to_int, dict_path + config['PREPROCESS']['cont_to_int']) utils.save_pkl(self.int_to_cont, dict_path + config['PREPROCESS']['int_to_cont'])
def export_model(self): utils.save_pkl( self.embedding, self.output_dictionary + config['TRAIN']['embedding_pkl']) utils.save_pkl( self.softmax_w, self.output_dictionary + config['TRAIN']['softmax_w_pkl']) utils.save_pkl( self.softmax_b, self.output_dictionary + config['TRAIN']['softmax_b_pkl'])
for i in range(args.continue_from, args.scope): w = data[i][0] c = data[i][1] length = model.snml_length_sampling(w, c, epochs=args.epochs) snml_lengths.append(length) # print process if (i + 1) % print_step == 0: end = time.time() print('Run {} step in: {:.4f} sec, snml length: {}'.format(i + 1, (end - start), sum(snml_lengths))) start = time.time() # save steps if (i + 1) % 1000 == 0: step_path = args.model + '{}-step/'.format(i + 1) filename = step_path + 'scope-{}-snml_length.pkl'.format(args.scope) utils.save_pkl(snml_lengths, filename) print('{} scope snml length: {}'.format(args.scope, sum(snml_lengths))) # Save result to file filename = args.model + 'scope-{}-snml_length.txt'.format(args.scope) output = open(filename, 'w') for i in snml_lengths: output.write(str(i) + '\n') output.close() # upload to gcs utils.upload_to_gcs(filename, force_update=True)
print_step = 50000 start = time.time() for i in range(args.continue_from, args.scope): w = data[i][0] c = data[i][1] length = model.snml_length(w, c, epochs=args.epochs) snml_lengths.append(length) # print process if (i + 1) % print_step == 0: end = time.time() print('Run {} step in: {:.4f} sec, snml length: {}'.format( i + 1, (end - start), sum(snml_lengths))) start = time.time() # save steps # if (i + 1) % 500000 == 0: # step_path = args.model + '{}-step/'.format(i + 1) # filename = step_path + 'scope-{}-snml_length.pkl'.format(args.scope) # utils.save_pkl(snml_lengths, filename) print('{} scope snml length: {}'.format(args.scope, sum(snml_lengths))) # Save result to file filename = args.model + 'scope-{}-snml_length.pkl'.format(args.scope) utils.save_pkl(snml_lengths, filename, local=True) # upload to gcs # utils.upload_to_gcs(filename, force_update=True)
epochs = 16 dim = '200' # read snml train file data = np.genfromtxt('../../../data/text8/scope.csv', delimiter=',').astype(int) loss_list = [] n_sample = 2000 model = Model('../../../output/text8/momentum/snml/1/' + dim + 'dim/', '../../../data/text8/contexts/', n_context_sample=3000, learning_rate=0.0004) for i in range(n_sample): datum = data[i] w, c = data[i][0], data[i][1] w = int(w) c = int(c) ps_a = -np.log( model.train_one_sample(w, c, epochs=epochs, update_weight=True)) loss_list.append(ps_a) if i % 100 == 0: print('{} th loop'.format(i)) utils.save_pkl(loss_list, '../../../output/text8/momentum/test/4.pkl', local=True)
parser = argparse.ArgumentParser() parser.add_argument('--data_path', default='../../data/processed data/split/', type=str) args = parser.parse_args() print('Reading file...') contexts = [] iteration = 0 for file in os.listdir(args.data_path): iteration += 1 if iteration % 1000 == 0: print('Importing ', file) data = np.genfromtxt(args.data_path + file, delimiter=',').astype(int) contexts.extend(data[:, 1]) context_counts = Counter(contexts) n_context = len(context_counts) n_data = len(contexts) print('Making distribution...') context_distribution = np.zeros(n_context) for i in range(n_context): context_distribution[i] = context_counts[i] / n_data print('Saving file...') utils.save_pkl(context_distribution, 'context_distribution.pkl') print('Finished!') print('Saved: {} contexts / {} records'.format(n_context, n_data))
def train(self, n_sampled=200, epochs=1, batch_size=10000, print_step=1000): self.embedding_file = config['TRAIN']['embedding'].format( self.n_embedding, n_sampled, epochs, batch_size) # computation graph train_graph = tf.Graph() with train_graph.as_default(): # training data dataset = tf.data.experimental.make_csv_dataset( self.data_path + config['TRAIN']['train_data'], batch_size=batch_size, column_names=['input', 'output'], header=False, num_epochs=epochs) datum = dataset.make_one_shot_iterator().get_next() inputs, labels = datum['input'], datum['output'] # embedding layer embedding = tf.Variable( tf.random_uniform((self.n_vocab, self.n_embedding), -1, 1)) embed = tf.nn.embedding_lookup(embedding, inputs) # softmax layer softmax_w = tf.Variable( tf.truncated_normal((self.n_context, self.n_embedding))) softmax_b = tf.Variable(tf.zeros(self.n_context)) # Calculate the loss using negative sampling labels = tf.reshape(labels, [-1, 1]) loss = tf.nn.sampled_softmax_loss(weights=softmax_w, biases=softmax_b, labels=labels, inputs=embed, num_sampled=n_sampled, num_classes=self.n_context) cost = tf.reduce_mean(loss) optimizer = tf.train.AdamOptimizer().minimize(cost) with tf.Session(graph=train_graph) as sess: iteration = 1 loss = 0 losses = [] sess.run(tf.global_variables_initializer()) try: start = time.time() while True: train_loss, _ = sess.run([cost, optimizer]) loss += train_loss losses.append(train_loss) if iteration % print_step == 0: end = time.time() print( "Iteration: {}".format(iteration), "Avg. Training loss: {:.4f}".format(loss / print_step), "{:.4f} sec/ {} sample".format( (end - start), batch_size * print_step)) loss = 0 start = time.time() iteration += 1 except tf.errors.OutOfRangeError: print("End of dataset") # export embedding matrix self.embedding = embedding.eval() self.softmax_w = softmax_w.eval() self.softmax_b = softmax_b.eval() # export losses utils.save_pkl( losses, self.output_dictionary + config['TRAIN']['loss_file'])
loss_list.extend(m.get_loss_batch(w, c)) return loss_list if __name__ == "__main__": dims = [50, 100, 110, 120, 130, 140, 150, 160, 200, 300] # read snml train file data = np.genfromtxt('../../../../data/wiki/scope.csv', delimiter=',').astype(int) n_sample = 3000000 for dim in dims: print(dim) # full data model = Model( '../../../output/wiki/20200126/1/train2/{}dim/step-90/'.format( dim), '../../../data/wiki/contexts/', n_context_sample=3000, learning_rate=0.1) loss_list = get_loss_list_batch(model, data[299999:n_sample + 299999]) save_pkl( loss_list, 'C:\\Users/hungp/Downloads/information criteria on sg/wiki/20200126 snml/cv_{}_dim.pkl' .format(dim), local=True) # save_pkl(loss_list, 'cv_lines/cv_{}_dim.pkl'.format(dim), local=True)
with open(args.data_path + config['TRAIN']['train_data']) as fp: line = fp.readline() iteration = 0 for line in fp: iteration += 1 try: context = int(line.split(',')[1]) contexts.append(context) except: print('Failed {}th line: {}'.format(iteration, line)) finally: if iteration % 10000000 == 0: print('Processed: {} lines'.format(iteration)) context_counts = Counter(contexts) n_context = len(context_counts) n_data = len(contexts) print('Making distribution...') context_distribution = np.zeros(n_context) for i in range(n_context): context_distribution[i] = context_counts[i] / n_data print('Saving file...') utils.save_pkl(context_distribution, args.data_path + 'contexts/context_distribution.pkl') print('Finished!') print('Saved: {} contexts / {} records'.format(n_context, n_data))
from multiprocessing import Pool import multiprocessing import utils.tools as utils if __name__ == "__main__": words = [6581, 93, 4519, 506] contexts = [390, 1172, 1545, 22] model = ModelMomentum( '../../../output/text8/momentum/snml/48epochs/1/100dim/', '../../../data/text8/contexts/', n_context_sample=600) for i in range(len(words)): word = words[i] context = contexts[i] # Update all other context print('Start: ', word) # implement pools job_args = [(word, c, 48, 3000) for c in range(model.V_dash)] p = Pool(multiprocessing.cpu_count()) probs = p.map(model._train_job, job_args) p.close() p.join() # save context's probs utils.save_pkl( probs, '../../../output/test/contexts_probs_{}.pkl'.format(word))
def train(self, print_step=1000, stop_threshold=0): iteration = 1 loss = 0 losses = [] epoch_sum_loss = 0. last_epoch_loss = 999999. wa_scores = [] if print_step == 0: print_step = self.n_batches try: start = time.time() while True: train_loss, _ = self.sess.run( [self.full_cost, self.full_optimizer]) # train_loss, _ = self.sess.run([self.cost, self.optimizer]) loss += train_loss epoch_sum_loss += train_loss losses.append(train_loss) if iteration % print_step == 0: end = time.time() print( "Iteration: {}".format(iteration), "Avg. Training loss: {:.4f}".format(loss / print_step), "{:.4f} sec/ {} sample".format( (end - start), self.batch_size * print_step)) loss = 0 start = time.time() if iteration % self.n_batches == 0: epochs = iteration / self.n_batches epoch_loss = epoch_sum_loss / self.n_batches epoch_sum_loss = 0 epoch_loss_diff = np.abs(epoch_loss - last_epoch_loss) print('Epochs {} loss: {}'.format(epochs, epoch_loss)) # word analogy score embedding = self.sess.run(self.embedding_g) eval = Embedding(embedding, self.int_to_vocab, self.vocab_to_int) wa_score = self.word_analogy.evaluate( eval, high_level_category=False, restrict_top_words=False) wa_scores.append(wa_score['all']) # stop criteria if epoch_loss_diff < stop_threshold: self.epochs = iteration / self.n_batches # output file self.embedding_file = config['TRAIN'][ 'embedding'].format(self.n_embedding, self.n_sampled, int(self.epochs), self.batch_size) print('Loss diff: {}, stop training.'.format( epoch_loss_diff)) print(self.output_dictionary + self.embedding_file) break # Save step if epochs % 10 == 0: self.embedding = self.sess.run(self.embedding_g) self.softmax_w = self.sess.run(self.softmax_w_g) self.export_model(self.output_dictionary + 'step-{}/'.format(int(epochs))) last_epoch_loss = epoch_loss iteration += 1 except tf.errors.OutOfRangeError: print("End of dataset") # export embedding matrix self.embedding = self.sess.run(self.embedding_g) self.softmax_w = self.sess.run(self.softmax_w_g) # export losses utils.save_pkl(losses, self.output_dictionary + config['TRAIN']['loss_file']) utils.save_pkl(wa_scores, self.output_dictionary + config['TRAIN']['acc_file'])
if __name__ == "__main__": # Data file raw_data_path = '../data/raw data/test.txt ' context_to_dict_path = 'data/text8/dict/cont_to_int.dict' output_path = 'data/text8/contexts/distribution_from_raw.pkl' int_to_cont = load_pkl('data/text8/dict/int_to_cont.dict', local=True) # Load data with open(raw_data_path, encoding='utf-8') as f: words = f.read().split() # Load dict context_to_dict = load_pkl(context_to_dict_path, local=True) # Convert vocab to int context = [] for word in words: if word in context_to_dict: context.append(context_to_dict[word]) context_counts = Counter(context) n_context = len(context_to_dict) n_data = sum(list(context_counts.values())) context_distribution = np.zeros(n_context) for c, count in context_counts.items(): context_distribution[c] = count / n_data context_distribution = np.array(context_distribution) save_pkl(context_distribution, output_path)
if __name__ == "__main__": context_path = '../notebooks/output/50-context-500000-data-18-questions/contexts/' n_context_sample = 50 scope = 5000 file_name = os.path.join(context_path, 'sample_contexts_{}.pkl'.format(n_context_sample)) context_distribution = utils.load_pkl(context_path + 'context_distribution.pkl') if os.path.exists(file_name): print('Load file') contexts = utils.load_pkl(file_name) else: contexts = [] print('Current contexts: ', len(contexts)) # Sample contexts if scope + 1 > len(contexts): for i in range(scope - len(contexts)): samples = utils.sample_context_uniform(len(context_distribution), n_context_sample) contexts.append(samples) # Save result back to pkl utils.save_pkl(contexts, file_name) print(len(contexts)) print(len(contexts[0]))