def main(): dataset = 'cifar100' num_samples = 1000 datafile = DATAFILE_LIST[dataset] num_classes = NUM_CLASSES_DICT[dataset] categories, observations, confidences, idx2category, category2idx, labels = prepare_data(datafile, False) # accuracy models accuracy_model = BetaBernoulli(k=num_classes, prior=None) accuracy_model.update_batch(categories, observations) # ece models for each class ece_model = ClasswiseEce(num_classes, num_bins=10, pseudocount=2) ece_model.update_batch(categories, observations, confidences) # draw samples from posterior of classwise accuracy accuracy_samples = accuracy_model.sample(num_samples) # (num_categories, num_samples) ece_samples = ece_model.sample(num_samples) # (num_categories, num_samples) accuracy = np.array([np.quantile(accuracy_samples, 0.025, axis=1), np.quantile(accuracy_samples, 0.5, axis=1), np.quantile(accuracy_samples, 0.975, axis=1)]).T ece = np.array([np.quantile(ece_samples, 0.025, axis=1), np.quantile(ece_samples, 0.5, axis=1), np.quantile(ece_samples, 0.975, axis=1)]).T fig, axes = plot_figure_1(accuracy, ece, labels=CIFAR100_CLASSES, limit=10, reverse=False) fig.tight_layout() fig.subplots_adjust(bottom=-0.2, wspace=0.35) fig.set_size_inches(COLUMN_WIDTH * 1.3, 2.0) fig.savefig(FIGURE_DIR + 'figure1.pdf', bbox_inches="tight", pad_inches=0.05)
def test_shaps_to_probs_with_data() -> None: """ test whether the shaps_to_probs tensorflow function actually calculates the correct class probabilities given actual shap values """ from data_utils import load_costa_rica_dataset, prepare_data from xgboost_utils import fit_xgboost_classifier, calculate_shap_values # load data, train xgboost model, calculate shap values X, y = load_costa_rica_dataset() (n_samples, n_features, n_classes, X_train, X_valid, y_train, y_valid, y_train_onehot, y_valid_onehot, y_onehot, class_weights) = prepare_data(X, y) xgb_model = fit_xgboost_classifier(X_train, y_train) shap_values, expected_logits = calculate_shap_values(xgb_model, X) xgb_probs = xgb_model.predict_proba(X) # test shaps_to_probs sess = tf.Session() t_shaps = tf.placeholder(tf.float32) t_expected_logits = tf.placeholder(tf.float32) t_res = shaps_to_probs(t_shaps, t_expected_logits) shap_probs = sess.run(t_res, feed_dict={ t_shaps: shap_values, t_expected_logits: expected_logits }) print(np.allclose(shap_probs, xgb_probs)) print()
def main(): # Prepare dataset. raw_file = './data/y_n_all' # raw data file. prepared_dir = './data/' # dir of prepared data(sentences cut down and labels are number) # cut_mode = 'character' cut_mode = 'jieba' # cut_mode = '2-gram' vocab_dir = './data/Bayes_vocabulary' prepared_data,prepared_label = data_utils.prepare_data(raw_file,prepared_dir,cut_method = cut_mode, vocab_dir = vocab_dir) print('Get prepared dataset.') # pdb.set_trace() # print(prepared_data) # Get training and test dataset. traning_dir = './data/train/' test_dir = './data/test/' ratio = 0 train,test = data_utils.get_data(list(zip(prepared_data,prepared_label)),traning_dir,test_dir,ratio = ratio, cut_method = cut_mode) print('Get training and test dataset.') # pdb.set_trace() # print(train.data) # train model_name = 'Bayes' config_dir = './model/' train_model = init_model(model_name,config_dir) print('Initialize the model.') training(train_model,train) print('Training finished.') # store the variable. v = {'model':train_model} model_file = train_model.model_path + 'model.pickle' with open(model_file,'wb') as f: pickle.dump(v, f)
def evaluate_model(model_path, dataset_path='emnist/emnist-balanced-test.csv'): raw_test_x, raw_test_y, class_map = data_utils.load_dataset(dataset_path) test_x, test_y, _ = data_utils.prepare_data(raw_test_x, raw_test_y, class_map) best_model = load_model(model_path) print(best_model.evaluate(test_x, test_y)) data_utils.print_confusion_matrix(test_x, test_y, model_path, class_map)
def get_prepared_data(): """ Figures out from the passed in flags, what the training data should be, and prepared it by setting up the tokenizer. """ from_train = None to_train = None from_dev = None to_dev = None if FLAGS.from_train_data and FLAGS.to_train_data: from_train_data = FLAGS.from_train_data to_train_data = FLAGS.to_train_data from_dev_data = from_train_data to_dev_data = to_train_data if FLAGS.from_dev_data and FLAGS.to_dev_data: from_dev_data = FLAGS.from_dev_data to_dev_data = FLAGS.to_dev_data from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data( FLAGS.data_dir, from_train_data, to_train_data, from_dev_data, to_dev_data, FLAGS.from_vocab_size, FLAGS.to_vocab_size) else: pdb.set_trace() print("Must specify drom_train_data and to_train_data directories") exit(1) return (from_train, to_train, from_dev, to_dev)
def example_evaluate_random_classifier() -> None: """ compare a random classifier """ from data_utils import load_costa_rica_dataset, prepare_data from xgboost_utils import fit_xgboost_classifier, calculate_shap_values, evaluate_xgboost_classifier # load data, train xgboost model, calculate shap values X, y = load_costa_rica_dataset() (n_samples, n_features, n_classes, X_train, X_valid, y_train, y_valid, y_train_onehot, y_valid_onehot, y_onehot, class_weights) = prepare_data(X, y) xgb_model = fit_xgboost_classifier(X_train, y_train) shap_values, expected_logits = calculate_shap_values(xgb_model, X) # evaluate xgboost classifier print("\n", "evaluate xgboost classifier:") evaluate_xgboost_classifier(xgb_model, X_valid, y_valid) # evaluate random classifier print( "\n", "evaluate random classifier (based of xgboost expected probabilities):" ) evaluate_random_classifier(expected_logits=expected_logits, y_true=y_valid, n_clones=1000)
def main(_): from_train_data = FLAGS.from_train_data to_train_data = FLAGS.to_train_data from_valid_data = FLAGS.from_valid_data to_valid_data = FLAGS.to_valid_data from_train, to_train, from_valid, to_valid, _, _ = data_utils.prepare_data( FLAGS.data_dir, from_train_data, to_train_data, from_valid_data, to_valid_data, FLAGS.from_vocab_size, FLAGS.to_vocab_size, same_vocab=False) hparams = create_hparams(FLAGS) hparams.add_hparam(name="from_train", value=from_train) hparams.add_hparam(name="to_train", value=to_train) hparams.add_hparam(name="from_valid", value=from_valid) hparams.add_hparam(name="to_valid", value=to_valid) from_vocab_path = os.path.join(hparams.data_dir, "vocab%d.from" % hparams.from_vocab_size) to_vocab_path = os.path.join(hparams.data_dir, "vocab%d.to" % hparams.to_vocab_size) #train_ae(hparams, train=True, interact=True) train_nmt(hparams, train=False, interact=True)
def load_data(self, debug=False): """Loads train/valid/test data and sentence encoding""" en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_data( 'tmp', 40000, 40000) self.source_vocab_to_id, self.source_id_to_vocab = data_utils.initialize_vocabulary( en_vocab_path) self.target_vocab_to_id, self.target_id_to_vocab = data_utils.initialize_vocabulary( fr_vocab_path) source_path = './tmp/train.ids40000.questions' target_path = './tmp/train.ids40000.answers' if self.config.train_mode: source_path = './tmp/train.ids40000.questions' target_path = './tmp/train.ids40000.answers' sources, targets = data_utils.read_data(source_path, target_path) else: source_path = './tmp/test.ids40000.questions' target_path = './tmp/test.ids40000.answers' sources, targets = data_utils.read_data(source_path, target_path) self.train, self.valid, self.max_t_len, self.max_input_len, self.max_sen_len = data_utils.pad_length_bucket( sources, targets, self.config) source_vocab_path = './tmp/vocab40000.questions' target_vocab_path = './tmp/vocab40000.answers' self.source_vocab_size = data_utils.get_vocab_size(source_vocab_path) self.target_vocab_size = data_utils.get_vocab_size(target_vocab_path) self.word_embedding = np.random.uniform( -self.config.embedding_init, self.config.embedding_init, (self.source_vocab_size, self.config.embed_size))
def train_epoch(self, epoch, dataloader, optimizer, image_dir, args, device='cpu'): self.train() s_total_loss0 = 0 s_total_loss1 = 0 w_total_loss0 = 0 w_total_loss1 = 0 for data in tqdm(dataloader, total=len(dataloader)): imgs, caps, caps_len, masks, class_ids = \ prepare_data(data, device, is_damsm=True) if self.is_bert: w_loss0, w_loss1, s_loss0, s_loss1 = \ self.forward( imgs, caps, caps_len, args, class_ids=class_ids, bert_mask=masks ) else: w_loss0, w_loss1, s_loss0, s_loss1 = \ self.forward( imgs, caps, caps_len, args, class_ids=class_ids ) loss = s_loss0 + s_loss1 + w_loss0 + w_loss1 w_total_loss0 += w_loss0.item() w_total_loss1 += w_loss1.item() s_total_loss0 += s_loss0.item() s_total_loss1 += s_loss1.item() self.text_encoder.zero_grad() self.image_encoder.zero_grad() loss.backward() # `clip_grad_norm` helps prevent # the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(self.text_encoder.parameters(), args.damsm_rnn_grad_clip) optimizer.step() s_total_loss0 /= len(dataloader) s_total_loss1 /= len(dataloader) w_total_loss0 /= len(dataloader) w_total_loss1 /= len(dataloader) sumloss = s_total_loss0 + s_total_loss1 + w_total_loss0 + w_total_loss1 print('[TRAIN] Epoch {:3d} | ' 's_loss {:5.2f} {:5.2f} | ' 'w_loss {:5.2f} {:5.2f} | Sum {:5.2f}'.format( epoch, s_total_loss0, s_total_loss1, w_total_loss0, w_total_loss1, sumloss)) return
def train(): print("Loading data...") vocab_word, vocab_word_list, train_words, train_labels, test_words, test_labels \ = data_utils.prepare_data(args.data_path) #读入数据 max_text_len = max([len(words) for words in train_words]) vocab_len = len(vocab_word_list) dev_sample_size = int(args.dev_sample * float(len(train_words))) x_train, x_dev = train_words[:-1 * dev_sample_size], train_words[ -1 * dev_sample_size:] #切分训练集和验证集 y_train, y_dev = train_labels[:-1 * dev_sample_size], train_labels[ -1 * dev_sample_size:] print("Vocabulary Size: {:d}".format(len(vocab_word_list))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) #模型构造 model = models.TextCNN(max_text_len, 2, vocab_len, args.embedding_size, list(map(int, args.filter_sizes.split(","))), args.num_filters, args.max_gradient_norm, args.learning_rate, args.l2_reg_lambda) if not os.path.exists(args.model_path): os.mkdir(args.model_path) #使用数据训练模型 model.fit(x_train, y_train, x_dev, y_dev, args.epoch_size, args.batch_size, args.checkpoint_step, args.model_path)
def get_data_params(train_df, val_df, raw_df, is_have_death=False, state="Texas", country='US'): train_params = prepare_data(train_df['Confirmed'].values, train_df['Deaths'].values, train_df['Recovered'].values, population[country][state], is_have_death) val_parms = prepare_data(val_df['Confirmed'].values, val_df['Deaths'].values, val_df['Recovered'].values, population[country][state], is_have_death) raw_params = prepare_data(raw_df['Confirmed'].values, raw_df['Deaths'].values, raw_df['Recovered'].values, population[country][state], is_have_death) return train_params, val_parms, raw_params
def _get_score(classifier): score_sum = 0 for _ in tqdm(range(_score_iter)): samples, labels = prepare_data() score_sum += cross_val_score(classifier, samples, labels, cv=4).mean() * 100 return score_sum / _score_iter
def test(test_dataset, source_vocab, target_vocab, source_vocab_list, target_vocab_list): model = create_model(len(source_vocab), len(target_vocab), source_vocab_list, target_vocab_list, 0.0, args.max_source_len, args.max_target_len) test_set = data_utils.prepare_data(test_dataset, source_vocab, target_vocab) evaluate(model, test_set, source_vocab, target_vocab, source_vocab_list, target_vocab_list)
def _get_score_with_optimal_features(classifier): score_sum = 0 samples, labels = prepare_data() num_of_features = len(samples[0]) histogram = [_score_iter for i in range(num_of_features)] for _ in tqdm(range(_score_iter)): samples, labels = prepare_data() search_problem = FeatureSearchProblem(classifier=classifier, initial_state=(samples, labels, [])) samples, labels, path = hill_climbing_stochastic( search_problem, iterations_limit=_search_iter).state dropped_features = _restore_path(path, num_of_features) _update_histogram(histogram, dropped_features) score_sum += cross_val_score(classifier, samples, labels, cv=4).mean() * 100 print histogram return score_sum / _score_iter
def train(): """Run SpeakEasy/server/python_model/scripts/run.sh to train model""" # prepare movie subtitle data. print("Preparing data in %s" % FLAGS.data_dir) sys.stdout.flush() data_train, data_dev, _ = data_utils.prepare_data(FLAGS.data_dir, FLAGS.vocab_size) with tf.Session() as sess: # create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) sys.stdout.flush() model = create_model(sess, False) # set up event logging. NOTE: added this merged_summaries = tf.merge_all_summaries() # writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph_def) print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) sys.stdout.flush() dev_set = read_data(data_dev) train_set = read_data(data_train, FLAGS.max_train_data_size) # this is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set) summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) sys.stdout.flush() # decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): result = sess.run([model.learning_rate_decay_op]) previous_losses.append(loss) # save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0
def test(test_data, source_vocab, target_vocab, source_vocab_list, target_vocab_list, source_serialize, target_serialize): model = create_model(len(source_vocab), len(target_vocab), source_vocab_list, target_vocab_list, 0.0, args.max_source_len, args.max_target_len) test_set = data_utils.prepare_data(test_data, source_vocab, target_vocab, args.input_format, args.output_format, source_serialize, target_serialize) evaluate(model, test_set, source_vocab, target_vocab, source_vocab_list, target_vocab_list)
def plot_input_data(country, state): train_df, val_df, raw_df = load_data(country=country, state = state) raw_params = prepare_data( raw_df['Confirmed'].values, raw_df['Deaths'].values, raw_df['Recovered'].values, population[country][state], is_have_death=True ) save_dir = os.path.join(cfg.data.save_path+'/input', f'{country}_{state}.jpg') plot_single_set(raw_params, x_axis=raw_df['Day'].values, save_dir=save_dir)
def hyperparam_search(FLAGS, override=False): t1 = time.time() ofile = open('/home/qv/wikiqa-data/out.train_ldc.txt', 'a') i = 0 while True: FLAGS.initial_learning_rate = np.random.choice([0.05, 0.005]) FLAGS.l2_reg_strength = np.random.choice([0.005, 0.0005]) FLAGS.keep_prob = np.random.choice([1, 0.75, 0.5]) FLAGS.embedding_type = np.random.choice( ['enwiki-skipgram', 'GoogleNews']) FLAGS.remove_stopwords_from_s = np.random.choice([True, False]) FLAGS.num_filters = np.random.choice([10, 100, 500]) FLAGS.train_dir = train_dir_name(FLAGS) FLAGS.data_pkl_file = data_utils.processed_data_file_name(FLAGS) if os.path.exists(FLAGS.train_dir) and not override: continue data_dict = data_utils.prepare_data(FLAGS) FLAGS.max_q_sents = data_dict['max_q_sents'] FLAGS.max_q_len = data_dict['max_q_len'] epochs = 100 if FLAGS.initial_learning_rate <= 0.01: epochs = 200 print "Learning rate: ", FLAGS.initial_learning_rate tf.reset_default_graph() best_results = train(data_dict, epochs) t = (FLAGS.initial_learning_rate, FLAGS.l2_reg_strength, FLAGS.keep_prob, FLAGS.embedding_type, FLAGS.embedding_size, FLAGS.remove_stopwords_from_s, FLAGS.num_filters) opt_names = [ 'initial_learning_rate', 'l2_reg_strength', 'keep_prob', 'embedding_type', 'embedding_size', 'stopwords removed', 'num_filters' ] ofile.write(FLAGS.train_dir + '\n') ofile.write(FLAGS.data_pkl_file + '\n') ofile.write('Parameters: \n') for name, opt in zip(opt_names, t): ofile.write(' {}: {}\n'.format(name, opt)) ofile.write('Best Results: ') ofile.write(" reg-loss %.4f loss %.4f tps/fps %d/%d tops %d\n" " mrr %.2f map %.2f corr preds %d/%d\n" % best_results) ofile.write('\n') ofile.flush() i = i + 1 if i == 100: break ofile.close() print time.time() - t1
def eval_test(): tf.reset_default_graph() test_out = os.path.join(FLAGS.data_dir, 'test_errors.out') deleteFiles([test_out]) stats = {'R2W': 0, 'W2R': 0, 'W2W_C': 0, 'W2W_NC': 0} # change the reuse parameter if you want to build the data again _, _, _, _, en_test, fr_test, _, _ = data_utils.prepare_data( FLAGS.data_dir, reuse=FLAGS.reuse) with tf.Session(config=config_all) as sess: model = create_model(sess, True) test_set = read_data(en_test, fr_test) test_bucket_sizes = [len(test_set[b]) for b in range(len(_buckets))] print('Bucket Sizes : {}'.format(test_bucket_sizes)) total_loss, num_batches = 0, 0 for bucket_id in range(len(_buckets)): all_batches = ([u for u in k if u is not None] for k in itertools.izip_longest(*[ test_set[bucket_id][i::FLAGS.batch_size] for i in range(FLAGS.batch_size) ])) for batch in all_batches: encoder_inputs, decoder_inputs, target_weights = model.prepare_batch( batch, bucket_id) # setting the model batch size in case it is smaller (would be for the # last batch in the bucket) model.batch_size = len(batch) _, eval_loss, logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = np.argmax(logits, axis=2).transpose() outseq = [ out[:list(out).index(data_utils.EOS_ID)] for out in outputs if data_utils.EOS_ID in out ] stat_updates = update_error_counts(batch, outseq) stats = {k: stats[k] + v for k, v in stat_updates.items()} total_loss += math.exp(eval_loss) num_batches += 1 # resetting the madel batch size model.batch_size = FLAGS.batch_size print("Loss over the test set : {}".format(total_loss / num_batches)) print(stats) precision = stats['W2R'] / sum( [stats['W2R'], stats['R2W'], stats['W2W_C']]) recall = stats['W2R'] / sum( [stats['W2R'], stats['W2W_NC'], stats['W2W_C']]) f_m = (2 * precision * recall) / (precision + recall) print('P: {}\nR: {}\nF: {}'.format(precision, recall, f_m))
def re_run(model_params=MODEL_PARAMS, data=utils.prepare_data(), learning_rate=LEARNING_RATE, epochs=RERUN_EPOCHS): train_data, test_data = data model = cc_model.Model(model_params) model.load() model.compile(learning_rate, sparse=SPARSE) model.train(train_data, epochs=epochs, test_data=test_data) model.verify(utils.gen_sample_data(size=100)) model.save(ask=True) return model
def load_data(self, debug=False): """Loads train/valid/test data and sentence encoding""" ''' en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_data( FLAGS.data_dir, FLAGS.en_vocab_size, FLAGS.fr_vocab_size) ''' en_train, fr_train, en_dev, fr_dev, en_vocab_path, fr_vocab_path = data_utils.prepare_data( 'tmp', 40000, 40000) self.source_vocab_to_id, self.source_id_to_vocab = data_utils.initialize_vocabulary( en_vocab_path) self.target_vocab_to_id, self.target_id_to_vocab = data_utils.initialize_vocabulary( fr_vocab_path) #print self.source_vocab_to_id #print self.source_id_to_vocab ''' print self.target_vocab_to_id print self.target_id_to_vocab ''' ''' for i in range(0, 10): print i print self.target_id_to_vocab[int(float(i))] #adsfas ''' source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.questions' target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.answers' if self.config.train_mode: source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.questions' target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/train.ids40000.answers' sources, targets = data_utils.read_data(source_path, target_path) else: source_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/test.ids40000.questions' target_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/test.ids40000.answers' sources, targets = data_utils.read_data(source_path, target_path) self.train, self.valid, self.max_t_len, self.max_input_len, self.max_sen_len = data_utils.pad_length_bucket( sources, targets, self.config) source_vocab_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/vocab40000.questions' target_vocab_path = '/Users/ethancaballero/Neural-Engineer_Candidates/dmn-tf-alter_working_decoder_d2c/tmp/vocab40000.answers' self.source_vocab_size = data_utils.get_vocab_size(source_vocab_path) self.target_vocab_size = data_utils.get_vocab_size(target_vocab_path) self.word_embedding = np.random.uniform( -self.config.embedding_init, self.config.embedding_init, (self.source_vocab_size, self.config.embed_size))
def main(): config = Config() data = prepare_data(config , debug=config.debug) data.data_loader = {phase: torch.utils.data.DataLoader(data.data[phase], shuffle=False, batch_size=config.batch_size, num_workers=2) for phase in ['train' , 'val']} encoder_model = encoder_RNN(config.embedding_size, data.vocab_size[config.source], config.hidden_size , n_layers=config.n_layers_encoding , bidirectional=config.bidirectional, dropout=config.dropout).to(config.device) decoder_model = decoderAttn(config.attn_model , config.embedding_size , data.vocab_size[config.target], config.hidden_size, n_layers=config.n_layers_decoding, dropout=config.dropout).to(config.device) loss_criterion = nn.CrossEntropyLoss(reduce=False) encoder_optimizer = torch.optim.Adam(encoder_model.parameters() , lr=config.lr) decoder_optimizer = torch.optim.Adam(decoder_model.parameters() , lr=config.lr) start = time.time() encoder_model , decoder_model , loss_curve = train(data, config, encoder_model, decoder_model, encoder_optimizer, decoder_optimizer,loss_criterion,n_epochs=config.n_epochs) print('Training Completed. Took {} seconds'.format(time.time()-start)) ## Evaluate print("######## VALIDATION #########") for i in np.random.randint(0, len(data.data['val']), 5): inp_seq = data.data['val'][i][0] + config.end_tok print(data.data['val'][i][0]) print(data.data['val'][i][1]) gen_sen = generate_translation(inp_seq ,config, encoder_model , decoder_model, data.vocab) print(gen_sen) print(BLEU_score(config , data.data['val'][i][1], gen_sen)) print() print("######## TRAIN #########") for i in np.random.randint(0, len(data.data['train']), 5): inp_seq = data.data['train'][i][0] + config.end_tok print(data.data['train'][i][0]) print(data.data['train'][i][1]) gen_sen = generate_translation(inp_seq ,config, encoder_model , decoder_model, data.vocab) print(gen_sen) print(BLEU_score(config , data.data['train'][i][1], gen_sen)) print() return config , data
def first_run(model_params=MODEL_PARAMS, data=utils.prepare_data(), learning_rate=LEARNING_RATE, dry_run=False): train_data, test_data = data model = cc_model.Model(model_params) model.build() model.compile(learning_rate, sparse=SPARSE) model.train(train_data, test_data=test_data) model.verify(utils.gen_sample_data(size=100)) if not dry_run: model.save() return model
def evaluate(self, epoch, loader, image_dir, args, device='cpu'): self.eval() s_total_loss0 = 0 s_total_loss1 = 0 w_total_loss0 = 0 w_total_loss1 = 0 with torch.no_grad(): for data in loader: imgs, caps, caps_len, masks, class_ids = \ prepare_data(data, device, is_damsm=True) if self.is_bert: w_loss0, w_loss1, s_loss0, s_loss1 = \ self.forward( imgs, caps, caps_len, args, class_ids=class_ids, bert_mask=masks ) else: w_loss0, w_loss1, s_loss0, s_loss1 = \ self.forward( imgs, caps, caps_len, args, class_ids=class_ids ) # loss = w_loss0 + w_loss1 + s_loss0 + s_loss1 w_total_loss0 += w_loss0.item() w_total_loss1 += w_loss1.item() s_total_loss0 += s_loss0.item() s_total_loss1 += s_loss1.item() s_cur_loss0 = s_total_loss0 / len(loader) s_cur_loss1 = s_total_loss1 / len(loader) w_cur_loss0 = w_total_loss0 / len(loader) w_cur_loss1 = w_total_loss1 / len(loader) sum_loss = s_cur_loss0 + s_cur_loss1 + w_cur_loss0 + w_cur_loss1 print('[VALID] Epoch {:3d} | s_loss {:5.2f} {:5.2f} | ' 'w_loss {:5.2f} {:5.2f} | Sum {:5.2f}'.format( epoch, s_cur_loss0, s_cur_loss1, w_cur_loss0, w_cur_loss1, sum_loss)) return sum_loss
def get_scores(self): cur_time = datetime.datetime.now().strftime('%d:%m:%Y:%H-%M-%S') run_name = os.path.join('gen_exp', cur_time) save_dir = os.path.join('generated_images', run_name) self.model.generator.eval() gen_iter = 0 for data in tqdm.tqdm(self.data_loader, total=len(self.data_loader)): images, captions, cap_lens, masks, class_ids = prepare_data( data, self.device) noise = torch.FloatTensor(captions.size(0), self.model.z_dim).to( self.device).normal_(0, 1) gen_iter += 1 gen_images, _, _, _, _ = self.model(captions, cap_lens, noise, masks) filenames = [ str(gen_iter) + str(i) for i in range(gen_images[-1].size(0)) ] img_tensor = save_images(gen_images[-1], filenames, save_dir, '', gen_images[-1].size(3)) # inception score calculation gen_save_folder = os.path.join(save_dir, 'images', 'iter', str(gen_images[-1].size(3))) gen_img_iterator = GenImgData(gen_save_folder) mean_val, std_val = inception_score(gen_img_iterator, cuda=False, batch_size=32, resize=False, splits=4) print('Inception Score, mean: {0:.3f}, std: {1:.3f}'.format( mean_val, std_val)) #fid calculation paths_to_fid = [] paths_to_fid.append(gen_save_folder) paths_to_fid.append(self.test_imgs_paths) fid_val = calculate_fid_given_paths(paths_to_fid, batch_size=1, cuda=False, dims=2048) print("FID value: ", fid_val)
def initialize_params(experiment_dir_name, experiment_dir_suffix, stage_of_development, params_initialization_for_training=None, params_initialization_for_resume_training=None, params_initialization_for_evaluation=None, training_with_dev=False, use_ranges=False,): params = {} params['experiment_dir'] = None params['resume_training'] = False params['evaluate_model'] = False params['resume_training'] = False params['num_epochs'] = None params['max_steps'] = None params['num_steps_before_checkpoint'] = 1 params['data_dir'] = None params['logs_dir'] = None params['summary_dir'] = None params['results_dir'] = None params['training_path'] = None params['dev_path'] = None params['testing_path'] = None params['forward_only'] = False params['use_preprocessing'] = False params['model_path'] = None params['dict_of_filePath_to_num_of_examples_in_tfrecord'] = None params['type_of_optimizer'] = 'adam' params['total_num_of_training_examples'] = None params, filenames_of_images, labels_of_images, = initialize_params_helper(params, stage_of_development, experiment_dir_name, experiment_dir_suffix, params_initialization_for_training=params_initialization_for_training, params_initialization_for_resume_training=params_initialization_for_resume_training, params_initialization_for_evaluation=params_initialization_for_evaluation) filenames_of_images_dev = None pmValues_dev = None if stage_of_development == "training" or stage_of_development == "resume_training": if training_with_dev: filenames_of_images_dev, _, _, _, _, _, _, _, pmValues_dev = data_utils.prepare_data(params['dev_path']) return params, filenames_of_images, labels_of_images, filenames_of_images_dev, pmValues_dev
def main(argv): X, y = prepare_data(data_path=FLAGS.data_path, sheet_name=FLAGS.sheet_name, label_name=FLAGS.label_name) n_features = X.shape[1] X, y = gradient_augment(X, y) X, y = temporalize(X, y, LOOKBACK) _, X_test, _, y_test = train_test_split(np.array(X), np.array(y), test_size=DATA_SPLIT_PCT, random_state=0) X_test = np.array(X_test) X_test = X_test.reshape(X_test.shape[0], LOOKBACK, n_features) ## Loads scaler fit on training data. with open(FLAGS.data_scaler_path, 'rb') as scaler_file: scaler = pickle.load(scaler_file) X_test_scaled = scale(X_test, scaler) model = keras.models.load_model(FLAGS.final_model_path) get_layer_output = tf.keras.backend.function([model.layers[0].input], [model.layers[1].output]) layer_output = get_layer_output([X_test_scaled])[0] print(layer_output.shape, layer_output) classifier = LogisticRegression(class_weight='balanced', max_iter=200, penalty='l1', solver='saga', C=0.01, verbose=1) classifier.fit(layer_output[:, -384:], y_test) y_hat_test = classifier.predict(layer_output[:, -384:]) print("Precision Recall F_score Support") test_res = precision_recall_fscore_support(y_test, y_hat_test, average='binary') print(test_res)
def test_basic_SIR(raw_df, attribute2fix: str, state='Texas', country='US'): ''' attribute2fix: ['I', 'R'] ''' print(f"--- Basic SIR ---") raw_params = prepare_data( raw_df['Confirmed'].values, raw_df['Deaths'].values, raw_df['Recovered'].values, population[country][state] ) filename = f"{country}/{state}_basicSIR_{cfg.model.curvefit_sigma_rate}" save_dir = os.path.join(cfg.data.save_path, filename) basic_sir = BasicSIR(cfg, raw_params) res = basic_sir.fit_single_attribute(attribute=attribute2fix, visualize=False) print("finish curve fitting") val_params = {'I': basic_sir.I, 'R': basic_sir.R} x_axis=raw_df['Day'].values visualize_basic_result(val_params, res, x_axis, save_dir, attribute2fix)
def test(): print("Loading data...") vocab_word, vocab_word_list, train_words, train_labels, test_words, test_labels \ = data_utils.prepare_data(args.data_path) max_text_len = max([len(words) for words in train_words]) vocab_len = len(vocab_word_list) # 模型构造 model = models.TextCNN(max_text_len, 2, vocab_len, args.embedding_size, list(map(int, args.filter_sizes.split(","))), args.num_filters, args.max_gradient_norm, args.learning_rate, args.l2_reg_lambda) # 给定数据,对模型进行测试 text = '''说实话没吃成, 但是对这家太不满意了, 所有都给差评!到那之后满屋子都是座, 服务员非得给安排在一个犄角旮旯, 黑咕隆咚的冷气还吹不到. 点了餐喊半天服务员都不来拿单子, 而且是眼看着服务员从身边经过, 喊着服务员服务员, 她们就只当没听见. 然后我自己换了个显眼的位置, 举着手喊服务员, 她们还是无视的从我旁边走过.我k, 你又不服务, 没事走来走去干什么?所以后来干脆不吃了. 请问, 您家是要做生意么?''' words_ids = data_utils.sentence_to_token_ids(text, vocab_word) predicts = model.predict(words_ids, args.model_path) print text print "预测结果为:", predicts if predicts == [0]: print "正面评论" else: print "负面评论" text = '''瘦了点,可能和季节有关吧吃完加点青菜做泡饭满嗲的~孔雀开屏 45.00很大一条鱼,摆盘很漂亮,肉质挺嫩,如果加点醋更好, 去腥更美味~~香菇菜心这个 我喜欢的呀~上面酱很嗲~ 香菇很入味,菜心很爽口~ 解油腻 总体来说这里感觉很实惠,虽然价格不贵,但是品质却不错, 摆盘很用心很漂亮。酒香不怕巷子深 用在这里真是非常合适~雨天滴滴答答,不是很舒服,但却并没影响到FB的心情~~~店开在 比较老式的弄堂里, 周围都是居民区,门面并不大,不过据说这里生意很好。性价比高么做的是绍兴菜,装修比较朴素,菜单也是很简单的A4纸 塑封一下总体来说这里感觉很实惠,虽然价格不贵,但是品质却不错,摆盘很用心很漂亮。酒香不怕巷子深 用在这里真是非常合适~''' words_ids = data_utils.sentence_to_token_ids(text, vocab_word) predicts = model.predict(words_ids, args.model_path) print text print "预测结果为:", predicts if predicts == [0]: print "正面评论" else: print "负面评论"
def prepare_data(): from_train = None to_train = None from_dev = None to_dev = None if FLAGS.from_train_data and FLAGS.to_train_data: from_train_data = FLAGS.from_train_data to_train_data = FLAGS.to_train_data from_dev_data = from_train_data to_dev_data = to_train_data if FLAGS.from_dev_data and FLAGS.to_dev_data: from_dev_data = FLAGS.from_dev_data to_dev_data = FLAGS.to_dev_data from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data( FLAGS.data_dir, from_train_data, to_train_data, from_dev_data, to_dev_data, FLAGS.from_vocab_size, FLAGS.to_vocab_size, data_utils.char_tokenizer) else: # Prepare WMT data. print("Preparing WMT data in %s" % FLAGS.data_dir) from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_wmt_data( FLAGS.data_dir, FLAGS.from_vocab_size, FLAGS.to_vocab_size) return from_train, to_train, from_dev, to_dev
def test_data(country, state): if state in SEIR_STATE: return train_df, val_df, raw_df = load_data(country=country, state = state) raw_params = prepare_data( raw_df['Confirmed'].values, raw_df['Deaths'].values, raw_df['Recovered'].values, population[country][state] ) print() print(f'test on {state}') # TEST TIME-DEPENDENT MODELS # test_time_SIR(train_df, val_df, raw_df, state, country) # test_time_SIRD(train_df, val_df, raw_df, state, country) # TEST BASIC MODELS for att in ['I', 'R']: test_basic_SIR(raw_df, att, state, country) test_basic_SIRD(raw_df, att, state, country)
def process_data(): print("Preparing data in %s" % FLAGS.data_dir) data_utils.prepare_data(FLAGS)
def train(): print "Preparing data in %s" % settings.data_dir sr_train_ids_path, tg_train_ids_path,sr_dev_ids_path, tg_dev_ids_path,sr_vocab_path, tg_vocab_path = data_utils.prepare_data(settings.data_dir) print "Reading training data from %s" % settings.data_dir train_set = data_utils.read_data(sr_train_ids_path,tg_train_ids_path,settings.max_train_num) train_batches,train_bucket_ids = data_utils.batchize(train_set) print "Reading development data from %s" % settings.data_dir dev_set = data_utils.read_data(sr_dev_ids_path,tg_dev_ids_path) dev_batches,dev_bucket_ids = data_utils.batchize(dev_set,False) log_file = open(settings.train_dir+'log.txt','w') log_file.write('epoch\tstep\ttime\ttrain-ppx\tdev-ppx\n') log_file.flush() with tf.Session() as sess: print("Creating %d layers of %d units." % (settings.num_layers, settings.size)) model = create_model(sess, False) current_epoch,current_step,train_loss = 0,0,0.0 start_time = time.time() while True: current_epoch+=1 for batch_id in xrange(len(train_batches)): current_step+=1 step_start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.preprocess_batch(train_batches[batch_id], train_bucket_ids[batch_id]) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, train_bucket_ids[batch_id], False) print "global-step %d\tstep-time %.2f\tstep-loss %.2f" % (model.global_step.eval(),time.time()-step_start_time,step_loss) train_loss+=step_loss/settings.steps_per_checkpoint if current_step % settings.steps_per_checkpoint == 0: # evaluate in training set train_ppx = math.exp(train_loss)/model.batch_size if train_loss < 300 else float('inf') # evaluate in development set dev_loss=0.0 for dev_batch_id in xrange(len(dev_batches)): encoder_inputs, decoder_inputs, target_weights = model.preprocess_batch(dev_batches[dev_batch_id], dev_bucket_ids[dev_batch_id]) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, dev_bucket_ids[dev_batch_id], True) dev_loss+=step_loss/len(dev_batches) dev_ppx = math.exp(dev_loss)/model.batch_size if dev_loss < 300 else float('inf') log_file.write("%d\t%d\t%.2f\t%.2f\t%.2f\n" % (current_epoch,model.global_step.eval(),time.time()-start_time,train_ppx,dev_ppx)) log_file.flush() sys.stdout.flush() train_loss,dev_loss = 0.0,0.0 checkpoint_path = os.path.join(settings.train_dir, "summary.ckpt") model.saver.save(sess, checkpoint_path,global_step=model.global_step) train_batches,train_bucket_ids = data_utils.batchize(train_set)
def train(): """Train a translation model using NMT data.""" source = sys.argv[1] target = sys.argv[2] # Prepare NMT data. print("Preparing NMT data in %s" % FLAGS.data_dir) print(" source langauge: %s" % source) print(" target language: %s" % target) # Generates the preprocessed train and test files and gives their paths. These have tokenized ids. s_train, t_train, s_dev, t_dev, _, _ = data_utils.prepare_data(FLAGS.data_dir, FLAGS.s_vocab_size, FLAGS.t_vocab_size, source, target) print("Tokenized Inputs: ", s_train, t_train, s_dev, t_dev) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, True, False) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(s_dev, t_dev) train_set = read_data(s_train, t_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] perplexity = 1e10 train_steps, train_ppx, bucket_ppx = [], [], {0:[], 1:[], 2:[], 3:[]} # Put a limit on the number of iterations it takes to train (instead of the perplexity) while current_step <= 12000: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: train_steps.append(current_step) # Print statistics for the previous epoch. perplexity = math.exp(float(loss)) if loss < 300 else float("inf") train_ppx.append(perplexity) print("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss, eval_loss_tot = 0.0, 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(float(eval_loss)) if eval_loss < 300 else float("inf") bucket_ppx[bucket_id].append(eval_ppx) print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) eval_loss_tot += eval_loss eval_loss_avg = eval_loss_tot/len(_buckets) eval_ppx = math.exp(float(eval_loss_avg)) if eval_loss < 300 else float("inf") print(" eval: mean perplexity %.2f" % eval_ppx) sys.stdout.flush() print(train_steps) print(train_ppx) print(bucket_ppx)
def train(): """Train a nl -> machine-code translation model.""" # Prepare training & dev data. print("Preparing data in %s" % FLAGS.data_dir) srce_train, trgt_train, trgt_train_pos, trgt_train_map, srce_dev, trgt_dev, trgt_dev_pos, trgt_dev_map, _, _, srce_vocab_size, trgt_vocab_size = data_utils.prepare_data( FLAGS.data_dir, FLAGS.srce_vocab_min, FLAGS.trgt_vocab_min) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, srce_vocab_size, trgt_vocab_size, False) # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(srce_dev, trgt_dev, trgt_dev_pos, trgt_dev_map) train_set = read_data(srce_train, trgt_train, trgt_train_pos, trgt_train_map, max_size=FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) print("training set bucket: ", train_bucket_sizes) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # size of dev set dev_bucket_sizes = [len(dev_set[b]) for b in xrange(len(_buckets))] dev_size = float(sum(dev_bucket_sizes)) dev_bucket_proportion = [b/dev_size for b in dev_bucket_sizes]# proportion print("dev set bucket: ", dev_bucket_sizes) # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] dev_losses = [] steps_per_checkpoint = int(train_total_size / FLAGS.batch_size) print ("steps per checkpoint: ", steps_per_checkpoint) while current_step < (FLAGS.epoch * steps_per_checkpoint): # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # check for empty bucket if len(train_set[bucket_id]) == 0: continue # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights, pos, maps = model.get_batch( train_set, bucket_id) # step _, step_loss, _, _, _, _= model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False, decoder_inputs_positions=pos, decoder_inputs_maps=maps) step_time += (time.time() - start_time) / steps_per_checkpoint loss += step_loss / steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. # if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): if current_step / steps_per_checkpoint > 5: sess.run(model.learning_rate_decay_op) print ("learning rate update to %.4f" % model.learning_rate.eval()) if model.learning_rate == float(0): break previous_losses.append(loss) # Run evals on development set, print their perplexity and perform early stopping. eval_loss_per_bucket = [] # eval_loss for the whole dev set for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id])==0: # print ("Bucket %s is empty." % bucket_id) eval_loss_per_bucket.append(float(0)) continue encoder_inputs, decoder_inputs, target_weights, pos, maps = model.get_batch( dev_set, bucket_id) _, eval_loss, _, _, _, _= model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True, decoder_inputs_positions=pos, decoder_inputs_maps=maps) eval_loss_per_bucket.append(float(eval_loss)) print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_loss)) dev_loss = np.dot(np.asarray(eval_loss_per_bucket), np.asarray(dev_bucket_proportion)) dev_losses.append(dev_loss) print(" eval: dev set weighted perplexity %.2f"% dev_loss) if dev_loss <= min(dev_losses): # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.data_dir, "checkpoint/ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 sys.stdout.flush()
def train(): """Run SpeakEasy/server/python_model/scripts/run.sh to train model""" slack.connection.notify( text='Training SpeakEasy!', ) # Prepare movie subtitle data. print("Preparing data in %s" % FLAGS.data_dir) sys.stdout.flush() data_train, data_dev, _ = data_utils.prepare_data(FLAGS.data_dir, FLAGS.vocab_size) with tf.Session() as sess: # Create model. print("Creating %s model with %d layers of %d units." % (FLAGS.model_type, FLAGS.num_layers, FLAGS.size)) sys.stdout.flush() if FLAGS.buckets: print("Using bucketed model.") sys.stdout.flush() model = create_model(sess, False) # Set up event logging. NOTE: added this, this is not finished merged_summaries = tf.merge_all_summaries() # writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph_def) # Read data into buckets and compute their sizes. print ("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) sys.stdout.flush() dev_set = read_data(data_dev) train_set = read_data(data_train, FLAGS.max_train_data_size) if FLAGS.buckets: train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Get a batch and make a step. start_time = time.time() if FLAGS.buckets: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id=bucket_id) summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, False, bucket_id=bucket_id) else: encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id=None) summaries, _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, False, bucket_id=None) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Save summaries. NOTE: added this # result = sess.run(merged_summaries) # summary_str = result[0] # writer.add_summary(summary_str, current_step) # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') log_line = ("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) print(log_line) slack.connection.notify( text=log_line, ) sys.stdout.flush() # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): result = sess.run([model.learning_rate_decay_op]) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "speakEasy" + str(FLAGS.vocab_size) + ".ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 if FLAGS.buckets: # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)-1): encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, True, bucket_id=bucket_id) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') log_line = "eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx) print(" %s" % log_line) slack.connection.notify( text=log_line, ) sys.stdout.flush()
def train(): """Train a en->fr translation model using WMT data.""" # Prepare parallel corpus data. print("Preparing parallel corpus data in %s" % FLAGS.data_dir) source_train, target_train, source_dev, target_dev, _, _ = data_utils.prepare_data( FLAGS.data_dir, FLAGS.source_vocab_size, FLAGS.target_vocab_size, FLAGS.train_name, FLAGS.dev_name, FLAGS.source_ext, FLAGS.target_ext, tokenizer=data_utils.whitespace_tokenizer) with tf.Session() as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(source_dev, target_dev) train_set = read_data(source_train, target_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(_buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))] # This is the training loop. print("Start training...") step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in train_buckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([i for i in xrange(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01]) # Get a batch and make a step. start_time = time.time() encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp(loss) if loss < 300 else float('inf') print("global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) # Decrease learning rate if no improvement was seen over last 3 times. if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0 # Run evals on development set and print their perplexity. for bucket_id in xrange(len(_buckets)): if len(dev_set[bucket_id]) == 0: print(" eval: empty bucket %d" % (bucket_id)) continue encoder_inputs, decoder_inputs, target_weights = model.get_batch( dev_set, bucket_id) _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) sys.stdout.flush()
def __init__(self, alpha, batch_size, n_epochs, wordVecLen, flag_dropout, datapath, random_seed, dropoutRates, optimizer, dispFreq, beam_size, flag_random_lookup_table, flag_toy_data, size_hidden_layer, dataset, result_path, sentence_modeling, CNN_filter_length, LSTM_go_backwards ): model_options = locals().copy() model_options['rng'] = np.random.RandomState(random_seed) print 'Loading data' src_train,src_valid,src_test,dic_w2idx, dic_idx2w, dic_w2embed, dic_idx2embed, embedding = load_data(path=datapath) if flag_toy_data == True: src_valid = src_valid[:10] src_test = src_test[:10] #src_train = copy.copy(src_valid) src_train = src_train[:10] elif flag_toy_data != False: valid_l = len(src_valid) * flag_toy_data test_l = len(src_test) * flag_toy_data train_l = len(src_train) * flag_toy_data src_valid = src_valid[:int(valid_l)] src_test = src_test[:int(test_l)] src_train = src_train[:int(train_l)] train,pairdict_train = prepare_data(src_train) valid,pairdict_valid = prepare_data(src_valid) test,pairdict_test = prepare_data(src_test) model_options['embedding'] = embedding (sentence1,sentence1_mask,sentence2,sentence2_mask,y,cost,f_pred,tparams,f_debug) = build_model(model_options) #f_cost = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], cost, name='f_cost') #grads = tensor.grad(theano.gradient.grad_clip(cost, -2.0, 2.0), wrt=tparams.values()) grads = tensor.grad(theano.gradient.grad_clip(cost, -2.0, 2.0), wrt=tparams) # grads = tensor.grad(cost, wrt=tparams.values()) #f_grad = theano.function([sentence1,sentence1_mask,sentence2,sentence2_mask,y], grads, name='f_grad') lr = tensor.scalar(name='lr') if model_options['optimizer'] == 'sgd': optimizer = sgd elif model_options['optimizer'] == 'rmsprop': optimizer = rmsprop else: optimizer = adadelta f_grad_shared, f_update = optimizer(lr, tparams, grads, sentence1,sentence1_mask,sentence2,sentence2_mask,y, cost) print 'Optimization' kf_valid = get_minibatches_idx(len(valid), model_options['batch_size']) kf_test = get_minibatches_idx(len(test), model_options['batch_size']) print "%d train examples" % len(train) print "%d valid examples" % len(valid) print "%d test examples" % len(test) sys.stdout.flush() best_validation_score = -np.inf best_iter = 0 uidx = 0 # the number of update done for epoch in xrange(model_options['n_epochs']): print ('Training on %d epoch' % epoch) sys.stdout.flush() kf = get_minibatches_idx(len(train), batch_size, shuffle=True) start_time = time.time() samples_seen = 0 for _, train_index in kf: uidx += 1 batch_samples = [train[t] for t in train_index] samples_seen += len(batch_samples) #print batch_samples sentence1,sentence1_mask,sentence2,sentence2_mask,y = data_padding(batch_samples) #print sentence1,sentence1_mask,sentence2,sentence2_mask,y #print sentence1.shape,sentence1_mask.shape,sentence2.shape,sentence2_mask.shape,y.shape #o = f_debug(sentence1,sentence1_mask,sentence2,sentence2_mask,y) #print o #print o[0].shape,o[1].shape,o[2].shape,o[3].shape cost = f_grad_shared(sentence1,sentence1_mask,sentence2,sentence2_mask,y) f_update(model_options['alpha']) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: print 'Epoch ', epoch, 'Update ', uidx, 'Cost ', cost, 'Samples_seen ', samples_seen sys.stdout.flush() print 'Epoch ', epoch, 'Update ', uidx, 'Cost ', cost, 'Samples_seen ', samples_seen sys.stdout.flush() ''' if epoch % 5 == 0: kf_train = get_minibatches_idx(len(train), batch_size) print ('Train_score:') self.eva(f_pred, src_train, train, pairdict_train, kf_train, model_options) sys.stdout.flush() ''' print ('Valid_score:') top1_res = self.eva(f_pred, src_valid, valid, pairdict_valid, kf_valid, model_options) self.save_result(model_options['result_path'] + 'dev.on.' + str(epoch) +'th_epoch_' + model_options['dataset'],top1_res) sys.stdout.flush() print ('Test_score:') top1_res = self.eva(f_pred, src_test, test, pairdict_test, kf_test, model_options) self.save_result(model_options['result_path'] + 'test.on.' + str(epoch) +'th_epoch_' + model_options['dataset'],top1_res) sys.stdout.flush() print ('%d epoch completed.' % epoch) sys.stdout.flush() ''' if(best_validation_score < valid_score): best_iter = epoch best_validation_score = valid_score print ('Current best_dev_F is %.2f, at %d epoch'%(best_validation_score,best_iter)) ''' end_time = time.time() minu = int((end_time - start_time)/60) sec = (end_time - start_time) - 60 * minu print ('Time: %d min %.2f sec' % (minu, sec)) sys.stdout.flush() print('Training completed!') sys.stdout.flush()