def train(FLAGS): # Load the data en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/en.p', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/sp.p', max_vocab_size=5000, target_lang=True) # Split into train and validation sets train_encoder_inputs, train_decoder_inputs, train_targets, \ train_en_seq_lens, train_sp_seq_len, \ valid_encoder_inputs, valid_decoder_inputs, valid_targets, \ valid_en_seq_lens, valid_sp_seq_len = \ split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens, train_ratio=0.8) # Update parameters FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) # Start session with tf.Session() as sess: # Create new model or load old one model = create_model(sess, FLAGS) # Training begins losses = [] for epoch_num, epoch in enumerate( generate_epoch(train_encoder_inputs, train_decoder_inputs, train_targets, train_en_seq_lens, train_sp_seq_len, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH: %i" % (epoch_num) # Decay learning rate sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(epoch): loss, _ = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, FLAGS.dropout) batch_loss.append(loss) losses.append(np.mean(batch_loss)) plt.plot(losses, label='loss') plt.legend() plt.show()
def sample(FLAGS): # Load the data needed to convert your sentence en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/en.p', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/sp.p', max_vocab_size=5000, target_lang=True) # Change FLAGS parameters FLAGS.batch_size = 1 FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token # Process sample sentence inference_sentence = ["I like to play tennis and eat sandwiches."] # Split into tokens tokenized = [] for i in xrange(len(inference_sentence)): tokenized.append(basic_tokenizer(inference_sentence[i])) # Convert data to token ids data_as_tokens, sample_en_seq_lens = data_to_token_ids( tokenized, en_vocab_dict, target_lang=False, normalize_digits=True) # make dummy_sp_inputs dummy_sp_inputs = np.array([[GO_ID]*FLAGS.sp_max_len]) sample_sp_seq_lens = np.array([len(dummy_sp_inputs)]) print data_as_tokens print sample_en_seq_lens print dummy_sp_inputs print sample_sp_seq_lens with tf.Session() as sess: # Load trained model model = create_model(sess, FLAGS, forward_only=True) y_pred = model.step(sess, FLAGS, batch_encoder_inputs=data_as_tokens, batch_decoder_inputs=dummy_sp_inputs, batch_targets=None, batch_en_seq_lens=sample_en_seq_lens, batch_sp_seq_lens=sample_sp_seq_lens, dropout=0.0, forward_only=True, sampling=True) # compose the predicted sp sentence sp_sentence = [] for idx in y_pred[0]: sp_sentence.append(sp_rev_vocab_dict[idx]) print " ".join([word for word in sp_sentence])
def __init__(self): self.df = load_oxford_data() self.min_date = self.df["Date"].unique().min() self.max_date = self.df["Date"].unique().max() self.num_date = self.df["Date"].nunique() # merge with Hopkins data hopkins = Hopkins() hdf = hopkins.data hdf_max_date = hdf['Date'].max() self.df = self.df[self.df['Date'] <= hdf_max_date] self.df = self.df.merge(hdf, on=['Date', 'Country_Code'], how='left') print("- merged Oxford and Hopkins", self.df.shape) self.min_date = self.df["Date"].unique().min() self.max_date = self.df["Date"].unique().max() self.num_date = self.df["Date"].nunique() print("- dates from {} to {} total {} days".format( self.min_date, self.max_date, self.num_date)) self.mdf = self.df[[ "Country_Code", "CountryName", "Date", "DateTime", "ConfirmedCases", "ConfirmedDeaths", "Recovered", "StringencyIndex" ]].copy() self.mdf = process_data(self.mdf, self.df)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--mode", choices={"train", "chat"}, default="train", help="mode. if not specified, it's in the train mode") args = parser.parse_args() if not os.path.exists(os.path.join(config.DATA_PATH, "test_ids.dec")): data_utils.process_data() print("Data ready!") # create checkpoints folder if there isn't one already data_utils.make_dir(config.CPT_PATH) if args.mode == "train": train() elif args.mode == "chat": chat()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--mode', choices={'train', 'chat'}, default='train', help="mode. if not specified, it's in the train mode") args = parser.parse_args() if not os.path.isdir(config.PROCESSED_PATH): data_utils.prepare_raw_data() data_utils.process_data() print('Data ready!') data_utils.make_dir(config.CPT_PATH) if args.mode == 'train': train() elif args.mode == 'chat': chat()
def train(params): hindi_token_ids, hindi_seq_lens, hindi_vocab_dict, hindi_rev_vocab_dict = process_data('../data/hindi_dump.p', max_vocab_size=100000, target_lang=False) bengali_token_ids, bengali_seq_lens, bengali_vocab_dict, bengali_rev_vocab_dict = process_data('../data/bengali_dump.p', max_vocab_size=100000, target_lang=True) train_encoder_inputs, train_decoder_inputs, train_targets, train_hindi_seq_lens, train_bengali_seq_len, valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_hindi_seq_lens, valid_bengali_seq_lens = split_data(hindi_token_ids, bengali_token_ids, hindi_seq_lens, bengali_seq_lens,train_ratio=0.8) params.hindi_vocab_size = len(hindi_vocab_dict) params.bengali_vocab_size = len(bengali_vocab_dict) print params.hindi_vocab_size, params.bengali_vocab_size with tf.Session() as sess: _model = model(params) sess.run(tf.global_variables_initializer()) losses = [] accs = [] for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs,train_decoder_inputs, train_targets,train_hindi_seq_lens, train_bengali_seq_len,params.num_epochs, params.batch_size)): print "EPOCH : ", epoch_num sess.run(tf.assign(_model.lr, 0.01 * (0.99 ** epoch_num))) batch_loss = [] batch_acc = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs,batch_targets, batch_hindi_seq_lens,batch_bengali_seq_lens) in enumerate(epoch): loss, _,acc = _model.step(sess, params,batch_encoder_inputs, batch_decoder_inputs, batch_targets,batch_hindi_seq_lens, batch_bengali_seq_lens,params.dropout) batch_loss.append(loss) batch_acc.append(acc) losses.append(np.mean(batch_loss)) accs.append(np.mean(batch_acc)) print "Training Loss: ",losses[-1] print "Training Accuracy",accs[-1] plt.plot(losses, label='loss') plt.legend() # plt.show() plt.title('Plot for Training Error versus Epochs', fontsize='20', style='oblique') plt.xlabel('Epochs', fontsize='16', color='green') plt.ylabel('Training Error', fontsize='16', color='green') plt.savefig('../output/plot.png') plt.show() acc = _model.test(sess, params, valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_hindi_seq_lens, valid_bengali_seq_lens, params.dropout) print acc
def read_h5_file(h5_path): h5_file = h5py.File(h5_path) y = np.array(h5_file['annotations']) X = np.array(h5_file['features']) X, y = data_utils.process_data(X, y) h5_file.close() ''' X = non_maximal_suppression(X, range(34, 50), supp_th=0.5) X = non_maximal_suppression(X, range(15, 17), supp_th=(0.01*180)/np.pi) X = non_maximal_suppression(X, range(56, 57), supp_th=1) # 1 degree ''' return X, y
def __init__(self, data_path, max_length, max_vocab_size, min_freq, eos_token, pad_token, unk_token, embed_dim, special_tokens, threshold, pre_trained=False): """ Args: data_path (str): path to data file max_length (int): maximum length of each sentence, including <eos> max_vocab_size (int): maximum number of words allowed in vocabulary min_freq (int): minimum frequency to add word to vocabulary eos_token (str): end of sentence token (tells decoder to start or stop) pad_token (str): padding token unk_token (str): unknown word token embed_dim (int): dimension of embedding vectors special_tokens (list of str): other tokens to add to vocabulary threshold (int): count of unknown words required to prune sentence pre_trained (Vector): pre trained word embeddings """ special_tokens = [pad_token, unk_token, eos_token] + special_tokens # the value 0 will be regarded as padding assert special_tokens[0] == pad_token inputs, targets, counter, xlen = process_data(data_path, max_length, eos_token, pad_token) self.vocab = vocab.Vocab(counter=counter, max_size=max_vocab_size, min_freq=min_freq, specials=special_tokens) if pre_trained is not False: self.vocab.load_vectors(pre_trained) assert len(inputs) == len(targets) and len(inputs) == len(xlen) self.nwords = len(self.vocab) self.max_len = max_length self.eos_idx = self.vocab.stoi[eos_token] self.pad_idx = self.vocab.stoi[pad_token] self.unk_idx = self.vocab.stoi[unk_token] self.eos_token = eos_token self.pad_token = pad_token self.unk_token = unk_token self.embed_dim = embed_dim self.unk_count = 0 # number of unknown words in dataset self.total_tokens = 0 # number of tokens in dataset not counting padding self.special_tokens = special_tokens self.x_lens = xlen self.x_data = np.zeros((len(inputs), max_length), dtype=np.int32) self.y_data = np.zeros((len(targets), max_length), dtype=np.int32) convert_to_index(inputs, self, self.x_data) convert_to_index(targets, self, self.y_data) self.x_data, self.y_data, self.x_lens = prune_data(self.x_data, self.y_data, self.x_lens, self, threshold) self.x_data = torch.from_numpy(self.x_data) self.y_data = torch.from_numpy(self.y_data)
def run(args): files = [INPUT_DIRECTORY + '/' + f for f in listdir(INPUT_DIRECTORY) if isfile(join(INPUT_DIRECTORY, f))] files.sort() if not os.path.exists(OUTPUT_DIRECTORY) or not os.path.isdir(OUTPUT_DIRECTORY): os.mkdir(OUTPUT_DIRECTORY) latex_ouput = '' for filename in files: latex_part = "\\paragraph{" latex_part += filename.split('/')[-1].split('.pdf.txt.txt')[0] latex_part += "}\n\\begin{enumerate}\n" f = open(filename,"r",encoding='utf-8', errors='ignore') sentences = f.readlines() sentences = [sentence.replace('\n', '') for sentence in sentences] for question in QUESTIONS: latex_part += "\\item " + question + "\\\\\n" latex_part += "$\\longrightarrow$ " reset_dict() testS, testQ, testA = process_data(sentences, question) answer, answer_probability, mem_probs = get_pred(testS, testQ) memory_probabilities = np.round(mem_probs, 4) best_sentence_index = 0 best_sentence_score = 0 # print(len(memory_probabilities.tolist())) for index, mem in enumerate(memory_probabilities.tolist()): if mem[2] > best_sentence_score: best_sentence_index = index best_sentence_score = mem[2] words_l = [] for idw in testS[0][best_sentence_index]: if idw == 0: break words_l.append(decode(idw)) sentence = ' '.join(words_l) sentence.replace('%', '\\%') sentence.replace('_', '\\_') latex_part += sentence + "\n" latex_part += "\\end{enumerate}" latex_ouput += "\n" + latex_part f = open(join(OUTPUT_DIRECTORY, 'latex_out.txt'), 'w') f.write(latex_ouput)
def create_most_informative_hist(gest_seq_h5_filepath, openface_dir, cpm_dir, user, gesture): gest_seq_h5 = h5py.File(gest_seq_h5_filepath, 'r') per_user_var_stats, per_user_max_vel_stats = {}, {} final_feat_names = None for g in range(10): for target_group in ['train', 'test']: for target_user in gest_seq_h5[target_group].keys(): if len(gest_seq_h5[target_group][target_user][str(g)].shape) <= 1: continue gest_seq = np.array(gest_seq_h5[target_group][target_user][str(g)]) target_openface_h5_path = os.path.join(openface_dir, target_user) target_cpm_h5_path = os.path.join(cpm_dir, target_user) X, y, cpm_X = data_utils.get_all_features( target_openface_h5_path, target_cpm_h5_path) X, _ = data_utils.process_data(X, y, cpm_X) X_filt, feat_names = filter_features(X) if final_feat_names is None: final_feat_names = feat_names var_stats, max_vel_stats = calculate_feat_variance( X_filt, feat_names, gest_seq) if per_user_var_stats.get(str(g)) is None: per_user_var_stats[str(g)] = {} per_user_max_vel_stats[str(g)] = {} per_user_var_stats[str(g)][target_user] = var_stats per_user_max_vel_stats[str(g)][target_user] = max_vel_stats gest_seq_h5.close() create_feature_gesture_distribution(per_user_var_stats, final_feat_names) # Plot the histgram for a given gesture sorted_filenames = sorted(per_user_var_stats[gesture].keys()) start_idx = 4 * PLOT_WIDTH*PLOT_HEIGHT plot_histograms(per_user_var_stats[gesture], feat_names, sorted_filenames[start_idx:start_idx+PLOT_WIDTH*PLOT_HEIGHT])
#!/usr/bin/env python from argparse import ArgumentParser from data_utils import process_data from keras.models import load_model if __name__ == '__main__': arg_parser = ArgumentParser(description='load and evalue model') arg_parser.add_argument('--train', help='HDF5 file with training ' 'data') arg_parser.add_argument('--test', help='HDF5 file with test data') arg_parser.add_argument('model_file', help='HDF5 file containing ' 'the model') arg_parser.add_argument('--verbose', type=int, default=1, help='verbosity level of evaluation') options = arg_parser.parse_args() model = load_model(options.model_file) if options.train: x_train, y_train = process_data(options.train) loss, accuracy = model.evaluate(x_train, y_train, verbose=options.verbose) print(f'training: loss = {loss:.4f}, accuracy = {accuracy:.4f}') if options.test: x_test, y_test = process_data(options.test) loss, accuracy = model.evaluate(x_test, y_test, verbose=options.verbose) print(f'test: loss = {loss:.4f}, accuracy = {accuracy:.4f}')
""" Setup scripts for downloading AS data and pre-processing for analysis """ import data_utils as dutil if __name__ == '__main__': # print("Downloading data from AWS") # download_data() print('Processing data') dutil.process_data()
from argparse import ArgumentParser from data_utils import process_data from keras.models import load_model if __name__ == '__main__': arg_parser = ArgumentParser(description='load and evalue model') arg_parser.add_argument('--train', help='HDF5 file with training ' 'data') arg_parser.add_argument('--test', help='HDF5 file with test data') arg_parser.add_argument('model_file', help='HDF5 file containing ' 'the model') arg_parser.add_argument('--verbose', type=int, default=1, help='verbosity level of evaluation') options = arg_parser.parse_args() model = load_model(options.model_file) if options.train: x_train, y_train = process_data(options.train) loss, accuracy = model.evaluate(x_train, y_train, verbose=options.verbose) print(f'training: loss = {loss:.4f}, accuracy = {accuracy:.4f}') if options.test: x_test, y_test = process_data(options.test) loss, accuracy = model.evaluate(x_test, y_test, verbose=options.verbose) print(f'test: loss = {loss:.4f}, accuracy = {accuracy:.4f}')
def train(create_data, log_file): """Trains a english to simple english translation model. Args: create_data: whether to load data from databases or not on startup. log_file: where to store training data outputs. """ if os.path.isfile('./' + log_file): raise ValueError('log file already exists') if create_data: data_utils.process_data() with tf.Session() as sess, open(log_file, 'w+') as log: print 'Opening log file' fields = [ 'step', 'step-time', 'batch-loss', 'batch-perplexity', 'learnrate', 'val-loss' ] log.write(','.join(fields) + '\n') print 'creating model' model = create_model(sess, False) print 'reading data' train, valid = read_data(dc.NORMAL_IDS_PATH, dc.SIMPLE_IDS_PATH) print 'entering training loop' step_time, loss = 0.0, 0.0 current_step = 0 prev_losses = [] # Training loop while current_step < dc.NUM_STEPS or dc.IGNORE_STEPS: start_time = time.time() encoder_in, decoder_in, target_weights = model.get_batch(train) step_loss, _ = model.step(sess, encoder_in, decoder_in, target_weights, False) step_time += (time.time() - start_time) / dc.STEPS_PER_CHECKPOINT loss += step_loss / dc.STEPS_PER_CHECKPOINT current_step += 1 if current_step < dc.STEPS_PER_CHECKPOINT: print "Step: %f" % current_step print "Loss: %f" % step_loss print "Learning: %f" % model.learning_rate.eval() # Every some amount of steps, output stats and check validation loss if current_step % dc.STEPS_PER_CHECKPOINT == 0: learnrate = model.learning_rate.eval() perplex = math.exp(float(loss)) if loss < 300 else float("inf") step = model.global_step.eval() print "step %d loss %f plex: %f learnrate %f step-time: %f" % ( step, loss, perplex, learnrate, step_time) if len(prev_losses) > 2 and loss > max( prev_losses[-1 * dc.DECAY_POINT:]): sess.run(model.learning_rate_decay_op) prev_losses.append(loss) checkpoint_path = os.path.join(dc.CKPT_PATH, 'simplify.ckpt') model.saver.save(sess, checkpoint_path, global_step=model.global_step) encoder_in, decoder_in, target_weights = model.get_batch(valid) val_loss, outputs = model.step(sess, encoder_in, decoder_in, target_weights, True) if dc.DEBUG: print "ENCODER LEN" print len(encoder_in[0]) print "OUTPUT LENs" print len(outputs) print len(outputs[0]) print len(outputs[0][0]) outputs = [ int(np.argmax(logit, axis=1)[0]) for logit in outputs ] if dc.DEBUG: print outputs print "validation loss: %f" % val_loss fields = [step, step_time, loss, perplex, learnrate, val_loss] log.write(','.join(map(str, fields)) + '\n') step_time, loss = 0.0, 0.0 sys.stdout.flush()
def save(self, step): self.saver.save(self.sess, self.config.ckpt_path + '.ckpt', global_step=step) def restore(self): # get checkpoint state ckpt = tf.train.get_checkpoint_state(self.config.ckpt_path) # restore session if ckpt and ckpt.model_checkpoint_path: self.saver.restore(self.sess, ckpt.model_checkpoint_path) if __name__ == '__main__': K.set_learning_phase(1) graph = tf.Graph() sess = tf.Session() config = Config() model = Model(config, sess, graph) train_data, validation_data, test_data = du.process_data() batches = du.generate_train_batches(train_data, config.batch_size) batch = du.get_next_batch(batches) batch_images, batch_labels = map(list, zip(*batch)) batch_images = np.array(batch_images) batch_labels = np.array(batch_labels) batch_images = batch_images.reshape(-1, config.image_size, config.image_size, config.channels) pred, loss = model.predict(batch_images, batch_labels) print(loss)
def predict(data_path='data/predict/', model_path='pretrained/model.h5', use_prophet=False): """ Main function to run the prediction """ print('Loading and processing data ...') # get the processed dataframe df = process_data(data_path) # predict using only last 14 days of the time series data df = get_last_nday(df, args.num_day) print('done!') gh_list = list(df.geohash6.unique()) chunk_size = len(gh_list) // (args.num_thread - 1) gh_chunks = [ gh_list[chunk_size * i:chunk_size * (i + 1)] for i in range(args.num_thread) ] gh_chunks = [ch for ch in gh_chunks if ch] pool = Pool(args.num_thread) if use_prophet: # use facebook's prophet to predict p_prophet_predict = partial(prophet_predict, df) predictions = [] preds = pool.map(p_prophet_predict, gh_chunks) [predictions.extend(pred) for pred in preds] pred_df = pd.concat(predictions, ignore_index=True) pred_df.loc[:, 'demand'] = pred_df['demand'].clip_lower(0).clip_upper(1) return pred_df else: print('Predict using wavenet model...') # load model model = load_model(model_path) # extract feature for all locations print('Extracting features ...') p_extract_feature = partial(extract_feature, df) features = [] fts = pool.map(p_extract_feature, gh_chunks) [features.extend(ft) for ft in fts] print('done!') # split features into batches feature_chunks = [ features[args.batch_size * i:args.batch_size * (i + 1)] for i in range(int(len(features) / args.batch_size) + 1) ] feature_chunks = [ch for ch in feature_chunks if ch] # run prediction using keras model print('Predicting ...') predictions = [] for each_chunk in feature_chunks: print(len(each_chunk)) batch = np.concatenate(each_chunk, axis=0) pred = model.predict(batch) pred = np.reshape(pred, (pred.shape[0], pred.shape[1])) predictions.extend(list(pred)) print('done!') print(len(predictions)) # create result dataframe from prediction results print('Constructing result dataframe ...') pred_df = [] for i, pred in enumerate(predictions): res = build_result_dataframe(gh_list[i], pred, df) pred_df.append(res) pred_df = pd.concat(pred_df, ignore_index=True) print('done!') pred_df.loc[:, 'demand'] = pred_df['demand'].clip_lower(0).clip_upper(1) return pred_df
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin( "data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) logging.info(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Load Dataset ========= # You can change this code to load dataset in your own way dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) dev_filename = os.path.basename(FLAGS.dev_path) context_data, question_data, context_data_chars, question_data_chars, question_uuid_data = prepare_dev( dev_dirname, dev_filename, vocab) context_data, context_lengths = process_data(context_data, FLAGS.output_size) question_data, question_lengths = process_data(question_data, FLAGS.max_question_length) # TODO: use process_data() for question_token in question_data_chars: question_token.extend( [[qa_data.PAD_ID] * FLAGS.max_word_length] * (FLAGS.max_question_length - len(question_token))) for context_token in context_data_chars: context_token.extend([[qa_data.PAD_ID] * FLAGS.max_word_length] * (FLAGS.output_size - len(context_token))) dataset = (context_data, context_lengths, question_data, question_lengths, context_data_chars, question_data_chars, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model config = Config(FLAGS) encoder = Encoder(config) if FLAGS.model == 'baseline' or FLAGS.model == 'baseline-v2' or FLAGS.model == 'baseline-v3' or FLAGS.model == 'baseline-v4' or FLAGS.model == 'baseline-v5': decoder = Decoder(config) else: decoder = HMNDecoder(config) mixer = Mixer(config) qa = QASystem(encoder, decoder, mixer, embed_path, config, FLAGS.model) with tf.Session() as sess: train_dir = get_normalized_train_dir(FLAGS.train_dir) initialize_model(sess, qa, train_dir) answers = generate_answers(sess, qa, dataset, rev_vocab) # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def train(FLAGS): # Load the data en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/tst2013.en', max_vocab_size=30000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/tst2013.tr', max_vocab_size=30000, target_lang=True) # Split into train and validation sets train_encoder_inputs, train_decoder_inputs, train_targets, \ train_en_seq_lens, train_sp_seq_len, \ valid_encoder_inputs, valid_decoder_inputs, valid_targets, \ valid_en_seq_lens, valid_sp_seq_len = \ split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens, train_ratio=0.8) output = open('data/vocab_en.pkl', 'wb') pickle.dump(en_vocab_dict, output) output.close() output = open('data/vocab_sp.pkl', 'wb') pickle.dump(sp_vocab_dict, output) output.close() # Update parameters FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) print 'len(en_vocab_dict)', len(en_vocab_dict) print 'len(sp_vocab_dict)', len(sp_vocab_dict) # Start session with tf.Session() as sess: model = None # Create new model or load old one f = checkpoint_path + ".index" print f exit() if os.path.isfile(f): model = restore_model(sess) else: model = create_model(sess, FLAGS) # Training begins losses = [] for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs, train_decoder_inputs, train_targets, train_en_seq_lens, train_sp_seq_len, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH: %i" % (epoch_num) # Decay learning rate sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(epoch): loss, _ = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, FLAGS.dropout) print loss batch_loss.append(loss) print 'mean: ', np.mean(batch_loss) print "Saving the model." model.saver.save(sess, checkpoint_path)
def create_data_augmentation_2( fdir, gest_seq_h5, new_filepath, win_sizes=[16, 32, 64], labels=[6, 7, 8, 9, 10], aug_type=OpenfaceAugmentationType.LANDMARKS_ONLY): h5_f = h5py.File(gest_seq_h5, 'r') h5_train = h5_f['train'] new_h5 = h5py.File(new_filepath, 'w') all_aug_map = {} count = 0 for f in h5_train.keys(): all_aug_map[f] = {} v = h5_train[f] fpath = fdir + '/' + f currf_h5 = h5py.File(fpath, 'r') num_augmentations = 16 if aug_type.aug_type == OpenfaceAugmentationType.LANDMARKS_ONLY: X = data_utils.trim_extra_landmarks(currf_h5['features']) elif aug_type.aug_type == OpenfaceAugmentationType.LANDMARKS_AND_VELOCITY: X, _ = data_utils.process_data(currf_h5['features'], currf_h5['annotations']) elif aug_type.aug_type == OpenfaceAugmentationType.ALL_LANDMARKS_AND_POSE: X = np.array(currf_h5['features']) X = X[:, :148] num_augmentations = 16 else: assert (False) for i in labels: str_i = str(i) gest_seq = v[str(i)] all_aug_map[f][str_i] = {} for seq in gest_seq: if type(seq) != type(np.array([])): continue gest_len = seq[1] - seq[0] gest_start = seq[0] + (gest_len // 5) gest_end = seq[1] - (gest_len // 5) for t in range(gest_start, gest_end + 1, WIN_STEP): str_t = str(t) all_aug_map[f][str_i][str_t] = {} for win_size in win_sizes: seq_augmentation = get_all_seq_augmentations_4( X, t, win_size, num_augmentations=num_augmentations) all_aug_map[f][str_i][str_t][str( win_size)] = seq_augmentation count = count + 1 if count % 300 == 0: print('Did get seq augmentation for file: {}, label: {}, ' \ 't: {}, win_size: {}'.format(f, i, t, win_size)) print('Did process file {}'.format(f)) data_utils.recursively_save_dict_contents_to_group( new_h5, str('/' + f + '/'), all_aug_map[f]) print('Did write {} augmentations'.format(f)) new_h5.flush() all_aug_map[f] = {} new_h5.flush() new_h5.close()
def train(FLAGS): # Load the data en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/en.p', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/sp.p', max_vocab_size=5000, target_lang=True) # Split into train and validation sets train_encoder_inputs, train_decoder_inputs, train_targets, \ train_en_seq_lens, train_sp_seq_len, \ valid_encoder_inputs, valid_decoder_inputs, valid_targets, \ valid_en_seq_lens, valid_sp_seq_len = \ split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens, train_ratio=0.8) # Update parameters FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) FLAGS.sp_max_len = max(sp_seq_lens) + 1 # GO token # Start session with tf.Session() as sess: # Create new model or load old one model = create_model(sess, FLAGS, forward_only=False) # Training begins train_losses = [] valid_losses = [] for epoch_num, epoch in enumerate(generate_epoch(train_encoder_inputs, train_decoder_inputs, train_targets, train_en_seq_lens, train_sp_seq_len, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH: %i" % (epoch_num) # Decay learning rate sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(epoch): y_pred, loss, _ = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, FLAGS.dropout, forward_only=False) batch_loss.append(loss) train_losses.append(np.mean(batch_loss)) for valid_epoch_num, valid_epoch in enumerate(generate_epoch(valid_encoder_inputs, valid_decoder_inputs, valid_targets, valid_en_seq_lens, valid_sp_seq_len, num_epochs=1, batch_size=FLAGS.batch_size)): batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(valid_epoch): loss = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, dropout=0.0, forward_only=True, sampling=False) batch_loss.append(loss) valid_losses.append(np.mean(batch_loss)) # Save checkpoint. if not os.path.isdir(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) checkpoint_path = os.path.join(FLAGS.ckpt_dir, "model.ckpt") print "Saving the model." model.saver.save(sess, checkpoint_path, global_step=model.global_step) plt.plot(train_losses, label='train_loss') plt.plot(valid_losses, label='valid_loss') plt.legend() plt.show()
dir_results = "./results/" if not os.path.isdir(dir_trained): os.mkdir(dir_trained) if not os.path.isdir(dir_results): os.mkdir(dir_results) config = Config() config.set_params_parser() data, idxs = read_data(use_loaded=True) idxs_train, idxs_dev, idxs_test = idxs X, y, emb, tokenizer, label_encoder = preprocess_data(data=data, use_loaded=True) X_sup_train, y_sup_train = process_data(get_supplementation( data.iloc[idxs_train], 'train', use_loaded=True), tokenizer, label_encoder, max_len_seq=35) X_sup_dev, y_sup_dev = process_data(get_supplementation(data.iloc[idxs_dev], 'dev', use_loaded=True), tokenizer, label_encoder, max_len_seq=35) X_eec, y_eec, idxs_identity = read_eec(tokenizer, label_encoder) debias_weights = read_weights() data_official_test = read_official_test() X_official_test, _ = process_data(data_official_test, tokenizer, label_encoder) acc_dev_list, auc_dev_list, acc_test_list, auc_test_list = [], [], [], []
def _main(args): data_path = os.path.expanduser(args.data_path) classes_path = os.path.expanduser(args.classes_path) anchors_path = os.path.expanduser(args.anchors_path) result_path = os.path.expanduser(args.result_path) test_path = os.path.expanduser(args.test_path) model_prefix = os.path.expanduser(args.model_prefix) num_frozen = int(args.num_frozen) num_trials = int(args.num_trials) num_epochs = int(args.num_epochs) shuffle_input = bool(int(args.shuffle)) class_names = get_classes(classes_path) data = np.load(data_path) # custom data saved as a numpy file. # has 2 arrays: an object array 'boxes' (variable length of boxes in each image) # and an array of images 'images' anchors = get_anchors(anchors_path) anchors = YOLO_ANCHORS for trial in range(num_trials): # Reprocess data to populate image_data_gen. Sacrifice latency for memory image_data_gen, boxes = data_utils.process_data( iter(data['images']), data['images'].shape[2], data['images'].shape[1], data['boxes'], dim=608) detectors_mask, matching_true_boxes = get_detector_mask(boxes, anchors) model_name = model_prefix + "-" + str(num_frozen) + "fr-trial" + str( trial) print "Training model:", model_name train(class_names, anchors, image_data_gen, boxes, detectors_mask, matching_true_boxes, model_name, num_frozen, num_epochs, shuffle_input=shuffle_input) if test_path != "" and result_path != "": mAP, precision, recalls = run_inference( model_name + ".h5", anchors, classes_path, test_path, None, # output_path 1, # mode 0.5, # score_threshold 0.5, # iou_threshold 0.5) # mAP_iou_threshold with open(result_path, "a+") as f: line = "%d,%d,%.6g,%.6g,%.6g,%d,%s\n" % ( trial, num_frozen, mAP, np.average(precision), np.average(recalls), num_epochs, model_name + ".h5") f.write(line)
# Import nescessary Library import string import math from collections import Counter # To find different characters between two sentences from difflib import SequenceMatcher import json import data_utils as dt import matplotlib.pyplot as plt import nltk # input: and essay with plain_text and markup # output: return a list of similar errors of word_choice with number of error for each # The related words for that error and the indices (on markup) of that error # Getting data data = dt.process_data('Data/tai-documents-v3/tai-documents-v3.json') def word_choice(input): output = [] check = False for i in range(len(input['markup'])): error = input['markup'][i] check = False if error['type'] == 'word choice': # Check if we see this error before, update the error_count for item in output: if ((error['old_text'] in item['words']) or (error['new_text'] in item['words'])): check = True # Set this error already marked item['index'].append(i) item['words'].add(error['old_text'])
def train(FLAGS): # Load the data en_token_ids, en_seq_lens, en_vocab_dict, en_rev_vocab_dict = \ process_data('data/my_en.txt', max_vocab_size=5000, target_lang=False) sp_token_ids, sp_seq_lens, sp_vocab_dict, sp_rev_vocab_dict = \ process_data('data/my_sp.txt', max_vocab_size=5000, target_lang=True) # Split into train and validation sets train_encoder_inputs, train_decoder_inputs, train_targets, \ train_en_seq_lens, train_sp_seq_len, \ valid_encoder_inputs, valid_decoder_inputs, valid_targets, \ valid_en_seq_lens, valid_sp_seq_len = \ split_data(en_token_ids, sp_token_ids, en_seq_lens, sp_seq_lens, train_ratio=0.8) output = open('data/vocab_en.pkl', 'wb') pickle.dump(en_vocab_dict, output) output.close() output = open('data/vocab_sp.pkl', 'wb') pickle.dump(sp_vocab_dict, output) output.close() # Update parameters FLAGS.en_vocab_size = len(en_vocab_dict) FLAGS.sp_vocab_size = len(sp_vocab_dict) print 'len(en_vocab_dict)', len(en_vocab_dict) print 'len(sp_vocab_dict)', len(sp_vocab_dict) # Start session with tf.Session() as sess: # Create new model or load old one model = create_model(sess, FLAGS) # Training begins losses = [] for epoch_num, epoch in enumerate( generate_epoch(train_encoder_inputs, train_decoder_inputs, train_targets, train_en_seq_lens, train_sp_seq_len, FLAGS.num_epochs, FLAGS.batch_size)): print "EPOCH: %i" % (epoch_num) # Decay learning rate sess.run(tf.assign(model.lr, FLAGS.learning_rate * \ (FLAGS.learning_rate_decay_factor ** epoch_num))) batch_loss = [] for batch_num, (batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens) in enumerate(epoch): loss, _ = model.step(sess, FLAGS, batch_encoder_inputs, batch_decoder_inputs, batch_targets, batch_en_seq_lens, batch_sp_seq_lens, FLAGS.dropout) batch_loss.append(loss) losses.append(np.mean(batch_loss)) checkpoint_path = "/tmp/model.ckpt" print "Saving the model." model.saver.save(sess, checkpoint_path) plt.plot(losses, label='loss') plt.legend() plt.savefig('seq_01.png')