def train_model(): vocab, embeddings = data_helper.load_embeddings(config.get('data', 'embedding_file')) train_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'train.txt')) numeric_train_samples = data_helper.convert_to_numeric_samples(train_data, vocab, num_classes=5) model = RNNModel(embeddings, num_classes=5, model_config=config['model']) dev_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'dev.txt')) numeric_dev_samples = data_helper.convert_to_numeric_samples(dev_data, vocab, num_classes=5) eval_func = lambda: model.eval(numeric_dev_samples) model.train(numeric_train_samples, eval_func) model.save(config.get('data', 'model_dir'))
def main(_): FLAGS = tf.app.flags.FLAGS pp = pprint.PrettyPrinter() FLAGS._parse_flags() pp.pprint(FLAGS.__flags) # Load Data X_train, Q_train, Y_train = data_helper.load_data('train') X_test, Q_test, Y_test = data_helper.load_data('valid') vocab_size = np.max(X_train) + 1 print('[?] Vocabulary Size:', vocab_size) # Create directories if not os.path.exists(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) timestamp = datetime.now().strftime('%c') FLAGS.log_dir = os.path.join(FLAGS.log_dir, timestamp) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) # Train Model with tf.Session(config=tf.ConfigProto( log_device_placement=False, allow_soft_placement=True)) as sess, tf.device('/gpu:0'): model = AlternatingAttention(FLAGS.batch_size, vocab_size, FLAGS.encoding_dim, FLAGS.embedding_dim, FLAGS.num_glimpses, session=sess) if FLAGS.trace: # Trace model for debugging train.trace(FLAGS, sess, model, (X_train, Q_train, Y_train)) return saver = tf.train.Saver() if FLAGS.restore_file is not None: print('[?] Loading variables from checkpoint %s' % FLAGS.restore_file) saver.restore(sess, FLAGS.restore_file) # Run evaluation if FLAGS.evaluate: if not FLAGS.restore_file: print('Need to specify a restore_file checkpoint to evaluate') else: test_data = data_helper.load_data('test') word2idx, _, _ = data_helper.build_vocab() test.run(FLAGS, sess, model, test_data, word2idx) else: train.run(FLAGS, sess, model, (X_train, Q_train, Y_train), (X_test, Q_test, Y_test), saver)
def main(): labelholder = tf.placeholder(tf.int64, shape=[None], name='labels') imageholder = tf.placeholder(tf.float32, shape=[None, 3072], name='images') layers = construct(imageholder, 3072, 10, 100) with tf.name_scope('En'): L = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=layers, labels=labelholder, name='cross_entropy')) tf.summary.scalar('en', L) currstep = tf.train.GradientDescentOptimizer(0.001).minimize( L, global_step=tf.Variable(0, name='global_step', trainable=False)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) zdata = zip(data_helper.load_data()['images_train'], data_helper.load_data()['labels_train']) batches = data_helper.gen_batch(list(zdata), 400, totalstep) for i in range(totalstep): batch = next(batches) images_batch, labels_batch = zip(*batch) if i % 100 == 0: with tf.name_scope('a'): accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(layers, 1), labelholder), tf.float32)) tf.summary.scalar('ta', accuracy) print('training accuracy for {:d} step is {:g}'.format( i, sess.run(accuracy, feed_dict={ imageholder: images_batch, labelholder: labels_batch }))) sess.run([currstep, L], feed_dict={ imageholder: images_batch, labelholder: labels_batch }) print('Accuracy is ' + format( sess.run(accuracy, feed_dict={ imageholder: data_helper.load_data()['images_test'], labelholder: data_helper.load_data()['labels_test'] })))
def main(_): FLAGS = tf.app.flags.FLAGS pp = pprint.PrettyPrinter() FLAGS._parse_flags() pp.pprint(FLAGS.__flags) # Load Data X_train, Q_train, Y_train = data_helper.load_data('train') X_test, Q_test, Y_test = data_helper.load_data('valid') vocab_size = np.max(X_train) + 1 print('[?] Vocabulary Size:', vocab_size) # Create directories if not os.path.exists(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) timestamp = datetime.now().strftime('%c') FLAGS.log_dir = os.path.join(FLAGS.log_dir, timestamp) if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) # Train Model with tf.Session(config=tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)) as sess, tf.device('/gpu:0'): model = AlternatingAttention(FLAGS.batch_size, vocab_size, FLAGS.encoding_dim, FLAGS.embedding_dim, FLAGS.num_glimpses, session=sess) if FLAGS.trace: # Trace model for debugging train.trace(FLAGS, sess, model, (X_train, Q_train, Y_train)) return saver = tf.train.Saver() if FLAGS.restore_file is not None: print('[?] Loading variables from checkpoint %s' % FLAGS.restore_file) saver.restore(sess, FLAGS.restore_file) # Run evaluation if FLAGS.evaluate: if not FLAGS.restore_file: print('Need to specify a restore_file checkpoint to evaluate') else: test_data = data_helper.load_data('test') word2idx, _, _ = data_helper.build_vocab() test.run(FLAGS, sess, model, test_data, word2idx) else: train.run(FLAGS, sess, model, (X_train, Q_train, Y_train), (X_test, Q_test, Y_test), saver)
def preprocess(): # Data Preparation # ================================================== # Load data print("Loading data...") x_text, y = data_helper.load_data(FLAGS.iseardataset) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) return x_train, y_train, vocab_processor, x_dev, y_dev
def main(): INPUTS_DIR = os.getenv('VH_INPUTS_DIR', '../data') OUTPUTS_DIR = os.getenv('VH_OUTPUTS_DIR', './models') data_name = 'sql_trainer_filtered_attempts.csv' data_path = os.path.join(INPUTS_DIR, 'filtered-data', data_name) X_train, X_val, y_train, y_val = load_data(data_path) classifiers = { # "linear SVM": svm.SVC(kernel='linear', gamma='scale'), "polynomialSVM": svm.SVC(kernel='poly', gamma='scale'), # "radialSVM": svm.SVC(kernel='rbf', gamma='scale'), # "sigmoidSVM": svm.SVC(kernel='sigmoid'), # "GaussianNB": GaussianNB(), # "MultinomialNB": MultinomialNB(), # "ComplementNB": ComplementNB(), # "BernoulliNB": BernoulliNB(), # "LDA": LinearDiscriminantAnalysis(), # "Randomforest": RandomForestClassifier(n_estimators=100, criterion="gini", bootstrap=True, oob_score=True), # "Linearregression": LinearRegression(normalize=True) } clf_estimators = dict( map( lambda name_and_clf: (name_and_clf[0], _perform_cross_validation_and_save(name_and_clf[ 1], X_train, y_train, name_and_clf[0], OUTPUTS_DIR)), classifiers.items()))
def test_model(): vocab, embeddings = data_helper.load_embeddings(config.get('data', 'embedding_file')) model = RNNModel(embeddings, num_classes=5) model.load(config.get('data', 'model_dir')) test_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'test.txt')) numeric_test_samples = data_helper.convert_to_numeric_samples(test_data, vocab, num_classes=5) model.eval(numeric_test_samples)
def preprocess(): # prepare data # ============================= # load data print("load data ...") sentence_A, sentence_B, y = data_helper.load_data('data/SICK_data.txt') # load pre-trained word vector and build vocabulary. word_vector = data_helper.word_vector('data/glove.6B.100d.txt') max_document_length = max([len(x.split(' ')) for x in sentence_A + sentence_B]) word_vector.vocab_processor.max_document_length = max_document_length sentence_A = np.array(list(word_vector.vocab_processor.transform(sentence_A))) sentence_B = np.array(list(word_vector.vocab_processor.transform(sentence_B))) # randomly shuffle the data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) A_shuffled = sentence_A[shuffle_indices] B_shuffled = sentence_B[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/dev set dev_sample_index = -1*int(0.2 * float(len(y))) A_train, A_dev = A_shuffled[:dev_sample_index], A_shuffled[dev_sample_index:] B_train, B_dev = B_shuffled[:dev_sample_index], B_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del A_shuffled, B_shuffled, y_shuffled print("Vocabulary size: {:d}".format(len(word_vector.vocab_processor.vocabulary_))) print("Train/dev split: {:d}".format(len(y_train), len(y_dev))) return A_train, B_train, A_dev, B_dev, y_train, y_dev, word_vector
def train(model, supervisor): test_sentences, test_labels = data_helper.load_data(FLAGS.test_data_path) test_sentences = utils.sentence2id(test_sentences, model.word2idx, FLAGS.seq_length) test_labels = utils.one_hot(test_labels, FLAGS.class_num) config = tf.ConfigProto() config.gpu_options.allow_growth = True with supervisor.managed_session(config=config) as sess: for epoch in range(FLAGS.epoch): print("Training for epoch %d/%d:" % (epoch, FLAGS.epoch)) if supervisor.should_stop(): print('supervisor stoped!') break tqdm_iter = tqdm(range(model.train_num), total=model.train_num, leave=False) for step in tqdm_iter: _, loss, train_acc = sess.run( [model.optimizer, model.total_loss, model.accuracy]) tqdm_iter.set_description( '--- loss: %.4f --- accuracy: %.4f ---' % (loss, train_acc)) test_acc = sess.run(model.accuracy, { model.inputs: test_sentences, model.labels: test_labels }) print('--- evaluate --- accuracy: %.4f ---' % test_acc)
def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop, sequence_length, vocabulary_size) print("Traning Model...") history = model.fit( x_train, y_train, batch_size=opt.batch_size, epochs=opt.epochs, verbose=1, callbacks=[TestCallback((x_valid, y_valid), model=model)]) model.save(opt.saved_model) print("Training cost time: ", time.time() - st) elif opt.mode == "ensemble": model1 = load_model(opt.saved_model1) model1.name = 'model1' for layer in model1.layers: layer.name = layer.name + str("_1") model2 = load_model(opt.saved_model2) model2.name = 'model2' for layer in model2.layers: layer.name = layer.name + str("_2") models = [model1, model2] vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word_models(models, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid") # x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( # "data", opt.debug) # num_training_data = x_train.shape[0] # sequence_length = x_train.shape[1] # model_inputs = Input(shape=(sequence_length,), dtype='int32') # model = ensemble(models, model_inputs) # model.save(opt.model_to_be_saved) else: model = load_model(opt.saved_model) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid")
def cv_k_fold(model_info, k=10, verbose=0): # load data, targets and the splits data, targets, splits = load_data(k) # used for storing all model scores scores = [] # iterate through the splits for CV for train_index, test_index in splits: # splits data and targets using indices given and returns them ready for training and testing (train_data, train_targets), (test_data, test_targets) = split_data_targets( data, targets, train_index, test_index, model_info.input_shape) # trains model and returns the score score = train(model_info, train_data, train_targets, test_data, test_targets, verbose) # store the score of this model scores.append(score) # print the accuracy metric score print('Fold: ' + str(len(scores)) + ', Accuracy: ' + str(round(score * 100, 3)) + '%') # calculate the mean score of the all the trained models cv_score = float(np.mean(scores) * 100) cv_std = float(np.std(scores) * 100) # print the CV accuracy score print('Final Accuracy:', str(round(cv_score, 3)) + '%', '(+/-', str(round(cv_std, 3)) + '%)')
def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop, sequence_length, vocabulary_size, opt.optimizer) print("Traning Model...") history = model.fit( x_train, y_train, batch_size=opt.batch_size, epochs=opt.epochs, verbose=1, callbacks=[TestCallback((x_valid, y_valid), model=model)]) model.save(opt.saved_model) print("Training cost time: ", time.time() - st) else: model = load_model(opt.saved_model) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid")
def train_step(): print("loading the dataset...") config = Config() eval_config=Config() eval_config.keep_prob=1.0 train_data,valid_data,test_data=data_helper.load_data(FLAGS.max_len,batch_size=config.batch_size) print("begin training") # gpu_config=tf.ConfigProto() # gpu_config.gpu_options.allow_growth=True with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale) with tf.variable_scope("model",reuse=None,initializer=initializer): model = RNN_Model(config=config,is_training=True) with tf.variable_scope("model",reuse=True,initializer=initializer): valid_model = RNN_Model(config=eval_config,is_training=False) test_model = RNN_Model(config=eval_config,is_training=False) #add summary # train_summary_op = tf.merge_summary([model.loss_summary,model.accuracy]) train_summary_dir = os.path.join(config.out_dir,"summaries","train") train_summary_writer = tf.train.SummaryWriter(train_summary_dir,session.graph) # dev_summary_op = tf.merge_summary([valid_model.loss_summary,valid_model.accuracy]) dev_summary_dir = os.path.join(eval_config.out_dir,"summaries","dev") dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir,session.graph) #add checkpoint checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) tf.initialize_all_variables().run() global_steps=1 begin_time=int(time.time()) for i in range(config.num_epoch): print("the %d epoch training..."%(i+1)) lr_decay = config.lr_decay ** max(i-config.max_decay_epoch,0.0) model.assign_new_lr(session,config.lr*lr_decay) global_steps=run_epoch(model,session,train_data,global_steps,valid_model,valid_data,train_summary_writer,dev_summary_writer) if i% config.checkpoint_every==0: path = saver.save(session,checkpoint_prefix,global_steps) print("Saved model chechpoint to{}\n".format(path)) print("the train is finished") end_time=int(time.time()) print("training takes %d seconds already\n"%(end_time-begin_time)) test_accuracy=evaluate(test_model,session,test_data) print("the test data accuracy is %f"%test_accuracy) print("program end!")
def find_average_year(): # Load the data using the appropraite function from the data_helper module, # and search through it to find the average age of respondants. # Return that value. data = data_helper.load_data('list_of_responses.json') total_year = 0 for response in data: total_year += response['year'] return total_year / len(data) pass
def item(): movie_data_file='ml-1m/ml-1m/movies.dat' item_text = data_helper.load_data(movie_data_file) movie1 = [] arr= [] i = 1 line1= [] for line in item_text: if i==line[0]: movie1.append(line) line1 = line #print(i) else: a = [0,0,0] #print(line[0]) if line[0]==1404: movie1.append(a) movie1.append(a) movie1.append(line) elif line[0]==1453: movie1.append(a) movie1.append(a) movie1.append(line) elif line[0]==1493: movie1.append(a) movie1.append(a) movie1.append(line) elif line[0]==1507: movie1.append(a) movie1.append(a) movie1.append(line) elif line[0]==1639: movie1.append(a) movie1.append(a) movie1.append(line) elif line[0]==1738: movie1.append(a) movie1.append(a) movie1.append(line) elif line[0]==1804: movie1.append(a) movie1.append(a) movie1.append(line) else: movie1.append(a) movie1.append(line) #print(arr) i=line[0]+1 item_text = np.array(movie1) item_text = item_text[:,2] item_text = item_text.tolist() item_all = data_helper.type_id(item_text) item_all = np.array(item_all) return item_all
def pipeline(): ''' ''' # load files train_df, y_train = data_helper.load_data(FLAGS.train_file) logging.debug("train shape:%s,dtypes:%s", train_df.shape, train_df.dtypes) # feature fit & transform categorical_features = [ 'mod', 'mf', 'aver', 'sver', 'vid', 'prev', ] categorical_transformer = Pipeline( steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]) preprocessor = ColumnTransformer(transformers=[ ('cat', categorical_transformer, categorical_features), ]) clf = lr(preprocessor, train_df, y_train) # model out try: dump(clf, FLAGS.model_out, compress=1) except: traceback.print_exc() # test if FLAGS.test_file: test_df, y_test = data_helper.load_data(FLAGS.test_file) test(FLAGS.test_out, clf, test_df, y_test) # submission if FLAGS.submission_input_file: submission_df = data_helper.load_eval_data(FLAGS.submission_input_file) submission_df = pd.DataFrame(submission_df, columns=test_df.columns) submission_data(FLAGS.submission_out, clf, submission_df)
def get_batch_data(path, batch_size, seq_length, word2idx): sentences, labels = data_helper.load_data(path) sentences = sentence2id(sentences, word2idx, seq_length) train_num = (len(labels) // batch_size) + 1 data_queues = tf.train.slice_input_producer([sentences, labels]) inputs, labels = tf.train.shuffle_batch(data_queues, num_threads=8, batch_size=batch_size, capacity=batch_size * 64, min_after_dequeue=batch_size * 32, allow_smaller_final_batch=True) return inputs, labels, train_num
def main(_): # load data data, ix2word, word2ix = load_data() num_train = data.shape[0] vocab_size = len(ix2word) # variables for training X=tf.placeholder(tf.int32, [BATCH_SIZE, None]) y=tf.placeholder(tf.int32, [BATCH_SIZE, None]) rnn_model = RNN(model=model, batch_size=BATCH_SIZE, vocab_size=vocab_size, embedding_dim=embedding_dim, n_neurons=n_neurons, n_layers=3, lr=lr, keep_prob=keep_prob) loss, optimizer = rnn_model.train(X, y) # start trian start_time = time.time() with tf.Session() as sess: # Visualize graph # write loss into logs merged = tf.summary.merge_all() writer = tf.summary.FileWriter('./logs/', sess.graph) tf.global_variables_initializer().run() print("="*15+"strat training"+"="*15) for epc in range(NUM_EPOCH): print("="*15, "epoch: %d" % epc, "="*15) for step in range(num_train//BATCH_SIZE): # get batch data idx_strat = step*BATCH_SIZE idx_end = idx_strat+BATCH_SIZE batch_data = data[idx_strat:idx_end, ...] x_data = batch_data[:, :-1] y_data = batch_data[:, 1:] feed_dict={X:x_data,y:y_data} sess.run(optimizer, feed_dict=feed_dict) # print evaluation results for every 100 steps if step%eval_frequence==0: l = sess.run(loss,feed_dict=feed_dict) result = sess.run(merged,feed_dict=feed_dict) writer.add_summary(result, (epc*num_train//BATCH_SIZE)+step) input_seq = "湖光秋月两相和" result = generate_poem(rnn_model=rnn_model, sess=sess, input_seqs=input_seq, ix2word=ix2word,word2ix=word2ix, max_len=125, prefix_words=None) result_poem = ''.join(result) run_time = time.time() - start_time start_time = time.time() print("step: %d, run time: %.1f ms" % (step, run_time*1000/eval_frequence)) print("minibatch loss: %d" % l) print("generated poem length: %d, poem is: %s" % (len(result_poem), result_poem)) sys.stdout.flush() # save model if SAVE: saver = tf.train.Saver() saver.save(sess, CKPT_PATH+'rnn_model.ckpt')
def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop1, opt.drop2, sequence_length, vocabulary_size) print("Training Model...") model.fit(x_train, y_train, batch_size=opt.batch_size, epochs=opt.epochs, verbose=2, callbacks=[TestCallback((x_valid, y_valid), model=model)]) model.save(opt.saved_model) print("Training cost time: ", time.time() - st) else: if opt.mode == "score_valid": model = load_model(opt.saved_model) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word([model], vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) scoring(sub_file, os.path.join("data"), type="valid") else: model0 = load_model('models/model0.h5') model1 = load_model('models/model1.h5') model2 = load_model('models/model2.h5') model3 = load_model('models/model3.h5') model4 = load_model('models/model4.h5') model5 = load_model('models/model5.h5') model6 = load_model('models/model6.h5') model7 = load_model('models/model7.h5') model8 = load_model('models/model8.h5') model9 = load_model('models/model9.h5') model_list = [ model0, model1, model2, model3, model4, model5, model6, model7, model8, model9 ] vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model_list, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) scoring(sub_file, os.path.join("data"), type="valid")
def main_func(argv): in_file = '' out_dir = '' try: opts, args = getopt.getopt(argv, "h:i:o:", ["in_filepath=", "out_dir="]) except getopt.GetoptError: print("python main.py -i <in_filepath> -o <out_dir>") sys.exit(2) for opt, arg in opts: if opt == '-h': print("python main.py -i <in_filepath> -o <out_dir>") sys.exit() elif opt in ("-i", "--in_filepath"): in_file = arg elif opt in ("-o", "--out_dir"): out_dir = arg if out_dir == '': out_dir = './trained_results/' if not out_dir.endswith('/'): out_dir += '/' if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir) # x_, y_ is planning and booking system data set # new_x, new_y is chatting system data set x_, y_, vocabulary, embedding_mat, labels, new_x, new_y = data_helper.load_data( in_file, out_dir) # split data x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.2) # add new_data to train data x_train = np.r_[x_train, new_x] y_train = np.r_[y_train, new_y] logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) train.train_cnn_rnn(embedding_mat, x_train, x_dev, y_train, y_dev, labels, vocabulary, out_dir) predict_labels, accuracy = predict.predict_cnn_rnn(x_test, y_test, out_dir) print('accuracy', accuracy)
def main(opt): os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu if opt.mode == "train": st = time.time() print('Loading data') x_train, y_train, x_valid, y_valid, vocabulary_size = load_data( "data", opt.debug) num_training_data = x_train.shape[0] sequence_length = x_train.shape[1] print(num_training_data) print('Vocab Size', vocabulary_size) model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop, sequence_length, vocabulary_size) print("Traning Model...") history = model.fit( x_train, y_train, batch_size=opt.batch_size, epochs=opt.epochs, verbose=1, callbacks=[TestCallback((x_valid, y_valid), model=model)]) model.save(opt.saved_model) # Save the model architecture #with open('model_architecture.yaml', 'w') as f: # f.write(model.to_json()) print("Training cost time: ", time.time() - st) else: # Model reconstruction from JSON file #with open('model_architecture.yaml', 'r') as f: # model = model_from_yaml(f.read()) model = load_model( opt.saved_model, custom_objects={'LayerNormalization': LayerNormalization}) vocabulary = json.load(open(os.path.join("data", "vocab.json"))) predict_dict = predict_final_word(model, vocabulary, opt.input) sub_file = make_submission(predict_dict, opt.student_id, opt.input) if opt.score: scoring(sub_file, os.path.join("data"), type="valid")
def conduct_survey(): ''' Allows the user to give multiple survey responses, saving the responses in a list. Returns the list. ''' responses =[] responses.append(data_helper.load_data('list_of_responses.json')) print(responses) yesA = ['yes', 'y', 'ok', 'o.k.', 'okay'] noA = ['no', 'n'] # while start_survey in yesA: # start_survey = input("Would you like to start the survey?\n") # start_survey = start_survey.lower() # print("Okay, let's jump into it!") # response_of_current_person = get_survey_response() # responses.append(response_of_current_person) while True: start_survey = input("Would you like to start the survey?\n") start_survey = start_survey.lower() if(start_survey in noA): break print("Okay, let's jump into it!") response_of_current_person = get_survey_response() responses.append(response_of_current_person) print('Thanks for your time!') # Your code here! Ask the user if they'd like to take the survey. If they # say yes, use the get_survey_response() function to get their response. # allow them to continue submitting responses until they say they are done. # Append each response to the responses list. # (Hint: what kind of loop do you use when you don't know how many times # it will iterate?) # Keep this line of code at the end. return responses
def num_month_counts(): month_counts = { 'december': 0, 'january': 0, 'february': 0, 'march': 0, 'april': 0, 'may': 0, 'june': 0, 'july': 0, 'august': 0, 'september': 0, 'october': 0, 'november': 0, } data = data_helper.load_data('list_of_responses.json') for response in data: month = response['month'] return month_counts
def save(model_info, model_filename): # load data, targets and ignore the splits data, targets = load_data(k=1) # normalise data data = normalise(data, model_info.input_shape) # initialise structure of model model = Sequential(model_info.get_structure()) # configure optimiser, loss and metrics model.compile(optimizer=model_info.optimizer, loss=model_info.loss, metrics=['accuracy']) # trains the model by fitting the data and targets. Configure number of epochs model.fit(data, targets, epochs=model_info.epochs, batch_size=model_info.batch_size) # saves the trained model and creates a HDF5 file '<model_name>.h5' at '../models/' model.save('../models/' + model_filename + '.h5')
def main(): INPUTS_DIR = os.getenv('VH_INPUTS_DIR', '../data') OUTPUTS_DIR = os.getenv('VH_OUTPUTS_DIR', './models') data_name = 'sql_trainer_filtered_attempts.csv' data_path = os.path.join(INPUTS_DIR, 'filtered-data', data_name) X_train, X_val, y_train, y_val = load_data(data_path, 100) linear_scores = dict() split_amount = 5 # 5 is default kf_splits = list(KFold(n_splits=split_amount).split(X_train)) C_key = 1 for i in range(0, len(kf_splits)): clf = SVC(kernel='linear', C=1) res = train_classifier_with_split(kf_splits, i, clf, X_train, y_train, 'linearSVM', upload_tsvm_classifier, OUTPUTS_DIR) linear_scores.setdefault(C_key, []).append(res['score']) for key, val in linear_scores.values(): print('%s: %s' % (key, val))
def train_step(): print("loading the dataset...") config = Config() eval_config=Config() eval_config.keep_prob=1.0 train_data, valid_data, test_data = data_helper.load_data(FLAGS.max_len, batch_size=config.batch_size, usemydata=FLAGS.using_mydata) print('the len of train_data:', len(train_data[0])) print("begin training") # gpu_config=tf.ConfigProto() # gpu_config.gpu_options.allow_growth=True with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale, 1*FLAGS.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): model = RNN_Model(config=config,is_training=True, using_api=False, usemydata=FLAGS.using_mydata) with tf.variable_scope("model",reuse=True,initializer=initializer): valid_model = RNN_Model(config=eval_config,is_training=False, using_api=False, usemydata=FLAGS.using_mydata) # test_model = RNN_Model(config=eval_config,is_training=False, using_api=False, usemydata=FLAGS.using_mydata) #add summary # train_summary_op = tf.merge_summary([model.loss_summary,model.accuracy]) train_summary_dir = os.path.join(config.out_dir,"summaries","train") train_summary_writer = tf.summary.FileWriter(train_summary_dir,session.graph) # dev_summary_op = tf.merge_summary([valid_model.loss_summary,valid_model.accuracy]) dev_summary_dir = os.path.join(eval_config.out_dir,"summaries","dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,session.graph) #add checkpoint checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.all_variables()) tf.global_variables_initializer().run() # tf.initialize_all_variables().run() global_steps=1 begin_time=int(time.time()) # 这个embedding只需要执行一次初始化pretrain即可 print('embedding loading pretrain...') model.assign_pretrain_embedding(session, data_helper.embedding) # 使用import file,然后file.x访问和修改共享变量 # 释放原来的embedding内存占用,防止程序溢出 print('loaded! and del memory of embedding.') del data_helper.embedding import gc gc.collect() for i in range(config.num_epoch): print("the %d epoch training..."%(i+1)) lr_decay = config.lr_decay ** max(i-config.max_decay_epoch, 0.0) model.assign_new_lr(session, config.lr*lr_decay) # 每个epoch变化learning-rate m = psutil.Process(os.getpid()).memory_full_info().uss / 1024. / 1024. / 1024. if m > 12: print('memory used:{:.2f} GB'.format(m)) sys.exit() global_steps = run_epoch(model, session, train_data,global_steps,valid_model,valid_data,train_summary_writer,dev_summary_writer) if i % config.checkpoint_every==0: path = saver.save(session,checkpoint_prefix,global_steps) print("Saved model chechpoint to{}\n".format(path)) print("the train is finished") end_time=int(time.time()) print("training takes %d seconds already\n"%(end_time-begin_time)) # test_accuracy=evaluate(test_model,session,test_data) # print("the test data accuracy is %f"%test_accuracy) print("program end!")
TimeDistributed, Multiply, Dot, Concatenate, Add from keras.layers.core import Activation, Dense, Permute, Flatten, Dropout, Reshape, Layer, \ ActivityRegularization, RepeatVector, Lambda from keras.utils import plot_model from keras.callbacks import History from keras import backend as K from keras.layers.wrappers import Bidirectional from data_helper import load_data, load_image, generate_img, test_img from result_calculator import * import time from keras.optimizers import RMSprop, Adamax from keras.initializers import TruncatedNormal if __name__ == '__main__': print('loading data...') train_dataset, valid_dataset, test_dataset, vocabulary, vocabulary_inv, user_vocabulary, user_vocabulary_inv, seqlen, maxlen, maxmentioned = load_data( ) uid, tx, ti, ux, ui, meid, mx, mi, y = train_dataset vocab_size = len(vocabulary_inv) + 1 user_vocab_size = len(user_vocabulary_inv) + 1 print('-') print('Vocab size:', vocab_size, 'unique words') print('User size:', user_vocab_size, 'unique users') print('Max length:', maxlen, 'words') print('Max mentioned:', maxmentioned, 'users') print('-') print( 'Here\'s what a "mention" tuple looks like (tweet_x, user_x, mentioned_x, label):' )
import json from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D from keras.layers import Reshape, Flatten, Dropout, Concatenate from keras.callbacks import ModelCheckpoint from keras.optimizers import Adam from keras.models import Model from sklearn.model_selection import train_test_split from data_helper import load_data print('Loading data') x, y, vocabulary, vocabulary_inv = load_data() # x.shape -> (10662, 56) # y.shape -> (10662, 2) # len(vocabulary) -> 18765 # len(vocabulary_inv) -> 18765 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # X_train.shape -> (8529, 56) # y_train.shape -> (8529, 2) # X_test.shape -> (2133, 56) # y_test.shape -> (2133, 2) sequence_length = x.shape[1] # 56 vocabulary_size = len(vocabulary_inv) # 18765 embedding_dim = 256
elif FLAGS.clf == 'clstm': FLAGS.hidden_size = len(FLAGS.filter_sizes.split(",")) * FLAGS.num_filters # Output files directory timestamp = str(int(time.time())) outdir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) if not os.path.exists(outdir): os.makedirs(outdir) # Load and save data # ============================================================================= data, labels, lengths, vocab_processor = data_helper.load_data( file_path=FLAGS.data_file, sw_path=FLAGS.stop_word_file, min_frequency=FLAGS.min_frequency, max_length=FLAGS.max_length, language=FLAGS.language, shuffle=True) print(labels) input() # Save vocabulary processor vocab_processor.save(os.path.join(outdir, 'vocab')) FLAGS.vocab_size = len(vocab_processor.vocabulary_._mapping) FLAGS.max_length = vocab_processor.max_document_length params = FLAGS.flag_values_dict() # Print parameters
def train_cnn_rnn(): input_file = "logstashTemp.dat" output_file = "logstash.csv" dataList = [] with open(input_file, 'r', encoding='utf8') as logFile: for row in logFile: dataList.append(json.loads(row)) keyList = list(dataList[0].keys()) csvList = [[keyItem for keyItem in keyList]] for row in dataList: if "severity" in list(row.keys()): tempRow = [ row[keyItem] for keyItem in keyList if keyItem in list(row.keys()) ] csvList.append(tempRow) with open(output_file, "w+", encoding="utf8") as csvFile: for row in csvList: myWriter = csv.writer(csvFile) myWriter.writerow(row) x_, y_, vocabulary, vocabulary_inv, df, labels = data_helper.load_data( output_file, 20000) training_config = "training_config.json" params = json.loads(open(training_config).read()) # Assign a 300 dimension vector to each word word_embeddings = data_helper.load_embeddings(vocabulary) embedding_mat = [ word_embeddings[word] for index, word in enumerate(vocabulary_inv) ] embedding_mat = np.array(embedding_mat, dtype=np.float32) # Split the original dataset into train set and test set x, x_test, y, y_test = train_test_split(x_, y_, test_size=0.1) # Split the train set into train set and dev set x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.1) logging.info('x_train: {}, x_dev: {}, x_test: {}'.format( len(x_train), len(x_dev), len(x_test))) logging.info('y_train: {}, y_dev: {}, y_test: {}'.format( len(y_train), len(y_dev), len(y_test))) # Create a directory, everything related to the training will be saved in this directory timestamp = str(int(time.time())) trained_dir = './trained_results_' + timestamp + '/' print(trained_dir) if os.path.exists(trained_dir): shutil.rmtree(trained_dir) os.makedirs(trained_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): cnn_rnn = TextCNNRNN(embedding_mat=embedding_mat, sequence_length=x_train.shape[1], num_classes=y_train.shape[1], non_static=params['non_static'], hidden_unit=params['hidden_unit'], max_pool_size=params['max_pool_size'], filter_sizes=map( int, params['filter_sizes'].split(",")), num_filters=params['num_filters'], embedding_size=params['embedding_dim'], l2_reg_lambda=params['l2_reg_lambda']) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.RMSPropOptimizer(1e-3, decay=0.9) grads_and_vars = optimizer.compute_gradients(cnn_rnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Checkpoint files will be saved in this directory during training checkpoint_dir = './checkpoints_' + timestamp + '/' if os.path.exists(checkpoint_dir): shutil.rmtree(checkpoint_dir) os.makedirs(checkpoint_dir) checkpoint_prefix = os.path.join(checkpoint_dir, 'model') def real_len(batches): return [ np.ceil( np.argmin(batch + [0]) * 1.0 / params['max_pool_size']) for batch in batches ] def train_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: params['dropout_keep_prob'], cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn_rnn.loss, cnn_rnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): feed_dict = { cnn_rnn.input_x: x_batch, cnn_rnn.input_y: y_batch, cnn_rnn.dropout_keep_prob: 1.0, cnn_rnn.batch_size: len(x_batch), cnn_rnn.pad: np.zeros([len(x_batch), 1, params['embedding_dim'], 1]), cnn_rnn.real_len: real_len(x_batch), } step, loss, accuracy, num_correct, predictions = sess.run([ global_step, cnn_rnn.loss, cnn_rnn.accuracy, cnn_rnn.num_correct, cnn_rnn.predictions ], feed_dict) return accuracy, loss, num_correct, predictions saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # Training starts here train_batches = data_helper.batch_iter(list(zip(x_train, y_train)), \ params['batch_size'], \ params['num_epochs']) best_accuracy, best_at_step = 0, 0 # Train the model with x_train and y_train for train_batch in train_batches: x_train_batch, y_train_batch = zip(*train_batch) train_step(x_train_batch, y_train_batch) current_step = tf.train.global_step(sess, global_step) # Evaluate the model with x_dev and y_dev if current_step % params['evaluate_every'] == 0: dev_batches = data_helper.batch_iter( list(zip(x_dev, y_dev)), params['batch_size'], 1) total_dev_correct = 0 for dev_batch in dev_batches: x_dev_batch, y_dev_batch = zip(*dev_batch) acc, loss, num_dev_correct, predictions = dev_step( x_dev_batch, y_dev_batch) total_dev_correct += num_dev_correct accuracy = float(total_dev_correct) / len(y_dev) logging.info('Accuracy on dev set: {}'.format(accuracy)) if accuracy >= best_accuracy: print("׼ȷÂÊ£º", accuracy) best_accuracy, best_at_step = accuracy, current_step path = saver.save(sess, checkpoint_prefix, global_step=current_step) logging.critical('Saved model {} at step {}'.format( path, best_at_step)) logging.critical('Best accuracy {} at step {}'.format( best_accuracy, best_at_step)) logging.critical( 'Training is complete, testing the best model on x_test and y_test' ) # Save the model files to trained_dir. predict.py needs trained model files. saver.save(sess, trained_dir + "best_model.ckpt") # Evaluate x_test and y_test saver.restore(sess, checkpoint_prefix + '-' + str(best_at_step)) test_batches = data_helper.batch_iter(list(zip(x_test, y_test)), params['batch_size'], 1, shuffle=False) total_test_correct = 0 for test_batch in test_batches: x_test_batch, y_test_batch = zip(*test_batch) acc, loss, num_test_correct, predictions = dev_step( x_test_batch, y_test_batch) total_test_correct += int(num_test_correct) logging.critical('Accuracy on test set: {}'.format( float(total_test_correct) / len(y_test))) # Save trained parameters and files since predict.py needs them with open(trained_dir + 'words_index.json', 'w') as outfile: json.dump(vocabulary, outfile, indent=4, ensure_ascii=False) with open(trained_dir + 'embeddings.pickle', 'wb') as outfile: pickle.dump(embedding_mat, outfile, pickle.HIGHEST_PROTOCOL) with open(trained_dir + 'labels.json', 'w') as outfile: json.dump(labels, outfile, indent=4, ensure_ascii=False) params['sequence_length'] = x_train.shape[1] with open(trained_dir + 'trained_parameters.json', 'w') as outfile: json.dump(params, outfile, indent=4, sort_keys=True, ensure_ascii=False)
import conversion import os import tensorflow as tf import numpy as np import matplotlib.pyplot as plt import pickle import data_helper data_sets = data_helper.load_data() classes = data_sets['classes'] train_data = data_sets['images_train'] train_label = data_sets['labels_train'] test_data = data_sets['images_test'] test_label = data_sets['labels_test'] #Reshaping data train_data = train_data.reshape(50000, 3, 32, 32).transpose(0, 2, 3, 1) test_data = test_data.reshape(10000, 3, 32, 32).transpose(0, 2, 3, 1) batch_size = 100 image_width = 32 image_height = 32 channels = 3 #placeholder images_placeholder = tf.placeholder( tf.float32, [None, image_width, image_height, channels]) labels_placeholder = tf.placeholder(tf.int32, [None]) one_hot = tf.one_hot(labels_placeholder, depth=10)
import data_helper def lis(x): num = 0 re = [] for a in x: for b in a: re.append(b) return re def chan(x): return [[a] for a in x] x_inp,y_input = data_helper.load_data() x_input = [lis(a) for a in x_inp] import tensorflow as tf print len(x_input) print len(y_input[0]) print"# Network Parameters" # Network Parameters dropout = 0.5 # Dropout, probability to keep units the_text_length = 56 n_input = 300 * the_text_length n_class = 2
timestr = datetime.datetime.now().isoformat() logging.info("%s, the %i step, train cost is:%f and the train accuracy is %6.7f"%(timestr, global_steps, cost, accuracy)) if(global_steps % FLAGS.evaluate_every == 0): valid_accuracy = evaluate(valid_model,session,valid_data,global_steps) logging.info("%s, the valid accuracy is %f"%(timestr, valid_accuracy)) global_steps += 1 return global_steps #---------------------------- run epoch end ------------------------------------- #------------------------------------load data ------------------------------- word2idx, idx2word = build_vocab(FLAGS.word_file) label2idx, idx2label = load_label(FLAGS.label_file) train_x, train_y, train_mask = load_data(FLAGS.train_file, word2idx, label2idx, FLAGS.sequence_len) logging.info("load train data finish") train_data, valid_data = create_valid(zip(train_x, train_y, train_mask)) num_classes = len(label2idx) embedding = load_embedding(FLAGS.embedding_size, filename=FLAGS.embedding_file) test_x, test_y, test_mask = load_data(FLAGS.test_file, word2idx, label2idx, FLAGS.sequence_len) logging.info("load test data finish") #----------------------------------- load data end ---------------------- #----------------------------------- execute train --------------------------------------- with tf.Graph().as_default(): with tf.device("/cpu:0"): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_options) session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=gpu_options) with tf.Session(config=session_conf).as_default() as sess: initializer = tf.random_uniform_initializer(-1 * FLAGS.init_scale, 1 * FLAGS.init_scale)