def train(self): with tf.name_scope('dataset'): ds = load_data.DataSet('data/gene_dict_clean_lower.txt', self.max_length, self.batch_size) train_dataset, test_dataset = ds.load_dict_data() iterator = train_dataset.make_initializable_iterator() with tf.name_scope('embedding'): word_vocab_list, embedding_matrix = ( load_data.load_word_embedding_for_dict_file( 'data/gene_dict_clean_lower.txt', 'data/word_embedding.txt')) # Set up tensorflow look up from string word to unique integer index_table = tf.contrib.lookup.index_table_from_tensor( word_vocab_list, num_oov_buckets=1, default_value=-1) self.vocab_size = len(word_vocab_list) with tf.Session() as sess: sess.run(tf.tables_initializer()) while (True): try: batch_data = index_table.lookup(iterator.get_next()) decoder_length = tf.where(tf.equal(batch_data, 2))[:, -1] + 1 embedded_batch_data = tf.nn.embedding_lookup( embedding_matrix, batch_data) one_hot_batch_data = tf.one_hot(batch_data, self.vocab_size) self.graph(batch_data, embedded_batch_data, one_hot_batch_data, decoder_length) sess.run(tf.global_variables_initializer()) tf.summary.FileWriter('graphs/test', sess.graph) for i in range(NUM_EPOCHS): sess.run(iterator.initializer) try: a = sess.run(self.update) b = sess.run(self.loss) print(a) print(b) except tf.errors.OutOfRangeError: pass except tf.errors.OutOfRangeError: break
import load_data as ld from net import Network from plots import plotLC ALPHA = 1e-2 BATCH_SIZE = 20 NITER = 12000 # load test data test = np.load('test.npy') test_labels = ld.one_hot(np.load('test_labels.npy')) # initialize network n = Network(ALPHA) # load training and validation datasets D = ld.DataSet() train_accuracies = [] batch_accuracies = [] valid_batch_accuracies = [] valid_accuracies = [] # train the network for i in range(NITER): batch = D.next_batch(BATCH_SIZE) batch_accuracies.append(n.getAccuracy(batch[0], batch[1])) if i > 0 and i % (D.get_trainsize() / BATCH_SIZE) == 0: train_accuracies.append(sum(batch_accuracies) / len(batch_accuracies)) for j in range((int)(D.get_validsize() / BATCH_SIZE)): valid_batch = D.next_valid_batch(BATCH_SIZE) valid_batch_accuracies.append(
def main(): # get titanic & test csv files as a DataFrame ds = load_data.DataSet() ds.load_files("train.csv", "test.csv") full = ds.get_full() titanic = ds.get_titanic() ''' #get information on data print('Datasets:', 'full:', full.shape, 'titanic:', titanic.shape) print(titanic.describe()) plot_utils.plot_correlation_map(titanic) plot_utils.plots(titanic) ''' #transform values into numerical types # Transform Sex into binary values 0 (female) and 1 (male) sex = pd.Series(np.where(full.Sex == 'male', 1, 0), name='Sex') # Create a new variable for every unique value of Embarked (Embarked_C, Embarked_Q, Embarked_S) embarked = pd.get_dummies(full.Embarked, prefix='Embarked') # Create a new variable for every unique value of Embarked (Pclass_1, Pclass_2, Pclass_3) pclass = pd.get_dummies(full.Pclass, prefix='Pclass') print(pclass.head()) # Create dataset imputed = datasets.imputed(full) title = datasets.title(full) title.head() cabin = datasets.cabin(full) cabin.head() ticket = datasets.ticket(full) ticket.head() family = datasets.family(full) family.head() # Select which features/variables to include in the dataset from the list below: # imputed , embarked , pclass , sex , family , cabin , ticket full_X = pd.concat([imputed, embarked, cabin, sex], axis=1) full_X.head() train_valid_X = full_X[0:891] train_valid_y = titanic.Survived test_X = full_X[891:] train_X, valid_X, train_y, valid_y = train_test_split(train_valid_X, train_valid_y, train_size=.7) print(full_X.shape, train_X.shape, valid_X.shape, train_y.shape, valid_y.shape, test_X.shape) plot_utils.plot_variable_importance(train_X, train_y) model = RandomForestClassifier(n_estimators=100) # model = SVC() # model = GradientBoostingClassifier() # model = KNeighborsClassifier(n_neighbors=3) # model = GaussianNB() # model = LogisticRegression() model.fit(train_X, train_y) print(model.score(train_X, train_y), model.score(valid_X, valid_y)) # plot_utils.plot_model_var_imp(model, train_X, train_y) rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold().split(train_X, train_y), scoring='accuracy') rfecv.fit(train_X, train_y) test_Y = model.predict(test_X) passenger_id = full[891:].PassengerId test = pd.DataFrame({'PassengerId': passenger_id, 'Survived': test_Y}) test['Survived'] = test['Survived'].astype(int) # test.head() test.to_csv('titanic_pred.csv', index=False)
def run_training(self): """Train MNIST for a number of steps.""" # Get the sets of images and labels for training, validation, and # test on MNIST. inputfiles = os.listdir(self.train_dir) np.random.shuffle(inputfiles) trainfiles = inputfiles[:int(self.max_num_train * self.train_test_split)] testlist = inputfiles[int(self.max_num_train * self.train_test_split):self.max_num_train] trainlist = trainfiles[:int(len(trainfiles) * self.train_val_ratio)] validationlist = trainfiles[ int(len(trainfiles) * self.train_val_ratio):] trainset = load_data.DataSet(trainlist, self.train_dir) validationset = load_data.DataSet(validationlist, self.train_dir) testset = load_data.DataSet(testlist, self.train_dir) print('Train list: ', len(trainlist)) print('Validation list: ', len(validationlist)) print('Test list: ', len(testlist)) with tf.Session(config=tf.ConfigProto(gpu_options=(tf.GPUOptions( per_process_gpu_memory_fraction=0.7)))) as sess: # Generate placeholders for the images and labels. images_placeholder, labels_placeholder = self.placeholder_inputs() # Build a Graph that computes predictions from the inference model. vgg = modvgg16.Vgg16() with tf.name_scope("content_vgg"): vgg.build(images_placeholder) #init_op = tf.initialize_all_variables() #sess.run(init_op) #logits = sess.run(vgg.prob, feed_dict=feed_dict) logits = vgg.prob # Add to the Graph the Ops for loss calculation. print(logits.get_shape()) print(labels_placeholder.get_shape()) loss = self.loss(logits, labels_placeholder) # Add to the Graph the Ops that calculate and apply gradients. train_op = self.training(loss, self.learning_rate) # Add the Op to compare the logits to the labels during evaluation. eval_correct = self.evaluation(logits, labels_placeholder) # Build the summary operation based on the TF collection of Summaries. summary_op = tf.merge_all_summaries() # Add the variable initializer Op. init = tf.initialize_all_variables() # Create a saver for writing training checkpoints. saver = tf.train.Saver() # Create a session for running Ops on the Graph. sess = tf.Session() # Instantiate a SummaryWriter to output summaries and the Graph. summary_writer = tf.train.SummaryWriter('log', sess.graph) # And then after everything is built: # Run the Op to initialize the variables. sess.run(init) # Start the training loop. for step in xrange(self.n_epoch * self.batch_size): start_time = time.time() # Fill a feed dictionary with the actual set of images and labels # for this particular training step. feed_dict = self.fill_feed_dict(trainset, images_placeholder, labels_placeholder) # Run one step of the model. The return values are the activations # from the `train_op` (which is discarded) and the `loss` Op. To # inspect the values of your Ops or variables, you may include them # in the list passed to sess.run() and the value tensors will be # returned in the tuple from the call. _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict) duration = time.time() - start_time # Write the summaries and print an overview fairly often. if step % self.batch_size == 0: # Print status to stdout. print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) # Update the events file. summary_str = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) summary_writer.flush() # Save a checkpoint and evaluate the model periodically. if (step + 1) % self.batch_size == 0 or ( step + 1) == self.max_num_train: checkpoint_file = os.path.join('log', 'checkpoint') saver.save(sess, checkpoint_file, global_step=step) # Evaluate against the training set. print('Training Data Eval:') self.do_eval(sess, eval_correct, images_placeholder, labels_placeholder, trainset) # Evaluate against the validation set. print('Validation Data Eval:') self.do_eval(sess, eval_correct, images_placeholder, labels_placeholder, validationset) # Evaluate against the test set. print('Test Data Eval:') self.do_eval(sess, eval_correct, images_placeholder, labels_placeholder, testset)