Beispiel #1
0
    def train(self):
        with tf.name_scope('dataset'):
            ds = load_data.DataSet('data/gene_dict_clean_lower.txt',
                                   self.max_length, self.batch_size)
            train_dataset, test_dataset = ds.load_dict_data()
            iterator = train_dataset.make_initializable_iterator()

        with tf.name_scope('embedding'):
            word_vocab_list, embedding_matrix = (
                load_data.load_word_embedding_for_dict_file(
                    'data/gene_dict_clean_lower.txt',
                    'data/word_embedding.txt'))
            # Set up tensorflow look up from string word to unique integer
            index_table = tf.contrib.lookup.index_table_from_tensor(
                word_vocab_list, num_oov_buckets=1, default_value=-1)
            self.vocab_size = len(word_vocab_list)

        with tf.Session() as sess:
            sess.run(tf.tables_initializer())
            while (True):
                try:
                    batch_data = index_table.lookup(iterator.get_next())
                    decoder_length = tf.where(tf.equal(batch_data, 2))[:,
                                                                       -1] + 1
                    embedded_batch_data = tf.nn.embedding_lookup(
                        embedding_matrix, batch_data)
                    one_hot_batch_data = tf.one_hot(batch_data,
                                                    self.vocab_size)
                    self.graph(batch_data, embedded_batch_data,
                               one_hot_batch_data, decoder_length)

                    sess.run(tf.global_variables_initializer())
                    tf.summary.FileWriter('graphs/test', sess.graph)

                    for i in range(NUM_EPOCHS):
                        sess.run(iterator.initializer)
                        try:
                            a = sess.run(self.update)
                            b = sess.run(self.loss)

                            print(a)
                            print(b)

                        except tf.errors.OutOfRangeError:
                            pass
                except tf.errors.OutOfRangeError:
                    break
Beispiel #2
0
import load_data as ld
from net import Network
from plots import plotLC

ALPHA = 1e-2
BATCH_SIZE = 20
NITER = 12000

# load test data
test = np.load('test.npy')
test_labels = ld.one_hot(np.load('test_labels.npy'))

# initialize network
n = Network(ALPHA)
# load training and validation datasets
D = ld.DataSet()

train_accuracies = []
batch_accuracies = []
valid_batch_accuracies = []
valid_accuracies = []

# train the network
for i in range(NITER):
    batch = D.next_batch(BATCH_SIZE)
    batch_accuracies.append(n.getAccuracy(batch[0], batch[1]))
    if i > 0 and i % (D.get_trainsize() / BATCH_SIZE) == 0:
        train_accuracies.append(sum(batch_accuracies) / len(batch_accuracies))
        for j in range((int)(D.get_validsize() / BATCH_SIZE)):
            valid_batch = D.next_valid_batch(BATCH_SIZE)
            valid_batch_accuracies.append(
Beispiel #3
0
def main():
    # get titanic & test csv files as a DataFrame
    ds = load_data.DataSet()
    ds.load_files("train.csv", "test.csv")
    full = ds.get_full()
    titanic = ds.get_titanic()
    '''
    #get information on data
    print('Datasets:', 'full:', full.shape, 'titanic:', titanic.shape)
    print(titanic.describe())
    plot_utils.plot_correlation_map(titanic)
    plot_utils.plots(titanic)
    '''

    #transform values into numerical types

    # Transform Sex into binary values 0 (female) and 1 (male)
    sex = pd.Series(np.where(full.Sex == 'male', 1, 0), name='Sex')

    # Create a new variable for every unique value of Embarked (Embarked_C, Embarked_Q, Embarked_S)
    embarked = pd.get_dummies(full.Embarked, prefix='Embarked')

    # Create a new variable for every unique value of Embarked (Pclass_1, Pclass_2, Pclass_3)
    pclass = pd.get_dummies(full.Pclass, prefix='Pclass')
    print(pclass.head())

    # Create dataset
    imputed = datasets.imputed(full)
    title = datasets.title(full)
    title.head()

    cabin = datasets.cabin(full)
    cabin.head()

    ticket = datasets.ticket(full)
    ticket.head()

    family = datasets.family(full)
    family.head()

    # Select which features/variables to include in the dataset from the list below:
    # imputed , embarked , pclass , sex , family , cabin , ticket
    full_X = pd.concat([imputed, embarked, cabin, sex], axis=1)
    full_X.head()

    train_valid_X = full_X[0:891]
    train_valid_y = titanic.Survived
    test_X = full_X[891:]
    train_X, valid_X, train_y, valid_y = train_test_split(train_valid_X,
                                                          train_valid_y,
                                                          train_size=.7)

    print(full_X.shape, train_X.shape, valid_X.shape, train_y.shape,
          valid_y.shape, test_X.shape)

    plot_utils.plot_variable_importance(train_X, train_y)

    model = RandomForestClassifier(n_estimators=100)
    # model = SVC()
    # model = GradientBoostingClassifier()
    # model = KNeighborsClassifier(n_neighbors=3)
    # model = GaussianNB()
    # model = LogisticRegression()

    model.fit(train_X, train_y)

    print(model.score(train_X, train_y), model.score(valid_X, valid_y))
    #    plot_utils.plot_model_var_imp(model, train_X, train_y)

    rfecv = RFECV(estimator=model,
                  step=1,
                  cv=StratifiedKFold().split(train_X, train_y),
                  scoring='accuracy')
    rfecv.fit(train_X, train_y)

    test_Y = model.predict(test_X)
    passenger_id = full[891:].PassengerId

    test = pd.DataFrame({'PassengerId': passenger_id, 'Survived': test_Y})
    test['Survived'] = test['Survived'].astype(int)

    # test.head()
    test.to_csv('titanic_pred.csv', index=False)
Beispiel #4
0
    def run_training(self):
        """Train MNIST for a number of steps."""
        # Get the sets of images and labels for training, validation, and
        # test on MNIST.
        inputfiles = os.listdir(self.train_dir)
        np.random.shuffle(inputfiles)
        trainfiles = inputfiles[:int(self.max_num_train *
                                     self.train_test_split)]
        testlist = inputfiles[int(self.max_num_train *
                                  self.train_test_split):self.max_num_train]
        trainlist = trainfiles[:int(len(trainfiles) * self.train_val_ratio)]
        validationlist = trainfiles[
            int(len(trainfiles) * self.train_val_ratio):]
        trainset = load_data.DataSet(trainlist, self.train_dir)
        validationset = load_data.DataSet(validationlist, self.train_dir)
        testset = load_data.DataSet(testlist, self.train_dir)
        print('Train list: ', len(trainlist))
        print('Validation list: ', len(validationlist))
        print('Test list: ', len(testlist))

        with tf.Session(config=tf.ConfigProto(gpu_options=(tf.GPUOptions(
                per_process_gpu_memory_fraction=0.7)))) as sess:
            # Generate placeholders for the images and labels.
            images_placeholder, labels_placeholder = self.placeholder_inputs()
            # Build a Graph that computes predictions from the inference model.
            vgg = modvgg16.Vgg16()
            with tf.name_scope("content_vgg"):
                vgg.build(images_placeholder)
            #init_op = tf.initialize_all_variables()
            #sess.run(init_op)
            #logits = sess.run(vgg.prob, feed_dict=feed_dict)
            logits = vgg.prob
            # Add to the Graph the Ops for loss calculation.
            print(logits.get_shape())
            print(labels_placeholder.get_shape())
            loss = self.loss(logits, labels_placeholder)

            # Add to the Graph the Ops that calculate and apply gradients.
            train_op = self.training(loss, self.learning_rate)

            # Add the Op to compare the logits to the labels during evaluation.
            eval_correct = self.evaluation(logits, labels_placeholder)

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.merge_all_summaries()

            # Add the variable initializer Op.
            init = tf.initialize_all_variables()

            # Create a saver for writing training checkpoints.
            saver = tf.train.Saver()

            # Create a session for running Ops on the Graph.
            sess = tf.Session()

            # Instantiate a SummaryWriter to output summaries and the Graph.
            summary_writer = tf.train.SummaryWriter('log', sess.graph)

            # And then after everything is built:

            # Run the Op to initialize the variables.
            sess.run(init)

            # Start the training loop.
            for step in xrange(self.n_epoch * self.batch_size):
                start_time = time.time()

                # Fill a feed dictionary with the actual set of images and labels
                # for this particular training step.
                feed_dict = self.fill_feed_dict(trainset, images_placeholder,
                                                labels_placeholder)

                # Run one step of the model.  The return values are the activations
                # from the `train_op` (which is discarded) and the `loss` Op.  To
                # inspect the values of your Ops or variables, you may include them
                # in the list passed to sess.run() and the value tensors will be
                # returned in the tuple from the call.
                _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

                duration = time.time() - start_time

                # Write the summaries and print an overview fairly often.
                if step % self.batch_size == 0:
                    # Print status to stdout.
                    print('Step %d: loss = %.2f (%.3f sec)' %
                          (step, loss_value, duration))
                    # Update the events file.
                    summary_str = sess.run(summary_op, feed_dict=feed_dict)
                    summary_writer.add_summary(summary_str, step)
                    summary_writer.flush()

                # Save a checkpoint and evaluate the model periodically.
                if (step + 1) % self.batch_size == 0 or (
                        step + 1) == self.max_num_train:
                    checkpoint_file = os.path.join('log', 'checkpoint')
                    saver.save(sess, checkpoint_file, global_step=step)
                    # Evaluate against the training set.
                    print('Training Data Eval:')
                    self.do_eval(sess, eval_correct, images_placeholder,
                                 labels_placeholder, trainset)
                    # Evaluate against the validation set.
                    print('Validation Data Eval:')
                    self.do_eval(sess, eval_correct, images_placeholder,
                                 labels_placeholder, validationset)
                    # Evaluate against the test set.
                    print('Test Data Eval:')
                    self.do_eval(sess, eval_correct, images_placeholder,
                                 labels_placeholder, testset)