def weights(shape, name=None):
    return tf.Variable(tf.random_normal(shape), name=name)


def biases(shape, name=None):
    return tf.Variable(tf.random_normal(shape), name=name)


TRAIN_FOLDER_PATH = "G:/DL/Iceberg Classifier Challenge/train/data/processed/train.json"
TEST_FOLDER_PATH = "G:/DL/Iceberg Classifier Challenge/test/data/processed/test.json"

dataset_train_features, dataset_train_labels = get_dataset_in_np(
    TRAIN_FOLDER_PATH, labels_available=True)
# dataset_test_features = get_dataset_in_np(TEST_FOLDER_PATH, labels_available=False)
dataset_train_features, dataset_test_features = normalize(
    dataset_train_features,
    dataset_train_features)  # TODO: change to dataset_test_features

print('dataset_train_features.shape:', dataset_train_features.shape,
      'dataset_train_labels.shape:', dataset_train_labels.shape)

DIMENSION = dataset_train_features.shape[1]
NUM_CHANNELS = dataset_train_features.shape[3]
NUM_CLASSES = dataset_train_labels.shape[1]
NUM_EXAMPLES = dataset_train_features.shape[0]
BATCH_SIZE = 16
NUM_EPOCHS = 100

x = tf.placeholder(tf.float32,
                   shape=[None, DIMENSION, DIMENSION, NUM_CHANNELS])
y = tf.placeholder(tf.float32, shape=[None, dataset_train_labels.shape[1]])
Beispiel #2
0
            training_params.path_to_chinese_dataset)
        np.save(npy_name, chinese_dataset)
    if training_params.get_features:
        chinese_dataset = np.load(npy_name)
        data_extraction.extract_features(
            chinese_dataset, training_params.path_to_chinese_samples)
    X, Y = data_extraction.get_sample(training_params.path_to_chinese_samples)
'''5-fold  cross_validation(K-折交叉验证)'''
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=100)
for train, test in kfold.split(X, Y):
    train_x = X[train]
    train_y = Y[train]
    test_x = X[test]
    test_y = Y[test]

    train_x = data_preprocessing.normalize(train_x)
    test_x = data_preprocessing.normalize(test_x)

    timestep = 24
    train_x = data_preprocessing.pad_sequence(train_x, timestep)
    test_x = data_preprocessing.pad_sequence(test_x, timestep)
    train_y_cat = data_preprocessing.to_categorical(train_y)
    test_y_cat = data_preprocessing.to_categorical(test_y)

    model = training_models.train_lstm(train_x, train_y_cat, test_x,
                                       test_y_cat)
    scores = model.predict(test_x)
    prediction = np.array([
        training_params.available_emotions[np.argmax(t)] for t in scores
    ])  # np.argmax(t) 返回最值所在的索引
    print(prediction[prediction == test_y].shape[0] /
Beispiel #3
0
def normalizing(df):

    return data_preprocessing.normalize(df)
Beispiel #4
0
import greenhouse_clock
import data_sourcing
import data_splitting
import data_preprocessing
import feature_engineering
from modeling import model
import performance_monitoring

start_time = greenhouse_clock.get_time()

if __name__ == "__main__":

    # Run prefect flow
    df = data_sourcing.get()
    df = data_preprocessing.clean(df)
    df = data_preprocessing.normalize(df)

    train, valid, test = data_splitting.split(df)

    (
        train["x"],
        valid["x"],
        test["x"],
    ) = feature_engineering.numerical_missing_imputation(
        train=train,
        valid=valid,
        test=test,
        cols=["x"],
        imputation_method="median")

    m = model().fit(train=train, y_col="y", x_col="x")
Beispiel #5
0
def train_model(learn_model, num_epochs, batch_size, learning_rate):
	"""takes a some parameters, trains a specified model,
	   calculates accuracy during training and prints it out. Prints out
	   final accuracy on a test set as well"""

	#load data
	num_classes = 6
	tr_data = normalize(load_data('X_train.txt'))
	tr_labels = one_hot_encode_labels(load_data('y_train.txt'),num_classes)
	te_data = normalize(load_data('X_test.txt'))
	te_labels = one_hot_encode_labels(load_data('y_test.txt'),num_classes)

	#determine which learn method to run
	if learn_model =='logistic' or learn_model == '2-layer':

		if learn_model=='logistic':
			x,y_,model,train_op,accuracy = logistic_classifier(learning_rate)
		else:
			x,y_,model,train_op,accuracy = two_layer_net(learning_rate)

		with tf.Session() as sess:
			sess.run(model)

			count = 0
			#get sample length
			sample_length = tr_data.shape[0]
			#get number of batches
			num_batches = int(sample_length/batch_size)

			#make the containers to hold test and train accuracy
			train_accuracy = np.zeros([num_epochs*num_batches,1])
			test_accuracy = np.zeros([num_epochs,1])

			for epoch in range(num_epochs):
				
				#get shuffled index
				shuffled_indexes = np.arange(sample_length)
				np.random.shuffle(shuffled_indexes)
				#shuffle data
				shuffled_train_data = tr_data[shuffled_indexes]
				shuffled_train_labels = tr_labels[shuffled_indexes]

				for i in range(num_batches):
				
					#gather batches
					start = i*batch_size
					end = (i+1)*batch_size

					#make sure dont access part of an array that doesn't exist(small overlap of data from previous batch will occur - artifact of uneven data set and even batch size)
					if end > sample_length:
						end = sample_length
					if start > sample_length-batch_size:
						start = sample_length-batch_size

					batch_x,batch_y = shuffled_train_data[start:end][:],shuffled_train_labels[start:end][:]
					_,train_accuracy[count] = sess.run([train_op, accuracy], feed_dict={x:batch_x,y_:batch_y})

					#prints out the accuracy every 7 batches (So that an even amount gets printed out based on num_batches)
					print_out = int(num_batches/7)
					if i%print_out==0:
						print("epoch%d, train_step %d, training accuracy %g"%(epoch, i, train_accuracy[count]))
				
					count +=1

				test_accuracy[epoch] = sess.run(accuracy, feed_dict={x: te_data, y_: te_labels})	
				print("epoch: %d, test accuracy: %g"%(epoch, test_accuracy[epoch]))
				print("--------------------------------------------------------------")

		#calculates average accuracy at five points
		
		#plots train accuracy
		train_line, = plt.plot(train_accuracy,'r.', label = 'train accuracy') 
		test_line, = plt.plot([e*num_batches for e in range(num_epochs)], test_accuracy,'b-', label = 'test accuracy')

		plt.legend(loc = 'lower right')
		plt.xlabel('Batch Number')
		plt.ylabel('Accuracy')
		plt.title('Prediction Accuracy vs Batch Number')
		#plt.legend(handles=[])
		plt.show()

	if learn_model =='knn':

		#get input from user
		raw_k_val = input("Please enter odd k value (recommend 7) or enter 0 to run a list of k values: ")

		if int(raw_k_val)==0:
			#ran multiple k's to decide which was best for this data set (Keep odd so that there are no ties)
			num_neighbours = [1,3,5,7,9,11,13,15]
			#num_neighbours = [1,3]
		else:
			num_neighbours = [int(raw_k_val)]

		#will contain the accuracy for each value of k
		train_accuracy = np.zeros((len(num_neighbours)))

		for index, k in enumerate(num_neighbours):
			#retrieve object for graph_constructor
			x,y_,xtest, ytest,accuracy, model,train_op = knn(k)
			with tf.Session() as sess:
				sess.run(model)
				# loop over test data
				for i in range(len(te_data)):
					# Get nearest neighbor to each row of test data which represents one multi dimensional data point
					train_accuracy[index] += sess.run(accuracy, feed_dict={x: tr_data, y_: tr_labels, xtest: te_data[i, :], ytest: te_labels[i]})

					if i%200==0:
						print(str(i) + ' out of ' + str(len(te_data)) + ' have been tested')

			print("k = {}, Accuracy: {} ".format(k, train_accuracy[index]/len(te_data)))

		#only plot if there is more than one k value
		if int(raw_k_val) == 0: 
			plt.plot(num_neighbours, train_accuracy/len(te_data), 'ro')
			plt.xlabel('K - value')
			plt.ylabel('Accuracy')
			plt.title('Prediction Accuracy vs Batch Number')
			plt.show()
X_test = X_test.transpose((3, 0, 1, 2))

# EXPECTED IMAGE SIZE
img_height = 32
img_width = 32

# PROCESSING THE DATA AND HAVING A LOOK AT AN EXAMPLE
rand_int = np.random.randint(X_train.shape[0])
rand_img = X_train[rand_int, :, :, :]
plt.imshow(rand_img)
plt.show()

method = 'grey_scale'
X_train = dp.prepocess(X_train, method=method)
X_test = dp.prepocess(X_test, method=method)
X_train, X_test = dp.normalize(X_train, X_test)

if X_train.shape[3] == 3:
    rand_img = X_train[rand_int, :, :, :] - np.amin(X_train[rand_int, :, :, :])
    rand_img = rand_img / np.amax(rand_img)
else:
    rand_img = X_train[rand_int, :, :, 0]
plt.imshow(rand_img)
plt.show()

###############################################################################

# BUILDING TENSORFLOW MODEL
tf.reset_default_graph()
model = gc.CNN(img_height=img_height,
               img_width=img_width,
Beispiel #7
0
def train_model(num_epochs, batch_size):
    """Trains model, outputs train and validation accuracy during training, has a early stopping
	   if validation error increases, outputs test accuracy on unseen dataset.

	   params:

	   :num_epochs: This is the number of cycles the model should be trained on

	   :batch_size: Batch training is being used to perform optimization. Therefore the size of the batch
	   must be specified. It cannot be too large else there will be an out of memory error. Generally 
	   20-50 is a good size for the current setup

	   return: nothing.

	   """

    #load data
    num_classes = 10
    images_train, labels_train = load_data('train_32x32.mat')
    tr_data = normalize(images_train)
    tr_labels = one_hot_encode_labels(labels_train, num_classes)

    images_test, labels_test = load_data('test_32x32.mat')
    te_data = normalize(images_test)
    te_labels = one_hot_encode_labels(labels_test, num_classes)

    #split test set into validation set and test set 50/50
    num_feat_val_set = 500
    val_data, val_labels = te_data[:
                                   num_feat_val_set], te_labels[:
                                                                num_feat_val_set]
    testing_data_and_labels = generate_data(te_data[num_feat_val_set:],
                                            te_labels[num_feat_val_set:])

    x, y_, model, train_op, accuracy, keep_prob = classifier()

    #save model functionailty
    saver = tf.train.Saver()

    #variable used to determine if to exit training phase due to overfitting
    exit = False

    #start interactive matplotlib session
    plt.ion()

    with tf.Session() as sess:
        sess.run(model)

        count = 0
        #training
        #get sample length
        sample_length = tr_data.shape[0]
        #get number of batches
        num_batches = int(sample_length / batch_size)

        #make the containers to hold test and train accuracy
        train_accuracy = np.zeros([num_epochs * num_batches, 1])
        val_accuracy = np.zeros([num_epochs * num_batches, 1])
        test_accuracy = 0

        for epoch in range(num_epochs):

            #get shuffled index
            shuffled_indexes = np.arange(sample_length)
            np.random.shuffle(shuffled_indexes)
            #shuffle data
            shuffled_train_data = tr_data[shuffled_indexes]
            shuffled_train_labels = tr_labels[shuffled_indexes]

            for i in range(num_batches):
                #gather batches
                start = i * batch_size
                end = (i + 1) * batch_size

                #make sure dont access part of an array that doesn't exist(small overlap of data from previous batch will occur - artifact of uneven data set and even batch size)
                if end > sample_length:
                    end = sample_length
                if start > sample_length - batch_size:
                    start = sample_length - batch_size

                batch_x, batch_y = shuffled_train_data[
                    start:end][:], shuffled_train_labels[start:end][:]

                _, train_accuracy[count] = sess.run([train_op, accuracy],
                                                    feed_dict={
                                                        x: batch_x,
                                                        y_: batch_y,
                                                        keep_prob: 0.5
                                                    })

                val_accuracy[count] = sess.run(accuracy,
                                               feed_dict={
                                                   x: val_data,
                                                   y_: val_labels,
                                                   keep_prob: 1.0
                                               })

                #prints out the accuracy every 7 batches (So that an even amount gets printed out based on num_batches)
                print_out = int(num_batches / 7)
                if i % print_out == 0:
                    print("epoch: %d, train_step %d, training accuracy %g" %
                          (epoch, i, train_accuracy[count]))
                    print("epoch: %d, test accuracy: %g" %
                          (epoch, val_accuracy[count]))

                    #plotting
                    train_line, = plt.plot(count,
                                           train_accuracy[count],
                                           'bo',
                                           label='train_accuracy')
                    val_line, = plt.plot(count,
                                         val_accuracy[count],
                                         'ro',
                                         label='validation_accuracy')
                    plt.xlabel('Batch Number')
                    plt.ylabel('Accuracy')
                    plt.title('Validation & train accuracy vs Batch Number')
                    plt.pause(0.05)

                #exit training if there is increased validation error (decrease in test accuracy)
                if val_accuracy[count] < (val_accuracy[count - 1] - 0.1):
                    exit = True
                    break

                count += 1

            #exit training if there is increased validation error (decrease in test accuracy)
            if exit:
                print("Overfitting occuring: exiting training phase")
                break
            print(
                "--------------------------------------------------------------"
            )

        #saves model
        saver_path = saver.save(sess, './trained_model.ckpt')
        print("Saved model: ", saver_path)

        #calculate test accuracy on unseen test set
        for _ in range(te_data.shape[0] - num_feat_val_set):
            batch = next(testing_data_and_labels)
            test_accuracy += sess.run(accuracy,
                                      feed_dict={
                                          x:
                                          np.reshape(batch[0], (1, 32, 32, 3)),
                                          y_: np.reshape(batch[1], (1, 10)),
                                          keep_prob: 1.0
                                      })

    print("Testing Accuracy: %g" %
          (float(test_accuracy) / float(te_data.shape[0] - num_feat_val_set)))
def train_model(num_epochs, batch_size, learning_rate):
    """Trains a neural network for image classification from the SVHN
    dataset, and creates a plot giving training/testing accuracy as a
    function of batch number.

    Parameters:
    num_epochs -- Number of training epochs
    batch_size -- Number of examples in each training batch
    learning_rate -- Initial learning rate
    """
    # Get data:
    train_X_orig, train_y_orig, _, _ = data_preprocessing.load_data()
    train_X_norm = data_preprocessing.normalize(train_X_orig)
    train_X, valid_X, train_y, valid_y = data_preprocessing.split(
        train_X_norm, train_y_orig)
    # One-hot encode so they can be used for input/validation:
    train_y_cat = keras.utils.to_categorical(train_y, num_classes=10)
    valid_y_cat = keras.utils.to_categorical(valid_y, num_classes=10)
    
    # Build & compile model:
    model = graph_construction.get_keras_model()
    sgd = SGD(lr=learning_rate, decay=1e-5, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    # Train:
    history = BatchHistory(valid_X, valid_y_cat)    
    model.fit(train_X,
              train_y_cat,
              epochs=num_epochs,
              batch_size=batch_size,
              callbacks=[history],
              validation_data=(valid_X, valid_y_cat))
    # The callback slows things down a bit, but I'm not sure of a good
    # way around it.  If I were testing only on specific batches of
    # validation data, it might be less of an issue.

    fname_base = "model_{0}_{1}_{2}_rgb".format(learning_rate, num_epochs,
                                            batch_size)
    model.save_weights("{0}.h5".format(fname_base))

    # Plot training & validation accuracy, and loss (not called for,
    # but useful):
    b = [i["batch"] for i in history.history]
    plt.plot(b, [i["acc"] for i in history.history])
    plt.plot(b, [i["val_acc"] for i in history.history])
    plt.ylabel('Accuracy')
    plt.xlabel('Batch')
    plt.legend(['Training', 'Validation'], loc='lower right')
    plt.savefig("{0}_accuracy.png".format(fname_base))
    plt.show()
    plt.close()
    
    plt.plot(b, [i["loss"] for i in history.history])
    plt.plot(b, [i["val_loss"] for i in history.history])
    plt.ylabel('Loss (categorical cross-entropy)')
    plt.xlabel('Batch')
    plt.legend(['Training', 'Validation'], loc='lower right')
    plt.savefig("{0}_loss.png".format(fname_base))
    plt.close()