Ejemplo n.º 1
0
        start_time = time.time()
        accuracy = sess.run(total_loss,
                            feed_dict={
                                model.inputs[0]: X_test,
                                targets: y_test
                            })
        print("Step: %d," % (step_value + 1), " Iteration: %2d," % step,
              " Cost: %.4f," % loss_value, " Accuracy: %.4f" % accuracy,
              " AvgTime: %3.2fms" % float(elapsed_time * 1000 / log_frequency))


if FLAGS.job_name == "ps":
    server.join()
elif FLAGS.job_name == "worker":

    X, y = load_data.load_datasets()

    # pre process
    X = pre_process(X)

    # one hot encode
    y, num_classes = one_hot_encode(y)

    # split dataset
    global X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=7)

    with tf.device("/job:worker/task:0"):
Ejemplo n.º 2
0
# fix random seed for reproducibility
seed = 2020
np.random.seed(seed)

if __name__ == "__main__":
    # pretrained
    model_base = model_define(modeltype=model_type, inputshape=inputshape)
    # print(model_base.summary())
    # fully connected layers for learning weights (fine-tune)
    modelUntrained = fine_tune(model_base, n_class)

    print(modelUntrained.summary())

    # load data
    X, y, num_classes = load_datasets(train_data)

    # split dataset
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.30,
                                                      random_state=seed)

    print('Train Shape: {}'.format(X_train.shape))
    print('Valid Shape: {}'.format(X_val.shape))

    # Data Preprocessing and rescaling
    train_data_gen = ImageDataGenerator(
        rescale=1. / 255,
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
Ejemplo n.º 3
0
# important note before running this script: 
# 1) Put the correct pre-trained model name in utilities.py MODEL_NAME 
# 2) Put the correct TIMESTR to match the dictionaries generated for this model

from load_data import load_datasets
from utilities import preprocess_sentences, create_vocab, create_vocab_tags, prepare_input, prepare_tags, evaluate_on_model
import configparser
config = configparser.ConfigParser()
config.read('config.ini')

DATASET_INDEX = config['DEFAULT'].getint('DATASET_INDEX') # this controls whether we want to load sentences from all datasets (set it to -1), or we want a specific dataset, identified by th


# STEP 1: load test datasets
# ds_sentences[i] contains the list of sentences for dataset i, and ds_tags[i] the corresponding tags
ds_sentences, ds_tags = load_datasets (dataset_split = 'test', dataset_index = DATASET_INDEX) # we want to load the test data files, and since dataset_name is not provided it will load the test files for all the datasets.

# STEP 2: pre-process the data
# replace digits, and lowercase the words across each dataset
for i, sentences in enumerate(ds_sentences): # iterate over each dataset
    sentences = sentences[:10000] # we only test the first 10k sentences, as we trained on 20k sentences
    ds_tags[i] = ds_tags[i][:10000]
    ds_sentences[i] = preprocess_sentences (sentences)

# STEP 5: For each dataset, convert each sentence from a list of words to list of indices,
# and pad it to be of max_len size, with each word being max_len_char size
print ('Padding sentences, and words. This will take some time...')

ds_X_word = [] 
ds_X_char = []
def training(repo, learning_rate, batch_size, filenames, option="qbc", record_repo=None, record_filename=None):

	print 'LOAD DATA'
	(x_train, y_train), (x_valid, y_valid), (x_test, y_test) = load_datasets(repo, filenames)

	print 'BUILD MODEL'
	train_f, valid_f, test_f, model, reinit = build_training(lr=learning_rate)

	n_train = len(y_train)/batch_size
	n_valid = len(y_valid)/batch_size
	n_test = len(y_test)/batch_size

	epochs = 2000
	best_valid = np.inf; best_train = np.inf; best_test=np.inf
	init_increment = 5 # 20 5 8 
	increment = 0
	n_train_batches=int (n_train*1./100.)
	state_of_train = {}
	state_of_train['TRAIN']=best_train; state_of_train['VALID']=best_valid; state_of_train['TEST']=best_test; 
	print 'TRAINING IN PROGRESS'
	for epoch in range(epochs):

		try:
			for minibatch_index in range(n_train_batches):
				x_value = x_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size]
				y_value = y_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size].reshape((batch_size, 1))
				value = train_f(x_value, y_value)
				if np.isnan(value):
					import pdb
					pdb.set_trace()
			valid_cost=[]
			for minibatch_index in range(n_valid):
				x_value = x_valid[minibatch_index*batch_size:(minibatch_index+1)*batch_size].reshape((batch_size, 3, 32, 32))
				y_value = y_valid[minibatch_index*batch_size:(minibatch_index+1)*batch_size].reshape((batch_size, 1))
				value = test_f(x_value, y_value)
				valid_cost.append(value)

			# deciding when to stop the training on the sub batch
	    		valid_result = np.mean(valid_cost)
	    		if valid_result <= best_valid*0.95:
				model.save_model() # record the best architecture so to apply active learning on it (overfitting may appear in a few epochs)
	    			best_valid = valid_result
				# compute best_train and best_test
				train_cost=[]
				for minibatch_train in range(n_train_batches):
					x_value = x_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size].reshape((batch_size, 3, 32, 32))
					y_value = y_train[minibatch_train*batch_size:(minibatch_train+1)*batch_size].reshape((batch_size, 1))
					train_cost.append(valid_f(x_value, y_value))
				test_cost=[]
				for minibatch_test in range(n_test):
					x_value = x_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size].reshape((batch_size, 3, 32, 32))
					y_value = y_test[minibatch_test*batch_size:(minibatch_test+1)*batch_size].reshape((batch_size, 1))
					test_cost.append(test_f(x_value, y_value))
				best_train=np.mean(train_cost)
				best_test=np.mean(test_cost)
	    			increment=init_increment
			else:
				increment-=1

			if increment==0:
				# keep the best set of params found during training
				model.load_model()
				increment = init_increment
				record_state(n_train_batches, n_train, best_train, best_valid, best_test, record_repo, record_filename)
				# record in a file
				if state_of_train['VALID'] > best_valid :
					state_of_train['TRAIN']=best_train
					state_of_train['VALID']=best_valid
					state_of_train['TEST']=best_test;
				(x_train, y_train), n_train_batches = active_selection(model, x_train, y_train, n_train_batches, batch_size, valid_f, option)
				model.initialize()
				reinit()
				best_valid=np.inf; best_train=np.inf; best_test=np.inf

		except KeyboardInterrupt:
			# ask confirmation if you want to check state of training or really quit
			print 'BEST STATE OF TRAINING ACHIEVED'
			print "RATIO :"+str(1.*n_train_batches/n_train*100)
			print "TRAIN : "+str(state_of_train['TRAIN']*100)
			print "VALID : "+str(state_of_train['VALID']*100)
			print "TEST : "+str(state_of_train['TEST']*100)
			import pdb
			pdb.set_trace()
Ejemplo n.º 5
0
seed = 2020
np.random.seed(seed)
import pandas as pd
if __name__ == "__main__":

    # pretrained
    model_base = model_define(modeltype=model_type, inputshape=inputshape)
    # print(model_base.summary())

    # fully connected layers for learning weights (fine-tune)
    modelUntrained = fine_tune(model_base, n_class)
    # modelUntrained = model_base

    print(modelUntrained.summary())

    X, y, __ = load_datasets(train_data)

    # split dataset
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.8,
                                                      random_state=seed)

    print('Train Shape: {}'.format(X_train.shape))
    print('Valid Shape: {}'.format(X_val.shape))

    # Data Preprocessing and rescaling
    train_data_gen = ImageDataGenerator(
        rescale=1. / 255,
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
Ejemplo n.º 6
0
 def test_all_datasets_train(self):
     ret_sentences, ret_labels = load_datasets()
     self.assertEqual(len(ret_sentences), len(DATASETS))
     self.assertEqual(len(ret_labels), len(DATASETS))
     self.assertTrue(len(ret_sentences[0]) > 0)
Ejemplo n.º 7
0
 def test_single_dataset_test(self):
     ret_sentences, ret_labels = load_datasets(dataset_index=0,
                                               dataset_split='test')
     self.assertEqual(len(ret_sentences), 1)
     self.assertEqual(len(ret_labels), 1)
     self.assertTrue(len(ret_sentences[0]) > 0)
Ejemplo n.º 8
0
# Set it to a specific index if you just want the model to be trained for that dataset.

from load_data import load_datasets
from utilities import preprocess_sentences, create_vocab, create_vocab_tags, prepare_input, prepare_tags, load_embedding_matrix, prepare_model, fit_model
import configparser
config = configparser.ConfigParser()
config.read('config.ini')

DATASET_INDEX = config['DEFAULT'].getint(
    'DATASET_INDEX'
)  # this controls whether we want to load sentences from all datasets (set it to -1), or we want a specific dataset, identified by the index

# STEP 1: load datasets
# ds_sentences[i] contains the list of sentences for dataset i, and ds_tags[i] the corresponding tags
ds_sentences, ds_tags = load_datasets(
    dataset_index=DATASET_INDEX
)  # by default, this will load the training datasets

# STEP 2: pre-process the data
# replace digits across each dataset
for i, sentences in enumerate(ds_sentences):  # iterate over each dataset
    sentences = sentences[:20000]
    ds_tags[i] = ds_tags[i][:20000]
    ds_sentences[i] = preprocess_sentences(sentences)

# STEP 3: create and save vocabulary dictionaries for words, and characters
# as well as the length of each sentence, and length of each word
consolidated_sen = []
for sentences in ds_sentences:
    consolidated_sen.extend(sentences)
create_vocab(
Ejemplo n.º 9
0
def training(repo, learning_rate, batch_size, proportion, pickle_f="svhn.pkl"):
	#train, valid, test = build_datasets(repo)

	repo="/home/ducoffe/Documents/Code/datasets/svhn"
	#build_datasets(repo)
	filenames={}; 
	filenames["x_train"]="svhn_pickle_v3_x_train";
	filenames["y_train"]="svhn_pickle_v3_y_train";
	filenames["y_test"]="svhn_pickle_v3_y_test";
	filenames["x_test"]="svhn_pickle_v3_x_test";
	train, valid, test = load_datasets(repo, filenames)
	x_train, y_train = train;
	x_valid, y_valid = valid
	x_test, y_test = test

	import pdb
	pdb.set_trace()

	"""
	with closing(open( os.path.join(repo, pickle_f), 'r')) as f:
		(x_train, y_train), (x_valid, y_valid), (x_test, y_test) = pickle.load(f)

	mean = np.mean(x_train, axis=0)
	std = np.std(x_train, axis=0)
	x_train = (x_train - mean)/std
	x_valid = (x_valid - mean)/std
	x_test = (x_test - mean)/std

	# shuffle
	x_train,y_train = shuffle_data(x_train, y_train)
	"""

	"""
	n = len(y_train)
	m = (int) (n*proportion/100.)
	x_train = x_train[:m]; y_train= y_train[:m]
	"""
	#train_f, valid_f, test_f, model, reinit = build_training(lr=learning_rate)
	train_f, valid_f, test_f, model, obs = build_training(lr=learning_rate)
	n_train = len(y_train)/batch_size
	n_valid = len(y_valid)/batch_size
	n_test = len(y_test)/batch_size

	# shuffle data
	x = np.concatenate([x_train, x_valid], axis=0)
	y = np.concatenate([y_train, y_valid], axis=0)

	# shuffle
	x,y = shuffle_data(x, y)
	x_train, y_train = x[:n_train*batch_size], y[:n_train*batch_size]
	x_valid, y_valid = x[n_train*batch_size:], y[n_train*batch_size:]

	mean = np.mean(x_train, axis=0)
	std = np.std(x_train, axis=0)

	print n_train, n_valid, n_test
	print n_train

	epochs = 20
	best_cost = np.inf
	init_increment = 10
	increment = init_increment
	for epoch in range(epochs):
		for minibatch_index in range(n_train):
			x_value = x_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size].reshape((batch_size, 3, 32, 32))
			y_value = y_train[minibatch_index*batch_size:(minibatch_index+1)*batch_size].reshape((batch_size, 1))
			value = train_f(x_value, y_value)
			if minibatch_index %50==0:
				valid_cost=[]
				for minibatch_valid in range(n_valid):
					x_value = x_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size].reshape((batch_size, 3, 32, 32))
					y_value = y_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size].reshape((batch_size, 1))
					valid_cost.append(test_f(x_value, y_value))
				valid_score = np.mean(valid_cost)*100
				#print 'ONGOIN :'+str(valid_score)
				if increment !=0:
					#print 'coco'
					if valid_score < best_cost*0.995:
						increment = init_increment
						best_cost = valid_score
						valid_error = []
						for minibatch_valid in range(n_valid):
							x_value = x_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size].reshape((batch_size, 3, 32, 32))
							y_value = y_valid[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size].reshape((batch_size, 1))
							valid_error.append(test_f(x_value, y_value))
						test_error =[]
						for minibatch_valid in range(n_test):
							x_value = x_test[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size].reshape((batch_size, 3, 32, 32))
							y_value = y_test[minibatch_valid*batch_size:(minibatch_valid+1)*batch_size].reshape((batch_size, 1))
							test_error.append(test_f(x_value, y_value))
						print "VALID : "+str(np.mean(valid_error)*100)
						print "TEST : "+str(np.mean(test_error)*100)
					else:
						increment -=1
				else:
					print 'START AGAIN'
					train_f, valid_f, test_f,_, obs = build_training(lr=learning_rate*0.1, model=model)
					increment = init_increment
					break