def createDataMapping(self, data): dataMapping = saveDatamapping(self.id, data) rawFilename = getRawFilename(self.id) allData, headers = toLookup(rawFilename) #Split data into train/validation/test trainData, validationData = featuremapping.splitData(allData, split=0.99) #Save the datasets experimentDir = getCreateExperimentDir(self.id) trainDataFilename = "{0}/traindata.pkl".format(experimentDir) save(trainDataFilename, trainData, validationData) #Build the mapper based on the training data. itemMapper = buildMapper(trainData, headers, dataMapping) itemMapperFilename = "{0}/mapper.pkl".format(experimentDir) save(itemMapperFilename, itemMapper, useCloud=True) result = {"mapping":dataMapping, "inputDimension":itemMapper.dimension, "outputDimension":itemMapper.range} return result
def runExperiment(experimentId, runArgs): type = runArgs["type"] if type == "create_datamapping": return saveDatamapping(experimentId, runArgs["data"]) elif type == "train": return train(experimentId, runArgs["data"]) # Load stored experiment data. dataMapping = loadDatamapping(experimentId) data, headers = getDataFile(experimentId) #Split data into train/validation/test trainData, testData = featuremapping.splitData(data, split=0.99) trainData, validationData = featuremapping.splitData(trainData, split=0.99) #Build the mapper based on the training data. itemMapper = buildMapper(trainData, headers, dataMapping) print("Beginning mapping of {0} samples".format(len(trainData))) mappedTrainX, mappedTrainY = itemMapper.map(trainData) mappedValidationX, mappedValidationY = itemMapper.map(validationData) #Normalize #mu = np.mean(mappedTrainX, axis=0) #sdev = np.std(mappedTrainX, axis=0) + 1e-5 #mappedTrainX = (mappedTrainX - mu) / sdev #mappedValidationX = (mappedValidationX - mu) / sdev # Create Theano shared data train_x = theano.shared(mappedTrainX, borrow=True) train_y = T.cast(theano.shared(mappedTrainY, borrow=True), 'int32') validation_x = theano.shared(mappedValidationX, borrow=True) validation_y = T.cast(theano.shared(mappedValidationY, borrow=True), 'int32') rng = np.random.RandomState(1234) # allocate symbolic variables for the data x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # the cost we minimize during training is the negative log likelihood of # the model in symbolic format input_dimension = itemMapper.dimension output_dimension = itemMapper.range classifier = nnlayer.MLPReg(rng=rng, input=x, topology=[(input_dimension,), (100, nnlayer.ReluLayer), (output_dimension, nnlayer.LogisticRegressionLayer)]) cost = classifier.cost(y) + 0.0001*classifier.L2_sqr costParams = [] costParams.extend(classifier.params) costFunction = (costParams, cost) cum_dim = 0 for p in classifier.params: cum_dim += p.get_value(borrow=True).size print("Model dimension: {0}".format(cum_dim)) # Create validation function. valid_func = theano.function(inputs = [], outputs = [classifier.cost(y)], givens = {x:validation_x, y:validation_y}) # Create trainer tt = MLPBatchTrainer() variableAndData = (VariableAndData(x, train_x), VariableAndData(y, train_y, size=len(trainData))) epochFunction, stateMananger = tt.getEpochTrainer(costFunction, variableAndData, batch_size=64, rms = True) # Train with adaptive learning rate. stats = tt.trainALR(epochFunction, valid_func, initial_learning_rate=0.01, epochs=2, convergence_criteria=0.0001, max_runs=10, state_manager = stateMananger) validation_scores = [item["validation_score"] for item in stats] train_scorees = [item["training_costs"][-1] for item in stats] plt.plot(validation_scores, 'g') plt.plot(train_scorees, 'r') plt.show() mappedTestX, mappedTestY = itemMapper.map(testData) #Normalize #mappedTestX = (mappedTestX - mu)/sdev # Create Theano shared data test_x = theano.shared(mappedTestX, borrow=True) test_y = T.cast(theano.shared(mappedTestY, borrow=True), 'int32') # Setup test function batch_size=1 index = T.lscalar() # index to a [mini]batch test_model = theano.function(inputs=[index], outputs=(classifier.errors(y), classifier.y_pred), givens={ x: test_x[index * batch_size: (index + 1) * batch_size], y: test_y[index * batch_size: (index + 1) * batch_size]}) n_test_batch = int(test_x.get_value(borrow=True).shape[0] / batch_size) errorVector = [test_model(i) for i in range(n_test_batch)] #print("Avg. error {0}".format(np.average(errorVector))) errCount = 0 for i in range(len(errorVector)): if errorVector[i][0] > 0.0: errCount += 1 print("Error: {0}, Predicted:{1}".format(testData[i], itemMapper.labelMapper.inverseMap(int(errorVector[i][1])))) print("Avg: {0}".format(errCount / len(errorVector)))
def train(experimentId, trainingArgs): # Load stored experiment data. dataMapping = loadDatamapping(experimentId) data, headers = getDataFile(experimentId) #Split data into train/validation/test trainData, testData = featuremapping.splitData(data, split=0.99) trainData, validationData = featuremapping.splitData(trainData, split=0.99) #Build the mapper based on the training data. itemMapper = buildMapper(trainData, headers, dataMapping) print("Beginning mapping of {0} samples".format(len(trainData))) mappedTrainX, mappedTrainY = itemMapper.map(trainData) mappedValidationX, mappedValidationY = itemMapper.map(validationData) #Normalize #mu = np.mean(mappedTrainX, axis=0) #sdev = np.std(mappedTrainX, axis=0) + 1e-5 #mappedTrainX = (mappedTrainX - mu) / sdev #mappedValidationX = (mappedValidationX - mu) / sdev # Create Theano shared data train_x = theano.shared(mappedTrainX, borrow=True) train_y = T.cast(theano.shared(mappedTrainY, borrow=True), 'int32') validation_x = theano.shared(mappedValidationX, borrow=True) validation_y = T.cast(theano.shared(mappedValidationY, borrow=True), 'int32') rng = np.random.RandomState(1234) # allocate symbolic variables for the data x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # the cost we minimize during training is the negative log likelihood of # the model in symbolic format input_dimension = itemMapper.dimension output_dimension = itemMapper.range classifier = nnlayer.MLPReg(rng=rng, input=x, topology=[(input_dimension,), (100, nnlayer.ReluLayer), (output_dimension, nnlayer.LogisticRegressionLayer)]) cost = classifier.cost(y) + 0.0001*classifier.L2_sqr costParams = [] costParams.extend(classifier.params) costFunction = (costParams, cost) cum_dim = 0 for p in classifier.params: cum_dim += p.get_value(borrow=True).size print("Model dimension: {0}".format(cum_dim)) # Create validation function. valid_func = theano.function(inputs = [], outputs = [classifier.cost(y)], givens = {x:validation_x, y:validation_y}) # Create trainer tt = MLPBatchTrainer() variableAndData = (VariableAndData(x, train_x), VariableAndData(y, train_y, size=len(trainData))) epochFunction, stateMananger = tt.getEpochTrainer(costFunction, variableAndData, batch_size=64, rms = True) # Train with adaptive learning rate. stats = tt.trainALR(epochFunction, valid_func, initial_learning_rate=0.01, epochs=2, convergence_criteria=0.0001, max_runs=10, state_manager = stateMananger) experimentDir = getCreateExperimentDir(experimentId) modelFilename = "{0}/model.pkl".format(experimentDir) # Save all relevant model data, include normalization if this is used. save(modelFilename, x, y, classifier, itemMapper)