def train(path, weightPath1, weightPath2, hiddenNodes, epochs, learningRate, proceed): # read sequences and their measured binding affinities allSequences, allTargets = fileUtils.readHLA(path) # log transform the data to fit between 0 and 1 allTargets = logTransform.transform(allTargets) # divide the data into training set, validation set and evaluation set numOfSequences = len(allSequences) indexes = np.arange(numOfSequences) np.random.shuffle(indexes) numOfTrain = (int) (numOfSequences * 0.7) # 70 % is for training trainSequence = allSequences[indexes[0:numOfTrain]] trainTarget = allTargets[indexes[0:numOfTrain]] numOfVal = (int) (numOfSequences * 0.2) # 20 % is for vaidation valSequence = allSequences[indexes[numOfTrain:(numOfTrain + numOfVal)]] valTarget = allTargets[indexes[numOfTrain:(numOfTrain + numOfVal)]] evalSequence = allSequences[indexes[(numOfTrain + numOfVal):numOfSequences]] evalTarget = allTargets[indexes[(numOfTrain + numOfVal):numOfSequences]] evalPrediction = np.zeros(len(evalSequence)) trainError = np.zeros(epochs) valError = np.zeros(epochs) # længden af sekvensbiderne og antallet er mulige aminosyrer. Der er 20 normale. mer = 9 numOfAminoAcids = 20 # create weight matrix with random values or load the files if(proceed): weight1 = np.load(weightPath1) weight2 = np.load(weightPath2) else: weight1 = weight(hiddenNodes, numOfAminoAcids * mer + 1) # plus 1 for bias weight2 = weight(1, hiddenNodes + 1) # plus 1 for bias bestWeight1 = weight1 bestWeight2 = weight2 bestError = 999 # just a large number so any validation will be better bestEpoch = 0 print("Starting training and validation.") for epoch in range(epochs): # train on training set # make scrampled order of sequences indexes = np.arange(numOfTrain) np.random.shuffle(indexes) error = np.zeros(numOfTrain) for index in indexes: # convert peptide sequence to quasi-binary inputLayer = sequenceUtils.createInputLayer(trainSequence[index]) # run the forward function hiddenLayer, outputLayer = forward(inputLayer, weight1, weight2) # save the error error[index] = 1/2 * (outputLayer - trainTarget[index])**2 errorDelta = outputLayer - trainTarget[index] # backpropagation outputDelta = backpropagation.backward(outputLayer, 1, errorDelta) weight2 = backpropagation.updateWeight(hiddenLayer, weight2, outputDelta, learningRate) hiddenDelta = backpropagation.backward(hiddenLayer, weight2, outputDelta) # bias is not a part of calculating the weights for the input hiddenDelta = hiddenDelta[0,0:hiddenNodes] weight1 = backpropagation.updateWeight(inputLayer, weight1, hiddenDelta, learningRate) trainError[epoch] = error.mean() # validation error = np.zeros(numOfVal) for index in range(numOfVal): # convert peptide sequence to quasi-binary inputLayer = sequenceUtils.createInputLayer(valSequence[index]) # run the forward function hiddenLayer, outputLayer = forward(inputLayer, weight1, weight2) # save the error error[index] = 1/2 * (outputLayer - valTarget[index])**2 valError[epoch] = error.mean() # find the best weight matrices so far if(valError[epoch] < bestError): bestWeight1 = weight1 bestWeight2 = weight2 bestError = valError[epoch] bestEpoch = epoch if(epoch % 10 == 0): percent = (int) (epoch/epochs*100) print("Training error: {:.8f}. Validation error: {:.8f}. {:2}% complete." .format(trainError[epoch], valError[epoch], percent)) print("Training and validation complete.") # plot error pyplot.plot(trainError, label = "Training set") pyplot.plot(valError, label = "Validation set") pyplot.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) pyplot.xlabel("epoch") pyplot.ylabel("error") pyplot.title("Validation") pyplot.savefig('validation.png', bbox_inches='tight') pyplot.show() # save the best weight matrices np.save(weightPath1, bestWeight1) np.save(weightPath2, bestWeight2) print("The minimum error of the validation set is at epoch {}. The validation error is {}." .format(bestEpoch, bestError)) #evaluation print("Predicting on evaluation set.") for index in range(len(evalSequence)): # convert peptide sequence to quasi-binary inputLayer = sequenceUtils.createInputLayer(evalSequence[index]) # run the forward function hiddenLayer, outputLayer = forward(inputLayer, bestWeight1, bestWeight2) evalPrediction[index] = outputLayer # plot comparison of prediction and target for evaluation set pyplot.plot(evalTarget, evalPrediction, '.') pyplot.xlabel("target") pyplot.ylabel("prediction") pyplot.title("Evaluation") pyplot.savefig('evaluationLog.png', bbox_inches='tight') pyplot.show() # how correlated is it? corr = np.corrcoef(evalTarget, evalPrediction)[1,0] print("The Pearson correlation coefficient is {}.".format(corr)) # plot comparison again, now inverse log transfomed back but with a logarithmic scale evalPrediction = logTransform.invTransform(evalPrediction) evalTarget = logTransform.invTransform(evalTarget) pyplot.axes().set_xscale('log') pyplot.axes().set_yscale('log') pyplot.plot(evalTarget, evalPrediction, '.') pyplot.xlabel("target") pyplot.ylabel("prediction") pyplot.title("Evaluation") pyplot.savefig('evaluation.png', bbox_inches='tight') pyplot.show()
# importer hjælpefunktioner import fileUtils import logTransform import sequenceUtils from forward import forward # set seed to be able to reproduce results np.random.seed(1234) limit = 500 syfLimit = 21 names = np.array(["gag", "pol", "vif", "vpr", "tat", "rev", "vpu", "env", "nef"]) # mhc epitopes mhcSequences, mhcAffinities = fileUtils.readHLA("data/mhcSequences.txt") mhcEpitopes = mhcSequences[mhcAffinities <= limit] # complete hiv hivProteins = fileUtils.readFasta("data/hivCodingSequences.txt") # SMMPMBEC smm0 = fileUtils.readColumn("data/smmpmbec.csv", 0, True) smm1 = fileUtils.readColumn("data/smmpmbec.csv", 1, True) smm2 = fileUtils.readColumn("data/smmpmbec.csv", 2, True) smm3 = fileUtils.readColumn("data/smmpmbec.csv", 3, True) index = smm3 <= limit smm = [smm2[index], np.repeat(0, sum(index)), smm1[index]] # replace names wih numbers for name in names: