def predict(path, weightPath1, weightPath2, method): predictionPath = "predictions.txt" predictions = [] limit = 500 # load weight1 = np.load(weightPath1) weight2 = np.load(weightPath2) method = method.lower() if(method == 'fasta'): proteins = fileUtils.readFasta(path) for proteinId in range(len(proteins)): sequences = sequenceUtils.openReadingFrames(proteins[proteinId]) for pos in range(len(sequences)): # lav sekvens om til binær inputLayer = sequenceUtils.createInputLayer(sequences[pos]) # forward outputLayer = forward(inputLayer, weight1, weight2)[1] outputLayer = logTransform.invTransform(outputLayer) if(outputLayer <= limit): # plus one, since both are zero indexed predictions.append([proteinId + 1, pos + 1]) np.savetxt(predictionPath, np.array(predictions), fmt = '%d', delimiter = '\t') print("There is {} predicted epitopes.".format(len(predictions)))
def train(path, weightPath1, weightPath2, hiddenNodes, epochs, learningRate, proceed): # read sequences and their measured binding affinities allSequences, allTargets = fileUtils.readHLA(path) # log transform the data to fit between 0 and 1 allTargets = logTransform.transform(allTargets) # divide the data into training set, validation set and evaluation set numOfSequences = len(allSequences) indexes = np.arange(numOfSequences) np.random.shuffle(indexes) numOfTrain = (int) (numOfSequences * 0.7) # 70 % is for training trainSequence = allSequences[indexes[0:numOfTrain]] trainTarget = allTargets[indexes[0:numOfTrain]] numOfVal = (int) (numOfSequences * 0.2) # 20 % is for vaidation valSequence = allSequences[indexes[numOfTrain:(numOfTrain + numOfVal)]] valTarget = allTargets[indexes[numOfTrain:(numOfTrain + numOfVal)]] evalSequence = allSequences[indexes[(numOfTrain + numOfVal):numOfSequences]] evalTarget = allTargets[indexes[(numOfTrain + numOfVal):numOfSequences]] evalPrediction = np.zeros(len(evalSequence)) trainError = np.zeros(epochs) valError = np.zeros(epochs) # længden af sekvensbiderne og antallet er mulige aminosyrer. Der er 20 normale. mer = 9 numOfAminoAcids = 20 # create weight matrix with random values or load the files if(proceed): weight1 = np.load(weightPath1) weight2 = np.load(weightPath2) else: weight1 = weight(hiddenNodes, numOfAminoAcids * mer + 1) # plus 1 for bias weight2 = weight(1, hiddenNodes + 1) # plus 1 for bias bestWeight1 = weight1 bestWeight2 = weight2 bestError = 999 # just a large number so any validation will be better bestEpoch = 0 print("Starting training and validation.") for epoch in range(epochs): # train on training set # make scrampled order of sequences indexes = np.arange(numOfTrain) np.random.shuffle(indexes) error = np.zeros(numOfTrain) for index in indexes: # convert peptide sequence to quasi-binary inputLayer = sequenceUtils.createInputLayer(trainSequence[index]) # run the forward function hiddenLayer, outputLayer = forward(inputLayer, weight1, weight2) # save the error error[index] = 1/2 * (outputLayer - trainTarget[index])**2 errorDelta = outputLayer - trainTarget[index] # backpropagation outputDelta = backpropagation.backward(outputLayer, 1, errorDelta) weight2 = backpropagation.updateWeight(hiddenLayer, weight2, outputDelta, learningRate) hiddenDelta = backpropagation.backward(hiddenLayer, weight2, outputDelta) # bias is not a part of calculating the weights for the input hiddenDelta = hiddenDelta[0,0:hiddenNodes] weight1 = backpropagation.updateWeight(inputLayer, weight1, hiddenDelta, learningRate) trainError[epoch] = error.mean() # validation error = np.zeros(numOfVal) for index in range(numOfVal): # convert peptide sequence to quasi-binary inputLayer = sequenceUtils.createInputLayer(valSequence[index]) # run the forward function hiddenLayer, outputLayer = forward(inputLayer, weight1, weight2) # save the error error[index] = 1/2 * (outputLayer - valTarget[index])**2 valError[epoch] = error.mean() # find the best weight matrices so far if(valError[epoch] < bestError): bestWeight1 = weight1 bestWeight2 = weight2 bestError = valError[epoch] bestEpoch = epoch if(epoch % 10 == 0): percent = (int) (epoch/epochs*100) print("Training error: {:.8f}. Validation error: {:.8f}. {:2}% complete." .format(trainError[epoch], valError[epoch], percent)) print("Training and validation complete.") # plot error pyplot.plot(trainError, label = "Training set") pyplot.plot(valError, label = "Validation set") pyplot.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0) pyplot.xlabel("epoch") pyplot.ylabel("error") pyplot.title("Validation") pyplot.savefig('validation.png', bbox_inches='tight') pyplot.show() # save the best weight matrices np.save(weightPath1, bestWeight1) np.save(weightPath2, bestWeight2) print("The minimum error of the validation set is at epoch {}. The validation error is {}." .format(bestEpoch, bestError)) #evaluation print("Predicting on evaluation set.") for index in range(len(evalSequence)): # convert peptide sequence to quasi-binary inputLayer = sequenceUtils.createInputLayer(evalSequence[index]) # run the forward function hiddenLayer, outputLayer = forward(inputLayer, bestWeight1, bestWeight2) evalPrediction[index] = outputLayer # plot comparison of prediction and target for evaluation set pyplot.plot(evalTarget, evalPrediction, '.') pyplot.xlabel("target") pyplot.ylabel("prediction") pyplot.title("Evaluation") pyplot.savefig('evaluationLog.png', bbox_inches='tight') pyplot.show() # how correlated is it? corr = np.corrcoef(evalTarget, evalPrediction)[1,0] print("The Pearson correlation coefficient is {}.".format(corr)) # plot comparison again, now inverse log transfomed back but with a logarithmic scale evalPrediction = logTransform.invTransform(evalPrediction) evalTarget = logTransform.invTransform(evalTarget) pyplot.axes().set_xscale('log') pyplot.axes().set_yscale('log') pyplot.plot(evalTarget, evalPrediction, '.') pyplot.xlabel("target") pyplot.ylabel("prediction") pyplot.title("Evaluation") pyplot.savefig('evaluation.png', bbox_inches='tight') pyplot.show()
# predict allSequences = [[], [], [], [], [], [], [], [], []] for proteinId in range(len(hivProteins)): allSequences[proteinId] = sequenceUtils.openReadingFrames(hivProteins[proteinId]) for pos in range(len(allSequences[proteinId])): ## machine # save sequence machine0.append(allSequences[proteinId][pos]) # lav sekvens om til binær inputLayer = sequenceUtils.createInputLayer(allSequences[proteinId][pos]) # forward outputLayer = forward(inputLayer, weight1, weight2)[1] outputLayer = logTransform.invTransform(outputLayer) # save prediction machine1.append(outputLayer) # save epitope if(outputLayer <= limit): # plus one, since both are zero indexed machine[0].append(allSequences[proteinId][pos]) machine[1].append(proteinId + 1) machine[2].append(pos + 1)