Example #1
0
def train(path, weightPath1, weightPath2, hiddenNodes, epochs, learningRate, proceed):
    
    # read sequences and their measured binding affinities
    allSequences, allTargets = fileUtils.readHLA(path)  
    
    # log transform the data to fit between 0 and 1
    allTargets = logTransform.transform(allTargets)    
    
    # divide the data into training set, validation set and evaluation set
    numOfSequences = len(allSequences)    
    indexes = np.arange(numOfSequences)
    np.random.shuffle(indexes)
    numOfTrain = (int) (numOfSequences * 0.7) # 70 % is for training
    trainSequence = allSequences[indexes[0:numOfTrain]]
    trainTarget = allTargets[indexes[0:numOfTrain]]    
    numOfVal = (int) (numOfSequences * 0.2) # 20 % is for vaidation
    valSequence = allSequences[indexes[numOfTrain:(numOfTrain + numOfVal)]]
    valTarget = allTargets[indexes[numOfTrain:(numOfTrain + numOfVal)]]
    evalSequence = allSequences[indexes[(numOfTrain + numOfVal):numOfSequences]]
    evalTarget = allTargets[indexes[(numOfTrain + numOfVal):numOfSequences]]
    evalPrediction = np.zeros(len(evalSequence))
    
    trainError = np.zeros(epochs)   
    valError = np.zeros(epochs)
    
    # længden af sekvensbiderne og antallet er mulige aminosyrer. Der er 20 normale.
    mer = 9
    numOfAminoAcids = 20
    
    # create weight matrix with random values or load the files
    if(proceed):
        weight1 = np.load(weightPath1)
        weight2 = np.load(weightPath2)
    else:
        weight1 = weight(hiddenNodes, numOfAminoAcids * mer + 1) # plus 1 for bias
        weight2 = weight(1, hiddenNodes + 1) # plus 1 for bias   
    
    bestWeight1 = weight1
    bestWeight2 = weight2
    bestError = 999 # just a large number so any validation will be better
    bestEpoch = 0
    
    print("Starting training and validation.")   
    
    for epoch in range(epochs):
        
        # train on training set
        
        # make scrampled order of sequences
        indexes = np.arange(numOfTrain)
        np.random.shuffle(indexes)

        error = np.zeros(numOfTrain)
        
        for index in indexes:
            
            # convert peptide sequence to quasi-binary
            inputLayer = sequenceUtils.createInputLayer(trainSequence[index])
            
            # run the forward function
            hiddenLayer, outputLayer = forward(inputLayer, weight1, weight2)

            # save the error
            error[index] = 1/2 * (outputLayer - trainTarget[index])**2
            errorDelta = outputLayer - trainTarget[index]
            
            # backpropagation
            outputDelta = backpropagation.backward(outputLayer, 1, errorDelta)
            
            weight2 = backpropagation.updateWeight(hiddenLayer, weight2, outputDelta, learningRate)
              
            hiddenDelta = backpropagation.backward(hiddenLayer, weight2, outputDelta)
            
            # bias is not a part of calculating the weights for the input
            hiddenDelta = hiddenDelta[0,0:hiddenNodes]
            
            weight1 = backpropagation.updateWeight(inputLayer, weight1, hiddenDelta, learningRate)


        trainError[epoch] = error.mean()
        
        
        
        # validation
        
        error = np.zeros(numOfVal)
        
        for index in range(numOfVal):
            
            # convert peptide sequence to quasi-binary
            inputLayer = sequenceUtils.createInputLayer(valSequence[index])
            
            # run the forward function
            hiddenLayer, outputLayer = forward(inputLayer, weight1, weight2)

            # save the error
            error[index] = 1/2 * (outputLayer - valTarget[index])**2

            
        valError[epoch] = error.mean()
        

        # find the best weight matrices so far
        if(valError[epoch] < bestError):
            bestWeight1 = weight1
            bestWeight2 = weight2
            bestError = valError[epoch]
            bestEpoch = epoch
        
        
        if(epoch % 10 == 0):           
            percent = (int) (epoch/epochs*100)
            print("Training error: {:.8f}. Validation error: {:.8f}. {:2}% complete."
            .format(trainError[epoch], valError[epoch], percent))
        
        
    print("Training and validation complete.")
    
    
    # plot error
    pyplot.plot(trainError, label = "Training set")
    pyplot.plot(valError, label = "Validation set")
    pyplot.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
    pyplot.xlabel("epoch")
    pyplot.ylabel("error")
    pyplot.title("Validation")
    pyplot.savefig('validation.png', bbox_inches='tight')    
    pyplot.show()
            
    # save the best weight matrices
    np.save(weightPath1, bestWeight1)
    np.save(weightPath2, bestWeight2)
    print("The minimum error of the validation set is at epoch {}. The validation error is {}."
    .format(bestEpoch, bestError))
    
    #evaluation   
    print("Predicting on evaluation set.")

    for index in range(len(evalSequence)):
        
        # convert peptide sequence to quasi-binary
        inputLayer = sequenceUtils.createInputLayer(evalSequence[index])
        
        # run the forward function
        hiddenLayer, outputLayer = forward(inputLayer, bestWeight1, bestWeight2)
        
        evalPrediction[index] = outputLayer


    # plot comparison of prediction and target for evaluation set
    pyplot.plot(evalTarget, evalPrediction, '.')
    pyplot.xlabel("target")
    pyplot.ylabel("prediction")
    pyplot.title("Evaluation")
    pyplot.savefig('evaluationLog.png', bbox_inches='tight')    
    pyplot.show()
    
    # how correlated is it?
    corr = np.corrcoef(evalTarget, evalPrediction)[1,0]
    print("The Pearson correlation coefficient is {}.".format(corr))
    
    # plot comparison again, now inverse log transfomed back but with a logarithmic scale
    evalPrediction = logTransform.invTransform(evalPrediction)
    evalTarget = logTransform.invTransform(evalTarget)
    pyplot.axes().set_xscale('log')
    pyplot.axes().set_yscale('log')
    pyplot.plot(evalTarget, evalPrediction, '.')
    pyplot.xlabel("target")
    pyplot.ylabel("prediction")
    pyplot.title("Evaluation")
    pyplot.savefig('evaluation.png', bbox_inches='tight')    
    pyplot.show()
# importer hjælpefunktioner
import fileUtils
import logTransform
import sequenceUtils
from forward import forward


# set seed to be able to reproduce results
np.random.seed(1234)

limit = 500
syfLimit = 21
names = np.array(["gag", "pol", "vif", "vpr", "tat", "rev", "vpu", "env", "nef"])

# mhc epitopes
mhcSequences, mhcAffinities = fileUtils.readHLA("data/mhcSequences.txt")
mhcEpitopes = mhcSequences[mhcAffinities <= limit]

# complete hiv
hivProteins = fileUtils.readFasta("data/hivCodingSequences.txt")


# SMMPMBEC
smm0 = fileUtils.readColumn("data/smmpmbec.csv", 0, True)
smm1 = fileUtils.readColumn("data/smmpmbec.csv", 1, True)
smm2 = fileUtils.readColumn("data/smmpmbec.csv", 2, True)
smm3 = fileUtils.readColumn("data/smmpmbec.csv", 3, True)
index = smm3 <= limit
smm = [smm2[index], np.repeat(0, sum(index)), smm1[index]]
# replace names wih numbers
for name in names: