Python readHLA Examples

Programming Language: Python

Namespace/Package Name: fileUtils

Method/Function: readHLA

Examples at hotexamples.com: 2

Python readHLA - 2 examples found. These are the top rated real world Python examples of fileUtils.readHLA extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: machine.py Project: degnbol/fagprojekt-kode

def train(path, weightPath1, weightPath2, hiddenNodes, epochs, learningRate, proceed):
    
    # read sequences and their measured binding affinities
    allSequences, allTargets = fileUtils.readHLA(path)  
    
    # log transform the data to fit between 0 and 1
    allTargets = logTransform.transform(allTargets)    
    
    # divide the data into training set, validation set and evaluation set
    numOfSequences = len(allSequences)    
    indexes = np.arange(numOfSequences)
    np.random.shuffle(indexes)
    numOfTrain = (int) (numOfSequences * 0.7) # 70 % is for training
    trainSequence = allSequences[indexes[0:numOfTrain]]
    trainTarget = allTargets[indexes[0:numOfTrain]]    
    numOfVal = (int) (numOfSequences * 0.2) # 20 % is for vaidation
    valSequence = allSequences[indexes[numOfTrain:(numOfTrain + numOfVal)]]
    valTarget = allTargets[indexes[numOfTrain:(numOfTrain + numOfVal)]]
    evalSequence = allSequences[indexes[(numOfTrain + numOfVal):numOfSequences]]
    evalTarget = allTargets[indexes[(numOfTrain + numOfVal):numOfSequences]]
    evalPrediction = np.zeros(len(evalSequence))
    
    trainError = np.zeros(epochs)   
    valError = np.zeros(epochs)
    
    # længden af sekvensbiderne og antallet er mulige aminosyrer. Der er 20 normale.
    mer = 9
    numOfAminoAcids = 20
    
    # create weight matrix with random values or load the files
    if(proceed):
        weight1 = np.load(weightPath1)
        weight2 = np.load(weightPath2)
    else:
        weight1 = weight(hiddenNodes, numOfAminoAcids * mer + 1) # plus 1 for bias
        weight2 = weight(1, hiddenNodes + 1) # plus 1 for bias   
    
    bestWeight1 = weight1
    bestWeight2 = weight2
    bestError = 999 # just a large number so any validation will be better
    bestEpoch = 0
    
    print("Starting training and validation.")   
    
    for epoch in range(epochs):
        
        # train on training set
        
        # make scrampled order of sequences
        indexes = np.arange(numOfTrain)
        np.random.shuffle(indexes)

        error = np.zeros(numOfTrain)
        
        for index in indexes:
            
            # convert peptide sequence to quasi-binary
            inputLayer = sequenceUtils.createInputLayer(trainSequence[index])
            
            # run the forward function
            hiddenLayer, outputLayer = forward(inputLayer, weight1, weight2)

            # save the error
            error[index] = 1/2 * (outputLayer - trainTarget[index])**2
            errorDelta = outputLayer - trainTarget[index]
            
            # backpropagation
            outputDelta = backpropagation.backward(outputLayer, 1, errorDelta)
            
            weight2 = backpropagation.updateWeight(hiddenLayer, weight2, outputDelta, learningRate)
              
            hiddenDelta = backpropagation.backward(hiddenLayer, weight2, outputDelta)
            
            # bias is not a part of calculating the weights for the input
            hiddenDelta = hiddenDelta[0,0:hiddenNodes]
            
            weight1 = backpropagation.updateWeight(inputLayer, weight1, hiddenDelta, learningRate)


        trainError[epoch] = error.mean()
        
        
        
        # validation
        
        error = np.zeros(numOfVal)
        
        for index in range(numOfVal):
            
            # convert peptide sequence to quasi-binary
            inputLayer = sequenceUtils.createInputLayer(valSequence[index])
            
            # run the forward function
            hiddenLayer, outputLayer = forward(inputLayer, weight1, weight2)

            # save the error
            error[index] = 1/2 * (outputLayer - valTarget[index])**2

            
        valError[epoch] = error.mean()
        

        # find the best weight matrices so far
        if(valError[epoch] < bestError):
            bestWeight1 = weight1
            bestWeight2 = weight2
            bestError = valError[epoch]
            bestEpoch = epoch
        
        
        if(epoch % 10 == 0):           
            percent = (int) (epoch/epochs*100)
            print("Training error: {:.8f}. Validation error: {:.8f}. {:2}% complete."
            .format(trainError[epoch], valError[epoch], percent))
        
        
    print("Training and validation complete.")
    
    
    # plot error
    pyplot.plot(trainError, label = "Training set")
    pyplot.plot(valError, label = "Validation set")
    pyplot.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
    pyplot.xlabel("epoch")
    pyplot.ylabel("error")
    pyplot.title("Validation")
    pyplot.savefig('validation.png', bbox_inches='tight')    
    pyplot.show()
            
    # save the best weight matrices
    np.save(weightPath1, bestWeight1)
    np.save(weightPath2, bestWeight2)
    print("The minimum error of the validation set is at epoch {}. The validation error is {}."
    .format(bestEpoch, bestError))
    
    #evaluation   
    print("Predicting on evaluation set.")

    for index in range(len(evalSequence)):
        
        # convert peptide sequence to quasi-binary
        inputLayer = sequenceUtils.createInputLayer(evalSequence[index])
        
        # run the forward function
        hiddenLayer, outputLayer = forward(inputLayer, bestWeight1, bestWeight2)
        
        evalPrediction[index] = outputLayer


    # plot comparison of prediction and target for evaluation set
    pyplot.plot(evalTarget, evalPrediction, '.')
    pyplot.xlabel("target")
    pyplot.ylabel("prediction")
    pyplot.title("Evaluation")
    pyplot.savefig('evaluationLog.png', bbox_inches='tight')    
    pyplot.show()
    
    # how correlated is it?
    corr = np.corrcoef(evalTarget, evalPrediction)[1,0]
    print("The Pearson correlation coefficient is {}.".format(corr))
    
    # plot comparison again, now inverse log transfomed back but with a logarithmic scale
    evalPrediction = logTransform.invTransform(evalPrediction)
    evalTarget = logTransform.invTransform(evalTarget)
    pyplot.axes().set_xscale('log')
    pyplot.axes().set_yscale('log')
    pyplot.plot(evalTarget, evalPrediction, '.')
    pyplot.xlabel("target")
    pyplot.ylabel("prediction")
    pyplot.title("Evaluation")
    pyplot.savefig('evaluation.png', bbox_inches='tight')    
    pyplot.show()

Example #2

Show file

File: findingSpecifics.py Project: degnbol/fagprojekt-kode

# importer hjælpefunktioner
import fileUtils
import logTransform
import sequenceUtils
from forward import forward


# set seed to be able to reproduce results
np.random.seed(1234)

limit = 500
syfLimit = 21
names = np.array(["gag", "pol", "vif", "vpr", "tat", "rev", "vpu", "env", "nef"])

# mhc epitopes
mhcSequences, mhcAffinities = fileUtils.readHLA("data/mhcSequences.txt")
mhcEpitopes = mhcSequences[mhcAffinities <= limit]

# complete hiv
hivProteins = fileUtils.readFasta("data/hivCodingSequences.txt")


# SMMPMBEC
smm0 = fileUtils.readColumn("data/smmpmbec.csv", 0, True)
smm1 = fileUtils.readColumn("data/smmpmbec.csv", 1, True)
smm2 = fileUtils.readColumn("data/smmpmbec.csv", 2, True)
smm3 = fileUtils.readColumn("data/smmpmbec.csv", 3, True)
index = smm3 <= limit
smm = [smm2[index], np.repeat(0, sum(index)), smm1[index]]
# replace names wih numbers
for name in names: