def long(generations:int): data = "/netapp/home/tianjiao.zhang/data/microstates.dat"; targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); ensembleSizes = numpy.array([16, 24, 32, 64, 128]); backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]); boltzmannTemps = numpy.array([-1, 5]); steepnessRange = numpy.array([1, 7]); minWeights = numpy.array([0, 0, 0, 0, 0]); maxWeights = numpy.array([1, 1, 1, 1, 1]); optimizer = Optimizer(MACROSTATES, True); optimizer.readTargetFrequencies(targetFreqs); optimizer.readFormattedMicrostateData(data); search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), True, 128, 1.25, 0.25); search.setMaxIterations(generations); search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setAllSearchToTrue(); search.suppressOutputs = True; optimizer.useAlgorithm(search); optimizer.optimize(); now = datetime.now(); optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "var ensembles " + now.strftime('%Y%m%d%H%M') + ".fasta"); optimizer.writeBestParamsToText("var ensembles " + now.strftime('%Y%m%d%H%M'));
def testRandUniformInput(): MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); # only looking at MACROSTATE.TS # only optimizing backrub temperature and steepness ensembleSizes = numpy.array([50]); backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]); boltzmannTemps = numpy.array([0, -1, 1, 5.0]); steepnessRange = numpy.array([0.5, 5]); minWeights = numpy.array([0, 0, 0, 0, 0, 0]); maxWeights = numpy.array([1, 1, 0, 1, 1, 1]); print("Initializing objects\n"); targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv"; dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv"; optimizer = Optimizer(MACROSTATES); # slightly different paths on my two computers try: optimizer.readTargetFrequencies(targetFreqs); optimizer.readData(data); except: optimizer.readTargetFrequencies(targetFreqsAlt); optimizer.readData(dataAlt); # make energies uniform for model in optimizer.models: optimizer.models[model].macrostateResidueEnergies = numpy.ones_like(optimizer.models[model].macrostateResidueEnergies); search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 1, 1, 0.25); search.setMaxIterations(1); search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setSearchParameters(False, False, False, False, numpy.array([False, False, False, False, False, False])); optimizer.useAlgorithm(search); outfile = open("uniform energy similarities.txt", 'w'); optimizer.optimize(); outfile.write("JSD: {:.4f}\n".format(optimizer.getBestParameters()['match'])); search.setSimilarityMeasure(CosineSimilarity(optimizer.targetFrequencies)); optimizer.optimize(); outfile.write("Cosine similarity: {:.4f}\n".format(optimizer.getBestParameters()['match'])); search.setSimilarityMeasure(KLDivergence(optimizer.targetFrequencies)); optimizer.optimize(); outfile.write("K-L divergence: {:.4f}\n".format(optimizer.getBestParameters()['match'])); search.setSimilarityMeasure(EntropyWeightsMixedSimilarity(CosineSimilarity(), JensenShannonDistance(), optimizer.targetFrequencies)); optimizer.optimize(); outfile.write("Weighted mixed similarity: {:.4f}\n".format(optimizer.getBestParameters()['match'])); outfile.close(); return None;
def smalltestPrevOptimalVals(): MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); # only looking at MACROSTATE.TS # only optimizing backrub temperature and steepness ensembleSizes = numpy.array([50]); backrubTemps = numpy.array([1.8]); boltzmannTemps = numpy.array([0.0]); steepnessRange = numpy.array([3.0]); minWeights = numpy.array([0.80, 0.55, 0, 0.90, 0.35, 1.00]); maxWeights = numpy.array([0.80, 0.55, 0, 0.90, 0.35, 1.00]); print("Initializing objects\n"); targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat6.tsv"; dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat5.tsv"; optimizer = Optimizer(MACROSTATES); # slightly different paths on my two computers try: optimizer.readTargetFrequencies(targetFreqs); optimizer.readData(data); except: optimizer.readTargetFrequencies(targetFreqsAlt); optimizer.readData(dataAlt); print("Files read in"); search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 1, 1, 0.25); search.setMaxIterations(1); search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setSearchParameters(False, False, False, False, numpy.array([False, False, False, False, False, False])); optimizer.useAlgorithm(search); #print("Cos similiarity"); #optimizer.optimize(); #optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "testOutCos.fasta"); #print(optimizer.getBestParameters()['match']); print("\nJS Dist"); #search.setSimilarityMeasure(JensenShannonDistance(optimizer.targetFrequencies)); optimizer.optimize(); now = datetime.now(); optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "prev opt vals " + now.strftime('%Y%m%d%H%M') + ".fasta"); optimizer.writeBestParamsToText("prev opt vals " + now.strftime('%Y%m%d%H%M')) print(optimizer.getBestParameters()['match']); return None;
def experimentalAnalysis(nMax, algoNo, stepSize): print "Generating array..." #generate array of test arrays testArray = [] for n in range(stepSize, nMax+1, stepSize): #need 10 test arrays of each input size n noTestArrays = 10 for j in range(0, noTestArrays): testArray.append([]) for k in range(0, n): testArray[-1].append(randint(-10, 10)) #determine which algorithm we need to run if (algoNo == 1): print "Analysis for Algorithm 1" for a in testArray: #test each array in the algorithm t0 = time() enum(a) #call algorithm t1 = time() print len(a), " %f" %(t1-t0) elif (algoNo == 2): print "Analysis for Algorithm 2" for a in testArray: t0 = time() better(a) t1 = time() print len(a), " %f" %(t1-t0) elif (algoNo == 3): print "Analysis for Algorithm 3" for a in testArray: t0 = time() divideConquer(a, 0, len(a)-1) t1 = time() print len(a), " %f" %(t1-t0) elif(algoNo == 4): print "Analysis for Algorithm 4" for a in testArray: t0 = time() maxSubarrayLinear(a) t1 = time() print len(a), " %f" %(t1-t0) else: print "Error: Invalid algorithm number."
def testChi2(iterations = 64): print("Hello!\n"); MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); # only looking at MACROSTATE.TS # only optimizing backrub temperature and steepness ensembleSizes = numpy.array([20, 50]); backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]); boltzmannTemps = numpy.array([0, -1, 1, 5.0]); steepnessRange = numpy.array([0.5, 5]); minWeights = numpy.array([0, 0, 0, 0, 0, 0]); maxWeights = numpy.array([1, 1, 0, 1, 1, 1]); print("Initializing objects\n"); data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat1.tsv"; #data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat" + str(i + 1) + ".tsv"; #targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; optimizer = Optimizer(MACROSTATES); # slightly different paths on my two computers try: optimizer.readTargetFrequencies(targetFreqs); optimizer.readData(data); except: optimizer.readTargetFrequencies(targetFreqsAlt); optimizer.readData(dataAlt); print("Files read in"); search = CuckooSearch(optimizer.models, Chi2Kernel(optimizer.targetFrequencies), False, 64, 1, 0.25); search.setMaxIterations(iterations); search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True])); optimizer.useAlgorithm(search); print("\nChi2 kernel"); #search.setSimilarityMeasure(JensenShannonDistance(optimizer.targetFrequencies)); optimizer.optimize(); now = datetime.now(); optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "Chi2 test " + now.strftime('%Y%m%d%H%M%S') + ".fasta"); optimizer.writeBestParamsToText("Chi2 test " + now.strftime('%Y%m%d%H%M%S')); print(optimizer.getBestParameters()['match']); return None;
def DHFRcomparemeasures(similarity:int): MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); ensembleSizes = numpy.array([20, 50]); backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]); boltzmannTemps = numpy.array([0, -1, 1, 5.0]); steepnessRange = numpy.array([0.5, 5]); minWeights = numpy.array([0, 0, 0, 0, 0, 0]); maxWeights = numpy.array([1, 1, 0, 1, 1, 1]); data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat1.tsv"; targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; optimizer = Optimizer(MACROSTATES); optimizer.readTargetFrequencies(targetFreqs); optimizer.readData(data); measure = ""; if similarity == 0: search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 64, 1, 0.25); measure = " JSD"; elif similarity == 1: search = CuckooSearch(optimizer.models, CosineSimilarity(optimizer.targetFrequencies), False, 64, 1, 0.25); measure = " Cos"; elif similarity == 2: search = CuckooSearch(optimizer.models, KLDivergence(optimizer.targetFrequencies), False, 64, 1, 0.25); measure = " KLD"; elif similarity == 3: search = CuckooSearch(optimizer.models, EntropyWeightsMixedSimilarity(CosineSimilarity(), JensenShannonDistance(), optimizer.targetFrequencies), False, 64, 1, 0.25); measure = " Mix" elif similarity == 4: search = CuckooSearch(optimizer.models, EntropyWeightedSimilarity(JensenShannonDistance(), optimizer.targetFrequencies), False, 64, 1, 0.25); measure = "Weighted JSD"; else: search = CuckooSearch(optimizer.models, Chi2Kernel(optimizer.targetFrequencies), False, 64, 1, 0.25); measure = "Chi2 kernel"; search.setMaxIterations(2048); search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setSearchParameters(True, True, True, True, numpy.array([True, True, False, True, True, True])); optimizer.useAlgorithm(search); optimizer.optimize(); name = "DHFR compare measures " + measure + " " + datetime.now().strftime('%Y%m%d%H%M'); optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), name + ".fasta", 3); optimizer.writeBestParamsToText(name + ".txt");
def smallTestBoltz(): print("Hello!\n"); MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); ensembleSizes = numpy.array([128]); backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]); boltzmannTemps = numpy.array([-1, 5]); steepnessRange = numpy.array([1, 7]); minWeights = numpy.array([0, 0, 0, 0, 0]); maxWeights = numpy.array([1, 1, 1, 1, 1]); print("Initializing objects\n"); targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; dataMicro = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\microstates.dat"; dataMicroAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\microstates.dat"; optimizer = Optimizer(MACROSTATES, True); try: optimizer.readTargetFrequencies(targetFreqs); optimizer.readFormattedMicrostateData(dataMicro); except FileNotFoundError: optimizer.readtargetfrequencies(targetfreqsalt); optimizer.readFormattedMicrostatedata(datamicroalt); search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), True, 64, 1, 0.25); search.setMaxIterations(2048); search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setSearchParameters(False, True, True, True, numpy.array([True, True, True, True, True, True])); optimizer.useAlgorithm(search); optimizer.optimize(); now = datetime.now(); optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "var ensembles " + now.strftime('%Y%m%d%H%M') + ".fasta"); optimizer.writeBestParamsToText("var ensembles " + now.strftime('%Y%m%d%H%M')); #for i in range(8): # thread = optimizerThread(); # thread.copyOptimizer(optimizer); # thread.run(); return None;
def onlyMacro(): MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); i = 0; # only looking at MACROSTATE.TS # only optimizing backrub temperature and steepness ensembleSizes = numpy.array([20, 50]); backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]); boltzmannTemps = numpy.array([0, -1, 1, 5.0]); steepnessRange = numpy.array([0.5, 5]); minWeights = numpy.array([0, 0, 0, 0, 0, 0]); maxWeights = numpy.array([1, 1, 0, 1, 1, 1]); data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat" + str(i + 1) + ".tsv"; #data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat" + str(i + 1) + ".tsv"; #targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; optimizer = Optimizer(MACROSTATES); optimizer.readTargetFrequencies(targetFreqs); optimizer.readData(data); search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 64, 1, 0.25); measure = " JSD"; search.setMaxIterations(2048); search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setSearchParameters(True, True, True, True, numpy.array([True, True, False, True, True, True])); optimizer.useAlgorithm(search); optimizer.optimize(); name = "Macrostates " + str(i + 1) + measure + datetime.now().strftime('%Y%m%d%H%M'); optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), name + ".fasta", 3); optimizer.writeBestParamsToText(name + ".txt"); return None;
from SimilarityMeasure import SimilarityMeasure from JensenShannonDistance import JensenShannonDistance from CuckooSearch import CuckooSearch from enumeration import enum from Optimizer import Optimizer import numpy MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS") RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') # only looking at MACROSTATE.TS # only optimizing backrub temperature and steepness ensembleSizes = numpy.array([20, 50]) backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]) boltzmannTemps = numpy.array([0, -1, 1, 5.0]) steepnessRange = numpy.array([0.5, 5]) minWeights = numpy.array([0, 0, 0, 0, 0, 0]) maxWeights = numpy.array([1, 1, 0, 1, 1, 1]) data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat1.tsv" targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta" optimizer = Optimizer(MACROSTATES) # slightly different paths on my two computers try: optimizer.readTargetFrequencies(targetFreqs) optimizer.readData(data) except:
def simpleRepeatTest(): print("Hello!\n"); MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); ensembleSizes = numpy.array([50]); backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]); boltzmannTemps = numpy.array([0, -1, 1, 5.0]); steepnessRange = numpy.array([0.5, 5]); minWeights = numpy.array([0, 0, 0, 0, 0, 1]); maxWeights = numpy.array([0, 0, 0, 0, 0, 1]); print("Initializing objects\n"); targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv"; dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv"; dataMicro = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv"; dataMicroAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv"; optimizer = Optimizer(MACROSTATES); # slightly different paths on my two computers try: optimizer.readTargetFrequencies(targetFreqs); optimizer.readData(data); except: optimizer.readTargetFrequencies(targetFreqsAlt); optimizer.readData(dataAlt); print("Files read in"); search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 1, 1, 0.25); search.setMaxIterations(1); search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); #search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True])); search.setAllSearchToFalse(); search.suppressOutputs = True; optimizer.useAlgorithm(search); print("\nJS Dist"); for i in range(64): optimizer.optimize(); params = optimizer.getBestParameters(); m = search.population[0]; #print(search.similarityMeasure.getSimilarityMeasure(m.getFrequencies())); # TODO: getModelByParams doesn't always return the same object. m1 = Model.constructFromExisting(optimizer.getModelByParams(m.backrubTemp, m.ensembleSize, m.boltzmannTemp), m.ensembleSize, m.backrubTemp, m.boltzmannTemp, m.getWeights(), m.steepness); #print(search.similarityMeasure.getSimilarityMeasure(m1.getFrequencies())); if not m.equalTo(m1): print("\t{:s}".format(Optimizer.calcParamsID(m.backrubTemp, m.ensembleSize, m.boltzmannTemp))); print("\t{:s}".format(Optimizer.calcParamsID(m1.backrubTemp, m1.ensembleSize, m1.boltzmannTemp))); #m2 = Model.constructFromExisting(m, m.ensembleSize, m.backrubTemp, m.boltzmannTemp, m.getWeights(), m.steepness); #print(search.similarityMeasure.getSimilarityMeasure(m2.getFrequencies())); #print(m.equalTo(m2)); #print(m2.backrubTemp); #print(m2.boltzmannTemp); #print(m2.ensembleSize); #print(m2.steepness); #print(m2.weights); #print(search.similarityMeasure.getSimilarityMeasure(m2.getFrequencies())); #print(numpy.sum(numpy.sum(numpy.abs( m.getFrequencies() - m2.getFrequencies())))); return None;
def repeatTest(): print("Hello!\n"); MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'); ensembleSizes = numpy.array([50]); backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]); boltzmannTemps = numpy.array([0, -1, 1, 5.0]); steepnessRange = numpy.array([0.5, 5]); minWeights = numpy.array([0, 0, 0, 0, 0, 0]); maxWeights = numpy.array([1, 1, 0, 1, 1, 1]); print("Initializing objects\n"); targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"; data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv"; dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv"; dataMicro = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv"; dataMicroAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv"; optimizer = Optimizer(MACROSTATES); # slightly different paths on my two computers try: optimizer.readTargetFrequencies(targetFreqs); optimizer.readData(data); except: optimizer.readTargetFrequencies(targetFreqsAlt); optimizer.readData(dataAlt); print("Files read in"); for i in range(32): search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 8, 1, 0.25); search.setMaxIterations(16); search.suppressOutputs = True; search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True])); optimizer.useAlgorithm(search); print("\nJS Dist"); #search.setSimilarityMeasure(JensenShannonDistance(optimizer.targetFrequencies)); optimizer.optimize(); params = optimizer.getBestParameters(); print(params['match']); print(optimizer.verifyFoundParams(params['ensembleSize'], params['backrubTemp'], params['boltzmannTemp'], params['steepness'], params['weights'])); search = CuckooSearch(optimizer.models, CosineSimilarity(optimizer.targetFrequencies), False, 8, 1, 0.25); search.setMaxIterations(16); search.suppressOutputs = True; search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights); search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True])); optimizer.useAlgorithm(search); print("\nCosine") optimizer.optimize(); params = optimizer.getBestParameters(); print(params['match']); print(optimizer.verifyFoundParams(params['ensembleSize'], params['backrubTemp'], params['boltzmannTemp'], params['steepness'], params['weights'])); return None;
data_filename = job_params[4] simMeas_id = job_params[5] iterations = int(job_params[6]) #iterations = 4 usedstates = numpy.array(job_params[7:]).astype(dtype=bool) #print(usedstates) if job_id == str(task_id): print('using options for job %s\n' % job_id) break #targetFreqs = "/Users/anatale/Documents/school/UCSF/Kortemme_lab/code/fitness-data-analysis/highscale_trim.fasta" targetFreqs = os.path.join(input_path, targetFreqs_filename) data = os.path.join(input_path, data_filename) #data = "/Users/anatale/Documents/school/UCSF/Kortemme_lab/code/fitness-data-analysis/testing_microstates.tsv" MACROSTATES = enum('1i2m', '1a2k', '1k5d', '3gj0', 'importin', 'composite') RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') # parameter reanges to optimize ensembleSizes = numpy.array([60, 70, 80, 90, 100]) backrubTemps = numpy.array([0.9]) #boltzmannTemps = numpy.array([-1.0]) # set below steepnessRange = numpy.array([0.5, 5]) minWeights = numpy.array([0, 0, 0, 0, 0, 0]) maxWeights = numpy.array([1, 1, 1, 1, 1, 1]) optimizer = Optimizer(MACROSTATES) optimizer.readTargetFrequencies(targetFreqs) print('pos before loading data: ', optimizer.nPositions) #optimizer.readData(data)
class Model: """ A multistate design model """ # NOTE: Java documentation style. mainly because idk what's a standard one for python and Java's is most readable # I need these statements for intellisense. # vars used by all instances # TODO: is the enum even useful? MACROSTATES = enum() # enum of the macrostates nMacrostates = 0 # int ensembleSize = 0 # int nPositions = 0 # int, number of positions on sequence examined contiguousPositions = True # are the positions contiguous? positionMap = {} # if the positions are not contiguous, a map from non-contiguous to [0, nPositions) positionOffset = 0 # offset for positions when the positions examined are contiguous backrubTemp = 0.0 # float boltzmannTemp = 0.0 # float weights = numpy.array(0) # float[] relative weights of the macrostates steepness = 0.0 # float steepness of siggmoid (s) fitnesses = numpy.array(0) # double[position][residue fitness] calculated fitnesses of residues at each location frequencies = numpy.array(0) # double[position][residue frequency] calculated frequencies of residues at each location macrostateResidueEnergies = numpy.array(0) # double[position][residue energy][macrostate] recovery = -1.0 # float assigned by the outside similiartyMeasure to how well this mode recovers the sequence # TODO: check if macrostatesUsed is actually useful macrostatesUsed = numpy.array(False) # bool[], macrostates examined during optimization # vars used when microstate data is involved useAltAveragingMethod = False # use accepted avg method or (the alt) xinjie's expression? isFrequenciesCalculated = False # prevent unnecessary calculations useMicrostateData = False # do we have data from individual microstates? areMicrostatesPicked = False # have microstates been selected to be used in the ensemble? microstateResidueEnergies = numpy.array(0) # double[position][residue energy][macrostate][microstate] selectedMicrostateEnergies = numpy.array(0) # double[position][residue energy][macrostate][microstate], subset of microstateREsidueEnergies microstateCounts = numpy.array(0) # double[position][macrostate] number of microstates microstatesUsed = numpy.array(0) # int[position][macrostate][microstate index], the microstates used to calculate the macrostates def __init__(self, macrostates: enum, ensembleSize: int, backrubTemp: float, boltzmannTemp: float, weights: numpy.array, steepness: float, positions: int, positionOffset: int, useMicrostateData: bool = False, posMap: dict = None, useAltAverageMethod: bool = False): """ Default constructor @param macrostates enumeration of the different macrostates in this model @param ensembleSize int of macrostate ensemble sizes @param backrubTemp float of the backrub temperature @param boltzmannTemp float, Boltzmann averaging temeprature @param weights float[], wights for the macrostates @param steepness float, steepness in fitness funciton @param positions int, number of positions examined in this midel @param positionOffset int, the index of the lowest index to be examined @param useMicrostateData bool, are we to actually average microstate data? @param posMap dict<int, int> a remapping of position values if the positions are not contiguous. ONLY pass an object if the positions are not contiguous @param useAltAveragingMethod bool, use the other Boltzmann averaging calculation method? """ self.MACROSTATES = macrostates self.nMacrostates = macrostates.size self.ensembleSize = ensembleSize self.backrubTemp = backrubTemp self.boltzmannTemp = boltzmannTemp self.weights = weights self.steepness = steepness self.nPositions = positions self.positionOffset = positionOffset self.isFrequenciesCalculated = False self.useMicrostateData = useMicrostateData self.useAltAveragingMethod = useAltAverageMethod self.macrostatesUsed = numpy.array([True] * self.nMacrostates) self.positionMap = deepcopy(posMap) if posMap is not None: self.contiguousPositions = False else: self.contiguousPositions = True # allocate arrays self.macrostateResidueEnergies = numpy.zeros( [self.nPositions, 20, self.nMacrostates], dtype=numpy.float64) self.fitnesses = numpy.ones([self.nPositions, 20], dtype=numpy.float64) self.frequencies = numpy.zeros([self.nPositions, 20], dtype=numpy.float64) # a little checker to prevent two identical entries, used in function addMacrostateData() for i in range(self.nMacrostates): for j in range(self.nPositions): self.macrostateResidueEnergies[j][0][i] = 65536.0 if self.useMicrostateData: self.areMicrostatesPicked = False self.microstatesUsed = numpy.zeros([0]) self.microstateCounts = numpy.zeros( [self.nPositions, self.nMacrostates], dtype=int) self.microstateResidueEnergies = numpy.zeros( [self.nPositions, 20, self.nMacrostates, 700], dtype=numpy.float64) # magic number 700 - max expected number of microstates def constructFromExisting(existing, ensembleSize: int, backrubTemp: float, boltzmannTemp: float, weights: numpy.array, steepness: float): """ "Overloaded" "constructor" that uses a pre-existing Model as a template @param existing pre-exisiting Model object @param ensembleSize int, new ensemble size @param backrubTemp float, new backrub temperate @param boltzmannTemp float, new Boltzmann temerature @param weights float[], new weights @return Model """ new = Model(existing.MACROSTATES, ensembleSize, backrubTemp, boltzmannTemp, weights, steepness, existing.nPositions, existing.positionOffset, existing.useMicrostateData) new.macrostatesUsed = existing.macrostatesUsed new.microstatesUsed = existing.microstatesUsed new.contiguousPositions = existing.contiguousPositions # TODO: is deepy copy necessary for all these values? #new.positionMap = deepcopy(existing.positionMap); # deep copy dict, keep instances completely separate #if (not existing.useMicrostateData): # not using microstate data # new.macrostateResidueEnergies = numpy.array(existing.macrostateResidueEnergies, dtype = numpy.float64); #elif ensembleSize == existing.ensembleSize: # using microstate data, already collapsed # new.selectedMicrostateEnergies = numpy.array(existing.selectedMicrostateEnergies); # new.areMicrostatesPicked = True; #else: # using microstate data, not collapsed # #print("!", end='') # new.microstateResidueEnergies = numpy.array(existing.microstateResidueEnergies); # new.microstateCounts = numpy.array(existing.microstateCounts); # SHALLOW COPY of raw data for faster since they should not need modification during any runs new.positionMap = existing.positionMap if (not existing.useMicrostateData): # not using microstate data new.macrostateResidueEnergies = existing.macrostateResidueEnergies else: if ensembleSize == existing.ensembleSize: # using microstate data, already collapsed new.selectedMicrostateEnergies = existing.selectedMicrostateEnergies new.areMicrostatesPicked = True #else: # using microstate data, not collapsed #print("!", end='') new.microstateResidueEnergies = existing.microstateResidueEnergies new.microstateCounts = existing.microstateCounts return new def setPositionMap(self, posMap: dict) -> None: """ Deprecated. There should not be a reason to EVER call this function If this model is using non-contiguous position numbers, sets the Map<int, int> used to convert them to contiguous numbers on [0, nPositions] @param posMap dict, Map<int, int> of how to change the numbers @return None """ warnings.warn( "Obsolete function. its function was taken care of during construction. Now results are not guaranteed", DeprecationWarning) self.positionMap = deepcopy(posMap) return None def addMacrostateData(self, macrostate: int, position: int, energies: "float[]") -> None: """ Inserts a macrostate_position set of fitness values to this model @param macrostate int, the macrostate this corresponds to @param position int, the position the energies corresponds to @param energies float[] of length-20 of the energies @return void """ # convert raw position to an internal index if self.contiguousPositions: pos = position - self.positionOffset else: pos = self.positionMap[position] if self.macrostateResidueEnergies[pos][0][macrostate] != 65536.0: raise Exception("Something something this entry already full") for i in range(20): self.macrostateResidueEnergies[pos][i][macrostate] = energies[i] def addMicrostateData(self, macrostate: int, position: int, energies: "float") -> None: """ Inserts a microstate_position fitness into this model @param macrostate int, the macrostate this microstate belongs to @param position int, the position the energy corresponds to @param energy float, the energy of this mutation @retun void """ if self.contiguousPositions: position -= self.positionOffset else: position = self.positionMap[position] # TODO: do I need a overwrite check as in adding macrostate data? for i in range(20): self.microstateResidueEnergies[position][i][macrostate][ self.microstateCounts[position][macrostate]] = energies[i] self.microstateCounts[position][macrostate] += 1 return None def useAltAverageMethod(self, yes: bool) -> None: """ Changes whether to use the other averaging method @param yes bool whether to use it or not @return void """ self.useAltAveragingMethod = yes self.isFrequenciesCalculated = False # change weights sets # TODO ascartain that this is actually necessary def setWeights(self, newWeights: numpy.array) -> None: warning.warn("Why are you changing the weights directly in a model?", UserWarning) self.weights = newWeights # PRIVATE # TODO: add flag to only compute once. Then we should be able to remove the deep copy def averageMicrostates(self) -> None: """ Boltzmann averages the microstates to calculate the energy for the macrostate @param void @return void """ #print(self.microstateResidueEnergies[0][0]); #print(); if not self.areMicrostatesPicked: # pick backbones to use for the ensemble. self.microstatesUsed = numpy.zeros( [self.nPositions, self.nMacrostates, self.ensembleSize], dtype=int) for i in range(self.nPositions): for j in range(self.nMacrostates): self.microstatesUsed[i][j] = numpy.random.randint( 0, self.microstateCounts[i][j], [self.ensembleSize]) # cherry-pick out the selected microstates self.selectedMicrostateEnergies = numpy.zeros( [self.nPositions, 20, self.nMacrostates, self.ensembleSize]) for i in range(self.nPositions): for j in range(20): for k in range(self.nMacrostates): for l in range(self.ensembleSize): self.selectedMicrostateEnergies[i][j][k][ l] = self.microstateResidueEnergies[i][j][k][ self.microstatesUsed[i][k][l]] self.areMicrostatesPicked = True #print(self.selectedMicrostateEnergies[0][0]); #print(); if not self.useAltAveragingMethod: if (self.boltzmannTemp == 0.0): self.macrostateResidueEnergies = numpy.amin( self.selectedMicrostateEnergies, axis=3) elif (self.boltzmannTemp == -1.0): self.macrostateResidueEnergies = numpy.mean( self.selectedMicrostateEnergies, axis=3) else: self.macrostateResidueEnergies = numpy.sum( self.selectedMicrostateEnergies * numpy.exp( self.selectedMicrostateEnergies / -self.boltzmannTemp), axis=3) / numpy.sum(numpy.exp( self.selectedMicrostateEnergies / -self.boltzmannTemp), axis=3) else: if (self.boltzmannTemp == 0.0): self.macrostateResidueEnergies = numpy.amin( self.selectedMicrostateEnergies, axis=3) elif (self.boltzmannTemp == -1.0): self.macrostateResidueEnergies = numpy.mean( self.selectedMicrostateEnergies, axis=3) else: self.macrostateResidueEnergies = -numpy.log( sum(numpy.exp( self.selectedMicrostateEnergies / -self.boltzmannTemp), axis=3)) #print(self.macrostateResidueEnergies[0]); # After averaging, delete the 4D array to save space and flip the microstate flag self.microstateResidueEnergies = numpy.array(0) return None # PRIVATE def calcFitness(self) -> None: """ Calculates the fitnesses of the each residue at each position. There is no need for this function to be externally called @param void @return void """ # collapse microstates into macrostates if self.useMicrostateData: self.averageMicrostates() minEnergies = numpy.amin(self.macrostateResidueEnergies, axis=1) # for each position and macrostate, which residue had min energy? offsets = minEnergies + numpy.divide(numpy.log(99), self.steepness) # calculate offset is double[position][macrostate] self.fitnesses = numpy.ones([self.nPositions, 20], dtype=numpy.float64) for i in range(self.nPositions): for j in range(20): for k in range(self.nMacrostates): f = 1.0 / (1.0 + numpy.exp( self.steepness * (self.macrostateResidueEnergies[i][j][k] - offsets[i][k]))) self.fitnesses[i][j] *= (1 - self.weights[k] + self.weights[k] * f) # PRIVATE def calcFrequencies(self) -> None: """ Calculates the fitnesses and the frequencies of residues at each location @param void @return void """ if not self.isFrequenciesCalculated: # only do it once! self.isFrequenciesCalculated = True self.calcFitness() self.frequencies = numpy.divide( self.fitnesses, numpy.subtract(1.0, self.fitnesses)) # non-normalized frequencies sums = numpy.sum(self.frequencies, axis=1) for i in range(self.nPositions): # normalize self.frequencies[i] = numpy.divide(self.frequencies[i], sums[i]) # get functions # member fields should not be directly accessed; use these get funtions instead def getEnsembleSize(self) -> int: """ Self-explanatory name @return int """ return self.ensembleSize def getBackrubTemp(self) -> float: """ Self-explanatory name @return double """ return self.backrubTemp def getBoltzmannTemp(self) -> float: """ Self-explanatory name @return double """ return self.boltzmannTemp def getWeights(self) -> numpy.array: """ Self-explanatory name. Return a deep copy of the array so it's safe to directly do math on the return value @return float[] """ return numpy.array(self.weights) def getSteepness(self) -> float: """ Self-explanatory name @return float """ return self.steepness def getFrequencies(self) -> numpy.array: """ Self-explanatory name. Returns a deep copy of the array so it's safe to directly do math on the return value @return float[][] """ # this is a special instance for storing data if self.ensembleSize == 0 and self.useMicrostateData: raise PermissionError( "This object is a raw data storage instance and this call should not have been made" ) return None if not self.isFrequenciesCalculated: self.calcFrequencies() return numpy.array(self.frequencies) def equalTo(self, other) -> bool: """ Are the data stored in this Model correct? i.e. does everything actually correspond to the input file's data? Used for debugging @param other Model object to compare this to @return bool if everything compared correctly """ retval = True if not isinstance(other, Model): print("is not same class") retval = False if self.ensembleSize != other.ensembleSize: print("ensemble sizes: {:d}, {:d}".format(self.ensembleSize, other.ensembleSize)) retval = False if self.boltzmannTemp != other.boltzmannTemp: print("boltzmann temps: {:.2f}, {:.2f}".format( self.boltzmannTemp, other.boltzmannTemp)) retval = False if self.backrubTemp != other.backrubTemp: print("backrub temps: {:.2f}, {:.2f}".format( self.backrubTemp, other.backrubTemp)) retval = False if numpy.sum( numpy.abs(self.macrostateResidueEnergies - other.macrostateResidueEnergies)) > 1e-9: print( numpy.sum( numpy.abs(self.macrostateResidueEnergies - other.macrostateResidueEnergies))) retval = False if not retval: print("{:d}\t{:.2f}\t{:.2f}".format(self.ensembleSize, self.boltzmannTemp, self.backrubTemp)) return retval # comparison operators based on similarity measure def __eq__(self, other): assert (isinstance(other, Model)) return self.recovery == other.recovery # see, this is where static typing comes in useful. it lets autocomplete see the fields of other def __le__(self, other): assert (isinstance(other, Model)) return self.recovery <= other.recovery def __lt__(self, other): assert (isinstance(other, Model)) return self.recovery < other.recovery def __ge__(self, other): assert (isinstance(other, Model)) return self.recovery >= other.recovery def __gt__(self, other): assert (isinstance(other, Model)) return self.recovery < other.recovery def __ne__(self, other): assert (isinstance(other, Model)) return self.recovery != other.recovery
import math as magic import ast import numpy import warnings from io import * from enumeration import enum from copy import * # should the macrostates be hard-coded? probably not if this ends up being actually used for tuning other models... #MACROSTATES_T = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS"); RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y') class Model: """ A multistate design model """ # NOTE: Java documentation style. mainly because idk what's a standard one for python and Java's is most readable # I need these statements for intellisense. # vars used by all instances # TODO: is the enum even useful? MACROSTATES = enum() # enum of the macrostates nMacrostates = 0 # int ensembleSize = 0 # int nPositions = 0 # int, number of positions on sequence examined
class Optimizer(object): """ Optimizes the hyperparameters of a model """ # TODO: make all parameters generic models = { } # Map<hyperparams, Model> of the input data, since values are precalculated optimizationAlgorithm = SearchAlgorithm() # the algorithm to be used #similarityMeasure = SimilarityMeasure(); # the similarlity measure to be used nPositions = 0 # number of positions examined contiguousPositions = True # are the set of positions contiguous positionMap = None # used to map from non-contiguous that start on 0 positions to [0, nPositions] minPosition = 0 # position offset for indexing targetFrequencies = numpy.array( 0 ) # float[position][residue] internal representation of the target frequencies MACROSTATES = enum() # enum of the macrostates nMacrostates = 0 # number of macrostates continuousBoltzmann = False # are we using a continuous set of boltzmann temps targetFreqsRead = False def __init__(self, macrostates=None, continuousBoltzmann=False, contiguousPositions=True): """ Default constructor. @param macrostates enum of the macrostates to be considered @param continuousBoltzmann bool, are microstate data provided @param contiguousPositions bool, are the positions aligned to a contiguous set of positions in the target? """ self.MACROSTATES = macrostates # turn this enum into what we really need for indexing - a dictionary # this is a kludgy solution, but it would take a while to dig all the enum stuff out of this code self.macStateToIndex = {} for elem in self.MACROSTATES.__dict__: if elem != 'size': if type(self.MACROSTATES.__dict__[elem]) is int: self.macStateToIndex[elem] = self.MACROSTATES.__dict__[ elem] self.nMacrostates = self.MACROSTATES.size self.continuousBoltzmann = continuousBoltzmann self.models = {} self.nPositions = 0 self.minPosition = 0 self.targetFrequencies = numpy.array(0) self.contiguousPositions = contiguousPositions self.targetFreqsRead = False if not contiguousPositions: self.positionMap = {} else: self.positionMap = None # STATIC def copyFromExisting(existing): """ Deep copies an existing Optimizer @param existing Optimizer to be copied @return Optimizer """ newOptimizer = Optimizer(existing.MACROSTATES) newOptimizer.minPosition = existing.minPosition newOptimizer.nPositions = existing.nPositions newOptimizer.targetFrequencies = numpy.array( existing.targetFrequencies) newOptimizer.models = dict(existing.models) #newOptimizer.similarityMeasure = existing.similarityMeasure; newOptimizer.optimizationAlgorithm = existing.optimizationAlgorithm newOptimizer.contiguousPositions = existing.contiguousPositions newOptimizer.targetFreqsRead = existing.targetFreqsRead if not existing.contiguousPositions: newOptimizer.positionMap = deepcopy(existing.positionMap) return newOptimizer # TODO: change the file return type to file read return def readTargetFrequencies(self, source, posPicker=None): """ Reads the target frequencies from a FASTA file. Call this before reading data Note: when optimizing against a set of positions that are not contiguous, this function *MUST* be called before calling a read*Data function. Doing otherwise will void all warranties and promises that calculations will be correct. @param source string pointing to the location of the input FASTAs @return array of the target frequencies """ # resToIndex is used to convert a one-letter AA code to an index, which is in alphabetical order resToIndex = { 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19 } infile = open(source, 'r', encoding='utf-8') # figure out the number of positions infile.readline() # skip the first line (>NULL) entry = "" thisLine = infile.readline() while thisLine[0] != '>': entry += thisLine thisLine = infile.readline() self.nPositions = len(entry) - 1 # case where the entries end with a dash if entry[-2] == 0: nPositions -= 1 # allocate space for a double[][] self.targetFrequencies = numpy.zeros([self.nPositions, 20], dtype=float) # read entries # 2/17 note: this has been modified to be ok with files that have unaligned positions, i.e. '-' in sequence infile.seek(0) # go back to the start nEntries = numpy.zeros([self.nPositions], dtype=int) thisEntry = "" for line in infile: if line[0] == '>': # line starts w/ '>', indicating start of a new entry if thisEntry == "": # no entry to process pass else: # add the residues in this entry to the counts for i in range(self.nPositions): #print(thisEntry[i], end=''); if thisEntry[ i] != '-': # only when there is a residue aligned here try: self.targetFrequencies[i][resToIndex[ thisEntry[i]]] += 1 nEntries[i] += 1 except KeyError: # non-single residue code. skip continue thisEntry = "" # then clear it to read the next entry #print(); else: # middle of an entry, append this line thisEntry += line # counts to frequencies for i in range(self.nPositions): for j in range(20): self.targetFrequencies[i][j] /= nEntries[i] infile.close() # 2/17 added parts to allow for removal of superfluous positions if posPicker != None: self.contiguousPositions = False indices = self.positionReindexerFASTA(posPicker) self.nPositions = len(indices) freqs = numpy.zeros([self.nPositions, 20]) for i in range(len(indices)): freqs[i] = self.targetFrequencies[indices[i]] self.targetFrequencies = freqs ## make the re-mapping indexer #for i in range(self.nPositions): # self.positionMap[indices[i]] = i; self.targetFreqsRead = True return numpy.array(self.targetFrequencies) # read raw macrostate data # TODO: change the file return type to file read return def readData(self, source): """ Reads in a tab-delimited file of ensembles encoding macrostate data @param source a string pointing to the location of the tab-delimited file @return void """ if not self.targetFreqsRead: warnings.warn("Hey, call the read target freqs functions first!", UserWarning) # used to convert the input dict to an array indexToRes = { 0: 'A', 1: 'C', 2: 'D', 3: 'E', 4: 'F', 5: 'G', 6: 'H', 7: 'I', 8: 'K', 9: 'L', 10: 'M', 11: 'N', 12: 'P', 13: 'Q', 14: 'R', 15: 'S', 16: 'T', 17: 'V', 18: 'W', 19: 'Y' } # convert strings to manipulate-able values self.models.clear() infile = open(source, 'r') isFirstLine = True isFirstEntry = True self.minPosition = 65535 # used for offsetting indices in Macrostate placeHolderWeights = numpy.array([0, 0, 0, 0]) placeHolderSteep = 1 for line in infile: # ignore first line since they're just column headers if isFirstLine: isFirstLine = False else: entries = line.split('\t') # entries: a list of strings macrostate = entries[0] # string backrubT = entries[1] # string ensembleS = entries[2] # string boltzmanT = entries[3] # string position = entries[4] # string energies = ast.literal_eval( entries[5] ) # ast.literal_eval converts a string to a dictionary # now ints macrostate = self.macStateToIndex[macrostate] position = int(position) ensembleS = int(ensembleS) # record minposition - assumes that the first entry is at first position # TODO: fix this to actually find minimum position if isFirstEntry: self.minPosition = position isFirstEntry = False # skip superfluous positions if position < self.minPosition or position >= self.minPosition + self.nPositions: continue # now doubles backrubT = float(backrubT) if boltzmanT == "min": # account for the possible text values boltzmanT = 0.0 elif boltzmanT == "mean": boltzmanT = -1.0 # use -1 to represent inf - you can't really do math with numpy.inf else: boltzmanT = float(boltzmanT) # calc model ID from the strings because that's always unique ID = Optimizer.calcParamsID(backrubT, ensembleS, boltzmanT) # now an array temp = numpy.zeros([20]) for i in range(20): temp[i] = energies[indexToRes[i]] energies = temp # put this read into the internal small, colorful shells structure if ID in self.models: self.models[ID].addMacrostateData(macrostate, position, energies) else: model = Model(self.MACROSTATES, ensembleS, backrubT, boltzmanT, placeHolderWeights, placeHolderSteep, self.nPositions, self.minPosition) model.addMacrostateData(macrostate, position, energies) self.models[ID] = model infile.close() return None # read raw microstate data def readMicrostateData(self, source: str, minPosition: int): """ Reads in raw microstate data. Unlike readData(), this function does not assume anything about the min position and it must be supplied manually @param source string of the input file @param minPosition int of the lowest position number @return void """ if not self.targetFreqsRead: warnings.warn("Hey, call the read target freqs functions first!", UserWarning) self.models.clear() self.minPosition = minPosition maxPos = 0 indexToRes = { 0: 'A', 1: 'C', 2: 'D', 3: 'E', 4: 'F', 5: 'G', 6: 'H', 7: 'I', 8: 'K', 9: 'L', 10: 'M', 11: 'N', 12: 'P', 13: 'Q', 14: 'R', 15: 'S', 16: 'T', 17: 'V', 18: 'W', 19: 'Y' } infile = open(source, 'r') placeHolderWeights = None placeHolderSteep = 0 placeHolderBoltzmannT = 0 placeHolderEnsemble = 0 n = 0 isFirstLine = True #line = infile.readline() #while true: for line in infile: # ignore first line since they're just column headers if isFirstLine: isFirstLine = False else: n += 1 entries = line.split('\t') # print(entries) macrostate = entries[0] backrubT = entries[1] position = int(entries[2]) #backbone = entries[3] energies = ast.literal_eval(entries[4]) # skip superfluous positions if position < self.minPosition or position >= self.minPosition + self.nPositions: print('skipping %d' % position) continue # convert from string to useful data types backrubT = float(backrubT) macrostate = self.macStateToIndex[macrostate] position = int(position) temp = numpy.zeros([20]) for i in range(20): temp[i] = energies[indexToRes[i]] energies = temp if position > maxPos: maxPos = position print(maxPos) ID = Optimizer.calcParamsID(backrubT, None, None) if ID in self.models: self.models[ID].addMicrostateData(macrostate, position, energies) else: model = Model(self.MACROSTATES, placeHolderEnsemble, backrubT, placeHolderBoltzmannT, placeHolderWeights, placeHolderSteep, self.nPositions, self.minPosition, True, self.positionMap) model.addMicrostateData(macrostate, position, energies) self.models[ID] = model #line = infile.readline() #if not line: # EOF # break if self.contiguousPositions: self.nPositions = maxPos - minPosition + 1 infile.close() return None def positionReindexer(data: str): """ Used to offset arbitrary positions to start with 0. Used when converting files. The file should have three columns. The first is the index in the alignment, the second I have no idea, the third is the index in the residue sequence. @param data string pointing to the indices file @return int[] where each entry holds the unaligned position that is mapped to the index """ infile = open(data, 'r') indices = [] n = 0 for line in infile: entries = line.split(' ') i = entries[2].strip('\n') indices[i] = n n += 1 indices['nPos'] = n infile.close() return indices def positionReindexerFASTA(self, data: str): """ Since we may be aligning to a FASTA sequence with more positions, we strip the superfluous positions from the read in FASTA. Used when reading files. The file should have three columns. The first is the index in the alignment, the second I have no idea, the third is the index in the residue sequence. @param data string pointing to the file @return int[] where the index is the corrected/reindexed position, and the value is the original index """ infile = open(data, 'r') index = [] i = 0 for line in infile: entries = line.split(' ') index.append(int(entries[0])) self.positionMap[int(entries[2].strip('\n'))] = i i += 1 indices = numpy.array(index) return indices # TODO: change the return type to file write return val def writeFrequenciesToFASTA(self, frequencies: numpy.array, outFileName: str, precision: int = 3): """ Writes the 2D residue frequencies to a FASTA file @param frequencies double[positio][residue] of relative frequencies @param outfile string of output filename @param precision int, optional, number of places behind the decimal point, default is 3 @return int 1 if failure, 0 if succeeds """ if outFileName.split('.')[-1] != 'fasta': outFileName += ".fasta" try: outfile = open(outFileName, 'w') except FileExistsError: print("Output file already exists\n") return 1 nEntries = numpy.power(10, precision) numbers = numpy.round(frequencies * nEntries) residueToWrite = numpy.zeros([self.nPositions], dtype=int) #print(numbers); residues = "ACDEFGHIKLMNPQRSTVWY" for i in range(nEntries): outfile.write("> Null\n") for j in range(self.nPositions): while numbers[j][ residueToWrite[j]] == 0 and residueToWrite[j] < 19: residueToWrite[j] += 1 numbers[j][residueToWrite[j]] -= 1 outfile.write(residues[residueToWrite[j]]) outfile.writelines("\n") outfile.close() return 0 def writeBestParamsToText(self, out: str): """ Writes the best parameters found to a human-readable text file. Overwrites without warning. @param out string of name of output file @return void """ if out.split('.')[-1] != 'txt': out += ".txt" outfile = open(out, 'w') bestVals = self.getBestParameters() outfile.write("Ensemble Size: {:d}\n".format(bestVals['ensembleSize'])) outfile.write("Backrub temperature: {:.1f}\n".format( bestVals['backrubTemp'])) bt = bestVals['boltzmannTemp'] if bt > 0: outfile.write("Boltzmann averaging temperature: {:.9f}\n".format( bestVals['boltzmannTemp'])) elif bt == 0: outfile.write("Boltzmann averaging temperature: mean\n") else: outfile.write("Boltzmann averaging temperature: inf\n") outfile.write("Steepness: {:.9f}\n".format(bestVals['steepness'])) outfile.write("Weights: ") for i in range(self.MACROSTATES.size): outfile.write("{:.4f} ".format(bestVals['weights'][i])) outfile.write("\nMatch: {:.4f}\n".format(bestVals['match'])) outfile.write("Algorithm: {:s}\n".format( self.optimizationAlgorithm.__str__())) outfile.write("Similarity measure: {:s}\n".format( self.optimizationAlgorithm.similarityMeasure.__str__())) outfile.write("Elapsed time: {:s}\n".format( str(self.optimizationAlgorithm.elapsedTime))) outfile.close() # generate a unique reproducible key for a combination of hyperparameters # hash or plaintext string? # STATIC def calcParamsID(param1, param2, param3): """ Generates a unique and reproducable ID string for each combination of parameters, either by concactenating the toString representations or hashing it all Is a static method. @param param1 backrub temperature @param param2 ensemble size @param param3 Boltzmann averaging temperature @return a unique string """ #longString = hashlib.sha1(macrostate).hexdigest() + hashlib.sha1(backrubT).hexdigest() + hashlib.sha1(ensembleS).hexdigest(); #longString += hashlib.sha1(boltzmanT).hexdigest() + hashlib.sha1(weights).hexdigest() + hashlib.sha1(steep); #return hashlib.sha1(longString).hexdigest(); return str(param1) + " " + str(param2) + " " + str(param3) def getModelByParams(self, param1, param2, param3): """ Gets a model by the specified pre-determined parameters. Return is a reference and the return object should not be directly modified. use the Model.createFromExisting() function on it to generate a copy. @param param1 backrub temperature @param param2 ensemble size @param param3 Boltzmann averaging temperature @return Model with specified params """ return self.models[Optimizer.calcParamsID(param1, param2, param3)] def useAlgorithm(self, algorithm: SearchAlgorithm): """ Changes the search algorithm used by the optimizer @param algorithm new SearchAlgorithm @return void """ self.optimizationAlgorithm = algorithm def optimize(self): """ Starts the optimization process @param void @return void """ self.optimizationAlgorithm.iterate() def verifyFoundParams(self, ensembleSize, backrubT, boltzmannT, steepness, weights): """ Run some found parameters against this model to see the match @param ensembleSize int of found size @param backrubT float of found backrub temperature @param boltzmannT float of found boltzmann averaging temperature @param steepness float of found steepness @param weights float[] of found weights @return float on [0, 1] of match to target """ model = Model.constructFromExisting( self.getModelByParams(backrubT, ensembleSize, boltzmannT), ensembleSize, backrubT, boltzmannT, weights, steepness) return self.optimizationAlgorithm.similarityMeasure.getSimilarityMeasure( model.getFrequencies()) def getFrequenciesByParams(self, ensembleSize, backrubT, boltzmannT, steepness, weights): """ Gets the frequencies corresponding to a particular set of hyperparams @param ensembleSize int of found size @param backrubT float of found backrub temperature @param boltzmannT float of found boltzmann averaging temperature @param steepness float of found steepness @param weights float[] of found weights @return float[][] of the relative frequencies """ model = Model.constructFromExisting( self.getModelByParams(backrubT, ensembleSize, boltzmannT), ensembleSize, backrubT, boltzmannT, weights, steepness) return model.getFrequencies() def getBestParameters(self): """ Returns a dictionary of the best parameters found. Keys: 'ensembleSize' 'backrubTemp' 'boltzmannTemp' 'steepness' 'weights' 'match' @param void @return Map<string, float> """ return self.optimizationAlgorithm.getBestParameters() def getBestFrequencies(self): """ Returns the best frequencies found @param void @return float[][] of frequencies """ return self.optimizationAlgorithm.getBestFrequencies()