def long(generations:int):
	data = "/netapp/home/tianjiao.zhang/data/microstates.dat";
	targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";

	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	ensembleSizes = numpy.array([16, 24, 32, 64, 128]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([-1, 5]);
	steepnessRange = numpy.array([1, 7]);
	minWeights = numpy.array([0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 1, 1, 1]);
	
	optimizer = Optimizer(MACROSTATES, True);
	optimizer.readTargetFrequencies(targetFreqs);	
	optimizer.readFormattedMicrostateData(data);
		
	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), True, 128, 1.25, 0.25);
	search.setMaxIterations(generations);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setAllSearchToTrue();
	search.suppressOutputs = True;
	optimizer.useAlgorithm(search);
	optimizer.optimize();
	now = datetime.now();
	optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "var ensembles " + now.strftime('%Y%m%d%H%M') + ".fasta");
	optimizer.writeBestParamsToText("var ensembles " + now.strftime('%Y%m%d%H%M'));
def testRandUniformInput():
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	# only looking at MACROSTATE.TS
	# only optimizing backrub temperature and steepness
	ensembleSizes = numpy.array([50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 0, 1, 1, 1]);

	print("Initializing objects\n");

	targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	
	optimizer = Optimizer(MACROSTATES);

	# slightly different paths on my two computers
	try:
		optimizer.readTargetFrequencies(targetFreqs);	
		optimizer.readData(data);
	except:
		optimizer.readTargetFrequencies(targetFreqsAlt);	
		optimizer.readData(dataAlt);

	# make energies uniform
	for model in optimizer.models:
		optimizer.models[model].macrostateResidueEnergies = numpy.ones_like(optimizer.models[model].macrostateResidueEnergies);

	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 1, 1, 0.25);
	search.setMaxIterations(1);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setSearchParameters(False, False, False, False, numpy.array([False, False, False, False, False, False]));
	optimizer.useAlgorithm(search);

	outfile = open("uniform energy similarities.txt", 'w');
	optimizer.optimize();
	outfile.write("JSD: {:.4f}\n".format(optimizer.getBestParameters()['match']));

	search.setSimilarityMeasure(CosineSimilarity(optimizer.targetFrequencies));
	optimizer.optimize();
	outfile.write("Cosine similarity: {:.4f}\n".format(optimizer.getBestParameters()['match']));

	search.setSimilarityMeasure(KLDivergence(optimizer.targetFrequencies));
	optimizer.optimize();
	outfile.write("K-L divergence: {:.4f}\n".format(optimizer.getBestParameters()['match']));

	search.setSimilarityMeasure(EntropyWeightsMixedSimilarity(CosineSimilarity(), JensenShannonDistance(), optimizer.targetFrequencies));
	optimizer.optimize();
	outfile.write("Weighted mixed similarity: {:.4f}\n".format(optimizer.getBestParameters()['match']));
	outfile.close();
	return None;
def smalltestPrevOptimalVals():
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	# only looking at MACROSTATE.TS
	# only optimizing backrub temperature and steepness
	ensembleSizes = numpy.array([50]);
	backrubTemps = numpy.array([1.8]);
	boltzmannTemps = numpy.array([0.0]);
	steepnessRange = numpy.array([3.0]);
	minWeights = numpy.array([0.80, 0.55, 0, 0.90, 0.35, 1.00]);
	maxWeights = numpy.array([0.80, 0.55, 0, 0.90, 0.35, 1.00]);

	print("Initializing objects\n");

	targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat6.tsv";
	dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat5.tsv";
	
	optimizer = Optimizer(MACROSTATES);

	# slightly different paths on my two computers
	try:
		optimizer.readTargetFrequencies(targetFreqs);	
		optimizer.readData(data);
	except:
		optimizer.readTargetFrequencies(targetFreqsAlt);	
		optimizer.readData(dataAlt);

	print("Files read in");

	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 1, 1, 0.25);
	search.setMaxIterations(1);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setSearchParameters(False, False, False, False, numpy.array([False, False, False, False, False, False]));
	optimizer.useAlgorithm(search);

	#print("Cos similiarity");
	#optimizer.optimize();	
	#optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "testOutCos.fasta");
	#print(optimizer.getBestParameters()['match']);

	print("\nJS Dist");
	#search.setSimilarityMeasure(JensenShannonDistance(optimizer.targetFrequencies));
	optimizer.optimize();
	now = datetime.now();
	optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "prev opt vals " + now.strftime('%Y%m%d%H%M') + ".fasta");
	optimizer.writeBestParamsToText("prev opt vals " + now.strftime('%Y%m%d%H%M'))
	print(optimizer.getBestParameters()['match']);
	
	return None;
Beispiel #4
0
def experimentalAnalysis(nMax, algoNo, stepSize):
	print "Generating array..."

	#generate array of test arrays
	testArray = []			

	for n in range(stepSize, nMax+1, stepSize):

		#need 10 test arrays of each input size n
		noTestArrays = 10
		for j in range(0, noTestArrays):
			testArray.append([])
			for k in range(0, n):
				testArray[-1].append(randint(-10, 10))

	#determine which algorithm we need to run
	if (algoNo == 1):
		print "Analysis for Algorithm 1"
		for a in testArray:					#test each array in the algorithm
			t0 = time()
			enum(a)							#call algorithm
			t1 = time()
			print len(a), " %f" %(t1-t0) 

	elif (algoNo == 2):
		print "Analysis for Algorithm 2"
		for a in testArray:
			t0 = time()
			better(a)
			t1 = time()
			print len(a), " %f" %(t1-t0) 

	elif (algoNo == 3):
		print "Analysis for Algorithm 3"
		for a in testArray:
			t0 = time()
			divideConquer(a, 0, len(a)-1)
			t1 = time()
			print len(a), " %f" %(t1-t0) 

	elif(algoNo == 4):
		print "Analysis for Algorithm 4"
		for a in testArray:
			t0 = time()
			maxSubarrayLinear(a)
			t1 = time()
			print len(a), " %f" %(t1-t0) 

	else:
		print "Error: Invalid algorithm number."
def testChi2(iterations = 64):
	print("Hello!\n");
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	# only looking at MACROSTATE.TS
	# only optimizing backrub temperature and steepness
	ensembleSizes = numpy.array([20, 50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 0, 1, 1, 1]);

	print("Initializing objects\n");

	data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat1.tsv";
	#data =  "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat" + str(i + 1) + ".tsv";
	#targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";

	optimizer = Optimizer(MACROSTATES);

	# slightly different paths on my two computers
	try:
		optimizer.readTargetFrequencies(targetFreqs);	
		optimizer.readData(data);
	except:
		optimizer.readTargetFrequencies(targetFreqsAlt);	
		optimizer.readData(dataAlt);

	print("Files read in");

	search = CuckooSearch(optimizer.models, Chi2Kernel(optimizer.targetFrequencies), False, 64, 1, 0.25);
	search.setMaxIterations(iterations);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True]));
	optimizer.useAlgorithm(search);

	print("\nChi2 kernel");
	#search.setSimilarityMeasure(JensenShannonDistance(optimizer.targetFrequencies));
	optimizer.optimize();
	now = datetime.now();
	optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "Chi2 test " + now.strftime('%Y%m%d%H%M%S') + ".fasta");
	optimizer.writeBestParamsToText("Chi2 test " + now.strftime('%Y%m%d%H%M%S'));
	print(optimizer.getBestParameters()['match']);
	
	return None;
def DHFRcomparemeasures(similarity:int):
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	ensembleSizes = numpy.array([20, 50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 0, 1, 1, 1]);

	data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat1.tsv";
	targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	optimizer = Optimizer(MACROSTATES);
	optimizer.readTargetFrequencies(targetFreqs);
	optimizer.readData(data);

	measure = "";
	if similarity == 0:
		search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = " JSD";
	elif similarity == 1:
		search = CuckooSearch(optimizer.models, CosineSimilarity(optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = " Cos";
	elif similarity == 2:
		search = CuckooSearch(optimizer.models, KLDivergence(optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = " KLD";
	elif similarity == 3:
		search = CuckooSearch(optimizer.models, EntropyWeightsMixedSimilarity(CosineSimilarity(), JensenShannonDistance(), optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = " Mix"
	elif similarity == 4:
		search = CuckooSearch(optimizer.models, EntropyWeightedSimilarity(JensenShannonDistance(), optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = "Weighted JSD";
	else:
		search = CuckooSearch(optimizer.models, Chi2Kernel(optimizer.targetFrequencies), False, 64, 1, 0.25);
		measure = "Chi2 kernel";
	search.setMaxIterations(2048);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setSearchParameters(True, True, True, True, numpy.array([True, True, False, True, True, True]));
	optimizer.useAlgorithm(search);
	optimizer.optimize();

	name = "DHFR compare measures " + measure + " " + datetime.now().strftime('%Y%m%d%H%M');
	optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), name + ".fasta", 3);
	optimizer.writeBestParamsToText(name + ".txt");
def smallTestBoltz():
	print("Hello!\n");
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	ensembleSizes = numpy.array([128]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([-1, 5]);
	steepnessRange = numpy.array([1, 7]);
	minWeights = numpy.array([0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 1, 1, 1]);

	print("Initializing objects\n");

	targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	dataMicro = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\microstates.dat";
	dataMicroAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\microstates.dat";

	optimizer = Optimizer(MACROSTATES, True);

	try:
		optimizer.readTargetFrequencies(targetFreqs);	
		optimizer.readFormattedMicrostateData(dataMicro);
	except FileNotFoundError:
		optimizer.readtargetfrequencies(targetfreqsalt);	
		optimizer.readFormattedMicrostatedata(datamicroalt);
		
	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), True, 64, 1, 0.25);
	search.setMaxIterations(2048);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setSearchParameters(False, True, True, True, numpy.array([True, True, True, True, True, True]));
	optimizer.useAlgorithm(search);
	optimizer.optimize();
	now = datetime.now();
	optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), "var ensembles " + now.strftime('%Y%m%d%H%M') + ".fasta");
	optimizer.writeBestParamsToText("var ensembles " + now.strftime('%Y%m%d%H%M'));
	
	#for i in range(8):
	#	thread = optimizerThread();
	#	thread.copyOptimizer(optimizer);
	#	thread.run();

	return None;
def onlyMacro():
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	i = 0;

	# only looking at MACROSTATE.TS
	# only optimizing backrub temperature and steepness
	ensembleSizes = numpy.array([20, 50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 0, 1, 1, 1]);

	data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat" + str(i + 1) + ".tsv";
	#data =  "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat" + str(i + 1) + ".tsv";
	#targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	optimizer = Optimizer(MACROSTATES);
	optimizer.readTargetFrequencies(targetFreqs);
	optimizer.readData(data);

	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 64, 1, 0.25);
	measure = " JSD";
	search.setMaxIterations(2048);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	search.setSearchParameters(True, True, True, True, numpy.array([True, True, False, True, True, True]));
	optimizer.useAlgorithm(search);
	optimizer.optimize();

	name = "Macrostates " + str(i + 1) + measure + datetime.now().strftime('%Y%m%d%H%M');
	optimizer.writeFrequenciesToFASTA(optimizer.getBestFrequencies(), name + ".fasta", 3);
	optimizer.writeBestParamsToText(name + ".txt");
	
	return None;
Beispiel #9
0
from SimilarityMeasure import SimilarityMeasure
from JensenShannonDistance import JensenShannonDistance
from CuckooSearch import CuckooSearch
from enumeration import enum
from Optimizer import Optimizer
import numpy

MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX",
                   "TS")
RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N',
                'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')

# only looking at MACROSTATE.TS
# only optimizing backrub temperature and steepness
ensembleSizes = numpy.array([20, 50])
backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8])
boltzmannTemps = numpy.array([0, -1, 1, 5.0])
steepnessRange = numpy.array([0.5, 5])
minWeights = numpy.array([0, 0, 0, 0, 0, 0])
maxWeights = numpy.array([1, 1, 0, 1, 1, 1])

data = "/netapp/home/tianjiao.zhang/data/DHFR_MSD_M20loop_repeat1.tsv"
targetFreqs = "/netapp/home/tianjiao.zhang/data/ecDHFR_openseq_bacterial_representative_final_align_trim.fasta"

optimizer = Optimizer(MACROSTATES)

# slightly different paths on my two computers
try:
    optimizer.readTargetFrequencies(targetFreqs)
    optimizer.readData(data)
except:
def simpleRepeatTest():
	print("Hello!\n");
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	ensembleSizes = numpy.array([50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 1]);
	maxWeights = numpy.array([0, 0, 0, 0, 0, 1]);

	print("Initializing objects\n");

	targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	dataMicro = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv";
	dataMicroAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv";

	optimizer = Optimizer(MACROSTATES);

	# slightly different paths on my two computers
	try:
		optimizer.readTargetFrequencies(targetFreqs);	
		optimizer.readData(data);
	except:
		optimizer.readTargetFrequencies(targetFreqsAlt);	
		optimizer.readData(dataAlt);

	print("Files read in");

	search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 1, 1, 0.25);
	search.setMaxIterations(1);
	search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
	#search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True]));
	search.setAllSearchToFalse();
	search.suppressOutputs = True;
	optimizer.useAlgorithm(search);

	print("\nJS Dist");

	for i in range(64):
		optimizer.optimize();
		params = optimizer.getBestParameters();
		m = search.population[0];
		#print(search.similarityMeasure.getSimilarityMeasure(m.getFrequencies()));
		# TODO: getModelByParams doesn't always return the same object.
		m1 = Model.constructFromExisting(optimizer.getModelByParams(m.backrubTemp, m.ensembleSize, m.boltzmannTemp), m.ensembleSize, m.backrubTemp, m.boltzmannTemp, m.getWeights(), m.steepness);
		#print(search.similarityMeasure.getSimilarityMeasure(m1.getFrequencies()));
		if not m.equalTo(m1):
			print("\t{:s}".format(Optimizer.calcParamsID(m.backrubTemp, m.ensembleSize, m.boltzmannTemp)));
			print("\t{:s}".format(Optimizer.calcParamsID(m1.backrubTemp, m1.ensembleSize, m1.boltzmannTemp)));
		#m2 = Model.constructFromExisting(m, m.ensembleSize, m.backrubTemp, m.boltzmannTemp, m.getWeights(), m.steepness);
		#print(search.similarityMeasure.getSimilarityMeasure(m2.getFrequencies()));

		#print(m.equalTo(m2));

	#print(m2.backrubTemp);
	#print(m2.boltzmannTemp);
	#print(m2.ensembleSize);
	#print(m2.steepness);
	#print(m2.weights);
	#print(search.similarityMeasure.getSimilarityMeasure(m2.getFrequencies()));

	#print(numpy.sum(numpy.sum(numpy.abs( m.getFrequencies() - m2.getFrequencies()))));

	return None;
def repeatTest():
	print("Hello!\n");
	MACROSTATES = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
	RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y');

	ensembleSizes = numpy.array([50]);
	backrubTemps = numpy.array([0.3, 0.6, 0.9, 1.2, 1.5, 1.8]);
	boltzmannTemps = numpy.array([0, -1, 1, 5.0]);
	steepnessRange = numpy.array([0.5, 5]);
	minWeights = numpy.array([0, 0, 0, 0, 0, 0]);
	maxWeights = numpy.array([1, 1, 0, 1, 1, 1]);

	print("Initializing objects\n");

	targetFreqs = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	targetFreqsAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\ecDHFR_openseq_bacterial_representative_final_align_trim.fasta";
	data = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	dataAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\DHFR_MSD_M20loop\\DHFR_MSD_M20loop_repeat1.tsv";
	dataMicro = "C:\\Users\\Candy\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv";
	dataMicroAlt = "C:\\Users\\Candy_000\\SkyDrive\\Documents\\rotation 2\\20160120_M20_enumeration_scores\\20160120_M20_enumeration_scores.tsv";

	optimizer = Optimizer(MACROSTATES);

	# slightly different paths on my two computers
	try:
		optimizer.readTargetFrequencies(targetFreqs);	
		optimizer.readData(data);
	except:
		optimizer.readTargetFrequencies(targetFreqsAlt);	
		optimizer.readData(dataAlt);

	print("Files read in");

	for i in range(32):
		search = CuckooSearch(optimizer.models, JensenShannonDistance(optimizer.targetFrequencies), False, 8, 1, 0.25);
		search.setMaxIterations(16);
		search.suppressOutputs = True;
		search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
		search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True]));
		optimizer.useAlgorithm(search);

		print("\nJS Dist");
		#search.setSimilarityMeasure(JensenShannonDistance(optimizer.targetFrequencies));
		optimizer.optimize();
	
		params = optimizer.getBestParameters();
		print(params['match']);
		print(optimizer.verifyFoundParams(params['ensembleSize'], params['backrubTemp'], params['boltzmannTemp'], params['steepness'], params['weights']));

		search = CuckooSearch(optimizer.models, CosineSimilarity(optimizer.targetFrequencies), False, 8, 1, 0.25);
		search.setMaxIterations(16);
		search.suppressOutputs = True;
		search.setParamBounds(ensembleSizes, backrubTemps, boltzmannTemps, steepnessRange, minWeights, maxWeights);
		search.setSearchParameters(False, True, True, True, numpy.array([True, True, False, True, True, True]));
		optimizer.useAlgorithm(search);

		print("\nCosine")
		optimizer.optimize();
		params = optimizer.getBestParameters();
		print(params['match']);
		print(optimizer.verifyFoundParams(params['ensembleSize'], params['backrubTemp'], params['boltzmannTemp'], params['steepness'], params['weights']));

	return None;
Beispiel #12
0
    data_filename = job_params[4]
    simMeas_id = job_params[5]
    iterations = int(job_params[6])
    #iterations = 4
    usedstates = numpy.array(job_params[7:]).astype(dtype=bool)
    #print(usedstates)
    if job_id == str(task_id):
        print('using options for job %s\n' % job_id)
        break

#targetFreqs = "/Users/anatale/Documents/school/UCSF/Kortemme_lab/code/fitness-data-analysis/highscale_trim.fasta"
targetFreqs = os.path.join(input_path, targetFreqs_filename)
data = os.path.join(input_path, data_filename)
#data = "/Users/anatale/Documents/school/UCSF/Kortemme_lab/code/fitness-data-analysis/testing_microstates.tsv"

MACROSTATES = enum('1i2m', '1a2k', '1k5d', '3gj0', 'importin', 'composite')
RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N',
                'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')

# parameter reanges to optimize
ensembleSizes = numpy.array([60, 70, 80, 90, 100])
backrubTemps = numpy.array([0.9])
#boltzmannTemps = numpy.array([-1.0]) # set below
steepnessRange = numpy.array([0.5, 5])
minWeights = numpy.array([0, 0, 0, 0, 0, 0])
maxWeights = numpy.array([1, 1, 1, 1, 1, 1])

optimizer = Optimizer(MACROSTATES)
optimizer.readTargetFrequencies(targetFreqs)
print('pos before loading data: ', optimizer.nPositions)
#optimizer.readData(data)
class Model:
    """
	A multistate design model
	"""
    # NOTE: Java documentation style. mainly because idk what's a standard one for python and Java's is most readable

    # I need these statements for intellisense.
    # vars used by all instances
    # TODO: is the enum even useful?
    MACROSTATES = enum()
    # enum of the macrostates
    nMacrostates = 0
    # int
    ensembleSize = 0
    # int
    nPositions = 0
    # int, number of positions on sequence examined
    contiguousPositions = True
    # are the positions contiguous?
    positionMap = {}
    # if the positions are not contiguous, a map from non-contiguous to [0, nPositions)
    positionOffset = 0
    # offset for positions when the positions examined are contiguous
    backrubTemp = 0.0
    # float
    boltzmannTemp = 0.0
    # float
    weights = numpy.array(0)
    # float[] relative weights of the macrostates
    steepness = 0.0
    # float steepness of siggmoid (s)
    fitnesses = numpy.array(0)
    # double[position][residue fitness] calculated fitnesses of residues at each location
    frequencies = numpy.array(0)
    # double[position][residue frequency] calculated frequencies of residues at each location
    macrostateResidueEnergies = numpy.array(0)
    # double[position][residue energy][macrostate]
    recovery = -1.0
    # float assigned by the outside similiartyMeasure to how well this mode recovers the sequence
    # TODO: check if macrostatesUsed is actually useful
    macrostatesUsed = numpy.array(False)
    # bool[], macrostates examined during optimization

    # vars used when microstate data is involved
    useAltAveragingMethod = False
    # use accepted avg method or (the alt) xinjie's expression?
    isFrequenciesCalculated = False
    # prevent unnecessary calculations
    useMicrostateData = False
    # do we have data from individual microstates?
    areMicrostatesPicked = False
    # have microstates been selected to be used in the ensemble?
    microstateResidueEnergies = numpy.array(0)
    # double[position][residue energy][macrostate][microstate]
    selectedMicrostateEnergies = numpy.array(0)
    # double[position][residue energy][macrostate][microstate], subset of microstateREsidueEnergies
    microstateCounts = numpy.array(0)
    # double[position][macrostate] number of microstates
    microstatesUsed = numpy.array(0)
    # int[position][macrostate][microstate index], the microstates used to calculate the macrostates

    def __init__(self,
                 macrostates: enum,
                 ensembleSize: int,
                 backrubTemp: float,
                 boltzmannTemp: float,
                 weights: numpy.array,
                 steepness: float,
                 positions: int,
                 positionOffset: int,
                 useMicrostateData: bool = False,
                 posMap: dict = None,
                 useAltAverageMethod: bool = False):
        """
		Default constructor

		@param macrostates				enumeration of the different macrostates in this model
		@param ensembleSize				int of macrostate ensemble sizes
		@param backrubTemp				float of the backrub temperature
		@param boltzmannTemp			float, Boltzmann averaging temeprature
		@param weights					float[], wights for the macrostates
		@param steepness				float, steepness in fitness funciton
		@param positions				int, number of positions examined in this midel
		@param positionOffset			int, the index of the lowest index to be examined
		@param useMicrostateData		bool, are we to actually average microstate data?
		@param posMap					dict<int, int> a remapping of position values if the positions are not contiguous. ONLY pass an object if the positions are not contiguous
		@param useAltAveragingMethod	bool, use the other Boltzmann averaging calculation method?
		"""
        self.MACROSTATES = macrostates
        self.nMacrostates = macrostates.size
        self.ensembleSize = ensembleSize
        self.backrubTemp = backrubTemp
        self.boltzmannTemp = boltzmannTemp
        self.weights = weights
        self.steepness = steepness
        self.nPositions = positions
        self.positionOffset = positionOffset
        self.isFrequenciesCalculated = False
        self.useMicrostateData = useMicrostateData
        self.useAltAveragingMethod = useAltAverageMethod
        self.macrostatesUsed = numpy.array([True] * self.nMacrostates)
        self.positionMap = deepcopy(posMap)
        if posMap is not None:
            self.contiguousPositions = False
        else:
            self.contiguousPositions = True

        # allocate arrays
        self.macrostateResidueEnergies = numpy.zeros(
            [self.nPositions, 20, self.nMacrostates], dtype=numpy.float64)
        self.fitnesses = numpy.ones([self.nPositions, 20], dtype=numpy.float64)
        self.frequencies = numpy.zeros([self.nPositions, 20],
                                       dtype=numpy.float64)

        # a little checker to prevent two identical entries, used in function addMacrostateData()
        for i in range(self.nMacrostates):
            for j in range(self.nPositions):
                self.macrostateResidueEnergies[j][0][i] = 65536.0

        if self.useMicrostateData:
            self.areMicrostatesPicked = False
            self.microstatesUsed = numpy.zeros([0])
            self.microstateCounts = numpy.zeros(
                [self.nPositions, self.nMacrostates], dtype=int)
            self.microstateResidueEnergies = numpy.zeros(
                [self.nPositions, 20, self.nMacrostates, 700],
                dtype=numpy.float64)
            # magic number 700 - max expected number of microstates

    def constructFromExisting(existing, ensembleSize: int, backrubTemp: float,
                              boltzmannTemp: float, weights: numpy.array,
                              steepness: float):
        """
		"Overloaded" "constructor" that uses a pre-existing Model as a template

		@param existing			pre-exisiting Model object
		@param ensembleSize		int, new ensemble size
		@param backrubTemp		float, new backrub temperate
		@param boltzmannTemp	float, new Boltzmann temerature
		@param weights			float[], new weights
		@return Model
		"""

        new = Model(existing.MACROSTATES, ensembleSize, backrubTemp,
                    boltzmannTemp, weights, steepness, existing.nPositions,
                    existing.positionOffset, existing.useMicrostateData)
        new.macrostatesUsed = existing.macrostatesUsed
        new.microstatesUsed = existing.microstatesUsed
        new.contiguousPositions = existing.contiguousPositions

        # TODO: is deepy copy necessary for all these values?
        #new.positionMap = deepcopy(existing.positionMap);	# deep copy dict, keep instances completely separate
        #if (not existing.useMicrostateData):				# not using microstate data
        #	new.macrostateResidueEnergies = numpy.array(existing.macrostateResidueEnergies, dtype = numpy.float64);
        #elif ensembleSize == existing.ensembleSize:				# using microstate data, already collapsed
        #	new.selectedMicrostateEnergies = numpy.array(existing.selectedMicrostateEnergies);
        #	new.areMicrostatesPicked = True;
        #else:													# using microstate data, not collapsed
        #	#print("!", end='')
        #	new.microstateResidueEnergies = numpy.array(existing.microstateResidueEnergies);
        #	new.microstateCounts = numpy.array(existing.microstateCounts);

        # SHALLOW COPY of raw data for faster since they should not need modification during any runs
        new.positionMap = existing.positionMap
        if (not existing.useMicrostateData):  # not using microstate data
            new.macrostateResidueEnergies = existing.macrostateResidueEnergies
        else:
            if ensembleSize == existing.ensembleSize:  # using microstate data, already collapsed
                new.selectedMicrostateEnergies = existing.selectedMicrostateEnergies
                new.areMicrostatesPicked = True
        #else:													# using microstate data, not collapsed
        #print("!", end='')
            new.microstateResidueEnergies = existing.microstateResidueEnergies
            new.microstateCounts = existing.microstateCounts

        return new

    def setPositionMap(self, posMap: dict) -> None:
        """
		Deprecated. There should not be a reason to EVER call this function

		If this model is using non-contiguous position numbers, sets the Map<int, int>
		used to convert them to contiguous numbers on [0, nPositions]

		@param posMap		dict, Map<int, int> of how to change the numbers
		@return None
		"""
        warnings.warn(
            "Obsolete function. its function was taken care of during construction. Now results are not guaranteed",
            DeprecationWarning)
        self.positionMap = deepcopy(posMap)
        return None

    def addMacrostateData(self, macrostate: int, position: int,
                          energies: "float[]") -> None:
        """
		Inserts a macrostate_position set of fitness values to this model

		@param macrostate		int, the macrostate this corresponds to
		@param position			int, the position the energies corresponds to
		@param energies			float[] of length-20 of the energies
		@return void
		"""
        # convert raw position to an internal index
        if self.contiguousPositions:
            pos = position - self.positionOffset
        else:
            pos = self.positionMap[position]

        if self.macrostateResidueEnergies[pos][0][macrostate] != 65536.0:
            raise Exception("Something something this entry already full")

        for i in range(20):
            self.macrostateResidueEnergies[pos][i][macrostate] = energies[i]

    def addMicrostateData(self, macrostate: int, position: int,
                          energies: "float") -> None:
        """
		Inserts a microstate_position fitness into this model

		@param macrostate		int, the macrostate this microstate belongs to
		@param position			int, the position the energy corresponds to
		@param energy			float, the energy of this mutation
		@retun void
		"""
        if self.contiguousPositions:
            position -= self.positionOffset
        else:
            position = self.positionMap[position]

        # TODO: do I need a overwrite check as in adding macrostate data?
        for i in range(20):
            self.microstateResidueEnergies[position][i][macrostate][
                self.microstateCounts[position][macrostate]] = energies[i]

        self.microstateCounts[position][macrostate] += 1

        return None

    def useAltAverageMethod(self, yes: bool) -> None:
        """
		Changes whether to use the other averaging method

		@param yes		bool whether to use it or not
		@return void
		"""
        self.useAltAveragingMethod = yes
        self.isFrequenciesCalculated = False

    # change weights sets
    # TODO ascartain that this is actually necessary
    def setWeights(self, newWeights: numpy.array) -> None:
        warning.warn("Why are you changing the weights directly in a model?",
                     UserWarning)
        self.weights = newWeights

    # PRIVATE
    # TODO: add flag to only compute once. Then we should be able to remove the deep copy
    def averageMicrostates(self) -> None:
        """
		Boltzmann averages the microstates to calculate the energy for the macrostate

		@param void
		@return void
		"""
        #print(self.microstateResidueEnergies[0][0]);
        #print();
        if not self.areMicrostatesPicked:
            # pick backbones to use for the ensemble.
            self.microstatesUsed = numpy.zeros(
                [self.nPositions, self.nMacrostates, self.ensembleSize],
                dtype=int)
            for i in range(self.nPositions):
                for j in range(self.nMacrostates):
                    self.microstatesUsed[i][j] = numpy.random.randint(
                        0, self.microstateCounts[i][j], [self.ensembleSize])
            # cherry-pick out the selected microstates
            self.selectedMicrostateEnergies = numpy.zeros(
                [self.nPositions, 20, self.nMacrostates, self.ensembleSize])
            for i in range(self.nPositions):
                for j in range(20):
                    for k in range(self.nMacrostates):
                        for l in range(self.ensembleSize):
                            self.selectedMicrostateEnergies[i][j][k][
                                l] = self.microstateResidueEnergies[i][j][k][
                                    self.microstatesUsed[i][k][l]]

            self.areMicrostatesPicked = True

        #print(self.selectedMicrostateEnergies[0][0]);
        #print();
        if not self.useAltAveragingMethod:
            if (self.boltzmannTemp == 0.0):
                self.macrostateResidueEnergies = numpy.amin(
                    self.selectedMicrostateEnergies, axis=3)
            elif (self.boltzmannTemp == -1.0):
                self.macrostateResidueEnergies = numpy.mean(
                    self.selectedMicrostateEnergies, axis=3)
            else:
                self.macrostateResidueEnergies = numpy.sum(
                    self.selectedMicrostateEnergies * numpy.exp(
                        self.selectedMicrostateEnergies / -self.boltzmannTemp),
                    axis=3) / numpy.sum(numpy.exp(
                        self.selectedMicrostateEnergies / -self.boltzmannTemp),
                                        axis=3)
        else:
            if (self.boltzmannTemp == 0.0):
                self.macrostateResidueEnergies = numpy.amin(
                    self.selectedMicrostateEnergies, axis=3)
            elif (self.boltzmannTemp == -1.0):
                self.macrostateResidueEnergies = numpy.mean(
                    self.selectedMicrostateEnergies, axis=3)
            else:
                self.macrostateResidueEnergies = -numpy.log(
                    sum(numpy.exp(
                        self.selectedMicrostateEnergies / -self.boltzmannTemp),
                        axis=3))

        #print(self.macrostateResidueEnergies[0]);
        # After averaging, delete the 4D array to save space and flip the microstate flag
        self.microstateResidueEnergies = numpy.array(0)
        return None

    # PRIVATE
    def calcFitness(self) -> None:
        """
		Calculates the fitnesses of the each residue at each position.
		There is no need for this function to be externally called

		@param void
		@return void
		"""

        # collapse microstates into macrostates
        if self.useMicrostateData:
            self.averageMicrostates()

        minEnergies = numpy.amin(self.macrostateResidueEnergies, axis=1)
        # for each position and macrostate, which residue had min energy?
        offsets = minEnergies + numpy.divide(numpy.log(99), self.steepness)
        # calculate offset is double[position][macrostate]
        self.fitnesses = numpy.ones([self.nPositions, 20], dtype=numpy.float64)
        for i in range(self.nPositions):
            for j in range(20):
                for k in range(self.nMacrostates):
                    f = 1.0 / (1.0 + numpy.exp(
                        self.steepness *
                        (self.macrostateResidueEnergies[i][j][k] -
                         offsets[i][k])))
                    self.fitnesses[i][j] *= (1 - self.weights[k] +
                                             self.weights[k] * f)

        # PRIVATE
    def calcFrequencies(self) -> None:
        """
		Calculates the fitnesses and the frequencies of residues at each location

		@param void
		@return void
		"""
        if not self.isFrequenciesCalculated:  # only do it once!
            self.isFrequenciesCalculated = True
            self.calcFitness()

            self.frequencies = numpy.divide(
                self.fitnesses, numpy.subtract(1.0, self.fitnesses))
            # non-normalized frequencies
            sums = numpy.sum(self.frequencies, axis=1)
            for i in range(self.nPositions):  # normalize
                self.frequencies[i] = numpy.divide(self.frequencies[i],
                                                   sums[i])

    # get functions
    # member fields should not be directly accessed; use these get funtions instead
    def getEnsembleSize(self) -> int:
        """
		Self-explanatory name

		@return int
		"""
        return self.ensembleSize

    def getBackrubTemp(self) -> float:
        """
		Self-explanatory name

		@return double
		"""
        return self.backrubTemp

    def getBoltzmannTemp(self) -> float:
        """
		Self-explanatory name

		@return double
		"""
        return self.boltzmannTemp

    def getWeights(self) -> numpy.array:
        """
		Self-explanatory name. Return a deep copy of the array so it's
		safe to directly do math on the return value

		@return float[]
		"""
        return numpy.array(self.weights)

    def getSteepness(self) -> float:
        """
		Self-explanatory name

		@return float
		"""
        return self.steepness

    def getFrequencies(self) -> numpy.array:
        """
		Self-explanatory name. Returns a deep copy of the array so it's
		safe to directly do math on the return value

		@return float[][]
		"""

        # this is a special instance for storing data
        if self.ensembleSize == 0 and self.useMicrostateData:
            raise PermissionError(
                "This object is a raw data storage instance and this call should not have been made"
            )
            return None

        if not self.isFrequenciesCalculated:
            self.calcFrequencies()
        return numpy.array(self.frequencies)

    def equalTo(self, other) -> bool:
        """
		Are the data stored in this Model correct? i.e. does everything actually correspond
		to the input file's data? Used for debugging

		@param other		Model object to compare this to
		@return bool	if everything compared correctly
		"""
        retval = True
        if not isinstance(other, Model):
            print("is not same class")
            retval = False
        if self.ensembleSize != other.ensembleSize:
            print("ensemble sizes: {:d}, {:d}".format(self.ensembleSize,
                                                      other.ensembleSize))
            retval = False
        if self.boltzmannTemp != other.boltzmannTemp:
            print("boltzmann temps: {:.2f}, {:.2f}".format(
                self.boltzmannTemp, other.boltzmannTemp))
            retval = False
        if self.backrubTemp != other.backrubTemp:
            print("backrub temps: {:.2f}, {:.2f}".format(
                self.backrubTemp, other.backrubTemp))
            retval = False
        if numpy.sum(
                numpy.abs(self.macrostateResidueEnergies -
                          other.macrostateResidueEnergies)) > 1e-9:
            print(
                numpy.sum(
                    numpy.abs(self.macrostateResidueEnergies -
                              other.macrostateResidueEnergies)))
            retval = False
        if not retval:
            print("{:d}\t{:.2f}\t{:.2f}".format(self.ensembleSize,
                                                self.boltzmannTemp,
                                                self.backrubTemp))
        return retval

    # comparison operators based on similarity measure
    def __eq__(self, other):
        assert (isinstance(other, Model))
        return self.recovery == other.recovery
        # see, this is where static typing comes in useful. it lets autocomplete see the fields of other

    def __le__(self, other):
        assert (isinstance(other, Model))
        return self.recovery <= other.recovery

    def __lt__(self, other):
        assert (isinstance(other, Model))
        return self.recovery < other.recovery

    def __ge__(self, other):
        assert (isinstance(other, Model))
        return self.recovery >= other.recovery

    def __gt__(self, other):
        assert (isinstance(other, Model))
        return self.recovery < other.recovery

    def __ne__(self, other):
        assert (isinstance(other, Model))
        return self.recovery != other.recovery
import math as magic
import ast
import numpy
import warnings
from io import *
from enumeration import enum
from copy import *

# should the macrostates be hard-coded? probably not if this ends up being actually used for tuning other models...
#MACROSTATES_T = enum("E-DHF-NADPH", "E-NADPH", "E-OPEN", "E-THF", "E-THF-NADPX", "TS");
RESIDUES = enum('A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N',
                'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y')


class Model:
    """
	A multistate design model
	"""
    # NOTE: Java documentation style. mainly because idk what's a standard one for python and Java's is most readable

    # I need these statements for intellisense.
    # vars used by all instances
    # TODO: is the enum even useful?
    MACROSTATES = enum()
    # enum of the macrostates
    nMacrostates = 0
    # int
    ensembleSize = 0
    # int
    nPositions = 0
    # int, number of positions on sequence examined
Beispiel #15
0
class Optimizer(object):
    """
    Optimizes the hyperparameters of a model
    """

    # TODO: make all parameters generic

    models = {
    }  # Map<hyperparams, Model> of the input data, since values are precalculated
    optimizationAlgorithm = SearchAlgorithm()  # the algorithm to be used
    #similarityMeasure = SimilarityMeasure();     # the similarlity measure to be used
    nPositions = 0  # number of positions examined
    contiguousPositions = True  # are the set of positions contiguous
    positionMap = None  # used to map from non-contiguous that start on 0 positions to [0, nPositions]
    minPosition = 0  # position offset for indexing
    targetFrequencies = numpy.array(
        0
    )  # float[position][residue] internal representation of the target frequencies
    MACROSTATES = enum()  # enum of the macrostates
    nMacrostates = 0  # number of macrostates
    continuousBoltzmann = False  # are we using a continuous set of boltzmann temps
    targetFreqsRead = False

    def __init__(self,
                 macrostates=None,
                 continuousBoltzmann=False,
                 contiguousPositions=True):
        """
        Default constructor.

        @param macrostates            enum of the macrostates to be considered
        @param continuousBoltzmann    bool, are microstate data provided
        @param contiguousPositions    bool, are the positions aligned to a contiguous set of positions in the target?
        """

        self.MACROSTATES = macrostates
        # turn this enum into what we really need for indexing - a dictionary
        # this is a kludgy solution, but it would take a while to dig all the enum stuff out of this code
        self.macStateToIndex = {}
        for elem in self.MACROSTATES.__dict__:
            if elem != 'size':
                if type(self.MACROSTATES.__dict__[elem]) is int:
                    self.macStateToIndex[elem] = self.MACROSTATES.__dict__[
                        elem]
        self.nMacrostates = self.MACROSTATES.size
        self.continuousBoltzmann = continuousBoltzmann
        self.models = {}
        self.nPositions = 0
        self.minPosition = 0
        self.targetFrequencies = numpy.array(0)
        self.contiguousPositions = contiguousPositions
        self.targetFreqsRead = False
        if not contiguousPositions:
            self.positionMap = {}
        else:
            self.positionMap = None

    # STATIC
    def copyFromExisting(existing):
        """
        Deep copies an existing Optimizer
        @param existing        Optimizer to be copied
        @return Optimizer
        """

        newOptimizer = Optimizer(existing.MACROSTATES)
        newOptimizer.minPosition = existing.minPosition
        newOptimizer.nPositions = existing.nPositions
        newOptimizer.targetFrequencies = numpy.array(
            existing.targetFrequencies)
        newOptimizer.models = dict(existing.models)
        #newOptimizer.similarityMeasure = existing.similarityMeasure;
        newOptimizer.optimizationAlgorithm = existing.optimizationAlgorithm
        newOptimizer.contiguousPositions = existing.contiguousPositions
        newOptimizer.targetFreqsRead = existing.targetFreqsRead
        if not existing.contiguousPositions:
            newOptimizer.positionMap = deepcopy(existing.positionMap)

        return newOptimizer

    # TODO: change the file return type to file read return
    def readTargetFrequencies(self, source, posPicker=None):
        """
        Reads the target frequencies from a FASTA file. Call this before reading data
        Note: when optimizing against a set of positions that are not contiguous, this function
        *MUST* be called before calling a read*Data function. Doing otherwise will void all warranties
        and promises that calculations will be correct.

        @param source            string pointing to the location of the input FASTAs
        @return array of the target frequencies
        """
        # resToIndex is used to convert a one-letter AA code to an index, which is in alphabetical order
        resToIndex = {
            'A': 0,
            'C': 1,
            'D': 2,
            'E': 3,
            'F': 4,
            'G': 5,
            'H': 6,
            'I': 7,
            'K': 8,
            'L': 9,
            'M': 10,
            'N': 11,
            'P': 12,
            'Q': 13,
            'R': 14,
            'S': 15,
            'T': 16,
            'V': 17,
            'W': 18,
            'Y': 19
        }
        infile = open(source, 'r', encoding='utf-8')

        # figure out the number of positions
        infile.readline()  # skip the first line (>NULL)
        entry = ""
        thisLine = infile.readline()
        while thisLine[0] != '>':
            entry += thisLine
            thisLine = infile.readline()
        self.nPositions = len(entry) - 1
        # case where the entries end with a dash
        if entry[-2] == 0:
            nPositions -= 1
        # allocate space for a double[][]
        self.targetFrequencies = numpy.zeros([self.nPositions, 20],
                                             dtype=float)

        # read entries
        # 2/17 note: this has been modified to be ok with files that have unaligned positions, i.e. '-' in sequence
        infile.seek(0)  # go back to the start
        nEntries = numpy.zeros([self.nPositions], dtype=int)
        thisEntry = ""
        for line in infile:
            if line[0] == '>':  # line starts w/ '>', indicating start of a new entry
                if thisEntry == "":  # no entry to process
                    pass
                else:  # add the residues in this entry to the counts
                    for i in range(self.nPositions):
                        #print(thisEntry[i], end='');
                        if thisEntry[
                                i] != '-':  # only when there is a residue aligned here
                            try:
                                self.targetFrequencies[i][resToIndex[
                                    thisEntry[i]]] += 1
                                nEntries[i] += 1
                            except KeyError:  # non-single residue code. skip
                                continue
                    thisEntry = ""  # then clear it to read the next entry
                    #print();
            else:  # middle of an entry, append this line
                thisEntry += line
        # counts to frequencies
        for i in range(self.nPositions):
            for j in range(20):
                self.targetFrequencies[i][j] /= nEntries[i]

        infile.close()

        # 2/17 added parts to allow for removal of superfluous positions
        if posPicker != None:
            self.contiguousPositions = False
            indices = self.positionReindexerFASTA(posPicker)
            self.nPositions = len(indices)
            freqs = numpy.zeros([self.nPositions, 20])
            for i in range(len(indices)):
                freqs[i] = self.targetFrequencies[indices[i]]
            self.targetFrequencies = freqs
            ## make the re-mapping indexer
            #for i in range(self.nPositions):
            #    self.positionMap[indices[i]] = i;

        self.targetFreqsRead = True
        return numpy.array(self.targetFrequencies)

    # read raw macrostate data
    # TODO: change the file return type to file read return
    def readData(self, source):
        """
        Reads in a tab-delimited file of ensembles encoding macrostate data

        @param source    a string pointing to the location of the tab-delimited file
        @return void
        """
        if not self.targetFreqsRead:
            warnings.warn("Hey, call the read target freqs functions first!",
                          UserWarning)

        # used to convert the input dict to an array
        indexToRes = {
            0: 'A',
            1: 'C',
            2: 'D',
            3: 'E',
            4: 'F',
            5: 'G',
            6: 'H',
            7: 'I',
            8: 'K',
            9: 'L',
            10: 'M',
            11: 'N',
            12: 'P',
            13: 'Q',
            14: 'R',
            15: 'S',
            16: 'T',
            17: 'V',
            18: 'W',
            19: 'Y'
        }

        # convert strings to manipulate-able values
        self.models.clear()

        infile = open(source, 'r')
        isFirstLine = True
        isFirstEntry = True
        self.minPosition = 65535  # used for offsetting indices in Macrostate

        placeHolderWeights = numpy.array([0, 0, 0, 0])
        placeHolderSteep = 1
        for line in infile:
            # ignore first line since they're just column headers
            if isFirstLine:
                isFirstLine = False
            else:
                entries = line.split('\t')  # entries: a list of strings
                macrostate = entries[0]  # string
                backrubT = entries[1]  # string
                ensembleS = entries[2]  # string
                boltzmanT = entries[3]  # string
                position = entries[4]  # string
                energies = ast.literal_eval(
                    entries[5]
                )  # ast.literal_eval converts a string to a dictionary

                # now ints
                macrostate = self.macStateToIndex[macrostate]
                position = int(position)
                ensembleS = int(ensembleS)

                # record minposition - assumes that the first entry is at first position
                # TODO: fix this to actually find minimum position
                if isFirstEntry:
                    self.minPosition = position
                    isFirstEntry = False

                # skip superfluous positions
                if position < self.minPosition or position >= self.minPosition + self.nPositions:
                    continue

                # now doubles
                backrubT = float(backrubT)
                if boltzmanT == "min":  # account for the possible text values
                    boltzmanT = 0.0
                elif boltzmanT == "mean":
                    boltzmanT = -1.0  # use -1 to represent inf - you can't really do math with numpy.inf
                else:
                    boltzmanT = float(boltzmanT)

                # calc model ID from the strings because that's always unique
                ID = Optimizer.calcParamsID(backrubT, ensembleS, boltzmanT)

                # now an array
                temp = numpy.zeros([20])
                for i in range(20):
                    temp[i] = energies[indexToRes[i]]
                energies = temp

                # put this read into the internal small, colorful shells structure
                if ID in self.models:
                    self.models[ID].addMacrostateData(macrostate, position,
                                                      energies)
                else:
                    model = Model(self.MACROSTATES, ensembleS, backrubT,
                                  boltzmanT, placeHolderWeights,
                                  placeHolderSteep, self.nPositions,
                                  self.minPosition)
                    model.addMacrostateData(macrostate, position, energies)
                    self.models[ID] = model

        infile.close()
        return None

    # read raw microstate data
    def readMicrostateData(self, source: str, minPosition: int):
        """
        Reads in raw microstate data. Unlike readData(), this function does not assume anything
        about the min position and it must be supplied manually

        @param source        string of the input file
        @param minPosition    int of the lowest position number
        @return void
        """

        if not self.targetFreqsRead:
            warnings.warn("Hey, call the read target freqs functions first!",
                          UserWarning)

        self.models.clear()
        self.minPosition = minPosition
        maxPos = 0

        indexToRes = {
            0: 'A',
            1: 'C',
            2: 'D',
            3: 'E',
            4: 'F',
            5: 'G',
            6: 'H',
            7: 'I',
            8: 'K',
            9: 'L',
            10: 'M',
            11: 'N',
            12: 'P',
            13: 'Q',
            14: 'R',
            15: 'S',
            16: 'T',
            17: 'V',
            18: 'W',
            19: 'Y'
        }

        infile = open(source, 'r')

        placeHolderWeights = None
        placeHolderSteep = 0
        placeHolderBoltzmannT = 0
        placeHolderEnsemble = 0

        n = 0
        isFirstLine = True
        #line = infile.readline()
        #while true:
        for line in infile:
            # ignore first line since they're just column headers
            if isFirstLine:
                isFirstLine = False
            else:
                n += 1
                entries = line.split('\t')
                # print(entries)
                macrostate = entries[0]
                backrubT = entries[1]
                position = int(entries[2])
                #backbone = entries[3]
                energies = ast.literal_eval(entries[4])

                # skip superfluous positions
                if position < self.minPosition or position >= self.minPosition + self.nPositions:
                    print('skipping %d' % position)
                    continue

                # convert from string to useful data types
                backrubT = float(backrubT)
                macrostate = self.macStateToIndex[macrostate]
                position = int(position)
                temp = numpy.zeros([20])
                for i in range(20):
                    temp[i] = energies[indexToRes[i]]
                energies = temp

                if position > maxPos:
                    maxPos = position
                    print(maxPos)

                ID = Optimizer.calcParamsID(backrubT, None, None)
                if ID in self.models:
                    self.models[ID].addMicrostateData(macrostate, position,
                                                      energies)
                else:
                    model = Model(self.MACROSTATES, placeHolderEnsemble,
                                  backrubT, placeHolderBoltzmannT,
                                  placeHolderWeights, placeHolderSteep,
                                  self.nPositions, self.minPosition, True,
                                  self.positionMap)
                    model.addMicrostateData(macrostate, position, energies)
                    self.models[ID] = model

            #line = infile.readline()
            #if not line: # EOF
            #    break

        if self.contiguousPositions:
            self.nPositions = maxPos - minPosition + 1
        infile.close()
        return None

    def positionReindexer(data: str):
        """
        Used to offset arbitrary positions to start with 0. Used when converting files.
        The file should have three columns. The first is the index in the alignment, the second I have no idea,
        the third is the index in the residue sequence.

        @param data        string pointing to the indices file
        @return int[] where each entry holds the unaligned position that is mapped to the index
        """
        infile = open(data, 'r')
        indices = []
        n = 0
        for line in infile:
            entries = line.split(' ')
            i = entries[2].strip('\n')
            indices[i] = n
            n += 1
        indices['nPos'] = n
        infile.close()
        return indices

    def positionReindexerFASTA(self, data: str):
        """
        Since we may be aligning to a FASTA sequence with more positions,
        we strip the superfluous positions from the read in FASTA. Used when reading files.
        The file should have three columns. The first is the index in the alignment, the second I have no idea,
        the third is the index in the residue sequence.

        @param data        string pointing to the file
        @return    int[] where the index is the corrected/reindexed position, and the value is the original index
        """

        infile = open(data, 'r')
        index = []
        i = 0
        for line in infile:
            entries = line.split(' ')
            index.append(int(entries[0]))
            self.positionMap[int(entries[2].strip('\n'))] = i
            i += 1
        indices = numpy.array(index)
        return indices

    # TODO: change the return type to file write return val
    def writeFrequenciesToFASTA(self,
                                frequencies: numpy.array,
                                outFileName: str,
                                precision: int = 3):
        """
        Writes the 2D residue frequencies to a FASTA file

        @param frequencies        double[positio][residue] of relative frequencies
        @param outfile            string of output filename
        @param precision        int, optional, number of places behind the decimal point, default is 3
        @return int                1 if failure, 0 if succeeds
        """
        if outFileName.split('.')[-1] != 'fasta':
            outFileName += ".fasta"

        try:
            outfile = open(outFileName, 'w')
        except FileExistsError:
            print("Output file already exists\n")
            return 1
        nEntries = numpy.power(10, precision)
        numbers = numpy.round(frequencies * nEntries)
        residueToWrite = numpy.zeros([self.nPositions], dtype=int)
        #print(numbers);
        residues = "ACDEFGHIKLMNPQRSTVWY"
        for i in range(nEntries):
            outfile.write("> Null\n")
            for j in range(self.nPositions):
                while numbers[j][
                        residueToWrite[j]] == 0 and residueToWrite[j] < 19:
                    residueToWrite[j] += 1
                numbers[j][residueToWrite[j]] -= 1
                outfile.write(residues[residueToWrite[j]])
            outfile.writelines("\n")
        outfile.close()
        return 0

    def writeBestParamsToText(self, out: str):
        """
        Writes the best parameters found to a human-readable text file.
        Overwrites without warning.

        @param out        string of name of output file
        @return void
        """
        if out.split('.')[-1] != 'txt':
            out += ".txt"
        outfile = open(out, 'w')
        bestVals = self.getBestParameters()
        outfile.write("Ensemble Size: {:d}\n".format(bestVals['ensembleSize']))
        outfile.write("Backrub temperature: {:.1f}\n".format(
            bestVals['backrubTemp']))
        bt = bestVals['boltzmannTemp']
        if bt > 0:
            outfile.write("Boltzmann averaging temperature: {:.9f}\n".format(
                bestVals['boltzmannTemp']))
        elif bt == 0:
            outfile.write("Boltzmann averaging temperature: mean\n")
        else:
            outfile.write("Boltzmann averaging temperature: inf\n")
        outfile.write("Steepness: {:.9f}\n".format(bestVals['steepness']))
        outfile.write("Weights: ")
        for i in range(self.MACROSTATES.size):
            outfile.write("{:.4f} ".format(bestVals['weights'][i]))
        outfile.write("\nMatch: {:.4f}\n".format(bestVals['match']))
        outfile.write("Algorithm: {:s}\n".format(
            self.optimizationAlgorithm.__str__()))
        outfile.write("Similarity measure: {:s}\n".format(
            self.optimizationAlgorithm.similarityMeasure.__str__()))
        outfile.write("Elapsed time: {:s}\n".format(
            str(self.optimizationAlgorithm.elapsedTime)))
        outfile.close()

    # generate a unique reproducible key for a combination of hyperparameters
    # hash or plaintext string?
    # STATIC
    def calcParamsID(param1, param2, param3):
        """
        Generates a unique and reproducable ID string for each combination of parameters, either
        by concactenating the toString representations or hashing it all
        Is a static method.

        @param param1            backrub temperature
        @param param2            ensemble size
        @param param3            Boltzmann averaging temperature
        @return                    a unique string
        """
        #longString = hashlib.sha1(macrostate).hexdigest() + hashlib.sha1(backrubT).hexdigest() + hashlib.sha1(ensembleS).hexdigest();
        #longString += hashlib.sha1(boltzmanT).hexdigest() + hashlib.sha1(weights).hexdigest() + hashlib.sha1(steep);
        #return hashlib.sha1(longString).hexdigest();
        return str(param1) + " " + str(param2) + " " + str(param3)

    def getModelByParams(self, param1, param2, param3):
        """
        Gets a model by the specified pre-determined parameters.
        Return is a reference and the return object should not be
        directly modified. use the Model.createFromExisting() function on it
        to generate a copy.

        @param param1            backrub temperature
        @param param2            ensemble size
        @param param3            Boltzmann averaging temperature
        @return Model with specified params
        """
        return self.models[Optimizer.calcParamsID(param1, param2, param3)]

    def useAlgorithm(self, algorithm: SearchAlgorithm):
        """
        Changes the search algorithm used by the optimizer

        @param algorithm        new SearchAlgorithm
        @return void
        """
        self.optimizationAlgorithm = algorithm

    def optimize(self):
        """
        Starts the optimization process

        @param void
        @return void
        """
        self.optimizationAlgorithm.iterate()

    def verifyFoundParams(self, ensembleSize, backrubT, boltzmannT, steepness,
                          weights):
        """
        Run some found parameters against this model to see the match

        @param ensembleSize            int of found size
        @param backrubT                float of found backrub temperature
        @param boltzmannT            float of found boltzmann averaging temperature
        @param steepness            float of found steepness
        @param weights                float[] of found weights
        @return    float on [0, 1] of match to target
        """
        model = Model.constructFromExisting(
            self.getModelByParams(backrubT, ensembleSize, boltzmannT),
            ensembleSize, backrubT, boltzmannT, weights, steepness)
        return self.optimizationAlgorithm.similarityMeasure.getSimilarityMeasure(
            model.getFrequencies())

    def getFrequenciesByParams(self, ensembleSize, backrubT, boltzmannT,
                               steepness, weights):
        """
        Gets the frequencies corresponding to a particular set of hyperparams

        @param ensembleSize            int of found size
        @param backrubT                float of found backrub temperature
        @param boltzmannT            float of found boltzmann averaging temperature
        @param steepness            float of found steepness
        @param weights                float[] of found weights
        @return    float[][] of the relative frequencies
        """
        model = Model.constructFromExisting(
            self.getModelByParams(backrubT, ensembleSize, boltzmannT),
            ensembleSize, backrubT, boltzmannT, weights, steepness)
        return model.getFrequencies()

    def getBestParameters(self):
        """
        Returns a dictionary of the best parameters found.
        Keys:
            'ensembleSize'
            'backrubTemp'
            'boltzmannTemp'
            'steepness'
            'weights'
            'match'

        @param void
        @return Map<string, float>
        """
        return self.optimizationAlgorithm.getBestParameters()

    def getBestFrequencies(self):
        """
        Returns the best frequencies found

        @param void
        @return float[][] of frequencies
        """
        return self.optimizationAlgorithm.getBestFrequencies()