def main(argv):

    modelSpecs = config.InitializeModelSpecs()
    modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs)

    ## load the datasets. Data is a list of proteins and each protein is represented as a dict()
    Data = DataProcessor.LoadDistanceLabelMatrices(modelSpecs['dataset'],
                                                   modelSpecs=modelSpecs)
    print '#proteins loaded from the dataset: ', len(Data)
    allProteins = [d['name'] for d in Data]

    print 'Preparing batch data for training...'
    groupSize = modelSpecs['minibatchSize']
    batches = DataProcessor.SplitData2Batches(data=Data,
                                              numDataPoints=groupSize,
                                              modelSpecs=modelSpecs)
    print "#batches:", len(batches)

    ## add code here to calculate empirical reference state
    ## RefState is a dict, RefState[response] = (length-independent ref, length-dependent ref)
    ## length-independent ref is an 1d array, length-dependent ref is a list with each element being an tuple (length, 1d array)
    RefState = CalcRefState(batches=batches, modelSpecs=modelSpecs)
    RefState['dataset'] = modelSpecs['dataset']
    RefState['proteins'] = allProteins

    ## save RefState
    responseStr = '-'.join(modelSpecs['responses'])
    file4save = 'EmpRefState-' + responseStr + '-' + str(os.getpid()) + '.pkl'
    fh = open(file4save, 'wb')
    cPickle.dump(RefState, fh, protocol=cPickle.HIGHEST_PROTOCOL)
    fh.close()

    ## print the length-ind reference state
    for response in modelSpecs['responses']:
        print RefState[response][0]
def TrainDataLoader3(sharedQ, sharedLabelPool, sharedLabelWeightPool, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	#print 'trainDataLoader has event: ', stopTrainDataLoader

	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelWeightPool = dict()

	## load the labels of all training proteins
	trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
	for loc in trainDataLocation:
		d = DataProcessor.LoadRealData(loc, modelSpecs, loadFeature=False, returnMode='list')
		name = d['name']
		labelPool[name] = d['atomLabelMatrix']
		labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
		labelWeightPool[name] = labelWeightMatrix

	print 'TrainDataLoader with #PID ', os.getpid(), ' has loaded ', len(labelPool), ' label matrices  and ', len(labelWeightPool), ' label weight matrices'
	## update labelPool and labelWeightPool to the shared dict()
	sharedLabelPool.update(labelPool)
	sharedLabelWeightPool.update(labelWeightPool)
	print 'TrainDataLoader with #PID ', os.getpid(), ' has update the shared labelPool and labelWeightPool'

	while True:
		if stopTrainDataLoader.is_set() or os.getppid()==1:
			print 'trainDataLoader receives the stop signal'
			break

		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		"""
		maxLen = 900
		trainDataLocation, numExcluded = DataProcessor.FilterByLength(trainDataLocation, maxLen)
		print 'Exclude ', numExcluded, ' train proteins longer than ', maxLen, ' AAs'
		"""
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)
		for batch in trainSeqData:
			if stopTrainDataLoader.is_set() or os.getppid()==1:
				print 'trainDataLoader receives the stop signal'
				break

			names = [ p['name'] for p in batch ]
			data = []
			for protein in batch:
				d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put( (data, names) )

	print 'TrainDataLoader has finished loading data'
	sharedQ.close()
def TrainDataLoader(sharedQ, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelMatrixPool = dict()

	while True:
		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)
		for batch in trainSeqData:
			data = []
			for protein in batch:
				name = protein['name']
				if labelPool.has_key(name):
					## label is already in the pool
					d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
					d['atomLabelMatrix'] = labelPool[name]
				else:
					d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list')
					assert d.has_key('atomLabelMatrix')
					labelPool[name] = d['atomLabelMatrix']

				if config.UseSampleWeight(modelSpecs):
					if not labelMatrixPool.has_key(name): 
						labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
						labelMatrixPool[name] = labelWeightMatrix
						d['labelWeightMatrix'] = labelWeightMatrix
					else:
						d['labelWeightMatrix'] = labelMatrixPool[name]

				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=np.float16, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put(data)
def PredictDistMatrix(modelFiles, predFiles, savefolder=None):
    	## load all the models from the files. Each file contains specification for one model.
	models = []
	for mFile in modelFiles:
    		fh = open(mFile, 'rb')
    		model = cPickle.load(fh)
    		fh.close()
		models.append(model)

	## check consistency among models. All the models shall have the same labelType for the same atom pair type
	labelTypes = dict()
	for model in models:
		for response in model['responses']:
			labelName = Response2LabelName(response)
			labelType = Response2LabelType(response)
			if not labelTypes.has_key(labelName):
				labelTypes[labelName] = labelType
			elif labelTypes[labelName] != labelType:
				print 'WARNING: at least two models have different label types for the same atom pair type.'
				exit(-1)
					

	allsequences = dict()

	##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = list of predicted_prob_matrices
	##We predict one prob_matrix from each model for each protein and each response
	## two different models may share some overlapping responses.

	allresults = dict()
	numModels = dict()
	for model, mfile in zip(models, modelFiles):
		if not model['network'] in config.allNetworks:

			print 'unsupported network architecture: ', model['network']
			exit(-1)

		distancePredictor, x, y, xmask, ymask, xem, labelList, weightList = Model4DistancePrediction.BuildModel(model, forTrain=False)

		inputVariables = [ x, y, xmask, ymask]
		if xem is not None:
			inputVariables.append(xem)

	  	pred_prob = distancePredictor.output_prob
        	predict = theano.function(inputVariables, pred_prob, on_unused_input='warn' )

		## set model parameter values
		if not Compatible(distancePredictor.params, model['paramValues']):
			print 'FATAL ERROR: the model type or network architecture is not compatible with the loaded parameter values in the model file: ', mfile
			exit(-1)

		[ p.set_value(v) for p, v in zip(distancePredictor.params, model['paramValues']) ]

		## We shall load these files for each model separately since each model may have different requirement of the data
		predData = DataProcessor.LoadDistanceFeatures(predFiles, modelSpecs = model, forTrainValidation=False)

		##make sure the input has the same number of features as the model. We do random check here to speed up
		rindex = np.random.randint(0, high=len(predData) )
		assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1]

		rindex = np.random.randint(0, high=len(predData) )
		assert model['n_in_matrix'] == predData[rindex]['matrixFeatures'].shape[2]

		if predData[0].has_key('embedFeatures'):
			rindex = np.random.randint(0, high=len(predData) )
			assert model['n_in_embed'] == predData[rindex]['embedFeatures'].shape[1]

		## check if all the proteins of the same name have exactly the same sequence
		for d in predData:
			if not allsequences.has_key(d['name']):
				allsequences[d['name']] = d['sequence']
			elif allsequences[d['name']] != d['sequence']:
				print 'Error: inconsistent primary sequence for the same protein in the protein feature files'
				exit(-1)
			
		## predSeqData and names are in the exactly the same order, so we know which data is for which protein	
		predSeqData, names = DataProcessor.SplitData2Batches(data=predData, numDataPoints=624, modelSpecs=model)
		print '#predData: ', len(predData), '#batches: ', len(predSeqData)

		for onebatch, names4onebatch in zip(predSeqData, names):
			input = onebatch[ : len(inputVariables) ]
			result = predict(*input)

			x1d, x2d, x1dmask, x2dmask = input[0:4]
			seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
			maxSeqLen = x1d.shape[1]

			##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model
			assert result.shape[3] == sum( [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ] )

			## calculate the start and end positions of each response in the last dimension of result
			dims = [ config.responseProbDims[ Response2LabelType(res) ] for res in model['responses'] ]
                        endPositions = np.cumsum(dims)
                        startPositions =  endPositions - dims

			for name in names4onebatch:
				if not allresults.has_key(name):
					allresults[name]=dict() 
					numModels[name] =dict()

			## batchres is a batch of result, its ndim=4
			for response, start, end in zip(model['responses'], startPositions, endPositions):

				## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are contact/distance matrix sizes and the 4th is for the predicted probability parameters
				batchres = result[:, :, :, start:end ]


				## remove masked positions
				revised_batchres = [ probMatrix[ maxSeqLen-seqLen:, maxSeqLen-seqLen:, : ] for probMatrix, seqLen in zip(batchres, seqLens) ]

				for res4one, name in zip(revised_batchres, names4onebatch):
                                        if not allresults[name].has_key(response):
                                                allresults[name][response] = res4one
                                                numModels[name][response] = np.int32(1)
                                        else:
                                                ## here we save only sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins
                                                allresults[name][response] +=  res4one
                                                numModels[name][response] += np.int32(1)


		del predict
		del predData
		del predSeqData
		gc.collect()


	## calculate the final result, which is the average of all the predictd prob matrices for the same protein and response
	finalresults = dict()
	for name, results in allresults.iteritems():
		if not finalresults.has_key(name):
			finalresults[name] = dict()

		## finalresults has 3 dimensions. 
		for response in results.keys():
			#finalresults[name][response] = np.average(allresults[name][response], axis=0)
			finalresults[name][response] = allresults[name][response]/numModels[name][response]

			##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy.
			apt = Response2LabelName(response)
			if config.IsSymmetricAPT( apt ):
				finalresults[name][response] = (finalresults[name][response] + np.transpose(finalresults[name][response], (1, 0, 2) ) )/2.

	## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average.
	labelDistributions = dict()
	labelWeights = dict()
	for model in models:
		for response in model['responses']:
			apt = response
			if not labelDistributions.has_key(apt):
				labelDistributions[apt] = []
			if not labelWeights.has_key(apt):
				labelWeights[apt] = []

			labelDistributions[apt].append(model['labelRefProbs'][response])
			labelWeights[apt].append(model['weight4labels'][response])

	finalLabelDistributions = dict()
	finalLabelWeights = dict()

	for apt in labelDistributions.keys():
		finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0)
	for apt in labelWeights.keys():
		finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0)

	## convert the predicted distance probability matrix into a predicted contact matrix. 
	## Each predicted prob matrix has 3 dimensions while Each predicted contact matrix has 2 dimensions
	predictedContactMatrices = dict()
	from scipy.stats import norm
	for name, results in finalresults.iteritems():
		predictedContactMatrices[name] = dict()
		for response in results.keys():
			apt = Response2LabelName(response)
			labelType = Response2LabelType(response)

			if apt in config.allAtomPairTypes:
				if labelType.startswith('Discrete'):
					subType = labelType[len('Discrete'): ]
					labelOf8 = DistanceUtils.LabelsOfOneDistance(config.ContactDefinition, config.distCutoffs[subType])
					predictedContactMatrices[name][apt] =  np.sum( finalresults[name][response][:, :, :labelOf8], axis=2)
				elif labelType.startswith('Normal'):
					assert labelType.startswith('Normal1d2')
					normDistribution =  norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1])
					predictedContactMatrices[name][apt] =  normDistribution.cdf(config.ContactDefinition)
				elif labelType.startswith('LogNormal'):
					assert labelType.startswith('LogNormal1d2')
					normDistribution =  norm( loc=finalresults[name][response][:, :, 0], scale=finalresults[name][response][:,:,1])
					predictedContactMatrices[name][apt] =  normDistribution.cdf(np.log(config.ContactDefinition) )
				else:
					print 'unsupported label type in response: ', response
					exit(-1)

			elif apt in ['HB', 'Beta']:
				predictedContactMatrices[name][apt] =  finalresults[name][response][:, :, 0]
			else:
				print 'unsupported atom type in response: ', response
				exit(-1)


	##write all the results here
	## for each protein, we have a output file, which deposits a tuple like (predicted distance probability, labelWeight, RefProbs, predicted contact matrix, distLabelType, sequence)
        ## we store distLabelType for future use
	for name, results in finalresults.iteritems():

		savefilename = name + '.predictedDistMatrix.pkl'
		if savefolder is not None:
			savefilename = os.path.join(savefolder, savefilename)

		fh = open(savefilename, 'wb')
		cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL)
		fh.close()

	return finalresults, predictedContactMatrices, allsequences
def PredictProperty(models, predictors, predFiles):

    allsequences = dict()

    ##allresults shall be a nested dictionary, e.g, allresults[proteinName][response] = predicted_property_list
    allresults4prob = dict()
    allresults = dict()

    for model, predictor in zip(models, predictors):

        predict, inputVariables = predictor

        ## We shall load these files for each model separately since each model may use a different set of features
        predData = DataProcessor.LoadPropertyFeatures(predFiles,
                                                      modelSpecs=model,
                                                      forTrainValidation=False)

        ##make sure the input has the same number of features as the model
        rindex = np.random.randint(0, high=len(predData))
        assert model['n_in_seq'] == predData[rindex]['seqFeatures'].shape[1]

        ## collecting sequences
        for d in predData:
            if not allsequences.has_key(d['name']):
                allsequences[d['name']] = d['sequence']
            elif allsequences[d['name']] != d['sequence']:
                print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files'
                exit(1)

        predSeqData, names = DataProcessor.SplitData2Batches(
            data=predData,
            numDataPoints=30,
            modelSpecs=model,
            forTrainValidation=False)
        print '#predData: ', len(predData), '#batches: ', len(predSeqData)

        for onebatch, names4onebatch in zip(predSeqData, names):
            input = onebatch[:len(inputVariables)]
            result4prob, result = predict(*input)

            ## x1d has shape (batchSize, maxSeqLen, numFeatures) and x1dmask has shape (batchSize, #cols_to_be_masked)
            x1d, x1dmask = input[0:2]
            seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
            maxSeqLen = x1d.shape[1]

            ##result4prob has shape (batchSize, maxSeqLen, sum( responseProbDims{res] for res in modelSpecs['responses'])  )
            assert result4prob.shape[2] == sum([
                config.responseProbDims[Response2LabelType(res)]
                for res in model['responses']
            ])

            ##result has shape (batchSize, maxSeqLen, sum( responseValueDims{res] for res in modelSpecs['responses'])  )
            assert result.shape[2] == sum([
                config.responseValueDims[Response2LabelType(res)]
                for res in model['responses']
            ])

            nameGenerator = (name for name in names4onebatch
                             if not allresults.has_key(name))
            for name in nameGenerator:
                allresults[name] = dict()
                allresults4prob[name] = dict()

            dims = [
                config.responseProbDims[Response2LabelType(res)]
                for res in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            for res, start, end in zip(model['responses'], startPositions,
                                       endPositions):
                nameGenerator = (name for name in names4onebatch
                                 if not allresults4prob[name].has_key(res))
                for name in nameGenerator:
                    allresults4prob[name][res] = []

                ## remove masked positions
                revised_batchres = [
                    tmp[maxSeqLen - seqLen:, :]
                    for tmp, seqLen in zip(result4prob[:, :,
                                                       start:end], seqLens)
                ]

                [
                    allresults4prob[name][res].append(res4one)
                    for res4one, name in zip(revised_batchres, names4onebatch)
                ]

            dims = [
                config.responseValueDims[Response2LabelType(res)]
                for res in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            for res, start, end in zip(model['responses'], startPositions,
                                       endPositions):
                nameGenerator = (name for name in names4onebatch
                                 if not allresults[name].has_key(res))
                for name in nameGenerator:
                    allresults[name][res] = []

                ## remove masked positions
                revised_batchres = [
                    tmp[maxSeqLen - seqLen:, :]
                    for tmp, seqLen in zip(result[:, :, start:end], seqLens)
                ]
                [
                    allresults[name][res].append(res4one)
                    for res4one, name in zip(revised_batchres, names4onebatch)
                ]

    ## calculate the final result, which is the average of all the predictd properties for the same protein and response name
    finalresults = dict()
    for name, results in allresults.iteritems():
        if not finalresults.has_key(name):
            finalresults[name] = dict()
        for response in results.keys():
            tmpresult = np.average(allresults[name][response], axis=0)

            ##convert coding of discrete labels to more meaningful representation
            labelType = Response2LabelType(response)
            if not labelType.startswith('Discrete'):
                finalresults[name][response] = tmpresult

    finalresults4prob = dict()
    for name, results in allresults4prob.iteritems():
        if not finalresults4prob.has_key(name):
            finalresults4prob[name] = dict()
        for response in results.keys():
            finalresults4prob[name][response] = np.average(
                allresults4prob[name][response], axis=0)

            labelType = Response2LabelType(response)
            if labelType.startswith('Discrete'):
                tmpresult = np.argmax(finalresults4prob[name][response],
                                      axis=1)
                finalresults[name][response] = PropertyUtils.Coding2String(
                    tmpresult, response)
    """
	## collect the average label distributions and weight matrix. We collect all the matrices and then calculate their average.
	labelDistributions = dict()
	labelWeights = dict()
	for model in models:
		for apt in model['responseNames']:
			if not labelDistributions.has_key(apt):
				labelDistributions[apt] = []
			if not labelWeights.has_key(apt):
				labelWeights[apt] = []

			labelDistributions[apt].append(model['labelRefProbs'][apt])
			labelWeights[apt].append(model['weight4' + model['labelType'] ][apt])

	finalLabelDistributions = dict()
	finalLabelWeights = dict()

	for apt in labelDistributions.keys():
		finalLabelDistributions[apt] = np.average(labelDistributions[apt], axis=0)
	for apt in labelWeights.keys():
		finalLabelWeights[apt] = np.average(labelWeights[apt], axis=0)
	"""

    return finalresults4prob, finalresults, allsequences
def main(argv):

    modelSpecs = InitializeModelSpecs()
    modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs)

    startTime = datetime.datetime.now()

    trainMetaData = DataProcessor.LoadMetaData(modelSpecs['trainFile'])
    FeatureUtils.DetermineFeatureDimensionBySampling(trainMetaData, modelSpecs)
    ## calculate label distribution and weight at the very beginning
    print 'Calculating label distribution...'
    LabelUtils.CalcLabelDistributionNWeightBySampling(trainMetaData,
                                                      modelSpecs)

    if config.TrainByRefLoss(modelSpecs) or config.UseRefState(modelSpecs):
        print 'Calculating feature expection by sampling...'
        FeatureUtils.CalcFeatureExpectBySampling(trainMetaData, modelSpecs)

## trainMetaData is a list of groups. Each group contains a set of related proteins (seq-template alignments) and files for their features
    trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
    trainSeqData = DataProcessor.SplitData2Batches(
        trainDataLocation,
        numDataPoints=modelSpecs['minibatchSize'],
        modelSpecs=modelSpecs)
    print 'approximate #batches for train data: ', len(trainSeqData)

    #global trainSharedQ, stopTrainDataLoader, trainDataLoaders, trainSharedLabelPool, trainSharedLabelWeightPool
    global trainSharedQ, stopTrainDataLoader, trainDataLoaders
    trainSharedQ = multiprocessing.Queue(config.QSize(modelSpecs))
    stopTrainDataLoader = multiprocessing.Event()
    #trainSharedLabelPool = multiprocessing.Manager().dict()
    #trainSharedLabelWeightPool = multiprocessing.Manager().dict()
    #print stopTrainDataLoader

    numTrainDataLoaders = config.NumTrainDataLoaders(modelSpecs)
    metaDatas = DataProcessor.SplitMetaData(trainMetaData, numTrainDataLoaders)

    trainDataLoaders = []
    for i, metaData in zip(xrange(numTrainDataLoaders), metaDatas):
        #trainDataLoader = multiprocessing.Process(name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader, args=(trainSharedQ, metaData, modelSpecs, True, True))
        trainDataLoader = multiprocessing.Process(
            name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()),
            target=TrainUtils.TrainDataLoader2,
            args=(trainSharedQ, stopTrainDataLoader, metaData, modelSpecs,
                  True, True))
        #trainDataLoader = multiprocessing.Process(name='TrainDataLoader ' + str(i) + ' for ' + str(os.getpid()), target=TrainUtils.TrainDataLoader3, args=(trainSharedQ, trainSharedLabelPool, trainSharedLabelWeightPool, stopTrainDataLoader, metaData, modelSpecs, True, True))
        trainDataLoader.daemon = True
        trainDataLoaders.append(trainDataLoader)

    print 'start the train data loaders...'
    for trainDataLoader in trainDataLoaders:
        trainDataLoader.start()

    validMetaData = DataProcessor.LoadMetaData(modelSpecs['validFile'])
    validDataLocation = DataProcessor.SampleProteinInfo(validMetaData)

    ## split data into batches, but do not load the real data from disk
    #validSeqData = DataProcessor.SplitData2Batches(validDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
    validSeqData = DataProcessor.SplitData2Batches(validDataLocation,
                                                   numDataPoints=500 * 500,
                                                   modelSpecs=modelSpecs)
    print '#batches for validation data: ', len(validSeqData)

    global validSharedQ, validDataLoader, stopValidDataLoader
    validSharedQ = multiprocessing.Queue(len(validSeqData))
    stopValidDataLoader = multiprocessing.Event()
    #print stopValidDataLoader
    ## shared memory is a limited resource, so avoid using it as much as possible
    ## here we do not use shared array for validation data since we only need to load it once
    #validDataLoader = multiprocessing.Process(name='ValidDataLoader for '+str(os.getpid()), target=TrainUtils.ValidDataLoader, args=(validSharedQ, validSeqData, modelSpecs, True, False))
    validDataLoader = multiprocessing.Process(
        name='ValidDataLoader for ' + str(os.getpid()),
        target=TrainUtils.ValidDataLoader2,
        args=(validSharedQ, stopValidDataLoader, validSeqData, modelSpecs,
              True, False))
    print 'start the validation data loader...'
    validDataLoader.start()
    """
	if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True):
		##calculate the average weight per minibatch
		maxDeviation = DataProcessor.CalcAvgWeightPerBatch(trainSeqDataset, modelSpecs)
		print 'maxWeightDeviation=', maxDeviation
	"""

    beforeTrainTime = datetime.datetime.now()
    print 'time spent before training :', beforeTrainTime - startTime

    result = TrainModel(modelSpecs=modelSpecs,
                        trainValidData=(trainSeqData, validSeqData))

    ##merge ModelSpecs and result
    resultModel = modelSpecs.copy()
    resultModel.update(result)

    modelFile = TrainUtils.GenerateModelFileName(resultModel)
    print 'Writing the resultant model to ', modelFile
    cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL)

    afterTrainTime = datetime.datetime.now()
    print 'time spent on training:', afterTrainTime - beforeTrainTime

    ## clean up again
    print 'Cleaning up again...'
    Cleanup()
def TrainModel(modelSpecs, trainValidData=None, predDataFile=None):
    if (not trainValidData):
        print 'Please provide train and validation data for model training'
        exit(1)

    if modelSpecs is None:
        print 'Please provide a model specification for training'
        exit(1)

    distancePredictor, variable4train, variable4validate, params, params4mean, params4var, paramL2, regularizer, topAcc, errors, labelList, weightList, trainByRefLoss = PrepareModel(
        modelSpecs)

    chkpoint, restart = InitializeChkpoint(params, modelSpecs)

    assert (len(modelSpecs['numEpochs']) > 0)
    numEpochs4stages = np.cumsum(modelSpecs['numEpochs'])
    ## train parameters not related to variance and correlation
    epoch = chkpoint['epoch']

    if epoch < numEpochs4stages[-1]:

        if weightList is not None and len(weightList) > 0:
            loss4train = distancePredictor.loss(labelList,
                                                useMeanOnly=True,
                                                weightList=weightList,
                                                trainByRefLoss=trainByRefLoss)
            loss4validate = distancePredictor.loss(labelList,
                                                   useMeanOnly=True,
                                                   weightList=weightList)
        else:
            loss4train = distancePredictor.loss(labelList,
                                                useMeanOnly=True,
                                                trainByRefLoss=trainByRefLoss)
            loss4validate = distancePredictor.loss(labelList, useMeanOnly=True)
        """
		## weightedLoss is only used for cost, i.e., gradient calculation
		if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True):
			weightedLoss = ScaleLossByBatchWeight(loss, weightList, modelSpecs)
		else:
			weightedLoss = loss
		"""
        if modelSpecs['algorithm'] in set(['AdamW', 'AdamWAMS']):
            cost = T.sum(T.mul(loss4train,
                               modelSpecs['w4responses'])) / np.sum(
                                   modelSpecs['w4responses'])
        else:
            cost = T.sum(T.mul(loss4train, modelSpecs['w4responses'])
                         ) / np.sum(modelSpecs['w4responses']) + regularizer

        params4var_set = set(params4var)
        pgrads = [
            T.grad(cost,
                   p,
                   consider_constant=weightList,
                   disconnected_inputs='warn')
            if p not in params4var_set else T.zeros_like(p) for p in params
        ]
        pdecay = [
            p if p not in params4var_set else T.zeros_like(p) for p in params
        ]

    for stage, lr, epoch_end in zip(xrange(len(numEpochs4stages)),
                                    modelSpecs['lrs'], numEpochs4stages):
        if epoch >= epoch_end:
            continue

        print 'training for mean using a learning rate ', lr, ' ...'
        startFromBest = (stage > 0 and epoch == numEpochs4stages[stage - 1])
        epoch_start = epoch
        epoch = RunOneStage(epoch_start,
                            epoch_end,
                            trainValidData,
                            chkpoint,
                            loss4train,
                            loss4validate,
                            pgrads,
                            pdecay,
                            modelSpecs,
                            lr=lr,
                            startFromBest=(startFromBest, startFromBest))

## train parameters only specific to variance and correlation
    numEpochs4var = modelSpecs['numEpochs4var']
    lrs = modelSpecs['lrs4var']

    if len(params4var) > 0:
        assert (len(numEpochs4var) > 0)
        assert (len(lrs) > 0)

        previousEpochs4Stages = numEpochs4stages
        numEpochs4stages = np.cumsum(numEpochs4var) + numEpochs4stages[-1]

        if epoch < numEpochs4stages[-1]:
            print 'Training the parameters specific to correlation and variance ...'

            if weightList is not None and len(weightList) > 0:
                loss4train = distancePredictor.loss(
                    labelList,
                    weightList=weightList,
                    trainByRefLoss=trainByRefLoss)
                loss4validate = distancePredictor.loss(labelList,
                                                       weightList=weightList)
            else:
                loss4train = distancePredictor.loss(labelList)
                loss4validate = distancePredictor.loss(labelList)
            """
			## weightedLoss is only used for cost, i.e., gradient calculation
			if modelSpecs.has_key('ScaleLoss4Cost') and (modelSpecs['ScaleLoss4Cost'] is True):
				weightedLoss = ScaleLossByBatchWeight(loss, weightList, modelSpecs)
			else:
				weightedLoss = loss
			"""

            if modelSpecs['algorithm'] in set(['AdamW', 'AdamWAMS']):
                cost = T.sum(T.mul(loss4train,
                                   modelSpecs['w4responses'])) / np.sum(
                                       modelSpecs['w4responses'])
            else:
                cost = T.sum(T.mul(loss4train,
                                   modelSpecs['w4responses'])) / np.sum(
                                       modelSpecs['w4responses']) + regularizer

            params4var_set = set(params4var)
            pgrads = [
                T.grad(cost,
                       p,
                       consider_constant=weightList,
                       disconnected_inputs='raise')
                if p in params4var_set else T.zeros_like(p) for p in params
            ]
            pdecay = [
                p if p in params4var_set else T.zeros_like(p) for p in params
            ]

        for stage, lr, epoch_end in zip(xrange(len(lrs)), lrs,
                                        numEpochs4stages):
            if epoch >= epoch_end:
                continue

            print 'training for variance using a learning rate ', lr, ' ...'
            startFromBest = (
                (stage == 0 and epoch == previousEpochs4Stages[-1])
                or (stage > 0 and epoch == numEpochs4stages[stage - 1]))
            epoch_start = epoch
            epoch = RunOneStage(epoch_start,
                                epoch_end,
                                trainValidData,
                                chkpoint,
                                loss4train,
                                loss4validate,
                                pgrads,
                                pdecay,
                                modelSpecs,
                                lr=lr,
                                startFromBest=(startFromBest, startFromBest
                                               and (stage > 0)))

    resultModel = {}
    resultModel['dateTrained'] = datetime.datetime.now()
    #resultModel['validLoss'] = validLoss
    resultModel['validLoss'] = chkpoint['best_validation_loss']
    #resultModel['validErr'] = validErr
    if chkpoint.has_key('best_validation_err'):
        resultModel['validErr'] = chkpoint['best_validation_err']

    resultModel['trainLoss'] = chkpoint['train_loss4best_validation_loss']
    #resultModel['validAcc']= validAcc
    if chkpoint.has_key('best_validation_acc'):
        resultModel['validAcc'] = chkpoint['best_validation_acc']

    resultModel['paramValues'] = chkpoint['bestParamValues']

    bestParamL2norm = np.sum([(v**2).sum()
                              for v in chkpoint['bestParamValues']])
    resultModel['bestParamL2norm'] = bestParamL2norm

    bestParamL1norm = np.sum(
        [abs(v).sum() for v in chkpoint['bestParamValues']])
    resultModel['bestParamL1norm'] = bestParamL1norm

    print 'best param L1 norm: ', bestParamL1norm, 'L2 norm: ', bestParamL2norm

    Cleanup()

    #test on prediction data if it is given. Here the prediction data shall be small to save memory and contain ground truth.
    if modelSpecs['predFile'] is not None:
        predMetaData = DataProcessor.LoadMetaData(modelSpecs['predFile'])
        predDataLocation = DataProcessor.SampleProteinInfo(predMetaData)
        predBatches = DataProcessor.SplitData2Batches(predDataLocation,
                                                      numDataPoints=624,
                                                      modelSpecs=modelSpecs)
        print '\nLoading prediction data...'
        print "#predData minibatches:", len(predBatches)

        predData = []
        for batch in predBatches:
            data = DataProcessor.LoadRealData(batch,
                                              modelSpecs,
                                              returnMode='list')
            FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
            #input = TrainUtils.PrepareInput4Prediction(data, modelSpecs, floatType=np.float16)
            input = TrainUtils.PrepareInput4Prediction(
                data, modelSpecs, floatType=theano.config.floatX)
            predData.append(input)

        if weightList is not None and len(weightList) > 0:
            loss4validate = distancePredictor.loss(labelList,
                                                   weightList=weightList)
        else:
            loss4validate = distancePredictor.loss(labelList)

        fullValidate = theano.function(variable4validate,
                                       [loss4validate, errors, topAcc],
                                       on_unused_input='warn')
        if config.UseRefState(modelSpecs):
            quickValidate = theano.function(variable4validate,
                                            [loss4validate, errors],
                                            on_unused_input='warn')

    ## set model parameters for valiation and possibly prediction
        for param, value in zip(params, chkpoint['bestParamValues']):
            param.set_value(value)

        predLoss, predErr, predAcc = ValidateAllData(predData, fullValidate,
                                                     modelSpecs)
        if config.UseRefState(modelSpecs):
            refLoss, refErr = ValidateAllData(predData,
                                              quickValidate,
                                              modelSpecs,
                                              forRefState=True)
            print 'pred loss: ', predLoss, 'pred err: ', predErr, 'ref loss: ', refLoss, 'ref err: ', refErr
        else:
            print 'pred loss: ', predLoss, 'pred err: ', predErr
        resultModel['predLoss'] = predLoss
        resultModel['predErr'] = predErr

        print "predAcc: ", [str_display(pAcc[:, 0]) for pAcc in predAcc
                            ], 'for top ', modelSpecs['topRatios']
        resultModel['predAcc'] = predAcc

        del predData[:]

## training is done, remove the checkpoint file since it has been copied at the end of each stage
    if modelSpecs.has_key('checkpointFile') and (modelSpecs['checkpointFile']
                                                 is not None):
        try:
            os.remove(modelSpecs['checkpointFile'])
        except IOError:
            print 'WARNING: error in deleting the check point file: ', modelSpecs[
                'checkpointFile']

## remove theano variables from modelSpecs
    keys4removal = [
        'variable4train', 'variable4validate', 'params', 'params4mean',
        'params4var', 'paramL2', 'regularizer', 'topAcc', 'errors',
        'labelList', 'weightList', 'trainByRefLoss'
    ]
    for k in keys4removal:
        if modelSpecs.has_key(k):
            del modelSpecs[k]

    return resultModel
def TrainDataLoader2(sharedQ, stopTrainDataLoader, trainMetaData, modelSpecs, assembleData=True, UseSharedMemory=False):
	#print 'trainDataLoader has event: ', stopTrainDataLoader

	bUseCCMFnorm, bUseCCMsum, bUseCCMraw, bUseFullMI, bUseFullCov = config.ParseExtraCCMmode(modelSpecs)
	if any([bUseCCMraw, bUseFullMI, bUseFullCov]):
		## when full coevolution matrices are used, we shall use float16 to save memory
		floatType = np.float16
	else:
		floatType = theano.config.floatX

	## here we use labelPool to cache the labels of all the training proteins
	## one protein may have multiple sets of input features due to MSA sampling or sequnence-template alignment
	## but it can only have one set of label matrices, so it is worth to save all label matrices in RAM.
	labelPool = dict()
	labelWeightPool = dict()

	while True:
		if stopTrainDataLoader.is_set() or os.getppid()==1:
			#print 'trainDataLoader receives the stop signal'
			break

		trainDataLocation = DataProcessor.SampleProteinInfo(trainMetaData)
		numOriginals = len(trainDataLocation)
		trainSeqData = DataProcessor.SplitData2Batches(trainDataLocation, numDataPoints=modelSpecs['minibatchSize'], modelSpecs=modelSpecs)
		random.shuffle(trainSeqData)

		#i = 0
		for batch in trainSeqData:
			if stopTrainDataLoader.is_set() or os.getppid()==1:
				#print 'trainDataLoader receives the stop signal'
				break

			data = []
			for protein in batch:
				name = protein['name']
				if labelPool.has_key(name):
					## label is already in the pool
					d = DataProcessor.LoadRealData(protein, modelSpecs, loadLabel=False, returnMode='list')
					d['atomLabelMatrix'] = labelPool[name]
				else:
					d = DataProcessor.LoadRealData(protein, modelSpecs, returnMode='list')
					assert d.has_key('atomLabelMatrix')
					labelPool[name] = d['atomLabelMatrix']

				if config.UseSampleWeight(modelSpecs):
					if not labelWeightPool.has_key(name): 
						labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(LabelMatrix=d['atomLabelMatrix'], modelSpecs=modelSpecs, floatType=np.float16)
						labelWeightPool[name] = labelWeightMatrix
						d['labelWeightMatrix'] = labelWeightMatrix
					else:
						d['labelWeightMatrix'] = labelWeightPool[name]

				data.append(d)

			FeatureUtils.CheckModelNDataConsistency(modelSpecs, data)
			if assembleData:
				data = PrepareInput4Train(data, modelSpecs, floatType=floatType, UseSharedMemory=UseSharedMemory)
			#print 'putting data to trainDataLoader queue...'
			sharedQ.put(data)

			"""
			i += 1
			if i%100 == 0:
				print '#batches of train data loaded: ', i
			"""

		#print 'TrainDataLoader with #PID ', os.getpid(), ' currently has ', len(labelPool), ' label matrices  and ', len(labelMatrixPool), ' label weight matrices'
	print 'TrainDataLoader has finished loading data'
	sharedQ.close()
def main(argv):

    #modelSpecs = config.InitializeModelSpecs()
    modelSpecs = InitializeModelSpecs()
    modelSpecs = ParseCommandLine.ParseArguments(argv, modelSpecs)

    startTime = datetime.datetime.now()

    ##trainData and validData are a list. Each element corresponds to one protein, which is a dict()
    trainData = DataProcessor.LoadPropertyFeatures(modelSpecs['trainFile'],
                                                   modelSpecs=modelSpecs)
    validData = DataProcessor.LoadPropertyFeatures(modelSpecs['validFile'],
                                                   modelSpecs=modelSpecs)
    print '#trainData: ', len(trainData), '#validData: ', len(validData)

    ## where to add code to assign weight to each residue? We need to deal with the residues without 3D coordinates for angle and SS prediction
    ##a, b = DataProcessor.CalcLabelDistributionAndWeight(trainData, modelSpecs)

    modelSpecs['numOfTrainProteins'] = len(trainData)

    beforeBatchTime = datetime.datetime.now()
    print 'time spent on data loading: ', beforeBatchTime - startTime

    print 'Preparing batch data for training...'
    groupSize = modelSpecs['minibatchSize']
    trainSeqDataset, _ = DataProcessor.SplitData2Batches(
        data=trainData, numDataPoints=groupSize, modelSpecs=modelSpecs)
    validSeqDataset, _ = DataProcessor.SplitData2Batches(
        data=validData, numDataPoints=groupSize, modelSpecs=modelSpecs)
    #validSeqDataset = DataProcessor.SplitData2Batches(data=validData, numDataPoints=20000, modelSpecs=modelSpecs)
    print "#trainData minibatches:", len(
        trainSeqDataset), "#validData minibatches:", len(validSeqDataset)

    predSeqDataset = None
    if modelSpecs['predFile'] is not None:
        predData = DataProcessor.LoadPropertyFeatures(modelSpecs['predFile'],
                                                      modelSpecs=modelSpecs,
                                                      forTrainValidation=False)
        print '#predData: ', len(predData)
        predSeqDataset, _ = DataProcessor.SplitData2Batches(
            data=predData, numDataPoints=40, modelSpecs=modelSpecs)
        print "#predData minibatches:", len(predSeqDataset)

## Each protein in trainData contains three or four components: seqFeatures and label
    modelSpecs['n_in_seq'] = trainData[0]['seqFeatures'].shape[1]

    beforeTrainTime = datetime.datetime.now()

    print 'time spent on generating batch data:', beforeTrainTime - beforeBatchTime

    result = TrainModel(modelSpecs=modelSpecs,
                        trainSeqData=trainSeqDataset,
                        validSeqData=validSeqDataset,
                        predSeqData=predSeqDataset)

    ##merge ModelSpecs and result
    resultModel = modelSpecs.copy()
    resultModel.update(result)

    modelFile = GenerateModelFileName(resultModel)
    print 'Writing the resultant model to ', modelFile
    cPickle.dump(resultModel, file(modelFile, 'wb'), cPickle.HIGHEST_PROTOCOL)
Beispiel #10
0
def PredictMatrixLabels(models,
                        predictors,
                        names,
                        inputFolders,
                        aliFolders=None,
                        tplFolder=None,
                        aliFile=None,
                        tplFile=None,
                        saveFolder=None):

    if not isinstance(names, (list, tuple)):
        targetName = names
    else:
        targetName = None

    ##allresults is a nested dictionary, i.e., allresults[proteinName][response] = sum of predicted_prob_matrices
    ##We predict one prob_matrix by each model for each protein and each response and then average them per protein and response to get the final results
    ##two different models may share common responses

    allsequences = dict()
    allresults = dict()  ## the results predicted from the real input
    numModels = dict(
    )  ## count the number of models that may predict each response

    for model, predictor in zip(models, predictors):
        #predict, inputVariables = BuildPredictor(model)
        predict, inputVariables = predictor

        ## load data for each model separately since each model may have a different specification
        if targetName is None:
            rawData = LoadProteinData4OneModel(model, names, inputFolders,
                                               aliFolders, tplFolder)

        elif aliFile is not None and tplFile is not None:
            rawData = LoadOneAlignment4OneModel(model, targetName,
                                                inputFolders, aliFile, tplFile)
        else:
            rawData = LoadOneProteinData4OneModel(model, targetName,
                                                  inputFolders, aliFolders,
                                                  tplFolder)

        predData = DataProcessor.ExtractFeaturesNLabels(
            rawData,
            modelSpecs=model,
            forTrainValidation=False,
            returnMode='list')

        ##make sure the input has the same number of features as the model
        FeatureUtils.CheckModelNDataConsistency(model, predData)

        ## check sequence consistency
        for d in predData:
            name = d['name']
            if not allresults.has_key(name):
                allresults[name] = dict()
                numModels[name] = dict()

            if not allsequences.has_key(name):
                allsequences[name] = d['sequence']
            elif allsequences[name] != d['sequence']:
                print 'ERROR: inconsistent primary sequence for the same protein in the protein feature files'
                exit(1)

        predSeqData = DataProcessor.SplitData2Batches(data=predData,
                                                      numDataPoints=624,
                                                      modelSpecs=model)
        print '#predData: ', len(predData), '#batches: ', len(predSeqData)

        ##for onebatch, names4onebatch in zip(predSeqData, names):
        for minibatch in predSeqData:
            onebatch, names4onebatch = DataProcessor.AssembleOneBatch(
                minibatch, model)
            input = onebatch[:len(inputVariables)]
            result = predict(*input)
            ##result is a 4-d tensor. The last dimension is the concatenation of the predicted prob parameters for all responses in this model
            assert result.shape[3] == sum([
                GetResponseProbDims(response)
                for response in model['responses']
            ])

            ## calculate the start and end positions of each response in the last dimension of result
            dims = [
                GetResponseProbDims(response)
                for response in model['responses']
            ]
            endPositions = np.cumsum(dims)
            startPositions = endPositions - dims

            x1d, x2d, x1dmask, x2dmask = input[0:4]
            seqLens = x1d.shape[1] - x1dmask.shape[1] + np.sum(x1dmask, axis=1)
            maxSeqLen = x1d.shape[1]

            for response, start, end in zip(model['responses'], startPositions,
                                            endPositions):

                ## batchres is a batch of result, its ndim=4
                ## the 1st dimension of batchres is batchSize, the 2nd and 3rd dimensions are distance/orientation matrix sizes and the 4th is for the predicted probability parameters
                batchres = result[:, :, :, start:end]
                ## remove masked positions
                revised_batchres = [
                    probMatrix[maxSeqLen - seqLen:, maxSeqLen - seqLen:, :]
                    for probMatrix, seqLen in zip(batchres, seqLens)
                ]

                for res4one, name in zip(revised_batchres, names4onebatch):
                    if not allresults[name].has_key(response):
                        allresults[name][response] = res4one
                        numModels[name][response] = np.int32(1)
                    else:
                        ## here we save sum to reduce memory consumption, which could be huge when many deep models are used to predict a large set of proteins
                        allresults[name][response] += res4one
                        numModels[name][response] += np.int32(1)

    ## calculate the final result, which is the average of predictd prob matrices by all models for the same protein and the same response
    finalresults = dict()
    for name, results in allresults.iteritems():
        if not finalresults.has_key(name):
            finalresults[name] = dict()

        ## finalresults has 3 dimensions.
        for response in results.keys():
            finalresults[name][response] = (allresults[name][response] /
                                            numModels[name][response]).astype(
                                                np.float32)

            ##make the predicted distance prob matrices symmetric for some reponses. This also slightly improves accuracy.
            labelName = Response2LabelName(response)
            if config.IsSymmetricLabel(labelName):
                finalresults[name][response] = (
                    finalresults[name][response] +
                    np.transpose(finalresults[name][response], (1, 0, 2))) / 2.

    ## convert predicted distance probability matrix into contact matrix
    predictedContactMatrices = DeriveContactMatrix(finalresults)

    ## collect the average label distributions and weight matrix
    finalLabelWeights, finalLabelDistributions = CollectLabelWeightNDistribution(
        models)

    ##write all the results here
    ## for each protein, we have a output file saving a tuple (name, sequence, predicted distance matrix, predicted contact matrix, labelWeight, labelDistribution)
    for name, results in finalresults.iteritems():

        savefilename = name + '.predictedDistMatrix.pkl'
        if saveFolder is not None:
            savefilename = os.path.join(saveFolder, savefilename)

        if targetName is not None:
            originalName = targetName
        else:
            for n in names:
                if name.startswith(n):
                    originalName = n
                    break

        with open(savefilename, 'wb') as fh:
            #cPickle.dump( (name, allsequences[name], results, predictedContactMatrices[name], finalLabelWeights, finalLabelDistributions), fh, protocol=cPickle.HIGHEST_PROTOCOL)
            cPickle.dump((originalName, allsequences[name], results,
                          predictedContactMatrices[name], finalLabelWeights,
                          finalLabelDistributions),
                         fh,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    return (predictedContactMatrices, allsequences)
    """