def EvaluateAccuracy(pred_prob, truth, pad_len):
            pred_in_correct_shape = T.cast(pred_prob[pad_len:, pad_len:],
                                           dtype=theano.config.floatX)
            truth_in_correct_shape = truth[pad_len:, pad_len:]

            labelName, labelType, subType = ParseResponse(currentResponse)
            symmetric = config.IsSymmetricLabel(labelName)

            if labelName in config.allOrientationNames:
                if not config.IsDiscreteLabel(labelType):
                    print 'ERROR: unsupported label type for orientation matrix prediction: ', currentResponse
                    exit(1)

                numLabels = GetResponseProbDims(currentResponse)
                if subType.endswith('Plus') or subType.endswith('Minus'):
                    largestValidLabel = numLabels - 2
                else:
                    largestValidLabel = numLabels - 1

                return TopAccuracyOrientation(
                    pred=pred_in_correct_shape,
                    truth=truth_in_correct_shape,
                    largestValidLabel=largestValidLabel,
                    symmetric=symmetric)

            if labelType.startswith('LogNormal'):
                return TopAccuracyLogNormal(pred=pred_in_correct_shape,
                                            truth=truth_in_correct_shape,
                                            symmetric=symmetric)

            elif labelType.startswith('Normal'):
                return TopAccuracyNormal(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)

            elif labelType.startswith('Discrete'):
                #subType = labelType[len('Discrete'): ]
                if subType.startswith('2C'):
                    return TopAccuracy2C(pred=pred_in_correct_shape,
                                         truth=truth_in_correct_shape,
                                         symmetric=symmetric)
                else:
                    return TopAccuracyMultiC(pred=pred_in_correct_shape,
                                             truth=truth_in_correct_shape,
                                             subType=subType,
                                             symmetric=symmetric)
            else:
                print 'ERROR: unsupported label type in EvaluateAccuracy: ', labelType
                exit(1)
def DeriveOriContactMatrix(predOriMatrix, response):
    labelName, labelType, subType = config.ParseResponse(response)
    symmetric = config.IsSymmetricLabel(labelName)

    if labelName not in config.allOrientationNames:
        print 'ERROR: unsupported orientation label name in', response
        exit(1)

    if not config.IsDiscreteLabel(labelType):
        print 'ERROR: unsupported orientation label type in', response
        exit(1)

    numLabels = config.GetResponseProbDims(response)
    if subType.endswith('Plus') or subType.endswith('Minus'):
        largestValidLabel = numLabels - 2
    else:
        largestValidLabel = numLabels - 1

    contactMatrix = np.sum(predOriMatrix[:, :, :largestValidLabel], axis=2)

    return contactMatrix
def AddLabel2OneBatch(names,
                      batch,
                      modelSpecs,
                      sharedLabelPool,
                      sharedLabelWeightPool,
                      floatType=theano.config.floatX):

    numSeqs = len(names)
    for name in names:
        if (not sharedLabelPool.has_key(name)) or (
                not sharedLabelWeightPool.has_key(name)):
            print 'the label or label weight matrix does not exist for protein ', name
            exit(1)

    seqLens = [sharedLabelWeightPool[name].shape[0] for name in names]

    ## get the boundingbox for this batch
    if not config.TrainByRefLoss(modelSpecs):
        box = batch[-1]
    else:
        box = batch[-2]

    top, left, bottom, right = box
    assert bottom - top == right - left
    boxsize = bottom - top

    if boxsize < max(seqLens) and numSeqs > 1:
        ## make sure that there is only one protein in this batch
        print 'ERROR: when one batch has a large protein, it can only have one protein'
        exit(1)

    ## we crop pairwise labels at this step to save memory and computational time
    maxMatrixSize = min(boxsize, max(seqLens))

    ## Y shall be a list of 2D or 3D matrices, each for one response
    Y = []
    for response in modelSpecs['responses']:
        labelName, labelType, _ = ParseResponse(response)
        dataType = np.int16
        if not config.IsDiscreteLabel(labelType):
            dataType = floatType
        rValDims = GetResponseValueDims(response)
        if rValDims == 1:
            y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize),
                         dtype=dataType)
            Y.append(y)

        else:
            y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize,
                                rValDims),
                         dtype=dataType)
            Y.append(y)

    ## when Y is empty, weight is useless. So When Y is empty, weight shall also be empty
    weightMatrix = []
    if bool(Y) and config.UseSampleWeight(modelSpecs):
        weightMatrix = [
            np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize),
                     dtype=floatType)
        ] * len(modelSpecs['responses'])

    for j, name, seqLen in zip(range(len(names)), names, seqLens):

        ## we align all matrices in the bottom/right corner
        ## posInX and posInY are the starting position of one protein in the final output tensor
        ## here X and Y refer to x-axis and y-axis
        posInX = -min(boxsize, seqLen)
        posInY = -min(boxsize, seqLen)

        for y, response in zip(Y, modelSpecs['responses']):

            if boxsize < seqLen:
                tmp = sharedLabelPool[name][response][top:bottom, left:right]
            else:
                tmp = sharedLabelPool[name][response]
            if len(y.shape) == 3:
                y[j, posInX:, posInY:] = tmp
            else:
                y[j, posInX:, posInY:, ] = tmp

        labelWeightMatrix = sharedLabelWeightPool[name]
        for w, response in zip(weightMatrix, modelSpecs['responses']):
            if boxsize < seqLen:
                w[j, posInX:,
                  posInY:] = labelWeightMatrix[response][top:bottom,
                                                         left:right]
            else:
                w[j, posInX:, posInY:] = labelWeightMatrix[response]

    ## the input batch contains bounding box
    tail = 1

    ## check to see if the input batch contains one flag for RefState
    if config.TrainByRefLoss(modelSpecs):
        tail += 1

    newbatch = batch[:-tail]
    newbatch.extend(Y)
    newbatch.extend(weightMatrix)
    newbatch.extend(batch[-tail:])

    return newbatch
def AssembleOneBatch(data,
                     modelSpecs,
                     forRefState=False,
                     bounds=None,
                     floatType=theano.config.floatX,
                     bUseSharedMemory=False):
    if not data:
        print 'WARNING: the list of data is empty'
        return None

    numSeqs = len(data)
    seqLens = [d['seqLen'] for d in data]
    names = [d['name'] for d in data]

    ## use maxSeqLen and minSeqLen for sequential features
    ## we do not crop sequential features at this step since the theano deep model will do so after 1D convolution operation
    maxSeqLen = max(seqLens)
    minSeqLen = min(seqLens)
    #print 'maxSeqLen= ', maxSeqLen, 'minSeqLen= ', minSeqLen

    numSeqFeatures = FeatureUtils.DetermineNumSeqFeatures(
        data[0]['seqFeatures'])
    X1d = np.zeros(shape=(numSeqs, maxSeqLen, numSeqFeatures), dtype=floatType)

    numMatrixFeatures = FeatureUtils.DetermineNumMatrixFeatures(
        data[0]['matrixFeatures']) + FeatureUtils.DetermineNumMatrixFeatures(
            data[0]['matrixFeatures_nomean'])
    ## we use maxMatrixSize and minMatrixSize for pairwise features
    ## we crop pairwise features at this step to save memory and computational time
    minMatrixSize, maxMatrixSize = CalcMinMaxMatrixSize(bounds, seqLens)

    if bUseSharedMemory:
        shmX2d = SharedNDArray(
            (numSeqs, maxMatrixSize, maxMatrixSize, numMatrixFeatures),
            dtype=floatType,
            name='/RaptorX-' + str(os.getppid()) + '-X2d-' + randomString(6))
        X2d = shmX2d.array
        X2d[:] = 0
    else:
        X2d = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize,
                              numMatrixFeatures),
                       dtype=floatType)

    X1dem = None
    if data[0].has_key('embedFeatures'):
        numEmbedFeatures = data[0]['embedFeatures'].shape[1]
        X1dem = np.zeros(shape=(numSeqs, maxSeqLen, numEmbedFeatures),
                         dtype=floatType)

    ## Y shall be a list of 2D or 3D matrices, each for one response
    Y = []
    if data[0].has_key('atomLabelMatrix'):
        for response in modelSpecs['responses']:
            labelName, labelType, _ = ParseResponse(response)
            dataType = np.int16
            if not config.IsDiscreteLabel(labelType):
                dataType = floatType
            rValDims = GetResponseValueDims(response)
            if rValDims == 1:
                y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize),
                             dtype=dataType)
                Y.append(y)

            else:
                y = np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize,
                                    rValDims),
                             dtype=dataType)
                Y.append(y)

    ## when Y is empty, weight is useless. So When Y is None, weight shall also be None
    weightMatrix = []
    if bool(Y) and config.UseSampleWeight(modelSpecs):
        weightMatrix = [
            np.zeros(shape=(numSeqs, maxMatrixSize, maxMatrixSize),
                     dtype=floatType)
        ] * len(modelSpecs['responses'])

    ## for mask. we do not used shared ndarray for them since they are small
    M1d = np.zeros(shape=(numSeqs, maxSeqLen - minSeqLen), dtype=np.int8)
    M2d = np.zeros(shape=(numSeqs, maxMatrixSize - minMatrixSize,
                          maxMatrixSize),
                   dtype=np.int8)

    if bounds is not None:
        boxes = bounds
    else:
        boxes = [None] * len(data)

    for j, d, box in zip(range(len(data)), data, boxes):
        seqLen = d['seqLen']

        ## posInSeq, posInX and posInY are the starting position of one protein in the final output tensor
        posInSeq = -seqLen

        ## here X and Y refer to x-axis and y-axis
        if box is not None:
            top, left, bottom, right = box
            posInX = -(bottom - top)
            posInY = -(right - left)
        else:
            posInX = -seqLen
            posInY = -seqLen

        if forRefState:
            ## this code needs reexamination, it may not be correct when d['seqFeatures']/d['matrixFeatures'] is represented as a list of arrays instead of a single array
            X1d[j, posInSeq:, :] = np.array(
                [modelSpecs['seqFeatures_expected']] * seqLen).reshape(
                    (seqLen, -1))

            tmp = [modelSpecs['matrixFeatures_expected']] * (seqLen * seqLen)
            tmp2 = np.array(tmp).reshape((seqLen, seqLen, -1))
            tmp3 = np.concatenate((tmp2, d['matrixFeatures_nomean']), axis=2)
            if box is not None:
                X2d[j, posInX:, posInY:, :] = tmp3[top:bottom, left:right, ]
            else:
                X2d[j, posInX:, posInY:, :] = tmp3
        else:
            if isinstance(d['seqFeatures'], np.ndarray):
                X1d[j, posInSeq:, :] = d['seqFeatures']
            else:
                startPos = 0
                for f in d['seqFeatures']:
                    if len(f.shape) == 1:
                        X1d[j, posInSeq:,
                            startPos:startPos + 1] = f[:, np.newaxis]
                        startPos += 1
                    elif len(f.shape) == 2:
                        X1d[j, posInSeq:, startPos:startPos + f.shape[1]] = f
                        startPos = startPos + f.shape[1]
                    else:
                        print 'wrong shape in sequential feature: ', f.shape
                        exit(1)

            # add 2D features in matrixFeatures to holder staring from the start position
            # holder is a 3D array and start is the starting position in the 3rd dimension
            def Add2DFeatures(matrixFeatures, holder, start):
                if isinstance(matrixFeatures, np.ndarray):
                    features = [matrixFeatures]
                else:
                    features = matrixFeatures

                startPos = start
                #for f in matrixFeatures:
                for f in features:
                    if len(f.shape) == 2:
                        endPos = startPos + 1
                        if box is None:
                            holder[:, :, startPos:endPos] = f[:, :, np.newaxis]
                        else:
                            holder[:, :,
                                   startPos:endPos] = f[top:bottom, left:right,
                                                        np.newaxis]
                    elif len(f.shape) == 3:
                        endPos = startPos + f.shape[2]
                        if box is None:
                            holder[:, :, startPos:endPos] = f
                        else:
                            holder[:, :, startPos:endPos] = f[top:bottom,
                                                              left:right, :]
                    else:
                        print 'wrong shape in matrixFeatures: ', f.shape
                        exit(1)
                    startPos = endPos

                return endPos

            end = Add2DFeatures(d['matrixFeatures'], X2d[j, posInX:,
                                                         posInY:, :], 0)
            Add2DFeatures(d['matrixFeatures_nomean'], X2d[j, posInX:,
                                                          posInY:, :], end)

        M1d[j, posInSeq:].fill(1)
        M2d[j, posInX:, posInY:].fill(1)

        if X1dem is not None:
            ## embed feature is always represented as a single array, so the code shall be correct
            if forRefState:
                X1dem[j, posInSeq:, :] = np.array(
                    [modelSpecs['embedFeatures_expected']] * seqLen).reshape(
                        (seqLen, -1))
            else:
                X1dem[j, posInSeq:, :] = d['embedFeatures']

        for y, response in zip(Y, modelSpecs['responses']):
            if box is not None:
                tmp = d['atomLabelMatrix'][response][top:bottom, left:right]
            else:
                tmp = d['atomLabelMatrix'][response]
            if len(y.shape) == 3:
                y[j, posInX:, posInY:] = tmp
            else:
                y[j, posInX:, posInY:, ] = tmp

        if bool(weightMatrix):
            if d.has_key('labelWeightMatrix'):
                labelWeightMatrix = d['labelWeightMatrix']
            else:
                labelWeightMatrix = LabelUtils.CalcLabelWeightMatrix(
                    d['atomLabelMatrix'], modelSpecs, floatType=floatType)

        for w, response in zip(weightMatrix, modelSpecs['responses']):
            if box is not None:
                w[j, posInX:,
                  posInY:] = labelWeightMatrix[response][top:bottom,
                                                         left:right]
            else:
                w[j, posInX:, posInY:] = labelWeightMatrix[response]

    if bUseSharedMemory:
        onebatch = [X1d, shmX2d, M1d, M2d]
    else:
        onebatch = [X1d, X2d, M1d, M2d]

    if X1dem is not None:
        onebatch.append(X1dem)

    onebatch.extend(Y)
    onebatch.extend(weightMatrix)

    return onebatch, names
Beispiel #5
0
        elif bPrintOtherAtomPairs:
                contactFileName = filename + '.' + apt + '.CM.txt'
                contactCASPFileName = filename + '.' + apt + '.CASP.rr'
	else:
		continue

	contactFile = os.path.join(savefolder, contactFileName)
        np.savetxt(contactFile, m, fmt='%1.6f', delimiter=' ')

	contactCASPFile = os.path.join(savefolder, contactCASPFileName)
	if contactOnly:
        	ContactUtils.SaveContactMatrixInCASPFormat(targetName, sequence, m, contactCASPFile, distMatrix=None, probScaleFactor=1)
		continue

	responses = FindStringsStartWith(distProbMatrix.keys(), apt)
	if len(responses) != 1:
		## right now for one apt, only one response is allowed
		print 'ERROR: incorrect distance information for', apt, 'in', predFile
		exit(1)

	response = responses[0]
	labelName, labelType, subType = config.ParseResponse(response)

	if not config.IsDiscreteLabel(labelType):
		print 'ERROR: right now only discrete distance matrix is supported'
		exit(1)

	## convert distance matrix to what's needed by CASP
	distMatrix = DistanceUtils.MergeDistanceBinsBySum(distProbMatrix[response], config.distCutoffs[subType], config.distCutoffs['10C'])
        ContactUtils.SaveContactMatrixInCASPFormat(targetName, sequence, m, contactCASPFile, distMatrix=distMatrix, probScaleFactor=1)
def main(argv):
    newName = None
    savefolder = os.getcwd()

    try:
        opts, args = getopt.getopt(argv, "s:n:", ["savefolder=", "name="])
        #print opts, args
    except getopt.GetoptError:
        Usage()
        exit(1)

    if len(args) < 2:
        Usage()
        exit(1)

    baseMatrixFile = args[0]
    subMatrixFiles = args[1:]

    for opt, arg in opts:
        if opt in ("-s", "--savefolder"):
            savefolder = arg
        elif opt in ("-n", "--name"):
            newName = arg
        else:
            Usage()
            exit(1)

    baseMatrix = DistanceUtils.LoadRawDistProbFile(baseMatrixFile)
    sequence = baseMatrix[1]
    targetName = baseMatrix[0]

    ## baseMatrix and subMatrix are a tuple of 6 items
    subMatrices = []
    for subMatrixFile in subMatrixFiles:
        subMatrix = DistanceUtils.LoadRawDistProbFile(subMatrixFile)

        ## make sure that both matrix files are of the same type, although they may not equal
        if baseMatrix[4] is None:
            assert (subMatrix[4] is None)
        if baseMatrix[4] is not None:
            assert (subMatrix[4] is not None)

        subMatrices.append(subMatrix)

    ## new distance and contact matrices with response as the keys
    newDistMatrices = {}
    counterMatrices = {}

    ## initialize
    for response, m in baseMatrix[2].iteritems():
        newDistMatrices[response] = deepcopy(m)
        counterMatrices[response] = np.ones(m.shape[:2], dtype=np.int32)

    ## add submatrices onto newDistMatrices
    for subMatrix, smfile in zip(subMatrices, subMatrixFiles):
        print 'Adding submatrix in ', smfile

        subSequence = subMatrix[1]

        ## try to find its position in the original sequence by assumming that this domain has only one seq segment
        index = sequence.find(subSequence)
        if index >= 0:
            for response, m in subMatrix[2].iteritems():
                if not newDistMatrices.has_key(response):
                    print 'WARNING: the original matrix does not have response', response, ' in subMatrixFile:', smfile
                    continue

                AddSubMatrix(newDistMatrices[response], m, index)
                AddSubMatrix(counterMatrices[response],
                             np.ones(m.shape[:2], dtype=np.int32), index)
        else:
            ## try to find its positions in the original sequence by assuming that this domain has two seq segments
            res = FindIndexBySegments(sequence, subSequence)
            if res is None:
                print 'ERROR: cannot map domain sequence to the whole chain sequence!'
                print '    domain Seq= ', subSequence
                print '    chain  Seq= ', sequence
                exit(1)

            for response, m in subMatrix[2].iteritems():
                if not newDistMatrices.has_key(response):
                    print 'WARNING: the original matrix does not have response', response, ' in subMatrixFile:', smfile
                    continue
                AddSubMatrixBySegments(newDistMatrices[response],
                                       m,
                                       starts=res[0],
                                       sizes=res[1])
                AddSubMatrix(counterMatrices[response],
                             np.ones(m.shape[:2], dtype=np.int32),
                             starts=res[0],
                             sizes=res[1])

    ## final processing
    for response, m in newDistMatrices.iteritems():
        newDistMatrices[response] = np.divide(
            newDistMatrices[response], counterMatrices[response][:, :,
                                                                 np.newaxis])

    ## convert distMatrix to contactMatrix
    newContMatrices = {}

    for response, m in newDistMatrices.iteritems():

        ## derive contact matrix from distance matrix
        labelName, labelType, subType = config.ParseResponse(response)
        if not config.IsDiscreteLabel(labelType):
            print 'ERROR: unsupported labelType by ReplaceSubDistMatrix.py: ', labelType
            exit(1)

        if labelName in config.allAtomPairNames:
            labelOf8 = DistanceUtils.LabelsOfOneDistance(
                config.ContactDefinition, config.distCutoffs[subType])
            newContMatrices[labelName] = ContactUtils.Distance2Contact(
                m, labelOf8)

        elif labelName in config.allOrientationNames:
            newContMatrices[
                labelName] = OrientationUtils.DeriveOriContactMatrix(
                    m, response)
        else:
            print 'ERROR: unsupported labelName in replaceSubDistMatrix(): ', labelName
            exit(1)

    content4save = (targetName, sequence, newDistMatrices, newContMatrices,
                    baseMatrix[4], baseMatrix[5])

    ## save the new result
    if newName is None:
        fileName = os.path.basename(baseMatrixFile).split('.')[0] + '-mixed'
    else:
        fileName = newName
    savefile = os.path.join(savefolder, fileName + '.predictedDistMatrix.pkl')
    with open(savefile, 'wb') as fh:
        cPickle.dump(content4save, fh, protocol=cPickle.HIGHEST_PROTOCOL)
def CalcLabelWeight(modelSpecs):
    print 'Calculating label weight ...'

    numRanges = RangeNWeight.GetNumRanges(modelSpecs)

    RangeNWeight.SetWeight4Range(modelSpecs)
    #print 'weight for range: ', modelSpecs['weight4range']

    RangeNWeight.SetWeight43C2C(modelSpecs)
    #print 'LRbias= ', modelSpecs['LRbias']
    #print 'weight43C= ', modelSpecs['weight4Discrete3C']

    allRefProbs = modelSpecs['labelDistributions']
    ##for discrete labels, we calculate their weights by inferring from the weight intialized to 3 bins: 0-8, 8-15 and >15 or -1, which makes inference easier
    modelSpecs['weight4labels'] = dict()

    for response in modelSpecs['responses']:
        labelName, labelType, subType = config.ParseResponse(response)
        numLabels = GetResponseProbDims(response)

        if config.IsContinuousLabel(labelType):
            ## just need to assign range weight for continuous response
            modelSpecs['weight4labels'][response] = modelSpecs[
                'weight4continuous']
            continue

        if not config.IsDiscreteLabel(labelType):
            print 'ERROR: unsupported response in CalcLabelWeight: ', response
            exit(1)

        if labelName in config.allOrientationNames or config.NoWeight4Label(
                modelSpecs):
            modelSpecs['weight4labels'][response] = np.multiply(
                np.ones((numRanges, numLabels), dtype=np.float32),
                modelSpecs['weight4range'])

        elif labelName in ['HB', 'Beta']:
            ## if the response is for HB and Beta-Pairing
            if subType.startswith('2C'):
                modelSpecs['weight4labels'][response] = modelSpecs['weight4' +
                                                                   response]
            else:
                print 'ERROR: unsupported label subtype in CalcLabelWeight: ', response
                exit(1)

        elif labelName in config.allAtomPairNames:
            ## calculate label weight for atom pairs Cb-Cb, Ca-Ca, Cg-Cg, CaCg, and NO
            if subType.startswith('2C'):
                print 'ERROR: 2C is not supported for contact/distance prediction any more'
                exit(1)
            elif subType.startswith('3C'):
                ## if 3C is used for the response
                modelSpecs['weight4labels'][response] = modelSpecs[
                    'weight4Discrete3C']
            else:
                modelSpecs['weight4labels'][
                    response] = DistanceUtils.CalcLabelWeight(
                        modelSpecs['weight4Discrete3C'], allRefProbs[response],
                        config.distCutoffs[subType])

        else:
            print 'ERROR: unsupported label name in CalcLabelWeight: ', response
            exit(1)

        ## set the weight of the label for the invalid entry (distance or orientation) to 0
        if subType.endswith('Minus'):
            modelSpecs['weight4labels'][response][:, -1] = 0
    """
	## for log
	for response in modelSpecs['responses']:
		print 'weight4labels for response: ', response
		print modelSpecs['weight4labels'][response]
	"""

    return modelSpecs['weight4labels']