예제 #1
0
def plotHistogramOverall():
    graphicsFormat = "png"
    alpha = 0.8  # was 0.8; looks awful with alpha = 1
    n = 20
    #    d = 3 # number of ss types.
    extent = (0, n) + (0, n)
    cmapList = [green_inv, blue_inv, yellow_inv]
    colorList = ['green', 'blue', 'yellow']
    i = 1  # decides on color picked.

    # If set it will do a single ssType otherwise the overall.
    for doOverall in [False, True]:
        #    for doOverall in [ True ]:
        if doOverall:
            ssTypeList = [None]
        else:
            ssTypeList = [' ', 'S', 'H']

        for ssType in ssTypeList:
            m = zeros((n * n), dtype=int).reshape(n, n)
            #    mBySs = zeros((n,n,d), dtype=int).reshape(n,n,d)
            tickList = [
                NTdb.getResidueDefByName(resType).shortName
                for resType in common20AAList
            ]
            #        tickListRev = tickList[:]
            #        tickListRev.reverse()
            for r, resTypePrev in enumerate(common20AAList):
                for c, resType in enumerate(common20AAList):
                    if doOverall:
                        hist1 = getDeepByKeys(hPlot.histd1ByResTypes, resType,
                                              resTypePrev)
                    else:
                        hist1 = getDeepByKeys(hPlot.histd1BySs0AndResTypes,
                                              ssType, resType, resTypePrev)
                    if hist1 == None:
                        nTdebug('skipping for hist1 is empty for [%s] [%s]' %
                                (resType, resTypePrev))
                        continue
                    m[r, c] = sum(hist1)

            clf()

            #            axes([.1, .1, .8, .8 ] )
            xlabel('resType')
            ylabel('resTypePrev')
            xlim((0, n))
            ylim((0, n))
            offset = 0.5
            xticks(arange(offset, n), tickList)
            yticks(arange(offset, n), tickList)
            #            print 'just before call to set_ticks_position'
            #        axis.xaxis.set_ticks_position('top')
            #        axis.xaxis.set_label_position('top')
            #    axis.yaxis.set_ticks_position('both')
            #    axis.yaxis.set_label_position('left')
            grid(True)
            strTitle = "ssType: [%s]" % ssType
            title(strTitle)
            plot([0, n], [0, n], 'b-', linewidth=1)
            minCount = 300.
            maxCount = 1000.
            if False:
                minCount = 0.
                maxCount = 1.
            if ssType:
                minCount /= 3.
                maxCount /= 3.
            maxHist = amax(m)
            minHist = amin(m)
            sumHist = sum(m)
            nTmessage('ssType: %s' % ssType)
            nTmessage('maxHist: %s' % maxHist)  # 9165 of total of ~ 1 M.
            nTmessage('minHist: %s' % minHist)  # 210
            nTmessage('sumHist: %s' % sumHist)  # 210
            #            nTmessage('tickList: %s' % tickList) # 210
            #    his *= 100./maxHist
            his = masked_where(m <= minCount, m, copy=1)

            palette = cmapList[i]
            palette.set_under(color='red', alpha=1.0)  # alpha is 0.0
            palette.set_over(
                color=colorList[i], alpha=1.0
            )  # alpha is 1.0 Important to make it a hard alpha; last plotted will rule.
            palette.set_bad(color='red', alpha=1.0)

            norm = Normalize(vmin=minCount, vmax=maxCount,
                             clip=True)  # clip is False
            imshow(
                his,
                interpolation='nearest',
                #            interpolation='bicubic',
                origin='lower',
                extent=extent,
                alpha=alpha,
                cmap=palette,
                norm=norm)
            #            mr = m[::-1] # reverses the rows, nice!
            #            nTmessage('mr: %s' % mr)

            fn = "plotHistogram_%s_d1d2.%s" % (ssType, graphicsFormat)
            savefig(fn)

            clf()
            a = m.reshape(n * n)
            hist(a, 20)
            xlabel('pair count')
            ylabel('number of occurrences')
            title(strTitle)
            fn = "plotHistOfHist_%s_d1d2.%s" % (ssType, graphicsFormat)
            savefig(fn)

        # end loop over ssType
    # end over ssType overall
    return m
예제 #2
0
파일: pdb.py 프로젝트: jakesyl/cing
    def matchResidue2Cing(self, res):
        """
        Match res to CING database using previously defined convention;
        Account for 'ill-defined' residues by examining crucial atom names.
        Use CYANA (==DIANA) Naming for conversion to INTERNAL (i.e. These names will not likely change)

        Return NTdb resDef object None on Error

        res is a NTtree object with the following attributes set after this routine:
            db
            skip
            resName    and attributes for every atom it includes:
            HA2, CD1, ...
        """

#        nTdebug("Now in _matchResidue2Cing: %s" % res)

        res.db = None
        res.skip = False

        # Residue names that are ambiguously defined by different PDB file formats
        if res.resName[0:3] == 'ARG':
            if 'HH1' in res:
                res.db = NTdb.getResidueDefByName('ARG', convention = CYANA)
            elif '1HH' in res: # Second set for CYANA 1.x, AMBER
                res.db = NTdb.getResidueDefByName('ARG', convention = CYANA)
            else:
                # Default protonated; this also assures most common for X-ray without protons
                res.db = NTdb.getResidueDefByName('ARG+', convention = CYANA)
            #end if
        #end if
        elif res.resName[0:3] == 'ASP':
            if 'HD2' in res:
                #print 'ASPH'
                res.db = NTdb.getResidueDefByName('ASP', convention = CYANA)
            else:
                # Default deprot; this also assures most common for X-ray without protons
                #print 'ASP'
                res.db = NTdb.getResidueDefByName('ASP-', convention = CYANA)
            #end if
        elif res.resName[0:3] == 'GLU':
            if 'HE2' in res:
                #print 'GLUH'
                res.db = NTdb.getResidueDefByName('GLU', convention = CYANA)
            else:
                # Default deprot; this also assures most common for X-ray without protons
                #print 'GLU'
                res.db = NTdb.getResidueDefByName('GLU-', convention = CYANA)
            #end if
        elif res.resName[0:3] == 'HIS':
            if 'HD1' in res and 'HE2' in res:
                #print 'HISH'
                res.db = NTdb.getResidueDefByName('HIS+', convention = CYANA)
            elif not 'HD1' in res and 'HE2' in res:
                # print HISE
                res.db = NTdb.getResidueDefByName('HIST', convention = CYANA)
            else:
                # Default HD1
                #print 'HIS'
                res.db = NTdb.getResidueDefByName('HIS', convention = CYANA)
            #end if
        elif res.resName[0:3] == 'LYS':
            if ('HZ1' in res and not 'HZ3' in res):
                res.db = NTdb.getResidueDefByName('LYS', convention = CYANA)
            elif ('1HZ' in res and not '3HZ' in res): # Second set for CYANA 1.x
                res.db = NTdb.getResidueDefByName('LYS', convention = CYANA)
            else:
                # Default prot; this also assures most common for X-ray without protons
                res.db = NTdb.getResidueDefByName('LYS+', convention = CYANA)
            #end if
        elif res.resName in CYANA_NON_RESIDUES:
            res.skip = True
        elif res.resName == 'HOH' and self.skipWaters:
            res.skip = True
        else:
            res.db = NTdb.getResidueDefByName(res.resName, convention = self.convention)
        #end if

        # Only continue the search if not found and non-standard residues are allowed.
        if res.db:
            return res.db

        if not self.allowNonStandardResidue:
            res.skip = True
            return res.db

        # Try to match the residue using INTERNAL convention.
        res.db = NTdb.getResidueDefByName(res.resName)
        if res.db:
            return res.db

#        insert new residue.
        res.db = NTdb.appendResidueDef(name = res.resName, shortName = '_', comment='From parsing PDB file')
        if not res.db:
            nTcodeerror("Adding a non-standard residue should have been possible.")
            return None
        res.db.nameDict[self.convention] = res.resName

        # Just a check, disable for speed.
        _x = NTdb.getResidueDefByName(res.resName)
        if not _x:
            nTcodeerror("Added residue but failed to find it again in pdbParser#_matchResidue2Cing")

        return res.db
예제 #3
0
파일: d1d2plot.py 프로젝트: VuisterLab/cing
def plotHistogramOverall():
    graphicsFormat = "png"
    alpha = 0.8 # was 0.8; looks awful with alpha = 1
    n = 20
#    d = 3 # number of ss types.
    extent = (0, n) + (0, n)
    cmapList = [   green_inv, blue_inv, yellow_inv ]
    colorList = [ 'green', 'blue', 'yellow']
    i = 1 # decides on color picked.

    # If set it will do a single ssType otherwise the overall.
    for doOverall in [ False, True ]:
#    for doOverall in [ True ]:
        if doOverall:
            ssTypeList = [ None ]
        else:
            ssTypeList = [' ', 'S', 'H']

        for ssType in ssTypeList:
            m = zeros((n * n), dtype=int).reshape(n, n)
        #    mBySs = zeros((n,n,d), dtype=int).reshape(n,n,d)
            tickList = [ NTdb.getResidueDefByName(resType).shortName for resType in common20AAList]
    #        tickListRev = tickList[:]
    #        tickListRev.reverse()
            for r, resTypePrev in enumerate(common20AAList):
                for c, resType in enumerate(common20AAList):
                    if doOverall:
                        hist1 = getDeepByKeys(hPlot.histd1ByResTypes, resType, resTypePrev)
                    else:
                        hist1 = getDeepByKeys(hPlot.histd1BySs0AndResTypes, ssType, resType, resTypePrev)
                    if hist1 == None:
                        nTdebug('skipping for hist1 is empty for [%s] [%s]' % (resType, resTypePrev))
                        continue
                    m[r, c] = sum(hist1)

            clf()

#            axes([.1, .1, .8, .8 ] )
            xlabel('resType')
            ylabel('resTypePrev')
            xlim((0, n))
            ylim((0, n))
            offset = 0.5
            xticks(arange(offset, n), tickList)
            yticks(arange(offset, n), tickList)
#            print 'just before call to set_ticks_position'
    #        axis.xaxis.set_ticks_position('top')
    #        axis.xaxis.set_label_position('top')
        #    axis.yaxis.set_ticks_position('both')
        #    axis.yaxis.set_label_position('left')
            grid(True)
            strTitle = "ssType: [%s]" % ssType
            title(strTitle)
            plot([0, n], [0, n], 'b-', linewidth=1)
            minCount = 300.
            maxCount = 1000.
            if False:
                minCount = 0.
                maxCount = 1.
            if ssType:
                minCount /= 3.
                maxCount /= 3.
            maxHist = amax(m)
            minHist = amin(m)
            sumHist = sum(m)
            nTmessage('ssType: %s' % ssType)
            nTmessage('maxHist: %s' % maxHist) # 9165 of total of ~ 1 M.
            nTmessage('minHist: %s' % minHist) # 210
            nTmessage('sumHist: %s' % sumHist) # 210
#            nTmessage('tickList: %s' % tickList) # 210
        #    his *= 100./maxHist
            his = masked_where(m <= minCount, m, copy=1)

            palette = cmapList[i]
            palette.set_under(color='red', alpha=1.0) # alpha is 0.0
            palette.set_over(color=colorList[i], alpha=1.0) # alpha is 1.0 Important to make it a hard alpha; last plotted will rule.
            palette.set_bad(color='red', alpha=1.0)


            norm = Normalize(vmin=minCount, vmax=maxCount, clip=True) # clip is False
            imshow(his,
                    interpolation='nearest',
        #            interpolation='bicubic',
                    origin='lower',
                    extent=extent,
                    alpha=alpha,
                    cmap=palette,
                    norm=norm)
#            mr = m[::-1] # reverses the rows, nice!
#            nTmessage('mr: %s' % mr)

            fn = "plotHistogram_%s_d1d2.%s" % (ssType, graphicsFormat)
            savefig(fn)

            clf()
            a = m.reshape(n * n)
            hist(a, 20)
            xlabel('pair count')
            ylabel('number of occurrences')
            title(strTitle)
            fn = "plotHistOfHist_%s_d1d2.%s" % (ssType, graphicsFormat)
            savefig(fn)

        # end loop over ssType
    # end over ssType overall
    return m
예제 #4
0
def main():
    'See above.'
    cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz')
    gunzip(cvs_file_abs_name_gz)
    reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE)
    valueBySs0AndResTypes = {} # keys are SSi,   RTi, RTi-1
    valueBySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1
    valueByResTypes = {}
    valueBySs0 = {} # keys are SSi
    valueBySs1 = {} # keys are SSi-1
    histd1CtupleBySsAndResTypes = {}
    value = [] # NB is an array without being keyed.

    histd1BySs0AndResTypes = {} # keys are SSi,   RTi, RTi-1
    histd1BySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1
    histd1ByResTypes = {}
    histd1BySs0 = {}
    histd1BySs1 = {}


    linesByEntry = {}
    lineCount = 0
    for row in reader:
        lineCount += 1
        if lineCount > lineCountMax:
            break
        entryId = row[0]
        if not linesByEntry.has_key(entryId):
            linesByEntry[ entryId ] = []
        linesByEntry[ entryId ].append( row )

    skippedResTypes = []
    entryIdList = linesByEntry.keys()
    entryIdList.sort()

    # Do some pre filtering.
    for entryId2 in entryIdList:
        lineList = linesByEntry[ entryId2 ]
        for idx,line in enumerate(lineList):
            line.append(idx)
        lineListSorted = NTsort(lineList,BFACTOR_COLUMN,inplace=False)
        # Now throw away the worst 10 % of residues.
        n = len(lineListSorted)
        bad_count = int(round((n * DEFAULT_BFACTOR_PERCENTAGE_FILTER) / 100.))
        to_remove_count = n-bad_count
#        nTmessage("Removing at least %d from %d residues" % (bad_count,n))
        badIdxList = [lineItem[IDX_COLUMN] for lineItem in lineListSorted[to_remove_count:n]]
        iList = range(n)
        iList.reverse()
        for i in iList:
            lineItem = lineList[i]
            max_bfactor = float(lineItem[BFACTOR_COLUMN])
            if max_bfactor > DEFAULT_MAX_BFACTOR:
#                nTdebug('Skipping because max bfactor in dihedral %.3f is above %.3f %s' % (max_bfactor, DEFAULT_MAX_BFACTOR, lineItem))
                del lineList[i] # TODO: check if indexing is still right or we shoot in the foot.
                continue
            if i in badIdxList:
#                nTdebug('Skipping because bfactor worst %.3f %s' % (max_bfactor, lineItem))
                del lineList[i]
                continue
        removed_count = n - len(lineList)
#        nTdebug("Reduced list by %d" % removed_count)
        if removed_count < bad_count:
            nTwarning("Failed to remove at least %d residues" % bad_count)

    for entryId2 in entryIdList:
        prevChainId = None
        prevResType = None
        prevResNum = None
        prevSsType = None
        for _r, row in enumerate(linesByEntry[ entryId2 ]):
    #1zzk,A,GLN ,  17,E, 205.2, 193.6
    #1zzk,A,VAL ,  18,E, 193.6, 223.2
    #1zzk,A,THR ,  19,E, 223.2, 190.1
            (entryId, chainId, resType, resNum, ssType, d1, _d2, _max_bfactor, _idx) = row
            resNum = int(resNum)
            ssType = to3StateDssp(ssType)[0]
            resType = resType.strip()
            db = NTdb.getResidueDefByName( resType )
            if not db:
                nTerror("resType not in db: %s" % resType)
                return
            resType = db.nameDict['IUPAC']
            d1 = d1.strip()
            d1 = floatParse(d1)
            if isNaN(d1):
#                nTdebug("d1 %s is a NaN on row: %s" % (d1,row))
                continue
            if not inRange(d1):
                nTerror("d1 not in range for row: %s" % str(row))
                return

            if not (resType in common20AAList):
    #            nTmessage("Skipping uncommon residue: %s" % resType)
                if not ( resType in skippedResTypes):
                    skippedResTypes.append( resType )
                continue

            if isSibling(chainId, resNum, prevChainId, prevResNum):
                appendDeepByKeys(valueBySs0AndResTypes, d1, ssType,     resType, prevResType)
                appendDeepByKeys(valueBySs1AndResTypes, d1, prevSsType, resType, prevResType)
                appendDeepByKeys(valueByResTypes, d1, resType, prevResType)
                appendDeepByKeys(valueBySs0, d1, ssType)
                appendDeepByKeys(valueBySs1, d1, prevSsType)
                value.append( d1 )
            prevResType = resType
            prevResNum = resNum
            prevChainId = chainId
            prevSsType = ssType

    os.unlink(cvs_file_abs_name)
    nTmessage("Skipped skippedResTypes: %r" % skippedResTypes )
    nTmessage("Got count of values: %r" % len(value) )
    # fill FOUR types of hist.
    # TODO: filter differently for pro/gly
    keyListSorted1 = valueBySs0AndResTypes.keys()
    keyListSorted1.sort()
    for isI in (True, False):
        if isI:
            valueBySs = valueBySs0
            valueBySsAndResTypes = valueBySs0AndResTypes
            histd1BySs = histd1BySs0
            histd1BySsAndResTypes = histd1BySs0AndResTypes
        else:
            valueBySs = valueBySs1
            valueBySsAndResTypes = valueBySs1AndResTypes
            histd1BySs = histd1BySs1
            histd1BySsAndResTypes = histd1BySs1AndResTypes
        for ssType in keyListSorted1:
#            keyListSorted1b = deepcopy(keyListSorted1)
    #        for ssTypePrev in keyListSorted1b:
            d1List = valueBySs[ssType]
            if not d1List:
                nTerror("Expected d1List from valueBySs[%s]" % (ssType))
                continue
            hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
            nTmessage("Count %6d in valueBySs[%s]" % (sum(hist1d), ssType))
            setDeepByKeys(histd1BySs, hist1d, ssType)

            keyListSorted2 = valueBySsAndResTypes[ssType].keys()
            keyListSorted2.sort()
            for resType in keyListSorted2:
    #            nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity.
                keyListSorted3 = valueBySsAndResTypes[ssType][resType].keys()
                keyListSorted3.sort()
                for prevResType in keyListSorted3:
    #                nTmessage("Working on valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType))
                    d1List = valueBySsAndResTypes[ssType][resType][prevResType]
                    if not d1List:
                        nTerror("Expected d1List from valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType))
                        continue
                    hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
    #                nTmessage("Count %6d in valueBySsAndResTypes[%s][%s][%s]" % (sum(hist1d), ssType, resType, prevResType))
                    setDeepByKeys(histd1BySsAndResTypes, hist1d, ssType, resType, prevResType)
            # Now that they are all in we can redo this.
    # Delete the reference -not- the object.
    valueBySs = None
    valueBySsAndResTypes = None
    histd1BySs = None
    histd1BySsAndResTypes = None

    for ssType in keyListSorted1:
        for resType in keyListSorted2:
#            nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity.
            keyListSorted3 = valueBySs0AndResTypes[ssType][resType].keys()
            keyListSorted3.sort()
            for resTypePrev in keyListSorted3:
                keyListSorted4 = keyListSorted3[:] # take a copy
                for resTypeNext in keyListSorted4:
                    hist1 = getDeepByKeys(histd1BySs0AndResTypes, ssType, resType, resTypePrev) # x-axis
                    # This was bug! It needs to be hashed on the ssType of resType -not- on resTypeNext
                    hist2 = getDeepByKeys(histd1BySs1AndResTypes, ssType, resTypeNext, resType) 
                    if hist1 == None:
                        nTdebug('skipping for hist1 is empty for [%s] [%s] [%s]' % (ssType, resTypePrev, resType))
                        continue
                    if hist2 == None:
                        nTdebug('skipping for hist2 is empty for [%s] [%s] [%s]' % (ssType, resType, resTypeNext))
                        continue
                    m1 = mat(hist1,dtype='float')
                    m2 = mat(hist2,dtype='float')
                    m2 = m2.transpose() # pylint: disable=E1101
                    hist2d = multiply(m1,m2)

                    cTuple = getEnsembleAverageAndSigmaHis( hist2d )
                    (_c_av, c_sd, _hisMin, _hisMax) = cTuple #@UnusedVariable
                    cTuple += tuple([str([ssType, resType, resTypePrev, resTypeNext])]) # append the hash keys as a way of id.
#                    nTdebug("For ssType %s residue types %s %s %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % (
#                        ssType, resType, resTypePrev, resTypeNext, c_av, c_sd, hisMin, hisMax))
                    if c_sd == None:
                        nTdebug('Failed to get c_sd when testing not all residues are present in smaller sets.')
                        continue
                    if c_sd == 0.:
                        nTdebug('Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.')
                        continue
                    setDeepByKeys( histd1CtupleBySsAndResTypes, cTuple, ssType, resType, resTypePrev, resTypeNext)
    # end for isI

    keyListSorted1 = valueByResTypes.keys()
    keyListSorted1.sort()
    for resType in keyListSorted1:
        keyListSorted2 = valueByResTypes[resType].keys()
        keyListSorted2.sort()
        for prevResType in keyListSorted2:
            d1List = valueByResTypes[resType][prevResType]
            if not d1List:
                nTerror("Expected d1List from valueByResTypes[%s][%s]" % (resType, prevResType))
                continue
            hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
#            nTmessage("Count %6d in valueByResTypes[%s][%s]" % (sum(hist1d), resType, prevResType))
            setDeepByKeys(histd1ByResTypes, hist1d, resType, prevResType)

    histd1, _bins, _patches = hist(value, bins=binCount, range=xRange)
    nTmessage("Count %6d in value" % sum(histd1))
#    setDeepByKeys(histd1, hist1d, resType, prevResType)

    if os.path.exists(dbase_file_abs_name):
        os.unlink(dbase_file_abs_name)
    output = open(dbase_file_abs_name, 'wb')
    dbase = {}
    dbase[ 'histd1BySs0AndResTypes' ] = histd1BySs0AndResTypes # 92 kb uncompressed in the case of ~1000 lines only
    dbase[ 'histd1BySs1AndResTypes' ] = histd1BySs1AndResTypes
    dbase[ 'histd1CtupleBySsAndResTypes' ] = histd1CtupleBySsAndResTypes
    dbase[ 'histd1ByResTypes' ] = histd1ByResTypes # 56 kb
    dbase[ 'histd1BySs0' ] = histd1BySs0 # 4 kb
    dbase[ 'histd1BySs1' ] = histd1BySs1
    dbase[ 'histd1' ] = histd1 #  4 kb

    cPickle.dump(dbase, output, 2)
    output.close()
예제 #5
0
def main():
    'See above.'
    cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif',
                                        cvs_file_abs_name + '.gz')
    gunzip(cvs_file_abs_name_gz)
    reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE)
    valueBySs0AndResTypes = {}  # keys are SSi,   RTi, RTi-1
    valueBySs1AndResTypes = {}  # keys are SSi-1, RTi, RTi-1
    valueByResTypes = {}
    valueBySs0 = {}  # keys are SSi
    valueBySs1 = {}  # keys are SSi-1
    histd1CtupleBySsAndResTypes = {}
    value = []  # NB is an array without being keyed.

    histd1BySs0AndResTypes = {}  # keys are SSi,   RTi, RTi-1
    histd1BySs1AndResTypes = {}  # keys are SSi-1, RTi, RTi-1
    histd1ByResTypes = {}
    histd1BySs0 = {}
    histd1BySs1 = {}

    linesByEntry = {}
    lineCount = 0
    for row in reader:
        lineCount += 1
        if lineCount > lineCountMax:
            break
        entryId = row[0]
        if not linesByEntry.has_key(entryId):
            linesByEntry[entryId] = []
        linesByEntry[entryId].append(row)

    skippedResTypes = []
    entryIdList = linesByEntry.keys()
    entryIdList.sort()

    # Do some pre filtering.
    for entryId2 in entryIdList:
        lineList = linesByEntry[entryId2]
        for idx, line in enumerate(lineList):
            line.append(idx)
        lineListSorted = NTsort(lineList, BFACTOR_COLUMN, inplace=False)
        # Now throw away the worst 10 % of residues.
        n = len(lineListSorted)
        bad_count = int(round((n * DEFAULT_BFACTOR_PERCENTAGE_FILTER) / 100.))
        to_remove_count = n - bad_count
        #        nTmessage("Removing at least %d from %d residues" % (bad_count,n))
        badIdxList = [
            lineItem[IDX_COLUMN]
            for lineItem in lineListSorted[to_remove_count:n]
        ]
        iList = range(n)
        iList.reverse()
        for i in iList:
            lineItem = lineList[i]
            max_bfactor = float(lineItem[BFACTOR_COLUMN])
            if max_bfactor > DEFAULT_MAX_BFACTOR:
                #                nTdebug('Skipping because max bfactor in dihedral %.3f is above %.3f %s' % (max_bfactor, DEFAULT_MAX_BFACTOR, lineItem))
                del lineList[
                    i]  # TODO: check if indexing is still right or we shoot in the foot.
                continue
            if i in badIdxList:
                #                nTdebug('Skipping because bfactor worst %.3f %s' % (max_bfactor, lineItem))
                del lineList[i]
                continue
        removed_count = n - len(lineList)
        #        nTdebug("Reduced list by %d" % removed_count)
        if removed_count < bad_count:
            nTwarning("Failed to remove at least %d residues" % bad_count)

    for entryId2 in entryIdList:
        prevChainId = None
        prevResType = None
        prevResNum = None
        prevSsType = None
        for _r, row in enumerate(linesByEntry[entryId2]):
            #1zzk,A,GLN ,  17,E, 205.2, 193.6
            #1zzk,A,VAL ,  18,E, 193.6, 223.2
            #1zzk,A,THR ,  19,E, 223.2, 190.1
            (entryId, chainId, resType, resNum, ssType, d1, _d2, _max_bfactor,
             _idx) = row
            resNum = int(resNum)
            ssType = to3StateDssp(ssType)[0]
            resType = resType.strip()
            db = NTdb.getResidueDefByName(resType)
            if not db:
                nTerror("resType not in db: %s" % resType)
                return
            resType = db.nameDict['IUPAC']
            d1 = d1.strip()
            d1 = floatParse(d1)
            if isNaN(d1):
                #                nTdebug("d1 %s is a NaN on row: %s" % (d1,row))
                continue
            if not inRange(d1):
                nTerror("d1 not in range for row: %s" % str(row))
                return

            if not (resType in common20AAList):
                #            nTmessage("Skipping uncommon residue: %s" % resType)
                if not (resType in skippedResTypes):
                    skippedResTypes.append(resType)
                continue

            if isSibling(chainId, resNum, prevChainId, prevResNum):
                appendDeepByKeys(valueBySs0AndResTypes, d1, ssType, resType,
                                 prevResType)
                appendDeepByKeys(valueBySs1AndResTypes, d1, prevSsType,
                                 resType, prevResType)
                appendDeepByKeys(valueByResTypes, d1, resType, prevResType)
                appendDeepByKeys(valueBySs0, d1, ssType)
                appendDeepByKeys(valueBySs1, d1, prevSsType)
                value.append(d1)
            prevResType = resType
            prevResNum = resNum
            prevChainId = chainId
            prevSsType = ssType

    os.unlink(cvs_file_abs_name)
    nTmessage("Skipped skippedResTypes: %r" % skippedResTypes)
    nTmessage("Got count of values: %r" % len(value))
    # fill FOUR types of hist.
    # TODO: filter differently for pro/gly
    keyListSorted1 = valueBySs0AndResTypes.keys()
    keyListSorted1.sort()
    for isI in (True, False):
        if isI:
            valueBySs = valueBySs0
            valueBySsAndResTypes = valueBySs0AndResTypes
            histd1BySs = histd1BySs0
            histd1BySsAndResTypes = histd1BySs0AndResTypes
        else:
            valueBySs = valueBySs1
            valueBySsAndResTypes = valueBySs1AndResTypes
            histd1BySs = histd1BySs1
            histd1BySsAndResTypes = histd1BySs1AndResTypes
        for ssType in keyListSorted1:
            #            keyListSorted1b = deepcopy(keyListSorted1)
            #        for ssTypePrev in keyListSorted1b:
            d1List = valueBySs[ssType]
            if not d1List:
                nTerror("Expected d1List from valueBySs[%s]" % (ssType))
                continue
            hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
            nTmessage("Count %6d in valueBySs[%s]" % (sum(hist1d), ssType))
            setDeepByKeys(histd1BySs, hist1d, ssType)

            keyListSorted2 = valueBySsAndResTypes[ssType].keys()
            keyListSorted2.sort()
            for resType in keyListSorted2:
                #            nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity.
                keyListSorted3 = valueBySsAndResTypes[ssType][resType].keys()
                keyListSorted3.sort()
                for prevResType in keyListSorted3:
                    #                nTmessage("Working on valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType))
                    d1List = valueBySsAndResTypes[ssType][resType][prevResType]
                    if not d1List:
                        nTerror(
                            "Expected d1List from valueBySsAndResTypes[%s][%s][%s]"
                            % (ssType, resType, prevResType))
                        continue
                    hist1d, _bins, _patches = hist(d1List,
                                                   bins=binCount,
                                                   range=xRange)
                    #                nTmessage("Count %6d in valueBySsAndResTypes[%s][%s][%s]" % (sum(hist1d), ssType, resType, prevResType))
                    setDeepByKeys(histd1BySsAndResTypes, hist1d, ssType,
                                  resType, prevResType)
            # Now that they are all in we can redo this.
    # Delete the reference -not- the object.
    valueBySs = None
    valueBySsAndResTypes = None
    histd1BySs = None
    histd1BySsAndResTypes = None

    for ssType in keyListSorted1:
        for resType in keyListSorted2:
            #            nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity.
            keyListSorted3 = valueBySs0AndResTypes[ssType][resType].keys()
            keyListSorted3.sort()
            for resTypePrev in keyListSorted3:
                keyListSorted4 = keyListSorted3[:]  # take a copy
                for resTypeNext in keyListSorted4:
                    hist1 = getDeepByKeys(histd1BySs0AndResTypes, ssType,
                                          resType, resTypePrev)  # x-axis
                    # This was bug! It needs to be hashed on the ssType of resType -not- on resTypeNext
                    hist2 = getDeepByKeys(histd1BySs1AndResTypes, ssType,
                                          resTypeNext, resType)
                    if hist1 == None:
                        nTdebug(
                            'skipping for hist1 is empty for [%s] [%s] [%s]' %
                            (ssType, resTypePrev, resType))
                        continue
                    if hist2 == None:
                        nTdebug(
                            'skipping for hist2 is empty for [%s] [%s] [%s]' %
                            (ssType, resType, resTypeNext))
                        continue
                    m1 = mat(hist1, dtype='float')
                    m2 = mat(hist2, dtype='float')
                    m2 = m2.transpose()  # pylint: disable=E1101
                    hist2d = multiply(m1, m2)

                    cTuple = getEnsembleAverageAndSigmaHis(hist2d)
                    (_c_av, c_sd, _hisMin, _hisMax) = cTuple  #@UnusedVariable
                    cTuple += tuple([
                        str([ssType, resType, resTypePrev, resTypeNext])
                    ])  # append the hash keys as a way of id.
                    #                    nTdebug("For ssType %s residue types %s %s %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % (
                    #                        ssType, resType, resTypePrev, resTypeNext, c_av, c_sd, hisMin, hisMax))
                    if c_sd == None:
                        nTdebug(
                            'Failed to get c_sd when testing not all residues are present in smaller sets.'
                        )
                        continue
                    if c_sd == 0.:
                        nTdebug(
                            'Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.'
                        )
                        continue
                    setDeepByKeys(histd1CtupleBySsAndResTypes, cTuple, ssType,
                                  resType, resTypePrev, resTypeNext)
    # end for isI

    keyListSorted1 = valueByResTypes.keys()
    keyListSorted1.sort()
    for resType in keyListSorted1:
        keyListSorted2 = valueByResTypes[resType].keys()
        keyListSorted2.sort()
        for prevResType in keyListSorted2:
            d1List = valueByResTypes[resType][prevResType]
            if not d1List:
                nTerror("Expected d1List from valueByResTypes[%s][%s]" %
                        (resType, prevResType))
                continue
            hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
            #            nTmessage("Count %6d in valueByResTypes[%s][%s]" % (sum(hist1d), resType, prevResType))
            setDeepByKeys(histd1ByResTypes, hist1d, resType, prevResType)

    histd1, _bins, _patches = hist(value, bins=binCount, range=xRange)
    nTmessage("Count %6d in value" % sum(histd1))
    #    setDeepByKeys(histd1, hist1d, resType, prevResType)

    if os.path.exists(dbase_file_abs_name):
        os.unlink(dbase_file_abs_name)
    output = open(dbase_file_abs_name, 'wb')
    dbase = {}
    dbase[
        'histd1BySs0AndResTypes'] = histd1BySs0AndResTypes  # 92 kb uncompressed in the case of ~1000 lines only
    dbase['histd1BySs1AndResTypes'] = histd1BySs1AndResTypes
    dbase['histd1CtupleBySsAndResTypes'] = histd1CtupleBySsAndResTypes
    dbase['histd1ByResTypes'] = histd1ByResTypes  # 56 kb
    dbase['histd1BySs0'] = histd1BySs0  # 4 kb
    dbase['histd1BySs1'] = histd1BySs1
    dbase['histd1'] = histd1  #  4 kb

    cPickle.dump(dbase, output, 2)
    output.close()
예제 #6
0
    def matchResidue2Cing(self, res):
        """
        Match res to CING database using previously defined convention;
        Account for 'ill-defined' residues by examining crucial atom names.
        Use CYANA (==DIANA) Naming for conversion to INTERNAL (i.e. These names will not likely change)

        Return NTdb resDef object None on Error

        res is a NTtree object with the following attributes set after this routine:
            db
            skip
            resName    and attributes for every atom it includes:
            HA2, CD1, ...
        """

        #        nTdebug("Now in _matchResidue2Cing: %s" % res)

        res.db = None
        res.skip = False

        # Residue names that are ambiguously defined by different PDB file formats
        if res.resName[0:3] == 'ARG':
            if 'HH1' in res:
                res.db = NTdb.getResidueDefByName('ARG', convention=CYANA)
            elif '1HH' in res:  # Second set for CYANA 1.x, AMBER
                res.db = NTdb.getResidueDefByName('ARG', convention=CYANA)
            else:
                # Default protonated; this also assures most common for X-ray without protons
                res.db = NTdb.getResidueDefByName('ARG+', convention=CYANA)
            #end if
        #end if
        elif res.resName[0:3] == 'ASP':
            if 'HD2' in res:
                #print 'ASPH'
                res.db = NTdb.getResidueDefByName('ASP', convention=CYANA)
            else:
                # Default deprot; this also assures most common for X-ray without protons
                #print 'ASP'
                res.db = NTdb.getResidueDefByName('ASP-', convention=CYANA)
            #end if
        elif res.resName[0:3] == 'GLU':
            if 'HE2' in res:
                #print 'GLUH'
                res.db = NTdb.getResidueDefByName('GLU', convention=CYANA)
            else:
                # Default deprot; this also assures most common for X-ray without protons
                #print 'GLU'
                res.db = NTdb.getResidueDefByName('GLU-', convention=CYANA)
            #end if
        elif res.resName[0:3] == 'HIS':
            if 'HD1' in res and 'HE2' in res:
                #print 'HISH'
                res.db = NTdb.getResidueDefByName('HIS+', convention=CYANA)
            elif not 'HD1' in res and 'HE2' in res:
                # print HISE
                res.db = NTdb.getResidueDefByName('HIST', convention=CYANA)
            else:
                # Default HD1
                #print 'HIS'
                res.db = NTdb.getResidueDefByName('HIS', convention=CYANA)
            #end if
        elif res.resName[0:3] == 'LYS':
            if ('HZ1' in res and not 'HZ3' in res):
                res.db = NTdb.getResidueDefByName('LYS', convention=CYANA)
            elif ('1HZ' in res
                  and not '3HZ' in res):  # Second set for CYANA 1.x
                res.db = NTdb.getResidueDefByName('LYS', convention=CYANA)
            else:
                # Default prot; this also assures most common for X-ray without protons
                res.db = NTdb.getResidueDefByName('LYS+', convention=CYANA)
            #end if
        elif res.resName in CYANA_NON_RESIDUES:
            res.skip = True
        elif res.resName == 'HOH' and self.skipWaters:
            res.skip = True
        else:
            res.db = NTdb.getResidueDefByName(res.resName,
                                              convention=self.convention)
        #end if

        # Only continue the search if not found and non-standard residues are allowed.
        if res.db:
            return res.db

        if not self.allowNonStandardResidue:
            res.skip = True
            return res.db

        # Try to match the residue using INTERNAL convention.
        res.db = NTdb.getResidueDefByName(res.resName)
        if res.db:
            return res.db

#        insert new residue.
        res.db = NTdb.appendResidueDef(name=res.resName,
                                       shortName='_',
                                       comment='From parsing PDB file')
        if not res.db:
            nTcodeerror(
                "Adding a non-standard residue should have been possible.")
            return None
        res.db.nameDict[self.convention] = res.resName

        # Just a check, disable for speed.
        _x = NTdb.getResidueDefByName(res.resName)
        if not _x:
            nTcodeerror(
                "Added residue but failed to find it again in pdbParser#_matchResidue2Cing"
            )

        return res.db