Ejemplo n.º 1
0
def main():
    cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif',
                                        cvs_file_abs_name + '.gz')
    gunzip(cvs_file_abs_name_gz)
    reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE)
    valuesBySsAndResType = {}
    histJaninBySsAndResType = {}
    histJaninBySsAndCombinedResType = {}
    #    histByCombinedSsAndResType = {}
    histJaninCtupleBySsAndResType = {}
    valuesByEntrySsAndResType = {}
    hrange = (xRange, yRange)

    #    rowCount = 0
    for row in reader:
        #        rowCount += 1
        #        7a3h,A,VAL ,   5,H, -62.8, -52.8
        #        7a3h,A,VAL ,   6,H, -71.2, -33.6
        #        7a3h,A,GLU ,   7,H, -63.5, -41.6
        (entryId, _chainId, resType, _resNum, ssType, chi1, chi2,
         _max_bfactor) = row
        ssType = to3StateDssp(ssType)[0]
        resType = resType.strip()
        chi1 = chi1.strip()
        chi2 = chi2.strip()
        chi1 = floatParse(chi1)
        chi2 = floatParse(chi2)
        if isNaN(chi1) or isNaN(chi2):
            continue
        if not inRange(chi1):
            nTerror("chi1 not in range for row: %s" % repr(row))
            return
        if not inRange(chi2):
            nTerror("chi2 not in range for row: %s" % repr(row))
            return
        if not common20AADict.has_key(resType):
            nTdebug("Residue not in common 20 for row: %s" % repr(row))
            #            rowCount -= 1
            continue

        appendDeepByKeys(valuesBySsAndResType, chi1, ssType, resType, 'chi1')
        appendDeepByKeys(valuesByEntrySsAndResType, chi1, entryId, ssType,
                         resType, 'chi1')
        appendDeepByKeys(valuesBySsAndResType, chi2, ssType, resType, 'chi2')
        appendDeepByKeys(valuesByEntrySsAndResType, chi2, entryId, ssType,
                         resType, 'chi2')
#        nTdebug('resType,ssType,chi1: %4s %1s %s' % (resType,ssType,floatFormat(chi1, "%6.1f")))
#        nTdebug('resType,ssType,chi2: %4s %1s %s' % (resType,ssType,floatFormat(chi2, "%6.1f")))
    del (reader)  # closes the file handles
    os.unlink(cvs_file_abs_name)

    for ssType in valuesBySsAndResType.keys():
        for resType in valuesBySsAndResType[ssType].keys():
            chi1 = valuesBySsAndResType[ssType][resType]['chi1']
            chi2 = valuesBySsAndResType[ssType][resType]['chi2']
            if chi1 and chi2:
                hist2d, _xedges, _yedges = histogram2d(chi2,
                                                       chi1,
                                                       bins=binCount,
                                                       range=hrange)
                setDeepByKeys(histJaninBySsAndResType, hist2d, ssType, resType)
                cTuple = getEnsembleAverageAndSigmaHis(hist2d)
                (c_av, c_sd, hisMin, hisMax) = cTuple
                cTuple += tuple([str([ssType, resType])
                                 ])  # append the hash keys as a way of id.
                nTdebug(
                    "For ssType %s residue type %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f"
                    % (ssType, resType, c_av, c_sd, hisMin, hisMax))
                if c_sd == None:
                    nTdebug(
                        'Failed to get c_sd when testing not all residues are present in smaller sets.'
                    )
                    continue
                if c_sd == 0.:
                    nTdebug(
                        'Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.'
                    )
                    continue
                setDeepByKeys(histJaninCtupleBySsAndResType, cTuple, ssType,
                              resType)

    for ssType in valuesBySsAndResType.keys():
        chi1 = []
        chi2 = []
        for resType in valuesBySsAndResType[ssType].keys():
            if resType == 'PRO' or resType == 'GLY':
                continue
            chi1 += valuesBySsAndResType[ssType][resType]['chi1']
            chi2 += valuesBySsAndResType[ssType][resType]['chi2']
        if chi1 and chi2:
            hist2d, _xedges, _yedges = histogram2d(
                chi2,  # Note that the x is the chi2 for some stupid reason,
                chi1,  # otherwise the imagery but also the [row][column] notation is screwed.
                bins=binCount,
                range=hrange)
            #        hist2d = zscaleHist( hist2d, Cav, Csd )
            setDeepByKeys(histJaninBySsAndCombinedResType, hist2d, ssType)

    # Throws a verbose error message on python 2.6.3 as per issue http://code.google.com/p/cing/issues/detail?id=211
    # Using Pickle instead


#    dbase = shelve.open( dbase_file_abs_name )
#    dbase.close()

    if os.path.exists(dbase_file_abs_name):
        os.unlink(dbase_file_abs_name)
    output = open(dbase_file_abs_name, 'wb')
    dbase = {}
    dbase['histJaninBySsAndCombinedResType'] = histJaninBySsAndCombinedResType
    dbase['histJaninBySsAndResType'] = histJaninBySsAndResType
    dbase['histJaninCtupleBySsAndResType'] = histJaninCtupleBySsAndResType
    #    histJaninCtupleBySsAndResType
    cPickle.dump(dbase, output, 2)
    output.close()
Ejemplo n.º 2
0
def main():
    cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz')
    gunzip(cvs_file_abs_name_gz)
    reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE)
    valuesBySsAndResType = {}
    histJaninBySsAndResType = {}
    histJaninBySsAndCombinedResType = {}
#    histByCombinedSsAndResType = {}
    histJaninCtupleBySsAndResType = {}
    valuesByEntrySsAndResType = {}
    hrange = (xRange, yRange)

#    rowCount = 0
    for row in reader:
#        rowCount += 1
#        7a3h,A,VAL ,   5,H, -62.8, -52.8
#        7a3h,A,VAL ,   6,H, -71.2, -33.6
#        7a3h,A,GLU ,   7,H, -63.5, -41.6
        (entryId, _chainId, resType, _resNum, ssType, chi1, chi2, _max_bfactor) = row
        ssType = to3StateDssp(ssType)[0]
        resType = resType.strip()
        chi1 = chi1.strip()
        chi2 = chi2.strip()
        chi1 = floatParse(chi1)
        chi2 = floatParse(chi2)
        if isNaN(chi1) or isNaN(chi2):
            continue
        if not inRange(chi1):
            nTerror("chi1 not in range for row: %s" % repr(row))
            return
        if not inRange(chi2):
            nTerror("chi2 not in range for row: %s" % repr(row))
            return
        if not common20AADict.has_key(resType):
            nTdebug("Residue not in common 20 for row: %s" % repr(row))
#            rowCount -= 1
            continue

        appendDeepByKeys(valuesBySsAndResType, chi1, ssType, resType, 'chi1')
        appendDeepByKeys(valuesByEntrySsAndResType, chi1, entryId, ssType, resType, 'chi1')
        appendDeepByKeys(valuesBySsAndResType, chi2, ssType, resType, 'chi2')
        appendDeepByKeys(valuesByEntrySsAndResType, chi2, entryId, ssType, resType, 'chi2')
#        nTdebug('resType,ssType,chi1: %4s %1s %s' % (resType,ssType,floatFormat(chi1, "%6.1f")))
#        nTdebug('resType,ssType,chi2: %4s %1s %s' % (resType,ssType,floatFormat(chi2, "%6.1f")))
    del(reader) # closes the file handles
    os.unlink(cvs_file_abs_name)

    for ssType in valuesBySsAndResType.keys():
        for resType in valuesBySsAndResType[ssType].keys():
            chi1 = valuesBySsAndResType[ssType][resType]['chi1']
            chi2 = valuesBySsAndResType[ssType][resType]['chi2']
            if chi1 and chi2:
                hist2d, _xedges, _yedges = histogram2d(
                    chi2, chi1,
                    bins=binCount,
                    range=hrange)
                setDeepByKeys(histJaninBySsAndResType, hist2d, ssType, resType)
                cTuple = getEnsembleAverageAndSigmaHis(hist2d)
                (c_av, c_sd, hisMin, hisMax) = cTuple
                cTuple += tuple([str([ssType, resType])]) # append the hash keys as a way of id.
                nTdebug("For ssType %s residue type %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % (
                    ssType, resType, c_av, c_sd, hisMin, hisMax))
                if c_sd == None:
                    nTdebug('Failed to get c_sd when testing not all residues are present in smaller sets.')
                    continue
                if c_sd == 0.:
                    nTdebug('Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.')
                    continue
                setDeepByKeys(histJaninCtupleBySsAndResType, cTuple, ssType, resType)

    for ssType in valuesBySsAndResType.keys():
        chi1 = []
        chi2 = []
        for resType in valuesBySsAndResType[ssType].keys():
            if resType == 'PRO' or resType == 'GLY':
                continue
            chi1 += valuesBySsAndResType[ssType][resType]['chi1']
            chi2 += valuesBySsAndResType[ssType][resType]['chi2']
        if chi1 and chi2:
            hist2d, _xedges, _yedges = histogram2d(
                chi2, # Note that the x is the chi2 for some stupid reason,
                chi1, # otherwise the imagery but also the [row][column] notation is screwed.
                bins=binCount,
                range=hrange)
    #        hist2d = zscaleHist( hist2d, Cav, Csd )
            setDeepByKeys(histJaninBySsAndCombinedResType, hist2d, ssType)

    # Throws a verbose error message on python 2.6.3 as per issue https://github.com/VuisterLab/cing/issues/211
    # Using Pickle instead
#    dbase = shelve.open( dbase_file_abs_name )
#    dbase.close()

    if os.path.exists(dbase_file_abs_name):
        os.unlink(dbase_file_abs_name)
    output = open(dbase_file_abs_name, 'wb')
    dbase = {}
    dbase[ 'histJaninBySsAndCombinedResType' ] = histJaninBySsAndCombinedResType
    dbase[ 'histJaninBySsAndResType' ] = histJaninBySsAndResType
    dbase[ 'histJaninCtupleBySsAndResType' ] = histJaninCtupleBySsAndResType
#    histJaninCtupleBySsAndResType
    cPickle.dump(dbase, output, 2)
    output.close()
Ejemplo n.º 3
0
def main():
    cvs_file_abs_name_gz = cvs_file_abs_name + '.gz'
    gunzip(cvs_file_abs_name_gz)
    reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE)
    valuesBySsAndResType = {}
    histRamaBySsAndResType = {}
    histRamaBySsAndCombinedResType = {}
    #    histByCombinedSsAndResType = {}
    histRamaCtupleBySsAndResType = {}
    valuesByEntrySsAndResType = {}
    hrange = (xRange, yRange)

    rowCount = 0
    for row in reader:
        rowCount += 1
        #        7a3h,A,VAL ,   5,H, -62.8, -52.8
        #        7a3h,A,VAL ,   6,H, -71.2, -33.6
        #        7a3h,A,GLU ,   7,H, -63.5, -41.6
        (entryId, _chainId, resType, _resNum, ssType, phi, psi,
         _max_bfactor) = row
        ssType = to3StateDssp(ssType)[0]
        resType = resType.strip()
        phi = float(phi)
        psi = float(psi)
        if not (inRange(phi, isRange360=isRange360)
                and inRange(psi, isRange360=isRange360)):
            nTerror("phi and/or psi not in range for row: %s" % repr(row))
            return
        if not common20AADict.has_key(resType):
            nTdebug("Residue not in common 20 for row: %s" % repr(row))
            rowCount -= 1
            continue

        appendDeepByKeys(valuesBySsAndResType, phi, ssType, resType, 'phi')
        appendDeepByKeys(valuesBySsAndResType, psi, ssType, resType, 'psi')
        #        nTdebug('resType,ssType,phi,psi: %4s %1s %8.3f %8.3f' % (resType,ssType,phi,psi))
        appendDeepByKeys(valuesByEntrySsAndResType, phi, entryId, ssType,
                         resType, 'phi')
        appendDeepByKeys(valuesByEntrySsAndResType, psi, entryId, ssType,
                         resType, 'psi')
    del (reader)  # closes the file handles
    os.unlink(cvs_file_abs_name)
    nTdebug('Total number of included residues including PRO/GLY: %d' %
            rowCount)
    #    nTdebug('valuesByEntrySsAndResType:\n%s'%valuesByEntrySsAndResType)
    #    (cAv, cSd, _Cn) = getRescaling(valuesByEntrySsAndResType)
    (cAv, cSd) = (1.0, 1.0)
    nTdebug("Overall found av,sd: %r %r" % (cAv, cSd))

    for ssType in valuesBySsAndResType.keys():
        for resType in valuesBySsAndResType[ssType].keys():
            hist2d, _xedges, _yedges = histogram2d(
                valuesBySsAndResType[ssType][resType]['psi'],
                valuesBySsAndResType[ssType][resType]['phi'],
                bins=binCount,
                range=hrange)
            #            hist2d = zscaleHist( hist2d, cAv, cSd )
            setDeepByKeys(histRamaBySsAndResType, hist2d, ssType, resType)
            #            nTdebug('hist2d ssType, resType: %s %s\n%s' % (ssType, resType, hist2d))
            cTuple = getEnsembleAverageAndSigmaHis(hist2d)
            (c_av, c_sd, hisMin, hisMax) = cTuple
            cTuple += tuple([str([ssType, resType])
                             ])  # append the hash keys as a way of id.
            nTdebug(
                "For ssType %s residue type %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f"
                % (ssType, resType, c_av, c_sd, hisMin, hisMax))
            #            nTdebug("xedges %s" % repr(xedges))
            #            sys.exit(1)
            if c_sd == None:
                nTdebug(
                    'Failed to get c_sd when testing not all residues are present in smaller sets.'
                )
                continue
            if c_sd == 0.:
                nTdebug(
                    'Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.'
                )
                continue
            setDeepByKeys(histRamaCtupleBySsAndResType, cTuple, ssType,
                          resType)

    for ssType in valuesBySsAndResType.keys():
        phi = []
        psi = []
        for resType in valuesBySsAndResType[ssType].keys():
            if resType == 'PRO' or resType == 'GLY':
                continue
            phi += valuesBySsAndResType[ssType][resType]['phi']
            psi += valuesBySsAndResType[ssType][resType]['psi']
        hist2d, _xedges, _yedges = histogram2d(
            psi,  # Note that the x is the psi for some stupid reason,
            phi,  # otherwise the imagery but also the [row][column] notation is screwed.
            bins=binCount,
            range=hrange)
        #        hist2d = zscaleHist( hist2d, cAv, cSd )
        setDeepByKeys(histRamaBySsAndCombinedResType, hist2d, ssType)

    phi = []
    psi = []
    for ssType in valuesBySsAndResType.keys():
        for resType in valuesBySsAndResType[ssType].keys():
            if resType == 'PRO' or resType == 'GLY':
                continue
            phi += valuesBySsAndResType[ssType][resType]['phi']
            psi += valuesBySsAndResType[ssType][resType]['psi']

    nTdebug('Total number of residues without PRO/GLY: %d' % len(psi))
    hist2d, _xedges, _yedges = histogram2d(
        psi,  # Note that the x is the psi for some stupid reason,
        phi,  # otherwise the imagery but also the [row][column] notation is screwed.
        bins=binCount,
        range=hrange)
    #    sumHistCombined = sum( hist2d )
    #    sumsumHistCombined = sum( sumHistCombined )
    nTdebug('hist2d         : \n%s' % hist2d)
    #    nTdebug('sumHistCombined   : %s' % repr(sumHistCombined))
    #    nTdebug('sumsumHistCombined: %.0f' % sumsumHistCombined)
    #    hist2d = zscaleHist( hist2d, cAv, cSd )
    #    nTdebug('hist2d scaled  : \n%s' % hist2d)

    if os.path.exists(dbase_file_abs_name):
        os.unlink(dbase_file_abs_name)
#    dbase = shelve.open( dbase_file_abs_name )
    output = open(dbase_file_abs_name, 'wb')
    #    dbase = {'bar':'milky'}
    dbase = {}
    # Pickle the list using the highest protocol available.
    dbase['histRamaCombined'] = hist2d
    dbase['histRamaBySsAndCombinedResType'] = histRamaBySsAndCombinedResType
    dbase['histRamaBySsAndResType'] = histRamaBySsAndResType
    dbase['histRamaCtupleBySsAndResType'] = histRamaCtupleBySsAndResType
    #    pickle.dump(dbase, output, -1)
    #    pickle.dump(dbase, output)
    cPickle.dump(
        dbase, output,
        2)  # Was -1 for the most recent version but this caused an issue 239
    # NB 2 is the highest listed protocol too but behind the scenes cPickle will probably write something higher still.
    # If the protocol parameter is omitted, protocol 0 is used.
    # If protocol is specified as a negative value or HIGHEST_PROTOCOL, the highest protocol version will be used.

    output.close()
Ejemplo n.º 4
0
def main():
    cvs_file_abs_name_gz = cvs_file_abs_name + '.gz'
    gunzip(cvs_file_abs_name_gz)
    reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE)
    valuesBySsAndResType = {}
    histRamaBySsAndResType = {}
    histRamaBySsAndCombinedResType = {}
#    histByCombinedSsAndResType = {}
    histRamaCtupleBySsAndResType = {}
    valuesByEntrySsAndResType = {}
    hrange = (xRange, yRange)

    rowCount = 0
    for row in reader:
        rowCount += 1
#        7a3h,A,VAL ,   5,H, -62.8, -52.8
#        7a3h,A,VAL ,   6,H, -71.2, -33.6
#        7a3h,A,GLU ,   7,H, -63.5, -41.6
        (entryId, _chainId, resType, _resNum, ssType, phi, psi, _max_bfactor) = row
        ssType = to3StateDssp(ssType)[0]
        resType = resType.strip()
        phi = float(phi)
        psi = float(psi)
        if not (inRange(phi, isRange360=isRange360) and inRange(psi, isRange360=isRange360)):
            nTerror("phi and/or psi not in range for row: %s" % repr(row))
            return
        if not common20AADict.has_key(resType):
            nTdebug("Residue not in common 20 for row: %s" % repr(row))
            rowCount -= 1
            continue

        appendDeepByKeys(valuesBySsAndResType, phi, ssType, resType, 'phi')
        appendDeepByKeys(valuesBySsAndResType, psi, ssType, resType, 'psi')
#        nTdebug('resType,ssType,phi,psi: %4s %1s %8.3f %8.3f' % (resType,ssType,phi,psi))
        appendDeepByKeys(valuesByEntrySsAndResType, phi, entryId, ssType, resType, 'phi')
        appendDeepByKeys(valuesByEntrySsAndResType, psi, entryId, ssType, resType, 'psi')
    del(reader) # closes the file handles
    os.unlink(cvs_file_abs_name)
    nTdebug('Total number of included residues including PRO/GLY: %d' % rowCount)
#    nTdebug('valuesByEntrySsAndResType:\n%s'%valuesByEntrySsAndResType)
#    (cAv, cSd, _Cn) = getRescaling(valuesByEntrySsAndResType)
    (cAv, cSd) = (1.0, 1.0)
    nTdebug("Overall found av,sd: %r %r" % (cAv, cSd))

    for ssType in valuesBySsAndResType.keys():
        for resType in valuesBySsAndResType[ssType].keys():
            hist2d, _xedges, _yedges = histogram2d(
                valuesBySsAndResType[ssType][resType]['psi'],
                valuesBySsAndResType[ssType][resType]['phi'],
                bins=binCount,
                range=hrange)
#            hist2d = zscaleHist( hist2d, cAv, cSd )
            setDeepByKeys(histRamaBySsAndResType, hist2d, ssType, resType)
#            nTdebug('hist2d ssType, resType: %s %s\n%s' % (ssType, resType, hist2d))
            cTuple = getEnsembleAverageAndSigmaHis(hist2d)
            (c_av, c_sd, hisMin, hisMax) = cTuple
            cTuple += tuple([str([ssType, resType])]) # append the hash keys as a way of id.
            nTdebug("For ssType %s residue type %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % (
                ssType, resType, c_av, c_sd, hisMin, hisMax))
#            nTdebug("xedges %s" % repr(xedges))
#            sys.exit(1)
            if c_sd == None:
                nTdebug('Failed to get c_sd when testing not all residues are present in smaller sets.')
                continue
            if c_sd == 0.:
                nTdebug('Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.')
                continue
            setDeepByKeys(histRamaCtupleBySsAndResType, cTuple, ssType, resType)

    for ssType in valuesBySsAndResType.keys():
        phi = []
        psi = []
        for resType in valuesBySsAndResType[ssType].keys():
            if resType == 'PRO' or resType == 'GLY':
                continue
            phi += valuesBySsAndResType[ssType][resType]['phi']
            psi += valuesBySsAndResType[ssType][resType]['psi']
        hist2d, _xedges, _yedges = histogram2d(
            psi, # Note that the x is the psi for some stupid reason,
            phi, # otherwise the imagery but also the [row][column] notation is screwed.
            bins=binCount,
            range=hrange)
#        hist2d = zscaleHist( hist2d, cAv, cSd )
        setDeepByKeys(histRamaBySsAndCombinedResType, hist2d, ssType)

    phi = []
    psi = []
    for ssType in valuesBySsAndResType.keys():
        for resType in valuesBySsAndResType[ssType].keys():
            if resType == 'PRO' or resType == 'GLY':
                continue
            phi += valuesBySsAndResType[ssType][resType]['phi']
            psi += valuesBySsAndResType[ssType][resType]['psi']

    nTdebug('Total number of residues without PRO/GLY: %d' % len(psi))
    hist2d, _xedges, _yedges = histogram2d(
        psi, # Note that the x is the psi for some stupid reason,
        phi, # otherwise the imagery but also the [row][column] notation is screwed.
        bins=binCount,
        range=hrange)
#    sumHistCombined = sum( hist2d )
#    sumsumHistCombined = sum( sumHistCombined )
    nTdebug('hist2d         : \n%s' % hist2d)
#    nTdebug('sumHistCombined   : %s' % repr(sumHistCombined))
#    nTdebug('sumsumHistCombined: %.0f' % sumsumHistCombined)
#    hist2d = zscaleHist( hist2d, cAv, cSd )
#    nTdebug('hist2d scaled  : \n%s' % hist2d)

    if os.path.exists(dbase_file_abs_name):
        os.unlink(dbase_file_abs_name)
#    dbase = shelve.open( dbase_file_abs_name )
    output = open(dbase_file_abs_name, 'wb')
#    dbase = {'bar':'milky'}
    dbase = {}
    # Pickle the list using the highest protocol available.
    dbase[ 'histRamaCombined' ] = hist2d
    dbase[ 'histRamaBySsAndCombinedResType' ] = histRamaBySsAndCombinedResType
    dbase[ 'histRamaBySsAndResType' ] = histRamaBySsAndResType
    dbase[ 'histRamaCtupleBySsAndResType' ] = histRamaCtupleBySsAndResType
#    pickle.dump(dbase, output, -1)
#    pickle.dump(dbase, output)
    cPickle.dump(dbase, output, 2) # Was -1 for the most recent version but this caused an issue 239
    # NB 2 is the highest listed protocol too but behind the scenes cPickle will probably write something higher still.
    # If the protocol parameter is omitted, protocol 0 is used.
    # If protocol is specified as a negative value or HIGHEST_PROTOCOL, the highest protocol version will be used.

    output.close()
Ejemplo n.º 5
0
def main():
    'See above.'
    cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz')
    gunzip(cvs_file_abs_name_gz)
    reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE)
    valueBySs0AndResTypes = {} # keys are SSi,   RTi, RTi-1
    valueBySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1
    valueByResTypes = {}
    valueBySs0 = {} # keys are SSi
    valueBySs1 = {} # keys are SSi-1
    histd1CtupleBySsAndResTypes = {}
    value = [] # NB is an array without being keyed.

    histd1BySs0AndResTypes = {} # keys are SSi,   RTi, RTi-1
    histd1BySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1
    histd1ByResTypes = {}
    histd1BySs0 = {}
    histd1BySs1 = {}


    linesByEntry = {}
    lineCount = 0
    for row in reader:
        lineCount += 1
        if lineCount > lineCountMax:
            break
        entryId = row[0]
        if not linesByEntry.has_key(entryId):
            linesByEntry[ entryId ] = []
        linesByEntry[ entryId ].append( row )

    skippedResTypes = []
    entryIdList = linesByEntry.keys()
    entryIdList.sort()

    # Do some pre filtering.
    for entryId2 in entryIdList:
        lineList = linesByEntry[ entryId2 ]
        for idx,line in enumerate(lineList):
            line.append(idx)
        lineListSorted = NTsort(lineList,BFACTOR_COLUMN,inplace=False)
        # Now throw away the worst 10 % of residues.
        n = len(lineListSorted)
        bad_count = int(round((n * DEFAULT_BFACTOR_PERCENTAGE_FILTER) / 100.))
        to_remove_count = n-bad_count
#        nTmessage("Removing at least %d from %d residues" % (bad_count,n))
        badIdxList = [lineItem[IDX_COLUMN] for lineItem in lineListSorted[to_remove_count:n]]
        iList = range(n)
        iList.reverse()
        for i in iList:
            lineItem = lineList[i]
            max_bfactor = float(lineItem[BFACTOR_COLUMN])
            if max_bfactor > DEFAULT_MAX_BFACTOR:
#                nTdebug('Skipping because max bfactor in dihedral %.3f is above %.3f %s' % (max_bfactor, DEFAULT_MAX_BFACTOR, lineItem))
                del lineList[i] # TODO: check if indexing is still right or we shoot in the foot.
                continue
            if i in badIdxList:
#                nTdebug('Skipping because bfactor worst %.3f %s' % (max_bfactor, lineItem))
                del lineList[i]
                continue
        removed_count = n - len(lineList)
#        nTdebug("Reduced list by %d" % removed_count)
        if removed_count < bad_count:
            nTwarning("Failed to remove at least %d residues" % bad_count)

    for entryId2 in entryIdList:
        prevChainId = None
        prevResType = None
        prevResNum = None
        prevSsType = None
        for _r, row in enumerate(linesByEntry[ entryId2 ]):
    #1zzk,A,GLN ,  17,E, 205.2, 193.6
    #1zzk,A,VAL ,  18,E, 193.6, 223.2
    #1zzk,A,THR ,  19,E, 223.2, 190.1
            (entryId, chainId, resType, resNum, ssType, d1, _d2, _max_bfactor, _idx) = row
            resNum = int(resNum)
            ssType = to3StateDssp(ssType)[0]
            resType = resType.strip()
            db = NTdb.getResidueDefByName( resType )
            if not db:
                nTerror("resType not in db: %s" % resType)
                return
            resType = db.nameDict['IUPAC']
            d1 = d1.strip()
            d1 = floatParse(d1)
            if isNaN(d1):
#                nTdebug("d1 %s is a NaN on row: %s" % (d1,row))
                continue
            if not inRange(d1):
                nTerror("d1 not in range for row: %s" % str(row))
                return

            if not (resType in common20AAList):
    #            nTmessage("Skipping uncommon residue: %s" % resType)
                if not ( resType in skippedResTypes):
                    skippedResTypes.append( resType )
                continue

            if isSibling(chainId, resNum, prevChainId, prevResNum):
                appendDeepByKeys(valueBySs0AndResTypes, d1, ssType,     resType, prevResType)
                appendDeepByKeys(valueBySs1AndResTypes, d1, prevSsType, resType, prevResType)
                appendDeepByKeys(valueByResTypes, d1, resType, prevResType)
                appendDeepByKeys(valueBySs0, d1, ssType)
                appendDeepByKeys(valueBySs1, d1, prevSsType)
                value.append( d1 )
            prevResType = resType
            prevResNum = resNum
            prevChainId = chainId
            prevSsType = ssType

    os.unlink(cvs_file_abs_name)
    nTmessage("Skipped skippedResTypes: %r" % skippedResTypes )
    nTmessage("Got count of values: %r" % len(value) )
    # fill FOUR types of hist.
    # TODO: filter differently for pro/gly
    keyListSorted1 = valueBySs0AndResTypes.keys()
    keyListSorted1.sort()
    for isI in (True, False):
        if isI:
            valueBySs = valueBySs0
            valueBySsAndResTypes = valueBySs0AndResTypes
            histd1BySs = histd1BySs0
            histd1BySsAndResTypes = histd1BySs0AndResTypes
        else:
            valueBySs = valueBySs1
            valueBySsAndResTypes = valueBySs1AndResTypes
            histd1BySs = histd1BySs1
            histd1BySsAndResTypes = histd1BySs1AndResTypes
        for ssType in keyListSorted1:
#            keyListSorted1b = deepcopy(keyListSorted1)
    #        for ssTypePrev in keyListSorted1b:
            d1List = valueBySs[ssType]
            if not d1List:
                nTerror("Expected d1List from valueBySs[%s]" % (ssType))
                continue
            hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
            nTmessage("Count %6d in valueBySs[%s]" % (sum(hist1d), ssType))
            setDeepByKeys(histd1BySs, hist1d, ssType)

            keyListSorted2 = valueBySsAndResTypes[ssType].keys()
            keyListSorted2.sort()
            for resType in keyListSorted2:
    #            nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity.
                keyListSorted3 = valueBySsAndResTypes[ssType][resType].keys()
                keyListSorted3.sort()
                for prevResType in keyListSorted3:
    #                nTmessage("Working on valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType))
                    d1List = valueBySsAndResTypes[ssType][resType][prevResType]
                    if not d1List:
                        nTerror("Expected d1List from valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType))
                        continue
                    hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
    #                nTmessage("Count %6d in valueBySsAndResTypes[%s][%s][%s]" % (sum(hist1d), ssType, resType, prevResType))
                    setDeepByKeys(histd1BySsAndResTypes, hist1d, ssType, resType, prevResType)
            # Now that they are all in we can redo this.
    # Delete the reference -not- the object.
    valueBySs = None
    valueBySsAndResTypes = None
    histd1BySs = None
    histd1BySsAndResTypes = None

    for ssType in keyListSorted1:
        for resType in keyListSorted2:
#            nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity.
            keyListSorted3 = valueBySs0AndResTypes[ssType][resType].keys()
            keyListSorted3.sort()
            for resTypePrev in keyListSorted3:
                keyListSorted4 = keyListSorted3[:] # take a copy
                for resTypeNext in keyListSorted4:
                    hist1 = getDeepByKeys(histd1BySs0AndResTypes, ssType, resType, resTypePrev) # x-axis
                    # This was bug! It needs to be hashed on the ssType of resType -not- on resTypeNext
                    hist2 = getDeepByKeys(histd1BySs1AndResTypes, ssType, resTypeNext, resType) 
                    if hist1 == None:
                        nTdebug('skipping for hist1 is empty for [%s] [%s] [%s]' % (ssType, resTypePrev, resType))
                        continue
                    if hist2 == None:
                        nTdebug('skipping for hist2 is empty for [%s] [%s] [%s]' % (ssType, resType, resTypeNext))
                        continue
                    m1 = mat(hist1,dtype='float')
                    m2 = mat(hist2,dtype='float')
                    m2 = m2.transpose() # pylint: disable=E1101
                    hist2d = multiply(m1,m2)

                    cTuple = getEnsembleAverageAndSigmaHis( hist2d )
                    (_c_av, c_sd, _hisMin, _hisMax) = cTuple #@UnusedVariable
                    cTuple += tuple([str([ssType, resType, resTypePrev, resTypeNext])]) # append the hash keys as a way of id.
#                    nTdebug("For ssType %s residue types %s %s %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % (
#                        ssType, resType, resTypePrev, resTypeNext, c_av, c_sd, hisMin, hisMax))
                    if c_sd == None:
                        nTdebug('Failed to get c_sd when testing not all residues are present in smaller sets.')
                        continue
                    if c_sd == 0.:
                        nTdebug('Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.')
                        continue
                    setDeepByKeys( histd1CtupleBySsAndResTypes, cTuple, ssType, resType, resTypePrev, resTypeNext)
    # end for isI

    keyListSorted1 = valueByResTypes.keys()
    keyListSorted1.sort()
    for resType in keyListSorted1:
        keyListSorted2 = valueByResTypes[resType].keys()
        keyListSorted2.sort()
        for prevResType in keyListSorted2:
            d1List = valueByResTypes[resType][prevResType]
            if not d1List:
                nTerror("Expected d1List from valueByResTypes[%s][%s]" % (resType, prevResType))
                continue
            hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
#            nTmessage("Count %6d in valueByResTypes[%s][%s]" % (sum(hist1d), resType, prevResType))
            setDeepByKeys(histd1ByResTypes, hist1d, resType, prevResType)

    histd1, _bins, _patches = hist(value, bins=binCount, range=xRange)
    nTmessage("Count %6d in value" % sum(histd1))
#    setDeepByKeys(histd1, hist1d, resType, prevResType)

    if os.path.exists(dbase_file_abs_name):
        os.unlink(dbase_file_abs_name)
    output = open(dbase_file_abs_name, 'wb')
    dbase = {}
    dbase[ 'histd1BySs0AndResTypes' ] = histd1BySs0AndResTypes # 92 kb uncompressed in the case of ~1000 lines only
    dbase[ 'histd1BySs1AndResTypes' ] = histd1BySs1AndResTypes
    dbase[ 'histd1CtupleBySsAndResTypes' ] = histd1CtupleBySsAndResTypes
    dbase[ 'histd1ByResTypes' ] = histd1ByResTypes # 56 kb
    dbase[ 'histd1BySs0' ] = histd1BySs0 # 4 kb
    dbase[ 'histd1BySs1' ] = histd1BySs1
    dbase[ 'histd1' ] = histd1 #  4 kb

    cPickle.dump(dbase, output, 2)
    output.close()
Ejemplo n.º 6
0
def main():
    'See above.'
    cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif',
                                        cvs_file_abs_name + '.gz')
    gunzip(cvs_file_abs_name_gz)
    reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE)
    valueBySs0AndResTypes = {}  # keys are SSi,   RTi, RTi-1
    valueBySs1AndResTypes = {}  # keys are SSi-1, RTi, RTi-1
    valueByResTypes = {}
    valueBySs0 = {}  # keys are SSi
    valueBySs1 = {}  # keys are SSi-1
    histd1CtupleBySsAndResTypes = {}
    value = []  # NB is an array without being keyed.

    histd1BySs0AndResTypes = {}  # keys are SSi,   RTi, RTi-1
    histd1BySs1AndResTypes = {}  # keys are SSi-1, RTi, RTi-1
    histd1ByResTypes = {}
    histd1BySs0 = {}
    histd1BySs1 = {}

    linesByEntry = {}
    lineCount = 0
    for row in reader:
        lineCount += 1
        if lineCount > lineCountMax:
            break
        entryId = row[0]
        if not linesByEntry.has_key(entryId):
            linesByEntry[entryId] = []
        linesByEntry[entryId].append(row)

    skippedResTypes = []
    entryIdList = linesByEntry.keys()
    entryIdList.sort()

    # Do some pre filtering.
    for entryId2 in entryIdList:
        lineList = linesByEntry[entryId2]
        for idx, line in enumerate(lineList):
            line.append(idx)
        lineListSorted = NTsort(lineList, BFACTOR_COLUMN, inplace=False)
        # Now throw away the worst 10 % of residues.
        n = len(lineListSorted)
        bad_count = int(round((n * DEFAULT_BFACTOR_PERCENTAGE_FILTER) / 100.))
        to_remove_count = n - bad_count
        #        nTmessage("Removing at least %d from %d residues" % (bad_count,n))
        badIdxList = [
            lineItem[IDX_COLUMN]
            for lineItem in lineListSorted[to_remove_count:n]
        ]
        iList = range(n)
        iList.reverse()
        for i in iList:
            lineItem = lineList[i]
            max_bfactor = float(lineItem[BFACTOR_COLUMN])
            if max_bfactor > DEFAULT_MAX_BFACTOR:
                #                nTdebug('Skipping because max bfactor in dihedral %.3f is above %.3f %s' % (max_bfactor, DEFAULT_MAX_BFACTOR, lineItem))
                del lineList[
                    i]  # TODO: check if indexing is still right or we shoot in the foot.
                continue
            if i in badIdxList:
                #                nTdebug('Skipping because bfactor worst %.3f %s' % (max_bfactor, lineItem))
                del lineList[i]
                continue
        removed_count = n - len(lineList)
        #        nTdebug("Reduced list by %d" % removed_count)
        if removed_count < bad_count:
            nTwarning("Failed to remove at least %d residues" % bad_count)

    for entryId2 in entryIdList:
        prevChainId = None
        prevResType = None
        prevResNum = None
        prevSsType = None
        for _r, row in enumerate(linesByEntry[entryId2]):
            #1zzk,A,GLN ,  17,E, 205.2, 193.6
            #1zzk,A,VAL ,  18,E, 193.6, 223.2
            #1zzk,A,THR ,  19,E, 223.2, 190.1
            (entryId, chainId, resType, resNum, ssType, d1, _d2, _max_bfactor,
             _idx) = row
            resNum = int(resNum)
            ssType = to3StateDssp(ssType)[0]
            resType = resType.strip()
            db = NTdb.getResidueDefByName(resType)
            if not db:
                nTerror("resType not in db: %s" % resType)
                return
            resType = db.nameDict['IUPAC']
            d1 = d1.strip()
            d1 = floatParse(d1)
            if isNaN(d1):
                #                nTdebug("d1 %s is a NaN on row: %s" % (d1,row))
                continue
            if not inRange(d1):
                nTerror("d1 not in range for row: %s" % str(row))
                return

            if not (resType in common20AAList):
                #            nTmessage("Skipping uncommon residue: %s" % resType)
                if not (resType in skippedResTypes):
                    skippedResTypes.append(resType)
                continue

            if isSibling(chainId, resNum, prevChainId, prevResNum):
                appendDeepByKeys(valueBySs0AndResTypes, d1, ssType, resType,
                                 prevResType)
                appendDeepByKeys(valueBySs1AndResTypes, d1, prevSsType,
                                 resType, prevResType)
                appendDeepByKeys(valueByResTypes, d1, resType, prevResType)
                appendDeepByKeys(valueBySs0, d1, ssType)
                appendDeepByKeys(valueBySs1, d1, prevSsType)
                value.append(d1)
            prevResType = resType
            prevResNum = resNum
            prevChainId = chainId
            prevSsType = ssType

    os.unlink(cvs_file_abs_name)
    nTmessage("Skipped skippedResTypes: %r" % skippedResTypes)
    nTmessage("Got count of values: %r" % len(value))
    # fill FOUR types of hist.
    # TODO: filter differently for pro/gly
    keyListSorted1 = valueBySs0AndResTypes.keys()
    keyListSorted1.sort()
    for isI in (True, False):
        if isI:
            valueBySs = valueBySs0
            valueBySsAndResTypes = valueBySs0AndResTypes
            histd1BySs = histd1BySs0
            histd1BySsAndResTypes = histd1BySs0AndResTypes
        else:
            valueBySs = valueBySs1
            valueBySsAndResTypes = valueBySs1AndResTypes
            histd1BySs = histd1BySs1
            histd1BySsAndResTypes = histd1BySs1AndResTypes
        for ssType in keyListSorted1:
            #            keyListSorted1b = deepcopy(keyListSorted1)
            #        for ssTypePrev in keyListSorted1b:
            d1List = valueBySs[ssType]
            if not d1List:
                nTerror("Expected d1List from valueBySs[%s]" % (ssType))
                continue
            hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
            nTmessage("Count %6d in valueBySs[%s]" % (sum(hist1d), ssType))
            setDeepByKeys(histd1BySs, hist1d, ssType)

            keyListSorted2 = valueBySsAndResTypes[ssType].keys()
            keyListSorted2.sort()
            for resType in keyListSorted2:
                #            nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity.
                keyListSorted3 = valueBySsAndResTypes[ssType][resType].keys()
                keyListSorted3.sort()
                for prevResType in keyListSorted3:
                    #                nTmessage("Working on valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType))
                    d1List = valueBySsAndResTypes[ssType][resType][prevResType]
                    if not d1List:
                        nTerror(
                            "Expected d1List from valueBySsAndResTypes[%s][%s][%s]"
                            % (ssType, resType, prevResType))
                        continue
                    hist1d, _bins, _patches = hist(d1List,
                                                   bins=binCount,
                                                   range=xRange)
                    #                nTmessage("Count %6d in valueBySsAndResTypes[%s][%s][%s]" % (sum(hist1d), ssType, resType, prevResType))
                    setDeepByKeys(histd1BySsAndResTypes, hist1d, ssType,
                                  resType, prevResType)
            # Now that they are all in we can redo this.
    # Delete the reference -not- the object.
    valueBySs = None
    valueBySsAndResTypes = None
    histd1BySs = None
    histd1BySsAndResTypes = None

    for ssType in keyListSorted1:
        for resType in keyListSorted2:
            #            nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity.
            keyListSorted3 = valueBySs0AndResTypes[ssType][resType].keys()
            keyListSorted3.sort()
            for resTypePrev in keyListSorted3:
                keyListSorted4 = keyListSorted3[:]  # take a copy
                for resTypeNext in keyListSorted4:
                    hist1 = getDeepByKeys(histd1BySs0AndResTypes, ssType,
                                          resType, resTypePrev)  # x-axis
                    # This was bug! It needs to be hashed on the ssType of resType -not- on resTypeNext
                    hist2 = getDeepByKeys(histd1BySs1AndResTypes, ssType,
                                          resTypeNext, resType)
                    if hist1 == None:
                        nTdebug(
                            'skipping for hist1 is empty for [%s] [%s] [%s]' %
                            (ssType, resTypePrev, resType))
                        continue
                    if hist2 == None:
                        nTdebug(
                            'skipping for hist2 is empty for [%s] [%s] [%s]' %
                            (ssType, resType, resTypeNext))
                        continue
                    m1 = mat(hist1, dtype='float')
                    m2 = mat(hist2, dtype='float')
                    m2 = m2.transpose()  # pylint: disable=E1101
                    hist2d = multiply(m1, m2)

                    cTuple = getEnsembleAverageAndSigmaHis(hist2d)
                    (_c_av, c_sd, _hisMin, _hisMax) = cTuple  #@UnusedVariable
                    cTuple += tuple([
                        str([ssType, resType, resTypePrev, resTypeNext])
                    ])  # append the hash keys as a way of id.
                    #                    nTdebug("For ssType %s residue types %s %s %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % (
                    #                        ssType, resType, resTypePrev, resTypeNext, c_av, c_sd, hisMin, hisMax))
                    if c_sd == None:
                        nTdebug(
                            'Failed to get c_sd when testing not all residues are present in smaller sets.'
                        )
                        continue
                    if c_sd == 0.:
                        nTdebug(
                            'Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.'
                        )
                        continue
                    setDeepByKeys(histd1CtupleBySsAndResTypes, cTuple, ssType,
                                  resType, resTypePrev, resTypeNext)
    # end for isI

    keyListSorted1 = valueByResTypes.keys()
    keyListSorted1.sort()
    for resType in keyListSorted1:
        keyListSorted2 = valueByResTypes[resType].keys()
        keyListSorted2.sort()
        for prevResType in keyListSorted2:
            d1List = valueByResTypes[resType][prevResType]
            if not d1List:
                nTerror("Expected d1List from valueByResTypes[%s][%s]" %
                        (resType, prevResType))
                continue
            hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange)
            #            nTmessage("Count %6d in valueByResTypes[%s][%s]" % (sum(hist1d), resType, prevResType))
            setDeepByKeys(histd1ByResTypes, hist1d, resType, prevResType)

    histd1, _bins, _patches = hist(value, bins=binCount, range=xRange)
    nTmessage("Count %6d in value" % sum(histd1))
    #    setDeepByKeys(histd1, hist1d, resType, prevResType)

    if os.path.exists(dbase_file_abs_name):
        os.unlink(dbase_file_abs_name)
    output = open(dbase_file_abs_name, 'wb')
    dbase = {}
    dbase[
        'histd1BySs0AndResTypes'] = histd1BySs0AndResTypes  # 92 kb uncompressed in the case of ~1000 lines only
    dbase['histd1BySs1AndResTypes'] = histd1BySs1AndResTypes
    dbase['histd1CtupleBySsAndResTypes'] = histd1CtupleBySsAndResTypes
    dbase['histd1ByResTypes'] = histd1ByResTypes  # 56 kb
    dbase['histd1BySs0'] = histd1BySs0  # 4 kb
    dbase['histd1BySs1'] = histd1BySs1
    dbase['histd1'] = histd1  #  4 kb

    cPickle.dump(dbase, output, 2)
    output.close()