def plotHistogramOverall(): graphicsFormat = "png" alpha = 0.8 # was 0.8; looks awful with alpha = 1 n = 20 # d = 3 # number of ss types. extent = (0, n) + (0, n) cmapList = [green_inv, blue_inv, yellow_inv] colorList = ['green', 'blue', 'yellow'] i = 1 # decides on color picked. # If set it will do a single ssType otherwise the overall. for doOverall in [False, True]: # for doOverall in [ True ]: if doOverall: ssTypeList = [None] else: ssTypeList = [' ', 'S', 'H'] for ssType in ssTypeList: m = zeros((n * n), dtype=int).reshape(n, n) # mBySs = zeros((n,n,d), dtype=int).reshape(n,n,d) tickList = [ NTdb.getResidueDefByName(resType).shortName for resType in common20AAList ] # tickListRev = tickList[:] # tickListRev.reverse() for r, resTypePrev in enumerate(common20AAList): for c, resType in enumerate(common20AAList): if doOverall: hist1 = getDeepByKeys(hPlot.histd1ByResTypes, resType, resTypePrev) else: hist1 = getDeepByKeys(hPlot.histd1BySs0AndResTypes, ssType, resType, resTypePrev) if hist1 == None: nTdebug('skipping for hist1 is empty for [%s] [%s]' % (resType, resTypePrev)) continue m[r, c] = sum(hist1) clf() # axes([.1, .1, .8, .8 ] ) xlabel('resType') ylabel('resTypePrev') xlim((0, n)) ylim((0, n)) offset = 0.5 xticks(arange(offset, n), tickList) yticks(arange(offset, n), tickList) # print 'just before call to set_ticks_position' # axis.xaxis.set_ticks_position('top') # axis.xaxis.set_label_position('top') # axis.yaxis.set_ticks_position('both') # axis.yaxis.set_label_position('left') grid(True) strTitle = "ssType: [%s]" % ssType title(strTitle) plot([0, n], [0, n], 'b-', linewidth=1) minCount = 300. maxCount = 1000. if False: minCount = 0. maxCount = 1. if ssType: minCount /= 3. maxCount /= 3. maxHist = amax(m) minHist = amin(m) sumHist = sum(m) nTmessage('ssType: %s' % ssType) nTmessage('maxHist: %s' % maxHist) # 9165 of total of ~ 1 M. nTmessage('minHist: %s' % minHist) # 210 nTmessage('sumHist: %s' % sumHist) # 210 # nTmessage('tickList: %s' % tickList) # 210 # his *= 100./maxHist his = masked_where(m <= minCount, m, copy=1) palette = cmapList[i] palette.set_under(color='red', alpha=1.0) # alpha is 0.0 palette.set_over( color=colorList[i], alpha=1.0 ) # alpha is 1.0 Important to make it a hard alpha; last plotted will rule. palette.set_bad(color='red', alpha=1.0) norm = Normalize(vmin=minCount, vmax=maxCount, clip=True) # clip is False imshow( his, interpolation='nearest', # interpolation='bicubic', origin='lower', extent=extent, alpha=alpha, cmap=palette, norm=norm) # mr = m[::-1] # reverses the rows, nice! # nTmessage('mr: %s' % mr) fn = "plotHistogram_%s_d1d2.%s" % (ssType, graphicsFormat) savefig(fn) clf() a = m.reshape(n * n) hist(a, 20) xlabel('pair count') ylabel('number of occurrences') title(strTitle) fn = "plotHistOfHist_%s_d1d2.%s" % (ssType, graphicsFormat) savefig(fn) # end loop over ssType # end over ssType overall return m
def matchResidue2Cing(self, res): """ Match res to CING database using previously defined convention; Account for 'ill-defined' residues by examining crucial atom names. Use CYANA (==DIANA) Naming for conversion to INTERNAL (i.e. These names will not likely change) Return NTdb resDef object None on Error res is a NTtree object with the following attributes set after this routine: db skip resName and attributes for every atom it includes: HA2, CD1, ... """ # nTdebug("Now in _matchResidue2Cing: %s" % res) res.db = None res.skip = False # Residue names that are ambiguously defined by different PDB file formats if res.resName[0:3] == 'ARG': if 'HH1' in res: res.db = NTdb.getResidueDefByName('ARG', convention = CYANA) elif '1HH' in res: # Second set for CYANA 1.x, AMBER res.db = NTdb.getResidueDefByName('ARG', convention = CYANA) else: # Default protonated; this also assures most common for X-ray without protons res.db = NTdb.getResidueDefByName('ARG+', convention = CYANA) #end if #end if elif res.resName[0:3] == 'ASP': if 'HD2' in res: #print 'ASPH' res.db = NTdb.getResidueDefByName('ASP', convention = CYANA) else: # Default deprot; this also assures most common for X-ray without protons #print 'ASP' res.db = NTdb.getResidueDefByName('ASP-', convention = CYANA) #end if elif res.resName[0:3] == 'GLU': if 'HE2' in res: #print 'GLUH' res.db = NTdb.getResidueDefByName('GLU', convention = CYANA) else: # Default deprot; this also assures most common for X-ray without protons #print 'GLU' res.db = NTdb.getResidueDefByName('GLU-', convention = CYANA) #end if elif res.resName[0:3] == 'HIS': if 'HD1' in res and 'HE2' in res: #print 'HISH' res.db = NTdb.getResidueDefByName('HIS+', convention = CYANA) elif not 'HD1' in res and 'HE2' in res: # print HISE res.db = NTdb.getResidueDefByName('HIST', convention = CYANA) else: # Default HD1 #print 'HIS' res.db = NTdb.getResidueDefByName('HIS', convention = CYANA) #end if elif res.resName[0:3] == 'LYS': if ('HZ1' in res and not 'HZ3' in res): res.db = NTdb.getResidueDefByName('LYS', convention = CYANA) elif ('1HZ' in res and not '3HZ' in res): # Second set for CYANA 1.x res.db = NTdb.getResidueDefByName('LYS', convention = CYANA) else: # Default prot; this also assures most common for X-ray without protons res.db = NTdb.getResidueDefByName('LYS+', convention = CYANA) #end if elif res.resName in CYANA_NON_RESIDUES: res.skip = True elif res.resName == 'HOH' and self.skipWaters: res.skip = True else: res.db = NTdb.getResidueDefByName(res.resName, convention = self.convention) #end if # Only continue the search if not found and non-standard residues are allowed. if res.db: return res.db if not self.allowNonStandardResidue: res.skip = True return res.db # Try to match the residue using INTERNAL convention. res.db = NTdb.getResidueDefByName(res.resName) if res.db: return res.db # insert new residue. res.db = NTdb.appendResidueDef(name = res.resName, shortName = '_', comment='From parsing PDB file') if not res.db: nTcodeerror("Adding a non-standard residue should have been possible.") return None res.db.nameDict[self.convention] = res.resName # Just a check, disable for speed. _x = NTdb.getResidueDefByName(res.resName) if not _x: nTcodeerror("Added residue but failed to find it again in pdbParser#_matchResidue2Cing") return res.db
def plotHistogramOverall(): graphicsFormat = "png" alpha = 0.8 # was 0.8; looks awful with alpha = 1 n = 20 # d = 3 # number of ss types. extent = (0, n) + (0, n) cmapList = [ green_inv, blue_inv, yellow_inv ] colorList = [ 'green', 'blue', 'yellow'] i = 1 # decides on color picked. # If set it will do a single ssType otherwise the overall. for doOverall in [ False, True ]: # for doOverall in [ True ]: if doOverall: ssTypeList = [ None ] else: ssTypeList = [' ', 'S', 'H'] for ssType in ssTypeList: m = zeros((n * n), dtype=int).reshape(n, n) # mBySs = zeros((n,n,d), dtype=int).reshape(n,n,d) tickList = [ NTdb.getResidueDefByName(resType).shortName for resType in common20AAList] # tickListRev = tickList[:] # tickListRev.reverse() for r, resTypePrev in enumerate(common20AAList): for c, resType in enumerate(common20AAList): if doOverall: hist1 = getDeepByKeys(hPlot.histd1ByResTypes, resType, resTypePrev) else: hist1 = getDeepByKeys(hPlot.histd1BySs0AndResTypes, ssType, resType, resTypePrev) if hist1 == None: nTdebug('skipping for hist1 is empty for [%s] [%s]' % (resType, resTypePrev)) continue m[r, c] = sum(hist1) clf() # axes([.1, .1, .8, .8 ] ) xlabel('resType') ylabel('resTypePrev') xlim((0, n)) ylim((0, n)) offset = 0.5 xticks(arange(offset, n), tickList) yticks(arange(offset, n), tickList) # print 'just before call to set_ticks_position' # axis.xaxis.set_ticks_position('top') # axis.xaxis.set_label_position('top') # axis.yaxis.set_ticks_position('both') # axis.yaxis.set_label_position('left') grid(True) strTitle = "ssType: [%s]" % ssType title(strTitle) plot([0, n], [0, n], 'b-', linewidth=1) minCount = 300. maxCount = 1000. if False: minCount = 0. maxCount = 1. if ssType: minCount /= 3. maxCount /= 3. maxHist = amax(m) minHist = amin(m) sumHist = sum(m) nTmessage('ssType: %s' % ssType) nTmessage('maxHist: %s' % maxHist) # 9165 of total of ~ 1 M. nTmessage('minHist: %s' % minHist) # 210 nTmessage('sumHist: %s' % sumHist) # 210 # nTmessage('tickList: %s' % tickList) # 210 # his *= 100./maxHist his = masked_where(m <= minCount, m, copy=1) palette = cmapList[i] palette.set_under(color='red', alpha=1.0) # alpha is 0.0 palette.set_over(color=colorList[i], alpha=1.0) # alpha is 1.0 Important to make it a hard alpha; last plotted will rule. palette.set_bad(color='red', alpha=1.0) norm = Normalize(vmin=minCount, vmax=maxCount, clip=True) # clip is False imshow(his, interpolation='nearest', # interpolation='bicubic', origin='lower', extent=extent, alpha=alpha, cmap=palette, norm=norm) # mr = m[::-1] # reverses the rows, nice! # nTmessage('mr: %s' % mr) fn = "plotHistogram_%s_d1d2.%s" % (ssType, graphicsFormat) savefig(fn) clf() a = m.reshape(n * n) hist(a, 20) xlabel('pair count') ylabel('number of occurrences') title(strTitle) fn = "plotHistOfHist_%s_d1d2.%s" % (ssType, graphicsFormat) savefig(fn) # end loop over ssType # end over ssType overall return m
def main(): 'See above.' cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz') gunzip(cvs_file_abs_name_gz) reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE) valueBySs0AndResTypes = {} # keys are SSi, RTi, RTi-1 valueBySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1 valueByResTypes = {} valueBySs0 = {} # keys are SSi valueBySs1 = {} # keys are SSi-1 histd1CtupleBySsAndResTypes = {} value = [] # NB is an array without being keyed. histd1BySs0AndResTypes = {} # keys are SSi, RTi, RTi-1 histd1BySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1 histd1ByResTypes = {} histd1BySs0 = {} histd1BySs1 = {} linesByEntry = {} lineCount = 0 for row in reader: lineCount += 1 if lineCount > lineCountMax: break entryId = row[0] if not linesByEntry.has_key(entryId): linesByEntry[ entryId ] = [] linesByEntry[ entryId ].append( row ) skippedResTypes = [] entryIdList = linesByEntry.keys() entryIdList.sort() # Do some pre filtering. for entryId2 in entryIdList: lineList = linesByEntry[ entryId2 ] for idx,line in enumerate(lineList): line.append(idx) lineListSorted = NTsort(lineList,BFACTOR_COLUMN,inplace=False) # Now throw away the worst 10 % of residues. n = len(lineListSorted) bad_count = int(round((n * DEFAULT_BFACTOR_PERCENTAGE_FILTER) / 100.)) to_remove_count = n-bad_count # nTmessage("Removing at least %d from %d residues" % (bad_count,n)) badIdxList = [lineItem[IDX_COLUMN] for lineItem in lineListSorted[to_remove_count:n]] iList = range(n) iList.reverse() for i in iList: lineItem = lineList[i] max_bfactor = float(lineItem[BFACTOR_COLUMN]) if max_bfactor > DEFAULT_MAX_BFACTOR: # nTdebug('Skipping because max bfactor in dihedral %.3f is above %.3f %s' % (max_bfactor, DEFAULT_MAX_BFACTOR, lineItem)) del lineList[i] # TODO: check if indexing is still right or we shoot in the foot. continue if i in badIdxList: # nTdebug('Skipping because bfactor worst %.3f %s' % (max_bfactor, lineItem)) del lineList[i] continue removed_count = n - len(lineList) # nTdebug("Reduced list by %d" % removed_count) if removed_count < bad_count: nTwarning("Failed to remove at least %d residues" % bad_count) for entryId2 in entryIdList: prevChainId = None prevResType = None prevResNum = None prevSsType = None for _r, row in enumerate(linesByEntry[ entryId2 ]): #1zzk,A,GLN , 17,E, 205.2, 193.6 #1zzk,A,VAL , 18,E, 193.6, 223.2 #1zzk,A,THR , 19,E, 223.2, 190.1 (entryId, chainId, resType, resNum, ssType, d1, _d2, _max_bfactor, _idx) = row resNum = int(resNum) ssType = to3StateDssp(ssType)[0] resType = resType.strip() db = NTdb.getResidueDefByName( resType ) if not db: nTerror("resType not in db: %s" % resType) return resType = db.nameDict['IUPAC'] d1 = d1.strip() d1 = floatParse(d1) if isNaN(d1): # nTdebug("d1 %s is a NaN on row: %s" % (d1,row)) continue if not inRange(d1): nTerror("d1 not in range for row: %s" % str(row)) return if not (resType in common20AAList): # nTmessage("Skipping uncommon residue: %s" % resType) if not ( resType in skippedResTypes): skippedResTypes.append( resType ) continue if isSibling(chainId, resNum, prevChainId, prevResNum): appendDeepByKeys(valueBySs0AndResTypes, d1, ssType, resType, prevResType) appendDeepByKeys(valueBySs1AndResTypes, d1, prevSsType, resType, prevResType) appendDeepByKeys(valueByResTypes, d1, resType, prevResType) appendDeepByKeys(valueBySs0, d1, ssType) appendDeepByKeys(valueBySs1, d1, prevSsType) value.append( d1 ) prevResType = resType prevResNum = resNum prevChainId = chainId prevSsType = ssType os.unlink(cvs_file_abs_name) nTmessage("Skipped skippedResTypes: %r" % skippedResTypes ) nTmessage("Got count of values: %r" % len(value) ) # fill FOUR types of hist. # TODO: filter differently for pro/gly keyListSorted1 = valueBySs0AndResTypes.keys() keyListSorted1.sort() for isI in (True, False): if isI: valueBySs = valueBySs0 valueBySsAndResTypes = valueBySs0AndResTypes histd1BySs = histd1BySs0 histd1BySsAndResTypes = histd1BySs0AndResTypes else: valueBySs = valueBySs1 valueBySsAndResTypes = valueBySs1AndResTypes histd1BySs = histd1BySs1 histd1BySsAndResTypes = histd1BySs1AndResTypes for ssType in keyListSorted1: # keyListSorted1b = deepcopy(keyListSorted1) # for ssTypePrev in keyListSorted1b: d1List = valueBySs[ssType] if not d1List: nTerror("Expected d1List from valueBySs[%s]" % (ssType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) nTmessage("Count %6d in valueBySs[%s]" % (sum(hist1d), ssType)) setDeepByKeys(histd1BySs, hist1d, ssType) keyListSorted2 = valueBySsAndResTypes[ssType].keys() keyListSorted2.sort() for resType in keyListSorted2: # nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity. keyListSorted3 = valueBySsAndResTypes[ssType][resType].keys() keyListSorted3.sort() for prevResType in keyListSorted3: # nTmessage("Working on valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType)) d1List = valueBySsAndResTypes[ssType][resType][prevResType] if not d1List: nTerror("Expected d1List from valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) # nTmessage("Count %6d in valueBySsAndResTypes[%s][%s][%s]" % (sum(hist1d), ssType, resType, prevResType)) setDeepByKeys(histd1BySsAndResTypes, hist1d, ssType, resType, prevResType) # Now that they are all in we can redo this. # Delete the reference -not- the object. valueBySs = None valueBySsAndResTypes = None histd1BySs = None histd1BySsAndResTypes = None for ssType in keyListSorted1: for resType in keyListSorted2: # nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity. keyListSorted3 = valueBySs0AndResTypes[ssType][resType].keys() keyListSorted3.sort() for resTypePrev in keyListSorted3: keyListSorted4 = keyListSorted3[:] # take a copy for resTypeNext in keyListSorted4: hist1 = getDeepByKeys(histd1BySs0AndResTypes, ssType, resType, resTypePrev) # x-axis # This was bug! It needs to be hashed on the ssType of resType -not- on resTypeNext hist2 = getDeepByKeys(histd1BySs1AndResTypes, ssType, resTypeNext, resType) if hist1 == None: nTdebug('skipping for hist1 is empty for [%s] [%s] [%s]' % (ssType, resTypePrev, resType)) continue if hist2 == None: nTdebug('skipping for hist2 is empty for [%s] [%s] [%s]' % (ssType, resType, resTypeNext)) continue m1 = mat(hist1,dtype='float') m2 = mat(hist2,dtype='float') m2 = m2.transpose() # pylint: disable=E1101 hist2d = multiply(m1,m2) cTuple = getEnsembleAverageAndSigmaHis( hist2d ) (_c_av, c_sd, _hisMin, _hisMax) = cTuple #@UnusedVariable cTuple += tuple([str([ssType, resType, resTypePrev, resTypeNext])]) # append the hash keys as a way of id. # nTdebug("For ssType %s residue types %s %s %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % ( # ssType, resType, resTypePrev, resTypeNext, c_av, c_sd, hisMin, hisMax)) if c_sd == None: nTdebug('Failed to get c_sd when testing not all residues are present in smaller sets.') continue if c_sd == 0.: nTdebug('Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.') continue setDeepByKeys( histd1CtupleBySsAndResTypes, cTuple, ssType, resType, resTypePrev, resTypeNext) # end for isI keyListSorted1 = valueByResTypes.keys() keyListSorted1.sort() for resType in keyListSorted1: keyListSorted2 = valueByResTypes[resType].keys() keyListSorted2.sort() for prevResType in keyListSorted2: d1List = valueByResTypes[resType][prevResType] if not d1List: nTerror("Expected d1List from valueByResTypes[%s][%s]" % (resType, prevResType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) # nTmessage("Count %6d in valueByResTypes[%s][%s]" % (sum(hist1d), resType, prevResType)) setDeepByKeys(histd1ByResTypes, hist1d, resType, prevResType) histd1, _bins, _patches = hist(value, bins=binCount, range=xRange) nTmessage("Count %6d in value" % sum(histd1)) # setDeepByKeys(histd1, hist1d, resType, prevResType) if os.path.exists(dbase_file_abs_name): os.unlink(dbase_file_abs_name) output = open(dbase_file_abs_name, 'wb') dbase = {} dbase[ 'histd1BySs0AndResTypes' ] = histd1BySs0AndResTypes # 92 kb uncompressed in the case of ~1000 lines only dbase[ 'histd1BySs1AndResTypes' ] = histd1BySs1AndResTypes dbase[ 'histd1CtupleBySsAndResTypes' ] = histd1CtupleBySsAndResTypes dbase[ 'histd1ByResTypes' ] = histd1ByResTypes # 56 kb dbase[ 'histd1BySs0' ] = histd1BySs0 # 4 kb dbase[ 'histd1BySs1' ] = histd1BySs1 dbase[ 'histd1' ] = histd1 # 4 kb cPickle.dump(dbase, output, 2) output.close()
def main(): 'See above.' cvs_file_abs_name_gz = os.path.join(cingDirData, 'PluginCode', 'Whatif', cvs_file_abs_name + '.gz') gunzip(cvs_file_abs_name_gz) reader = csv.reader(open(cvs_file_abs_name, "rb"), quoting=csv.QUOTE_NONE) valueBySs0AndResTypes = {} # keys are SSi, RTi, RTi-1 valueBySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1 valueByResTypes = {} valueBySs0 = {} # keys are SSi valueBySs1 = {} # keys are SSi-1 histd1CtupleBySsAndResTypes = {} value = [] # NB is an array without being keyed. histd1BySs0AndResTypes = {} # keys are SSi, RTi, RTi-1 histd1BySs1AndResTypes = {} # keys are SSi-1, RTi, RTi-1 histd1ByResTypes = {} histd1BySs0 = {} histd1BySs1 = {} linesByEntry = {} lineCount = 0 for row in reader: lineCount += 1 if lineCount > lineCountMax: break entryId = row[0] if not linesByEntry.has_key(entryId): linesByEntry[entryId] = [] linesByEntry[entryId].append(row) skippedResTypes = [] entryIdList = linesByEntry.keys() entryIdList.sort() # Do some pre filtering. for entryId2 in entryIdList: lineList = linesByEntry[entryId2] for idx, line in enumerate(lineList): line.append(idx) lineListSorted = NTsort(lineList, BFACTOR_COLUMN, inplace=False) # Now throw away the worst 10 % of residues. n = len(lineListSorted) bad_count = int(round((n * DEFAULT_BFACTOR_PERCENTAGE_FILTER) / 100.)) to_remove_count = n - bad_count # nTmessage("Removing at least %d from %d residues" % (bad_count,n)) badIdxList = [ lineItem[IDX_COLUMN] for lineItem in lineListSorted[to_remove_count:n] ] iList = range(n) iList.reverse() for i in iList: lineItem = lineList[i] max_bfactor = float(lineItem[BFACTOR_COLUMN]) if max_bfactor > DEFAULT_MAX_BFACTOR: # nTdebug('Skipping because max bfactor in dihedral %.3f is above %.3f %s' % (max_bfactor, DEFAULT_MAX_BFACTOR, lineItem)) del lineList[ i] # TODO: check if indexing is still right or we shoot in the foot. continue if i in badIdxList: # nTdebug('Skipping because bfactor worst %.3f %s' % (max_bfactor, lineItem)) del lineList[i] continue removed_count = n - len(lineList) # nTdebug("Reduced list by %d" % removed_count) if removed_count < bad_count: nTwarning("Failed to remove at least %d residues" % bad_count) for entryId2 in entryIdList: prevChainId = None prevResType = None prevResNum = None prevSsType = None for _r, row in enumerate(linesByEntry[entryId2]): #1zzk,A,GLN , 17,E, 205.2, 193.6 #1zzk,A,VAL , 18,E, 193.6, 223.2 #1zzk,A,THR , 19,E, 223.2, 190.1 (entryId, chainId, resType, resNum, ssType, d1, _d2, _max_bfactor, _idx) = row resNum = int(resNum) ssType = to3StateDssp(ssType)[0] resType = resType.strip() db = NTdb.getResidueDefByName(resType) if not db: nTerror("resType not in db: %s" % resType) return resType = db.nameDict['IUPAC'] d1 = d1.strip() d1 = floatParse(d1) if isNaN(d1): # nTdebug("d1 %s is a NaN on row: %s" % (d1,row)) continue if not inRange(d1): nTerror("d1 not in range for row: %s" % str(row)) return if not (resType in common20AAList): # nTmessage("Skipping uncommon residue: %s" % resType) if not (resType in skippedResTypes): skippedResTypes.append(resType) continue if isSibling(chainId, resNum, prevChainId, prevResNum): appendDeepByKeys(valueBySs0AndResTypes, d1, ssType, resType, prevResType) appendDeepByKeys(valueBySs1AndResTypes, d1, prevSsType, resType, prevResType) appendDeepByKeys(valueByResTypes, d1, resType, prevResType) appendDeepByKeys(valueBySs0, d1, ssType) appendDeepByKeys(valueBySs1, d1, prevSsType) value.append(d1) prevResType = resType prevResNum = resNum prevChainId = chainId prevSsType = ssType os.unlink(cvs_file_abs_name) nTmessage("Skipped skippedResTypes: %r" % skippedResTypes) nTmessage("Got count of values: %r" % len(value)) # fill FOUR types of hist. # TODO: filter differently for pro/gly keyListSorted1 = valueBySs0AndResTypes.keys() keyListSorted1.sort() for isI in (True, False): if isI: valueBySs = valueBySs0 valueBySsAndResTypes = valueBySs0AndResTypes histd1BySs = histd1BySs0 histd1BySsAndResTypes = histd1BySs0AndResTypes else: valueBySs = valueBySs1 valueBySsAndResTypes = valueBySs1AndResTypes histd1BySs = histd1BySs1 histd1BySsAndResTypes = histd1BySs1AndResTypes for ssType in keyListSorted1: # keyListSorted1b = deepcopy(keyListSorted1) # for ssTypePrev in keyListSorted1b: d1List = valueBySs[ssType] if not d1List: nTerror("Expected d1List from valueBySs[%s]" % (ssType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) nTmessage("Count %6d in valueBySs[%s]" % (sum(hist1d), ssType)) setDeepByKeys(histd1BySs, hist1d, ssType) keyListSorted2 = valueBySsAndResTypes[ssType].keys() keyListSorted2.sort() for resType in keyListSorted2: # nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity. keyListSorted3 = valueBySsAndResTypes[ssType][resType].keys() keyListSorted3.sort() for prevResType in keyListSorted3: # nTmessage("Working on valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType)) d1List = valueBySsAndResTypes[ssType][resType][prevResType] if not d1List: nTerror( "Expected d1List from valueBySsAndResTypes[%s][%s][%s]" % (ssType, resType, prevResType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) # nTmessage("Count %6d in valueBySsAndResTypes[%s][%s][%s]" % (sum(hist1d), ssType, resType, prevResType)) setDeepByKeys(histd1BySsAndResTypes, hist1d, ssType, resType, prevResType) # Now that they are all in we can redo this. # Delete the reference -not- the object. valueBySs = None valueBySsAndResTypes = None histd1BySs = None histd1BySsAndResTypes = None for ssType in keyListSorted1: for resType in keyListSorted2: # nTmessage("Working on valueBySsAndResTypes for [%s][%s]" % (ssType, resType)) # nice for balancing output verbosity. keyListSorted3 = valueBySs0AndResTypes[ssType][resType].keys() keyListSorted3.sort() for resTypePrev in keyListSorted3: keyListSorted4 = keyListSorted3[:] # take a copy for resTypeNext in keyListSorted4: hist1 = getDeepByKeys(histd1BySs0AndResTypes, ssType, resType, resTypePrev) # x-axis # This was bug! It needs to be hashed on the ssType of resType -not- on resTypeNext hist2 = getDeepByKeys(histd1BySs1AndResTypes, ssType, resTypeNext, resType) if hist1 == None: nTdebug( 'skipping for hist1 is empty for [%s] [%s] [%s]' % (ssType, resTypePrev, resType)) continue if hist2 == None: nTdebug( 'skipping for hist2 is empty for [%s] [%s] [%s]' % (ssType, resType, resTypeNext)) continue m1 = mat(hist1, dtype='float') m2 = mat(hist2, dtype='float') m2 = m2.transpose() # pylint: disable=E1101 hist2d = multiply(m1, m2) cTuple = getEnsembleAverageAndSigmaHis(hist2d) (_c_av, c_sd, _hisMin, _hisMax) = cTuple #@UnusedVariable cTuple += tuple([ str([ssType, resType, resTypePrev, resTypeNext]) ]) # append the hash keys as a way of id. # nTdebug("For ssType %s residue types %s %s %s found (av/sd/min/max) %8.0f %8.0f %8.0f %8.0f" % ( # ssType, resType, resTypePrev, resTypeNext, c_av, c_sd, hisMin, hisMax)) if c_sd == None: nTdebug( 'Failed to get c_sd when testing not all residues are present in smaller sets.' ) continue if c_sd == 0.: nTdebug( 'Got zero c_sd, ignoring histogram. This should only occur in smaller sets. Not setting values.' ) continue setDeepByKeys(histd1CtupleBySsAndResTypes, cTuple, ssType, resType, resTypePrev, resTypeNext) # end for isI keyListSorted1 = valueByResTypes.keys() keyListSorted1.sort() for resType in keyListSorted1: keyListSorted2 = valueByResTypes[resType].keys() keyListSorted2.sort() for prevResType in keyListSorted2: d1List = valueByResTypes[resType][prevResType] if not d1List: nTerror("Expected d1List from valueByResTypes[%s][%s]" % (resType, prevResType)) continue hist1d, _bins, _patches = hist(d1List, bins=binCount, range=xRange) # nTmessage("Count %6d in valueByResTypes[%s][%s]" % (sum(hist1d), resType, prevResType)) setDeepByKeys(histd1ByResTypes, hist1d, resType, prevResType) histd1, _bins, _patches = hist(value, bins=binCount, range=xRange) nTmessage("Count %6d in value" % sum(histd1)) # setDeepByKeys(histd1, hist1d, resType, prevResType) if os.path.exists(dbase_file_abs_name): os.unlink(dbase_file_abs_name) output = open(dbase_file_abs_name, 'wb') dbase = {} dbase[ 'histd1BySs0AndResTypes'] = histd1BySs0AndResTypes # 92 kb uncompressed in the case of ~1000 lines only dbase['histd1BySs1AndResTypes'] = histd1BySs1AndResTypes dbase['histd1CtupleBySsAndResTypes'] = histd1CtupleBySsAndResTypes dbase['histd1ByResTypes'] = histd1ByResTypes # 56 kb dbase['histd1BySs0'] = histd1BySs0 # 4 kb dbase['histd1BySs1'] = histd1BySs1 dbase['histd1'] = histd1 # 4 kb cPickle.dump(dbase, output, 2) output.close()
def matchResidue2Cing(self, res): """ Match res to CING database using previously defined convention; Account for 'ill-defined' residues by examining crucial atom names. Use CYANA (==DIANA) Naming for conversion to INTERNAL (i.e. These names will not likely change) Return NTdb resDef object None on Error res is a NTtree object with the following attributes set after this routine: db skip resName and attributes for every atom it includes: HA2, CD1, ... """ # nTdebug("Now in _matchResidue2Cing: %s" % res) res.db = None res.skip = False # Residue names that are ambiguously defined by different PDB file formats if res.resName[0:3] == 'ARG': if 'HH1' in res: res.db = NTdb.getResidueDefByName('ARG', convention=CYANA) elif '1HH' in res: # Second set for CYANA 1.x, AMBER res.db = NTdb.getResidueDefByName('ARG', convention=CYANA) else: # Default protonated; this also assures most common for X-ray without protons res.db = NTdb.getResidueDefByName('ARG+', convention=CYANA) #end if #end if elif res.resName[0:3] == 'ASP': if 'HD2' in res: #print 'ASPH' res.db = NTdb.getResidueDefByName('ASP', convention=CYANA) else: # Default deprot; this also assures most common for X-ray without protons #print 'ASP' res.db = NTdb.getResidueDefByName('ASP-', convention=CYANA) #end if elif res.resName[0:3] == 'GLU': if 'HE2' in res: #print 'GLUH' res.db = NTdb.getResidueDefByName('GLU', convention=CYANA) else: # Default deprot; this also assures most common for X-ray without protons #print 'GLU' res.db = NTdb.getResidueDefByName('GLU-', convention=CYANA) #end if elif res.resName[0:3] == 'HIS': if 'HD1' in res and 'HE2' in res: #print 'HISH' res.db = NTdb.getResidueDefByName('HIS+', convention=CYANA) elif not 'HD1' in res and 'HE2' in res: # print HISE res.db = NTdb.getResidueDefByName('HIST', convention=CYANA) else: # Default HD1 #print 'HIS' res.db = NTdb.getResidueDefByName('HIS', convention=CYANA) #end if elif res.resName[0:3] == 'LYS': if ('HZ1' in res and not 'HZ3' in res): res.db = NTdb.getResidueDefByName('LYS', convention=CYANA) elif ('1HZ' in res and not '3HZ' in res): # Second set for CYANA 1.x res.db = NTdb.getResidueDefByName('LYS', convention=CYANA) else: # Default prot; this also assures most common for X-ray without protons res.db = NTdb.getResidueDefByName('LYS+', convention=CYANA) #end if elif res.resName in CYANA_NON_RESIDUES: res.skip = True elif res.resName == 'HOH' and self.skipWaters: res.skip = True else: res.db = NTdb.getResidueDefByName(res.resName, convention=self.convention) #end if # Only continue the search if not found and non-standard residues are allowed. if res.db: return res.db if not self.allowNonStandardResidue: res.skip = True return res.db # Try to match the residue using INTERNAL convention. res.db = NTdb.getResidueDefByName(res.resName) if res.db: return res.db # insert new residue. res.db = NTdb.appendResidueDef(name=res.resName, shortName='_', comment='From parsing PDB file') if not res.db: nTcodeerror( "Adding a non-standard residue should have been possible.") return None res.db.nameDict[self.convention] = res.resName # Just a check, disable for speed. _x = NTdb.getResidueDefByName(res.resName) if not _x: nTcodeerror( "Added residue but failed to find it again in pdbParser#_matchResidue2Cing" ) return res.db