def getCasdNmrProjectInfo(casdNmrRefFile=None): """ Code to get list of CASD-NMR projects info """ result = [] # This file is customisable! if not casdNmrRefFile: casdNmrRefFile = os.path.join(allDataDir, 'dataPage.html') # Get the web page... text = ''.join( getReferenceTextFileFromHttp(casdNmrDataUrl, casdNmrRefFile, refText="CASD-NMR data", isGzipped=False)) table, links = casdUtil.parseHtmlTable(text) tags = table[0] for ii in range(1, len(table)): dd = {} result.append(dd) ll = table[ii] for jj, val in enumerate(table[ii]): dd[tags[jj]] = val dd['DataLink'] = links[ii][0] # return result
def getBmrbPdbMappingInfo(dataFilePath=None): # NOTE: this gives UNIQUE mappings between PDB and BMRB ID! # Use getBmrbDatabaseMatches for list of all mappings. if not dataFilePath: dataFilePath = bmrbPdbMappingFilePath dataLines = getReferenceTextFileFromHttp(bmrbPdbMappingUrl, dataFilePath, refText="BMRB to PDB mappings") bmrbPdbMappingDict = {} for dataLine in dataLines: if dataLine: (bmrbId, pdbCode) = dataLine.strip().split(',') bmrbId = int(bmrbId) if not bmrbPdbMappingDict.has_key(bmrbId): bmrbPdbMappingDict[bmrbId] = pdbCode.lower() else: # This should never happen... raise it? print "MULTIPLE MATCH FOR %d!!" % bmrbId return bmrbPdbMappingDict
def getPdbChainInfo(): """ Read info on chain information of PDB entries """ pdbChainInfoDict = {} dataLines = getReferenceTextFileFromHttp(pdbChainInfoUrl,pdbChainInfoFilePath,refText = "PDB entry chain information", isGzipped = True) for dataLine in dataLines: cols = dataLine.split() if cols and cols[0][0] == '>': (pdbCode,chainCode) = cols[0][1:].split("_") molType = cols[1][4:] chainLength = returnInt(cols[2][7:]) molName = ' '.join(cols[3:]) if not pdbChainInfoDict.has_key(pdbCode): pdbChainInfoDict[pdbCode] = {} pdbChainInfoDict[pdbCode][chainCode] = [chainLength,molName,None] else: pdbChainInfoDict[pdbCode][chainCode][2] = cols[0] return pdbChainInfoDict
def getPdbEntryType(dataFilePath=None,updateFile=False): """ Read info on experiment type of PDB entries """ pdbEntryTypeDict = {} if not dataFilePath: dataFilePath = pdbEntryTypeFilePath if updateFile or not os.path.exists(dataFilePath): dataLines = getReferenceTextFileFromHttp(pdbEntryTypeUrl,dataFilePath,refText = "PDB entry experiment type information") else: fin = open(dataFilePath) dataLines = fin.readlines() fin.close() for dataLine in dataLines: cols = dataLine.split() if len(cols) == 3: (pdbCode,molType,expType) = cols # molType is prot, nuc or prot-nuc # expType is diffraction, NMR, or EM pdbEntryTypeDict[pdbCode] = (molType,expType) return pdbEntryTypeDict
def getCasdNmrProjectInfo(casdNmrRefFile=None): """ Code to get list of CASD-NMR projects info """ hrefPatt = re.compile('\<a href\=\"([^\"]+)\"') hrefNamePatt = re.compile('[^ ]\"\>([^\>]+)\<\/a') pdbCodePatt = re.compile('\>\s*([A-Za-z0-9]{4})\s*\<\/a') # This file is customisable! if not casdNmrRefFile: casdNmrRefFile = os.path.join(casdNmrDataDir, 'reference', 'dataPage.html') # Get the web page... dataLines = getReferenceTextFileFromHttp(casdNmrDataUrl, casdNmrRefFile, refText="CASD-NMR data", isGzipped=False) # Now get the info out... projectInfo = [] for dataLine in dataLines: if dataLine.count('href'): if dataLine.count("assignment-software"): continue # Some custum hacking here - content of href lines with data not dependable enough... if dataLine.count("rutgers") or dataLine.count( 'Data for') or dataLine.count('/wenmr/files/files/'): hrefSearch = hrefPatt.search(dataLine) dataLine = dataLine.replace("<span>", "") dataLine = dataLine.replace("</span>", "") projectNameSearch = hrefNamePatt.search(dataLine) if hrefSearch: urlName = projectNameSearch.group(1) if urlName.count("/wenmr"): urlName = urlName.replace("/wenmr", eNmrUrl) projectInfo.append([urlName, hrefSearch.group(1), []]) elif dataLine.count("structureId="): pdbCodeSearch = pdbCodePatt.search(dataLine) if pdbCodeSearch: projectInfo[-1][-1].append(pdbCodeSearch.group(1)) if not projectInfo: raise CasdNmrError("No files found, probably web page change!") return projectInfo
def getBmrbDatabaseMatches(dataFilePath=None): """ Read info on relation between BMRB and other database entries... """ if not dataFilePath: dataFilePath = bmrbDatabaseMatchFilePath dataLines = getReferenceTextFileFromHttp(bmrbDatabaseMatchUrl, dataFilePath, refText="BMRB database matches") bmrbDatabaseDict = {} for dataLine in dataLines: if dataLine: dataLine = dataLine.strip() values = dataLine.split(',') # Ignore lines with inconsistent length if len(values) != 11: continue (bmrbId, dbName, dbCode, matchData, matchScore, unknown, unknown, method, resolution, unknown, unknown) = values bmrbId = bmrbId.strip('"') if not bmrbId: continue bmrbId = int(bmrbId) dbName = dbName.strip('"') dbCode = dbCode.strip('"') if not bmrbDatabaseDict.has_key(bmrbId): bmrbDatabaseDict[bmrbId] = {} if not bmrbDatabaseDict[bmrbId].has_key(dbName): bmrbDatabaseDict[bmrbId][dbName] = [] bmrbDatabaseDict[bmrbId][dbName].append(dbCode) return bmrbDatabaseDict
def getBmrbNmrGridDict(dataFilePath=None, skipCodes=None): """ Read info on DOCR/FRED from BMRB... """ bmrbNmrGridDict = {} #1047 1017 classified 1a51 2004-12-02 #26561 1064 parsed 1ajw 2004-12-03 #39888 1263 filtered 1c54 2004-12-03 #39889 1266 converted 1c7v 2004-12-03 # Where classified means there is a constraint file, but it's not been parsed # rest is obvious: converted DOCR, filtered FRED. if not dataFilePath: dataFilePath = os.path.join(nmrGridReferenceDir, nmrGridInfoFile) dataLines = getReferenceTextFileFromHttp(nmrGridInfoUrl, dataFilePath, refText="NMR GRID information") for dataLine in dataLines: if dataLine: (gridId, bmrbId, statusCode, pdbCode, date) = dataLine.split() if skipCodes and pdbCode in skipCodes: continue if statusCode in statusConvert: if not bmrbNmrGridDict.has_key(pdbCode): bmrbNmrGridDict[pdbCode] = {} status = statusConvert[statusCode] if not bmrbNmrGridDict[pdbCode].has_key(status): webLink = '%s?pdb_id=%s&min_items=0&block_text_type=%s' % ( nmrGridServletUrl, pdbCode, statusCode) bmrbNmrGridDict[pdbCode][status] = webLink return bmrbNmrGridDict
def getXrayResolution(): """ Read info on X-ray resolution PDB entries """ resolutionDict = {} dataLines = getReferenceTextFileFromHttp(xrayResolutionUrl,xrayResolutionFilePath,refText = "PDB x-ray resolution information") for dataLine in dataLines: cols = dataLine.split() if len(cols) == 3 and cols[1] == ';': pdbCode = cols[0].lower() resolution = float(cols[2]) if resolution != -1.0: resolutionDict[pdbCode] = resolution return resolutionDict
def getBmrbInfo(dataType=None): if not dataType: return None elif dataType == 'rdc': bmrbQueryFile = "query_1_67.html" valueNames = ('bmrbId', 'name', 'numRdcs', 'hasProtein', 'hasDNA', 'hasRNA') elif dataType == 'coupling': bmrbQueryFile = "query_1_29.html" valueNames = ('bmrbId', 'name', 'numCouplings', 'hasProtein', 'hasDNA', 'hasRNA') elif dataType == 't1': bmrbQueryFile = "query_1_33.html" valueNames = ('bmrbId', 'name', 'numT1s', 'hasProtein', 'hasDNA', 'hasRNA') elif dataType == 't2': bmrbQueryFile = "query_1_38.html" valueNames = ('bmrbId', 'name', 'numT2s', 'hasProtein', 'hasDNA', 'hasRNA') elif dataType == 'hetNuclNoe': bmrbQueryFile = "query_1_41.html" valueNames = ('bmrbId', 'name', 'numHetNuclNoes', 'hasProtein', 'hasDNA', 'hasRNA') elif dataType == 'shift': bmrbQueryFile = "query_1_5_4.html" valueNames = ('bmrbId', 'name', 'numShifts_1H', 'numShifts_13C', 'numShifts_15N', 'numShifts_31P', 'hasProtein', 'hasDNA', 'hasRNA') elif dataType == 'orderParam': bmrbQueryFile = "query_1_45.html" valueNames = ('bmrbId', 'name', 'numOrderParams', 'hasProtein', 'hasDNA', 'hasRNA') elif dataType == 'hExchange': bmrbQueryFile = "query_1_49.html" valueNames = ('bmrbId', 'name', 'numHExchangeValues', 'hasProtein', 'hasDNA', 'hasRNA') elif dataType == 'hProtection': bmrbQueryFile = "query_1_53.html" valueNames = ('bmrbId', 'name', 'numHProtectionValues', 'hasProtein', 'hasDNA', 'hasRNA') else: print "Unknown data type %s, aborting..." % dataType return None bmrbQueryUrl = "%s/search/query_grid/%s" % (bmrbUrl, bmrbQueryFile) bmrbQueryFilePath = os.path.join(bmrbReferenceDir, bmrbQueryFile) entryLinePatt = re.compile("data_library/generate_summary") valuePatt = re.compile("\>([^\<\>]+)\<") bmrbValuesDict = {} dataLines = getReferenceTextFileFromHttp(bmrbQueryUrl, bmrbQueryFilePath, refText="BMRB %s information" % dataType) ln = 0 while (ln < len(dataLines)): dataLine = dataLines[ln] if entryLinePatt.search(dataLine): infoDict = {} for valueName in valueNames: valueSearch = valuePatt.search(dataLine) if not valueSearch: print dataLine value = valueSearch.group(1) if value in (' ', ' '): value = False elif value == 'X': value = True elif valueName[:3] == 'num': value = int(value) infoDict[valueName] = value ln += 1 dataLine = dataLines[ln] bmrbId = int(infoDict['bmrbId']) del (infoDict['bmrbId']) bmrbValuesDict[bmrbId] = infoDict ln += 1 return bmrbValuesDict