def get_pdb_object(self, pdb_id): self.log_lookup('pdb object {0}'.format(pdb_id)) pdb_id = pdb_id.upper() if not self.pdb_objects.get(pdb_id): if not self.pdb_contents.get(pdb_id): if self.cache_dir: self.add_pdb_contents(pdb_id, download_pdb(pdb_id, self.cache_dir, silent = True)) else: self.add_pdb_contents(pdb_id, retrieve_pdb(pdb_id, silent = True)) self.add_pdb_object(pdb_id, PDB(self.pdb_contents[pdb_id])) return self.pdb_objects[pdb_id]
def retrieve(pdb_id, cache_dir = None, acceptable_sequence_percentage_match = 70.0, require_uniprot_residue_mapping = True, bio_cache = None): '''Creates a PDBML object by using a cached copy of the files if they exists or by retrieving the files from the RCSB. bio_cache should be a klab.bio.cache.py::BioCache object and is used to avoid reading/downloading cached files repeatedly. ''' pdb_contents = None xml_contents = None pdb_id = pdb_id.upper() l_pdb_id = pdb_id.lower() if len(pdb_id) != 4 or not pdb_id.isalnum(): raise Exception("Bad PDB identifier '%s'." % pdb_id) if bio_cache: pdb_contents = bio_cache.get_pdb_contents(pdb_id) xml_contents = bio_cache.get_sifts_xml_contents(pdb_id) if cache_dir: if not pdb_contents: # Check to see whether we have a cached copy of the PDB file filename = os.path.join(cache_dir, "%s.pdb" % pdb_id) if os.path.exists(filename): pdb_contents = read_file(filename) if not xml_contents: # Check to see whether we have a cached copy of the XML file filename = os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id) if os.path.exists(filename): xml_contents = read_file(filename) # Get any missing files from the RCSB and create cached copies if appropriate if not pdb_contents: pdb_contents = rcsb.retrieve_pdb(pdb_id) if cache_dir: write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents) if not xml_contents: try: xml_contents = retrieve_xml(pdb_id, silent = False) if cache_dir: write_file(os.path.join(cache_dir, "%s.sifts.xml.gz" % l_pdb_id), xml_contents) except FTPException550: raise MissingSIFTSRecord('The file "%s.sifts.xml.gz" could not be found on the EBI FTP server.' % l_pdb_id) xml_contents = xml_contents # Return the object handler = SIFTS(xml_contents, pdb_contents, acceptable_sequence_percentage_match = acceptable_sequence_percentage_match, cache_dir = cache_dir, require_uniprot_residue_mapping = require_uniprot_residue_mapping, bio_cache = bio_cache, pdb_id = pdb_id) xml.sax.parseString(xml_contents, handler) return handler
def static_get_pdb_object(pdb_id, bio_cache = None, cache_dir = None): '''This method does not necessarily use a BioCache but it seems to fit here.''' pdb_id = pdb_id.upper() if bio_cache: return bio_cache.get_pdb_object(pdb_id) if cache_dir: # Check to see whether we have a cached copy of the PDB file filepath = os.path.join(cache_dir, '{0}.pdb'.format(pdb_id)) if os.path.exists(filepath): return PDB.from_filepath(filepath) # Get any missing files from the RCSB and create cached copies if appropriate pdb_contents = retrieve_pdb(pdb_id) if cache_dir: write_file(os.path.join(cache_dir, "%s.pdb" % pdb_id), pdb_contents) return PDB(pdb_contents)
def from_RCSB(cls, pdb_id, cut_off = 0.25, acc_array = 'Miller', tmp_dir = '/tmp', read_only = False): return cls(PDB(retrieve_pdb(pdb_id)), cut_off = cut_off, acc_array = acc_array, tmp_dir = tmp_dir, read_only = read_only)
def check_existing_complexes_by_name(): '''Check whether any of the complexes exist in the database.''' # Ran is short for "RAs-related Nuclear protein" and is also known as "GTP-binding nuclear protein Ran" ppi_api = get_ppi_api() ids = ppi_api.get_complex_ids_matching_protein_name('gsp') ids.extend(ppi_api.get_complex_ids_matching_protein_name('ran')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('ras')) # This gives us these complexes, amongst others: # # 77 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Importin beta-1 subunit, Importin β1, Importin β1 # # 119 # Ran GTPase, Ran GTPase, Ran GTPase # Ran GAP, Ran GAP, Ran GAP # # 176 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Regulator of chromosome condensation, RCC1, RCC1 # # 202 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Nuclear transport factor 2, NTF2, NTF2 # # 29 # Ras GTPase.GDP, Ras GTPase.GDP, Ras GTPase.GDP # Ras GAP, Ras GAP, Ras GAP # # 65 # Ras GTPase.GTP, H-Ras, H-Ras # Son of sevenless-1, Sos, Sos # # 201 # Ras GTPase, Ras GTPase, Ras GTPase # Phosphoinositide 3-kinase, PI3K, PI3K # # 280 # Ras.GNP, Ras.GNP, Ras.GNP # RalGDS Ras-interacting domain, RalGDS RID, RalGDS RID ids = [] ids.extend(ppi_api.get_complex_ids_matching_protein_name('importin')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('KARYOPHERIN')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('TRANSPORTIN')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('NTF2')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('YRB1P')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('RANBP1')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('EXP5')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('CSE1')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('RANGAP')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('RANBP2')) ids.extend(ppi_api.get_complex_ids_matching_protein_name('RCC1')) for id in ids: d = ppi_api.get_complex_details(id) colortext.warning(id) print('{0}, {1}, {2}'.format(d['LName'].encode('utf-8').strip(), d['LShortName'].encode('utf-8').strip(), d['LHTMLName'].encode('utf-8').strip())) print('{0}, {1}, {2}'.format(d['RName'].encode('utf-8').strip(), d['RShortName'].encode('utf-8').strip(), d['RHTMLName'].encode('utf-8').strip())) # This gives us these complexes: # # 77 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Importin beta-1 subunit, Importin β1, Importin β1 # # 202 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Nuclear transport factor 2, NTF2, NTF2 # # 176 # Ran GTPase-GDP, Ran GTPase-GDP, Ran GTPase-GDP # Regulator of chromosome condensation, RCC1, RCC1 # # SELECT DISTINCT `PDBFileID` FROM `PPIPDBPartnerChain` WHERE `PPComplexID` IN (77, 202, 176) # returns # 1F59, 1IBR, 1QG4, 1A12, 1OUN and 1I2M, 1A2K # # Some of these are unbound. Get the complexes: # # SELECT DISTINCT `PDBFileID` FROM `PPIPDBPartnerChain` # INNER JOIN PPIPDBSet ON PPIPDBPartnerChain.PPComplexID=PPIPDBSet.PPComplexID AND PPIPDBPartnerChain.SetNumber=PPIPDBSet.SetNumber # WHERE PPIPDBPartnerChain.PPComplexID IN (77, 202, 176) AND IsComplex=1 # # returns only three hits: # complex #77 -> 1IBR (A|B); # complex #176 -> 1I2M (A|B) where Tina uses A|B (chains may be renamed); and # complex #202 -> 1A2K (C|AB) where Tina uses A|B (chains may be renamed). # # We also have: # complex #119 -> 1K5D (AB|C) where Tina uses A|B # # 1IBR -> Ran (human)|Importin β1 (human) # Tina has: # 2BKU -> RAN (dog)|Importin β1 (yeast) # 3EA5 -> RAN (human)|Importin β1 (yeast) # 3EA5 and 1IBR do not match on chains B at all and have one mutation in chain A # Similarly for 2BKU and 1IBR. # # However what came out of this is that 3EA5 and 2BKU are related i.e. that RAN is almost the same sequence in both. # The only difference is one mutation in chain A: index 40, A->P and that 3EA5 has a longer sequence for chain A # colortext.message('\n\n1IBR') p1 = PDB(retrieve_pdb('1IBR')) pprint.pprint(p1.seqres_sequences) colortext.message('\n\n2BKU') p2 = PDB(retrieve_pdb('2BKU')) pprint.pprint(p2.seqres_sequences) a1 = str(p1.seqres_sequences['A']) a2 = str(p2.seqres_sequences['A']) #3EA5 a1 = 'MAAQGEPQVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVATLGVEVHPLVFHTNRGPIKFNVWDTAGQEKFGGLRDGYYIQAQCAIIMFDVTSRVTYKNVPNWHRDLVRVCENIPIVLCGNKVDIKDRKVKAKSIVFHRKKNLQYYDISAKSNYNFEKPFLWLARKLIGDPNLEFVAMPCLAPPEVVMDPALAAQYEHDLEVAQTTALPDEDDDL' a1 = 'MSTAEFAQLLENSILSPDQNIRLTSETQLKKLSNDNFLQFAGLSSQVLIDENTKLEGRILAALTLKNELVSKDSVKTQQFAQRWITQVSPEAKNQIKTNALTALVSIEPRIANAAAQLIAAIADIELPHGAWPELMKIMVDNTGAEQPENVKRASLLALGYMCESADPQSQALVSSSNNILIAIVQGAQSTETSKAVRLAALNALADSLIFIKNNMEREGERNYLMQVVCEATQAEDIEVQAAAFGCLCKIMSKYYTFMKPYMEQALYALTIATMKSPNDKVASMTVEFWSTICEEEIDIAYELAQFPQSPLQSYNFALSSIKDVVPNLLNLLTRQNEDPEDDDWNVSMSAGACLQLFAQNCGNHILEPVLEFVEQNITADNWRNREAAVMAFGSIMDGPDKVQRTYYVHQALPSILNLMNDQSLQVKETTAWCIGRIADSVAESIDPQQHLPGVVQACLIGLQDHPKVATNCSWTIINLVEQLAEATPSPIYNFYPALVDGLIGAANRIDNEFNARASAFSALTTMVEYATDTVAETSASISTFVMDKLGQTMSVDENQLTLEDAQSLQELQSNILTVLAAVIRKSPSSVEPVADMLMGLFFRLLEKKDSAFIEDDVFYAISALAASLGKGFEKYLETFSPYLLKALNQVDSPVSITAVGFIADISNSLEEDFRRYSDAMMNVLAQMISNPNARRELKPAVLSVFGDIASNIGADFIPYLNDIMALCVAAQNTKPENGTLEALDYQIKVLEAVLDAYVGIVAGLHDKPEALFPYVGTIFQFIAQVAEDPQLYSEDATSRAAVGLIGDIAAMFPDGSIKQFYGQDWVIDYIKRTRSGQLFSQATKDTARWAREQQKRQLSL' #2BKU a2 = 'MAAQGEPQVQFKLVLVGDGGTGKTTFVKRHLTGEFEKKYVPTLGVEVHPLVFHTNRGPIKFNVWDTAGQEKFGGLRDGYYIQAQCAIIMFDVTSRVTYKNVPNWHRDLVRVCENIPIVLCGNKVDIKDRKVKAKSIVFHRKKNLQYYDISAKSNYNFEKPFLWLARKLIGDPNLEFV' a2 = 'MSTAEFAQLLENSILSPDQNIRLTSETQLKKLSNDNFLQFAGLSSQVLIDENTKLEGRILAALTLKNELVSKDSVKTQQFAQRWITQVSPEAKNQIKTNALTALVSIEPRIANAAAQLIAAIADIELPHGAWPELMKIMVDNTGAEQPENVKRASLLALGYMCESADPQSQALVSSSNNILIAIVQGAQSTETSKAVRLAALNALADSLIFIKNNMEREGERNYLMQVVCEATQAEDIEVQAAAFGCLCKIMSKYYTFMKPYMEQALYALTIATMKSPNDKVASMTVEFWSTICEEEIDIAYELAQFPQSPLQSYNFALSSIKDVVPNLLNLLTRQNEDPEDDDWNVSMSAGACLQLFAQNCGNHILEPVLEFVEQNITADNWRNREAAVMAFGSIMDGPDKVQRTYYVHQALPSILNLMNDQSLQVKETTAWCIGRIADSVAESIDPQQHLPGVVQACLIGLQDHPKVATNCSWTIINLVEQLAEATPSPIYNFYPALVDGLIGAANRIDNEFNARASAFSALTTMVEYATDTVAETSASISTFVMDKLGQTMSVDENQLTLEDAQSLQELQSNILTVLAAVIRKSPSSVEPVADMLMGLFFRLLEKKDSAFIEDDVFYAISALAASLGKGFEKYLETFSPYLLKALNQVDSPVSITAVGFIADISNSLEEDFRRYSDAMMNVLAQMISNPNARRELKPAVLSVFGDIASNIGADFIPYLNDIMALCVAAQNTKPENGTLEALDYQIKVLEAVLDAYVGIVAGLHDKPEALFPYVGTIFQFIAQVAEDPQLYSEDATSRAAVGLIGDIAAMFPDGSIKQFYGQDWVIDYIKRTRSGQLFSQATKDTARWAREQQKRQLSL' print(a1 == a2) if not a1 == a2: # horribly inefficient (casting to str each time) but not worth rewriting assert(len(a1) == len(a2)) for x in range(len(a1)): if str(a1)[x] != str(a2)[x]: print(x, str(a1)[x], str(a2)[x]) # one mutation A->C near the end of the sequence: VAMPALAP -> VAMPCLAP assert(str(p1.seqres_sequences['A']) == str(p1.seqres_sequences['C'])) assert(str(p1.seqres_sequences['B']) == str(p1.seqres_sequences['D'])) assert(str(p2.seqres_sequences['A']) == str(p2.seqres_sequences['C'])) assert(str(p2.seqres_sequences['B']) == str(p2.seqres_sequences['D'])) print('')
errors.append('No input files were specified.') else: for batch_file_selector in args: if '*' in batch_file_selector or '?' in batch_file_selector: batch_files += map(os.path.abspath, glob.glob(batch_file_selector)) elif os.path.isdir(batch_file_selector): for input_file_wildcard in input_file_wildcards: batch_files += map(os.path.abspath, glob.glob(os.path.join(batch_file_selector, input_file_wildcard))) elif not os.path.exists(batch_file_selector): if len(batch_file_selector) == 4 and batch_file_selector.isalnum(): batch_file_selector = batch_file_selector.lower() # the files are named in lowercase on the cluster if not os.path.exists('/netapp/database'): # This script is not being run on the cluster - try to retrieve the file from the RCSB colortext.message('No file %s exists - assuming that this is a PDB ID and trying to retrieve the associated file from the RCSB.' % batch_file_selector) try: fname = write_temp_file('/tmp', retrieve_pdb(batch_file_selector), suffix = '.pdb', prefix = batch_file_selector) batch_files.append(os.path.abspath(fname)) temp_files.append(os.path.abspath(fname)) except: errors.append('An error occurred retrieving the PDB file "%s".' % batch_file_selector) else: # We are on the cluster so try to retrieve the stored file colortext.message('No file %s exists - assuming that this is a PDB ID and trying to retrieve the associated file from the cluster mirror of the PDB database.' % batch_file_selector) if os.path.exists('/netapp/database/pdb/remediated/uncompressed_files/pdb%s.ent' % batch_file_selector): batch_files.append('/netapp/database/pdb/remediated/uncompressed_files/pdb%s.ent' % batch_file_selector) elif os.path.exists('/netapp/database/pdb/pre-remediated/uncompressed_files/pdb%s.ent' % batch_file_selector): batch_files.append('/netapp/database/pdb/pre-remediated/uncompressed_files/pdb%s.ent' % batch_file_selector) else: errors.append('Could not find a PDB file for argument "%s".' % batch_file_selector) missing_files.append(batch_file_selector) else: