def download_pdb(config, pdb_code: str) -> Path: """ Download PDB structure from PDB. :param pdb_code: 4 character PDB accession code. :type pdb_code: str :return: returns filepath to downloaded structure. :rtype: str """ if not config.pdb_dir: config.pdb_dir = Path("/tmp/") # Initialise class and download pdb file pdbl = PDBList() pdbl.retrieve_pdb_file(pdb_code, pdir=config.pdb_dir, overwrite=True, file_format="pdb") # Rename file to .pdb from .ent os.rename( config.pdb_dir / f"pdb{pdb_code}.ent", config.pdb_dir / f"{pdb_code}.pdb", ) # Assert file has been downloaded assert any(pdb_code in s for s in os.listdir(config.pdb_dir)) log.info(f"Downloaded PDB file for: {pdb_code}") return config.pdb_dir / f"{pdb_code}.pdb"
def get_pdb(pdb_list): import os from Bio.PDB import PDBList out_dir = "PDB_benchmark_structures\\" pdb = pdb_list number_ids = len(pdb) filename = [] not_found = [] print("Downloading in %s:\n" % out_dir) for i, pdbid in enumerate(pdb): print('%s' % pdbid[:4]) pdbl = PDBList() try: if not os.path.exists("{}{}.pdb".format(out_dir, pdbid)): x = pdbl.retrieve_pdb_file(pdbid[:4], file_format='pdb', pdir=out_dir) filename.append(x) except FileNotFoundError: not_found.append(pdbid) print("(NOTE) {} not found.".format(pdbid)) return filename
def retrieve_cif_list(): server = PDBList(server='ftp://ftp.wwpdb.org', pdb='input_files', obsolete_pdb=None ,verbose=True) pdb_list = open('input_files/protlist.txt','r') content = pdb_list.read().split() pdb_list.close() server.download_pdb_files(content,pdir="input_files/cif",file_format='mmCif', overwrite=True,obsolete= False)
def get_unique(input_df): from Bio.PDB import PDBList unique_pdbs = input_df.CPX.unique() pdbl = PDBList() for single_pdb in unique_pdbs: pdbl.retrieve_pdb_file(single_pdb, pdir='PDB', file_format="pdb")
def download_pdbs(base_dir, protein_codes): """ Downloads the PDB database (or a part of it) as PDB files. Every protein is stored in it's own directory (with name the PDB code) under base_dir. :param base_dir: where to download all the proteins. :param protein_codes: the PDB codes of the proteins that should be downloaded. """ prot_codes = [] if isinstance(protein_codes, dict): for key in protein_codes.keys(): prot_codes += protein_codes[key] else: prot_codes = protein_codes prot_codes = list(set(prot_codes)) from Bio.PDB import PDBList failed = 0 attempted = len(prot_codes) for code in prot_codes: try: pl = PDBList(pdb=os.path.join(base_dir, code.upper())) pl.flat_tree = 1 pl.retrieve_pdb_file(pdb_code=code) except IOError: log.warning("Failed to download protein {}".format(code)) failed += 1 continue log.info("Downloaded {0}/{1} molecules".format(attempted - failed, attempted))
def get_structure(pdb_data_folder, structure_PDB_ID): """ Function to retrieve information about structure of specified molecule :param pdb_data_folder: path to folder contaiting all 3D structures :param structure_PDB_ID: structures PDB ID :return: the structure object """ if (not is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".pdb")) ) and (not is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".cif"))): PDBList().retrieve_pdb_file(structure_PDB_ID, pdir=pdb_data_folder) if is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".pdb")): parser_pdb = PDBParser() structure = parser_pdb.get_structure( structure_PDB_ID, pdb_data_folder / (structure_PDB_ID + ".pdb")) if is_non_zero_file(pdb_data_folder / (structure_PDB_ID + ".cif")): try: parser_cif = MMCIFParser(QUIET=True) structure = parser_cif.get_structure( structure_PDB_ID, pdb_data_folder / (structure_PDB_ID + ".cif")) except: PDBList().retrieve_pdb_file(structure_PDB_ID, pdir=pdb_data_folder, file_format='pdb') parser_pdb = PDBParser() structure = parser_pdb.get_structure( structure_PDB_ID, pdb_data_folder / (structure_PDB_ID + ".pdb")) return structure
def from_id(cls, pdb_id): """ Initialize structure by PDB ID (fetches structure from RCSB servers) Parameters ---------- pdb_id : str PDB identifier (e.g. 1hzx) Returns ------- PDB initialized PDB structure """ from urllib.error import URLError from Bio.PDB import PDBList pdblist = PDBList() try: # download PDB file to temporary directory pdb_file = pdblist.retrieve_pdb_file(pdb_id, pdir=tempdir()) return cls.from_file(pdb_file, file_format="pdb") except URLError as e: raise ResourceError( "Could not fetch PDB data for {}".format(pdb_id) ) from e
def build_xtalstable(dbpathstr, sourcedbpathstrs, pdbformat='cif'): dbpath = Path(dbpathstr) today = datetime.date.today() todaystr = f'{today.year}-{today.month:02d}-{today.day:02d}' conn = seqdbutils.gracefuldbopen(dbpath) xtaldirpath = Path(dbpath).parent / 'Xtals' if not xtaldirpath.exists(): os.mkdir(xtaldirpath) c = conn.cursor() c.execute( '''CREATE TABLE IF NOT EXISTS XTALS (pdbid text, acc text, srcdb text,dldate text, relpath text, pdbformat text, dlsuccess int, obsolete int)''') pdbl = PDBList() cxRE = re.compile('([A-Za-z0-9]{4})\[') #by default folder containing xtals is named 'xtals' and in same directory as xtalsdb for srcdbpathstr in sourcedbpathstrs: srcdbpath = Path(srcdbpathstr) srcdbstr = srcdbpath.name src_conn = seqdbutils.gracefuldbopen(srcdbpath) seqdbutils.check_tables_exist(src_conn, ['CAZYSEQDATA']) src_c = src_conn.cursor() src_c.execute( 'SELECT acc,pdbids FROM CAZYSEQDATA WHERE pdbids NOT NULL') pdbrows = src_c.fetchall() dbpdbs = [] dbaccs = [] for pdbrow in pdbrows: pdbentry = pdbrow['pdbids'] accentry = pdbrow['acc'] pdbs = cxRE.findall(pdbentry) dbaccs.extend([accentry for _ in range(len(pdbs))]) dbpdbs.extend(pdbs) #dbpdbs=['4IM4'] pdbl.download_pdb_files(dbpdbs, pdir=xtaldirpath) #,obsolete=True) for acc, pdb in zip(dbaccs, dbpdbs): rel_pdbpath = xtaldirpath / f'{pdb}.cif' download_success = os.path.exists(rel_pdbpath) str_relpath = str(rel_pdbpath) c.execute( '''SELECT COUNT(*) FROM XTALS WHERE pdbid=(?) AND dlsuccess=(?)''', (pdb, 1)) already_downloaded = c.fetchone()[0] if already_downloaded: continue c.execute( '''SELECT COUNT(*) FROM XTALS WHERE pdbid=(?) AND dlsuccess=(?)''', (pdb, 0)) previously_failed = c.fetchone()[0] if previously_failed: if download_success: print(f'new download of previously failed {pdb}') c.execute( '''UPDATE XTALS SET dldate = (?), dlsuccess = (?) WHERE pdbid=(?)''', (todaystr, 1, pdb)) continue c.execute('''INSERT INTO XTALS VALUES (?,?,?,?,?,?,?,?)''',\ (pdb,acc,srcdbstr,todaystr,str_relpath,pdbformat,download_success,None)) conn.commit() src_conn.close() conn.close()
def generate_seq_file(score_file, save_file): score_file = './dataFile/' + score_file sf = pd.read_csv(score_file, sep='\t') mut_chains = sf.iloc[:,0] mut_dict = dict() mut_track = set() pdb_track = set() for chain in mut_chains: info = chain.split('_') pdb_id = info[0] chain_id = info[1] wt_aa = info[2][0:3] mu_aa = info[2][-3:] mu_pos = int(''.join(filter(lambda x: x.isdigit(), info[2]))) if not chain in mut_track: mut_track.add(chain) if pdb_id in pdb_track: mut_dict[pdb_id].append({'chain_id':chain_id, 'wt_aa': wt_aa, 'mu_aa': mu_aa, 'mu_pos': mu_pos, 'name': chain}) else: mut_dict[pdb_id] = [{'chain_id': chain_id, 'wt_aa': wt_aa, 'mu_aa': mu_aa, 'mu_pos': mu_pos, 'name': chain}] pdb_track.add(pdb_id) del mut_track del pdb_track parser = PDBParser() seq_builder = PPBuilder() pdb_dl_handle = PDBList() PDB_DIR = './dataFile/PDB_dl' # check if pdb file exists mut_collect = dict() for pdb_id in mut_dict.keys(): if not os.path.exists(PDB_DIR+'/pdb'+pdb_id.lower()+'.ent'): pdb_dl_handle.retrieve_pdb_file(pdb_code=pdb_id, file_format='pdb', overwrite=False, pdir=PDB_DIR) pdb_file = PDB_DIR+'/pdb'+pdb_id.lower()+'.ent' model = parser.get_structure(pdb_id, pdb_file)[0] for mutation in mut_dict[pdb_id]: protein_chain = model[mutation['chain_id']] sequence = "".join([str(pp.get_sequence()) for pp in seq_builder.build_peptides(protein_chain)]) sequence = sequence.replace('\n', '').replace(' ', '') assert sequence[mutation['mu_pos']-1] == three_to_one(mutation['wt_aa']), 'Wt amino acid failed to match' mut_Seq_list = list(sequence) mut_Seq_list[mutation['mu_pos']-1] = three_to_one(mutation['mu_aa']) mut_Seq = ''.join(mut_Seq_list) mut_collect[mutation['name']] = mut_Seq with open(save_file, 'w') as output_hl: for k, v in mut_collect.items(): output_hl.write(k+'\t'+v+'\n')
def download(filelist: list, q: Queue, lock: Lock, cursor: sqlite3.Cursor, conn: sqlite3.Connection, dir_name: str): """ :param filelist: :param q: :param lock: :param cursor: :param conn: :param dir_name: """ with open('status_tmp.txt', 'w') as f: f.write('') for file in filelist: if file in open('status_tmp.txt').readlines(): continue pdbl = PDBList() pdbl.retrieve_pdb_file(file, pdir=os.path.join(dir_name, file), file_format='pdb') if not os.path.exists(os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))): print("File with ID PDB: {:s} not found!".format(file)) continue parser = PDBParser() structure = parser.get_structure('{:s}', os.path.join(dir_name, file, 'pdb{:s}.ent'.format(file))) name = parser.header.get('name', '') head = parser.header.get('head', '') method = parser.header.get('structure_method', '') res = parser.header.get('resolution', '') ncomp = 0 nchain = 0 eclist = [] for values in parser.header['compound'].values(): ncomp += 1 nchain += len(values['chain'].split(',')) eclist.append(values.get('ec', '') or values.get('ec_number', '')) ec = ", ".join(eclist) nres = 0 mmass = 0 ppb = PPBuilder() for pp in ppb.build_peptides(structure): seq = pp.get_sequence() nres += len(seq) seqan = ProteinAnalysis(str(seq)) mmass += int(seqan.molecular_weight()) lock.acquire() try: cursor.execute("""INSERT INTO Structures (IDPDB, NAME, HEAD, METHOD, RESOLUTION, NCOMP, NCHAIN, NRES, MMASS, EC) VALUES ("{:s}", "{:s}", "{:s}", "{:s}", {:.2f}, {:d}, {:d},{:d}, {:d}, "{:s}")""".format( file, name, head, method, res, ncomp, nchain, nres, mmass, ec)) except sqlite3.DatabaseError as err: print("Error: ", err) continue else: print("Download Done for ID PDB: {:s}".format(file)) conn.commit() q.put(file) finally: lock.release() with open('status_tmp.txt', 'at') as f: f.write((file + '\n')) os.remove('status_tmp.txt') q.put(None)
def fetchPDB(name, path): """Fetch a pdb and save to path""" from Bio.PDB import PDBList pdbname = os.path.join(path,name+'.pdb') pdbl = PDBList() filename = pdbl.retrieve_pdb_file(name,pdir=path) os.rename(filename, pdbname) return
def struct_retrieve(self): """ Retrieve PDB structure given argparse ID """ self.pdb_id = str(self.args.id_input) pdbl = PDBList() pdbl.retrieve_pdb_file(self.pdb_id, file_format='pdb', pdir=".")
def downloadPdb(pdb_list): os.system("mkdir -p original_pdbs") for pdb_id in pdb_list: pdb = f"{pdb_id.lower()[:4]}" pdbFile = pdb + ".pdb" if not os.path.isfile("original_pdbs/" + pdbFile): pdbl = PDBList() name = pdbl.retrieve_pdb_file(pdb, pdir='.', file_format='pdb') os.system(f"mv {name} original_pdbs/{pdbFile}")
def struct_retrieve(self): """ Retrieve PDB structure given argparse ID """ pdbl = PDBList() pdbl.retrieve_pdb_file(self.pdb_id, file_format='pdb', pdir=f"{self.out_dir}/")
def download_structure_file(pdb_id: str) -> None: """Download PDB/mmCIF file with a user provided identifer from PDB using BioPython library :param pdb_id: the protein id in protein data bank :type pdb_id: str """ pdb_list = PDBList() pdb_list.retrieve_pdb_file(pdb_id) return
def generate_random_PDB(a, b, c): for i in range(b): c.append(a[randint(0, len(a))]) print(c) pdb1=PDBList() for i in c: pdb1.retrieve_pdb_file(i, pdir = 'PDB') return c
def load_data(experiment: str, in_file: str, out_dir: str) -> None: #in_file = os.path.join('../data/', experiment, 'full_list.txt') print(in_file) #out_dir = os.path.join('../data/raw/', experiment) pdbl = PDBList(server='http://ftp.wwpdb.org', verbose=False) with open(in_file, 'r') as molecule_id_list: molecule_id_list = molecule_id_list.readlines() for molecule_id in tqdm(molecule_id_list): pdbl.retrieve_pdb_file(molecule_id.strip('\n').split('_')[0], pdir=out_dir, file_format='pdb')
def getPDB(self, ID=None): ''' Retrives a PDB file from RCSB when ID is supplied or OBJECT.id is defined ''' from Bio.PDB import PDBList if ID is not None : return PDBList().retrieve_pdb_file(ID, pdir = '.', file_format = 'pdb') elif self.id is not None: return PDBList().retrieve_pdb_file(self.id, pdir = '.', file_format = 'pdb')
def retrieve_cif(prot_id): server = PDBList(server='ftp://ftp.wwpdb.org', pdb='input_files', obsolete_pdb=None, verbose=True) server.retrieve_pdb_file(prot_id, pdir="input_files/cif", file_format='mmCif', overwrite=True, obsolete=False)
def descargarPDB(pdb): pdbl = PDBList() pdbl.retrieve_pdb_file(pdb, pdir='./Script/PDB', file_format='pdb') parser = PDBParser() ent_file = './Script/PDB/pdb' + pdb.lower() + '.ent' structure = parser.get_structure(pdb, ent_file) io = PDBIO() io.set_structure(structure) pdb_structure_file = './Script/PDBStructure/' + pdb + '.pdb' io.save(pdb_structure_file)
def download_PDB(pdb_ids, pdb_dir='.'): # Define pdb file fetching class pdbl = PDBList() # Fetch every protein for pdb_id in pdb_ids: # Debug logging.debug('PDB file which will be downloaded') logging.debug(pdb_id) # Execute fetching of the protein (pdb file) pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
def obtian_seq_wo_seq_file(score_file): score_file = './dataFile/' + score_file sf = pd.read_csv(score_file, sep='\t') chains_involved = sf.iloc[:, 0] pdb = dict() pdb_track = set() for chain in chains_involved: chain_name = chain[0:6] pdb_name = chain[0:4] # if we encounter a old pdb if pdb_name in pdb_track: pdb[pdb_name].add(chain_name) # else, we have a new pdb else: # update the track file pdb_track.add(pdb_name) pdb[pdb_name] = {chain_name} # create the link to the PDB database and retrive all the file # related to the files, store them locally under ./dataFile/PDB_dl/ PDB_DIR = './dataFile/PDB_dl' if not os.path.exists(PDB_DIR): os.mkdir(PDB_DIR) # create the download handle pdb_dl_handle = PDBList() # download all of the pdb files for item in pdb.keys(): if not os.path.exists(PDB_DIR + '/pdb' + item.lower() + '.ent'): pdb_dl_handle.retrieve_pdb_file(pdb_code=item, file_format='pdb', overwrite=False, pdir=PDB_DIR) # for each pdb, we will construct the sequence seq_dict = dict() parser = PDBParser() seq_builder = PPBuilder() # key is the pdb_id, value is the chain in a for pdb_id, chain_names in pdb.items(): pdb_file = PDB_DIR + '/pdb' + pdb_id.lower() + '.ent' model = parser.get_structure(pdb_id, pdb_file)[0] for chain in chain_names: # extract the last letter, which is the chain name chain_id = chain[-1] protein_chain = model[chain_id] sequence = "".join([ str(pp.get_sequence()) for pp in seq_builder.build_peptides(protein_chain) ]) sequence = sequence.replace('\n', '').replace(' ', '') # clean the bad chars seq_dict[chain] = sequence return seq_dict
def download_structure_file(self, outdir, file_type=None, load_header_metadata=True, force_rerun=False): """Download a structure file from the PDB, specifying an output directory and a file type. Optionally download the mmCIF header file and parse data from it to store within this object. Args: outdir (str): Path to output directory file_type (str): ``pdb``, ``mmCif``, ``xml``, ``mmtf`` - file type for files downloaded from the PDB load_header_metadata (bool): If header metadata should be loaded into this object, fastest with mmtf files force_rerun (bool): If structure file should be downloaded even if it already exists """ ssbio.utils.double_check_attribute( object=self, setter=file_type, backup_attribute='file_type', custom_error_text= 'Please set file type to be downloaded from the PDB: ' 'pdb, mmCif, xml, or mmtf') # XTODO: check if outfile exists using ssbio.utils.force_rerun, pdblist seems to take long if it exists # I know why - it's because we're renaming the ent to pdb. need to have mapping from file type to final extension # Then check if file exists, if not then download again p = PDBList() with ssbio.utils.suppress_stdout(): structure_file = p.retrieve_pdb_file(pdb_code=self.id, pdir=outdir, file_format=file_type, overwrite=force_rerun) if not op.exists(structure_file): log.debug('{}: {} file not available'.format(self.id, file_type)) raise URLError('{}.{}: file not available to download'.format( self.id, file_type)) else: log.debug('{}: {} file saved'.format(self.id, file_type)) # Rename .ent files to .pdb if file_type == 'pdb': new_name = structure_file.replace('pdb', '').replace('ent', 'pdb') os.rename(structure_file, new_name) structure_file = new_name self.load_structure_path(structure_file, file_type) if load_header_metadata and file_type == 'mmtf': self.update(parse_mmtf_header(structure_file)) if load_header_metadata and file_type != 'mmtf': self.update( parse_mmcif_header( download_mmcif_header(pdb_id=self.id, outdir=outdir, force_rerun=force_rerun)))
def download_pdb_structure(pdb_code, pdb_file_name, file_path='.'): """Downloads a PDB structure from the Protein Data Bank""" pdbl = PDBList() file_name = pdbl.retrieve_pdb_file(pdb_code, file_format='pdb', pdir=file_path, overwrite=True) if os.path.exists(file_name): os.rename(file_name, pdb_file_name) else: raise Exception("Can not download structure: {0}".format(pdb_code))
def download_pdb(pdb_id, pdbs_path): """Downloads a pdb file Parameters ---------- pdb_id : str pdb id pdbs_path: str, optional path of folder containing the pdbs (default is "pdbs") """ pdbl = PDBList(obsolete_pdb=True) pdbl.download_pdb_files(pdb_id, file_format="pdb", pdir=pdbs_path)
def get_pdb(pdb_list): from Bio.PDB import PDBList out_dir = "PDB_benchmark_structures\\" pdb = pdb_list number_ids = len(pdb) print("Downloading in %s:\n" % out_dir) for ids in pdb: print('%s' % ids[:4]) pdbl = PDBList() pdbl.retrieve_pdb_file(ids[:4], file_format='pdb', pdir=out_dir)
def readFromPDBDatabase(self, pdbID, dir=None, type='mmCif'): """ Retrieve structure from PDB :param pdbID: :param dir: save structure in this directory :param type: mmCif or pdb :return: filename with pdb file """ if dir is None: dir = os.getcwd() pdbl = PDBList(pdb=dir) fileName = pdbl.retrieve_pdb_file(pdbID, pdir=dir, file_format=type) return os.path.abspath(fileName)
def struct_retrieve(self): """Retrieve the PDB structure from the terminal argument. Args: args [argparse object]: Contains the id_input argument Returns: prompt [str]: File successfully written """ pdbl = PDBList() pdbl.retrieve_pdb_file(self.pdb_id, file_format='pdb', pdir=self.out_dir)
def get_pdb(ids, dir): # Get structures ids ids_list = [] with open(ids, 'r') as file: for line in file.readlines(): ids_list.append(line.split('\n')[0]) # Selecting structures from PDB pdbl = PDBList() for i in ids_list: print(i) pdbl.retrieve_pdb_file(i, pdir=dir, file_format='pdb')
def download_pdb(): # download the selected pdb file result_handle = open(project_dir + spacer + "my_blast.xml") blast_record = NCBIXML.read(result_handle) global choose_pdbChoices i = 0 list_blast_results = [] for alignment in blast_record.alignments: for hsp in alignment.hsps: title = alignment.title e_value = hsp.expect print e_value length = alignment.length identities = hsp.identities positives = hsp.positives gaps = hsp.gaps choice = ("title: " + str(title[0:100]) + " ..." + "\n" + "score: " + str(e_value) + " " + "length: " + str(length) + "\n" + "identities: " + str(identities) + " " + "positives: " + str(positives) + " " + "gaps: " + str(gaps) + "\n" ) if i <5: choose_pdbChoices.append(choice) list_blast_results.append(title) choice = "" i += 1 print choose_pdbChoices app = wx.App(0) MainApp =MyFrame2(None) MainApp.Show() MainApp.Maximize() app.MainLoop() print "bla" print row title = list_blast_results[int(row)] info_file= open(project_dir + spacer +"info.txt", "a") info_file.write("\n you chose this blastresult: \n" + str(list_blast_results[int(row)])) info_file.close() print title[17:21] global pdbfile pdb_dir = project_dir + spacer + project_name +"_blast_pdb" pdbfile = project_dir + spacer + "tmppdb.pdb" pdbl = PDBList() pdbl.retrieve_pdb_file(title[17:21], obsolete = False, pdir = pdb_dir) title_lower = title[17:21].lower() shutil.copyfile(pdb_dir + spacer + "pdb" + title_lower + ".ent", project_dir + spacer + "tmppdb.pdb")
def online_input(structure_name, file_format=None): """ Uses BioPython's PDBList to download tertiary structure from PDB :param structure_name: PDB correct name of RNA structure :param file_format: File format to pull from database, currently only 'pdb' is supported :return: """ if not file_format: file_format = 'pdb' pdbl = PDBList() return pdbl.retrieve_pdb_file(pdb_code=structure_name, file_format=file_format, pdir=ROOT_DIR + '/downloadedStructures/')
def DownloadTemplate(template): """ Downloads the desired template from the pdb database. Arguments: template: pdb code of the template to download. """ pdbl = PDBList() pdbl.retrieve_pdb_file(template, obsolete=False, pdir="./", file_format="pdb")
def pdb_download(code, path=None): """Downloads the structure of the pdb on a file. cod is the pdb code of the structure path is the localization where it will be downloaded Returns the file name where it is stored""" logging.info("Downloading pdb %s.", code) logging.captureWarnings(True) pdbl = PDBList(obsolete_pdb=os.getcwd()) if path is None: file = pdbl.retrieve_pdb_file(code) else: file = pdbl.retrieve_pdb_file(code, pdir=path) logging.captureWarnings(False) return file
def download_and_get_chains(): from Bio.PDB import PDBParser, PDBIO failed = [] pdbs_dict = read_rostdb_entries() io = PDBIO() pdbl = PDBList() for pdb_e, chains in pdbs_dict.items(): for chain_e in chains: try: pdbl.retrieve_pdb_file(pdb_e, pdir='./') pdb = PDBParser().get_structure(pdb_e, 'pdb'+pdb_e.lower()+'.ent') for chain in pdb.get_chains(): if chain.get_id() == chain_e: io.set_structure(chain) io.save(pdb.get_id() + '_' + chain.get_id() + '.pdb') except: failed.append((pdb_e, chain_e)) print("failures:", failed)
def uniblast(sequence, db, evalue, tgts): os.environ['BLASTDB'] = db seq ='>%s\n%s' % sequence blast_line = ['psiblast', '-db', db, '-evalue', str(evalue), '-num_iterations', '0', '-max_target_seqs', str(tgts), '-outfmt', '6 qaccver saccver pident evalue qcovs length ' 'staxid'] taxon_line = ['taxonkit', 'lineage', '-i', '7'] taxon_line2 = ['taxonkit', 'reformat', '-i', '8', '-f', '{s}'] blast = Popen(blast_line, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=os.environ.copy(), encoding='utf8', cwd=os.getcwdu()) o, e = blast.communicate(seq) txnkit1 = Popen(taxon_line, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=os.environ.copy(), encoding='utf8', cwd=os.getcwdu()) o1, e1 = txnkit1.communicate(o[:o.find('\n\n')] ) txnkit2 = Popen(taxon_line2, stdin=PIPE, stdout=PIPE, stderr=PIPE, env=os.environ.copy(), encoding='utf8', cwd=os.getcwdu()) o2, e2 = txnkit2.communicate(o1) df = pd.read_table(StringIO(o2), sep='\t', header=None, names='qaccver saccver pident evalue qcovs length ' 'staxid lineage species'.split(), quoting=csv.QUOTE_NONE, encoding='utf-8') df = df.sort_values(by=['pident', 'evalue', 'qcovs', 'length'], ascending=[False, True, False, False]) df = df[(df.pident > 30) & (df.qcovs > 70)][~df.saccver.duplicated()] pdbl = PDBList() pdb_codes = df.saccver.unique() for i in pdb_codes: pdb = i[:4] ch = i[-1] # Download pdb file_path = pdbl.retrieve_pdb_file(pdb, pdir='PDBs', file_format='pdb') parser = PDBParser() st = parser.get_structure(pdb, file_path) ou = PDBIO() ou.set_structure(st) os.remove(os.path.join('PDBs', pdb)) return df
def fetch_pdb(self, pdb_code, pdb_dir): pdb_list = PDBList() pdb_list.retrieve_pdb_file(pdb_code, pdir=pdb_dir)
f.write("\nPosicao: %s\n" %ref.positions) f.write("\nComentarios: %s\n" %ref.comments) f.write("\nReferencias: %s\n" %ref.references) f.write("\nAutores: %s\n" %ref.authors) f.write("\nTitulo: %s\n" %ref.title) f.write("\nLocalizacao: %s\n\n" %ref.location) break except Exception: break f.close() #análise da estrutura das proteínas relevantes com base nos ficheiros PDB encontrados (código baseado no desenvolvido pelo grupo 10) parser = PDBParser() ficheiro= open("analise_pdb.txt", "w") structure = parser.get_structure('4F67', '4F67.pdb') pdbl = PDBList() pdbl.retrieve_pdb_file('4F67') ficheiro.write("****Analise do ficheiro 4F67.pdb****\n") ficheiro.write("\nPalavras Chave: %s\n" %structure.header['keywords']) ficheiro.write("\nNome do Organismo: %s\n" %structure.header['name']) ficheiro.write("\nCabecalho: %s" %structure.header['head']) ficheiro.write("\nData da deposicao: %s\n" %structure.header['deposition_date']) ficheiro.write("\nData da publicacaos: %s\n" %structure.header['release_date']) ficheiro.write("\nMetodo usado: %s\n" %structure.header['structure_method']) ficheiro.write("\nResolucao: %s\n" %structure.header['resolution']) ficheiro.write("\nReferencia da estrutura: %s\n" %structure.header['structure_reference']) ficheiro.write("\nReferencia de artigo: %s\n" %structure.header['journal_reference']) ficheiro.write("\nAutor: %s\n" %structure.header['author']) ficheiro.write("\nComposto: %s" %structure.header['compound']) ficheiro.close()
def get_pdb_files(self): """Retrieves all pdb files corresponding the domains of interest.""" pdb_list = PDBList() for pdb in self.pdb_ids: # put files in the directory pointed by the self.directory variable pdb_list.retrieve_pdb_file(pdb, pdir=self.directory)
#!/usr/bin/env python from Bio.PDB import PDBList, PDBParser, Selection import networkx as nx import matplotlib.pyplot as plt from pprint import pprint as pp import numpy as np distanceThreshold = 1 # sys.argv[2] pdbList = PDBList() pdbParser = PDBParser() proteinName = "1MBN" structure = pdbParser.get_structure(proteinName, pdbList.retrieve_pdb_file(proteinName)) resList = Selection.unfold_entities(structure, "R") distanceMatrix = np.zeros([len(resList), len(resList)]) def genDistanceMatrix(dMatrix, rList): caMap = {res.id[1]: res["CA"] for res in rList if "CA" in res} pp(caMap) pp(len(caMap)) genDistanceMatrix(distanceMatrix, resList)
#!/usr/bin/env python from Bio.PDB import PDBList p = PDBList() atp227 = [ '1A0I', '1A49', '1A82', '1ATP', '1AYL', '1B0U', '1B76', '1B8A', '1BCP', '1BCP', '1CSN', '1D9Z', '1DY3', '1E2Q', '1E4G', '1E8X', '1EE1', '1ESQ', '1F2U', '1F2U', '1F9A', '1FMW', '1G21', '1G3I', '1G5T',
def fetch_pdb(self, pdb_code): pdb_list = PDBList() pdb_list.retrieve_pdb_file(pdb_code, pdir=self.pdb_dir) self._rename_pdb_file(pdb_code)
serialize(entries, pdbredo_seq_folder, entries_json_file) except IOError: raise('Missing some file') else: entries = retrieve(pdbredo_seq_folder, entries_json_file) if new_subset: #grab some subset of the pdb, download files and parse for uniprot xreferences os.mkdir(pdb_folder) os.mkdir(uniprot_folder) #select a pseudo_random subset of entries by slicing the entries dict #(which is in pseudo-random) order PDB_subset = list(entries.keys())[:n_entries] PDB_subset_nochain = [x.split('_')[0] for x in PDB_subset] #fetch and save all the pdb files pdbl = PDBList() for entry in PDB_subset_nochain: pdbl.retrieve_pdb_file(entry, pdir=pdb_folder) #serialize the PDB list for future use serialize(PDB_subset, pdb_folder, pdb_list) # parse the pdb headers for DBREF to uniprot pdb_to_uniprot = find_uniprot_in_pdb(PDB_subset_nochain, pdb_folder) if recompute_clean: PDB_subset = retrieve(pdb_folder, pdb_list) PDB_subset_nochain = [x.split('_')[0] for x in PDB_subset] # parse the pdb headers for DBREF to uniprot pdb_to_uniprot = find_uniprot_in_pdb(PDB_subset_nochain, pdb_folder) #determine the uniprot references to fetch to_fetch = [] for entry in pdb_to_uniprot.keys():
""" Module Description: This file imports PDB files from Network, and store them at appropiate place """ from Bio.PDB import PDBList PDBListTemp = PDBList() FILE = open("pfam_list.txt","r") IN = FILE.readline() while IN: print IN PDBListTemp.retrieve_pdb_file(IN)
def download_pdb(args): from Bio.PDB import PDBParser, PDBIO, PDBList io = PDBIO() pdbl = PDBList() pdbl.retrieve_pdb_file(args['name'], pdir=args['path'])
def get_pdb_file(self, directory): """Retrieves a pdb file.""" pdb_list = PDBList() pdb_list.retrieve_pdb_file(self.pdb_id, pdir=directory)