def retrieve_cif_list(): server = PDBList(server='ftp://ftp.wwpdb.org', pdb='input_files', obsolete_pdb=None ,verbose=True) pdb_list = open('input_files/protlist.txt','r') content = pdb_list.read().split() pdb_list.close() server.download_pdb_files(content,pdir="input_files/cif",file_format='mmCif', overwrite=True,obsolete= False)
def build_xtalstable(dbpathstr, sourcedbpathstrs, pdbformat='cif'): dbpath = Path(dbpathstr) today = datetime.date.today() todaystr = f'{today.year}-{today.month:02d}-{today.day:02d}' conn = seqdbutils.gracefuldbopen(dbpath) xtaldirpath = Path(dbpath).parent / 'Xtals' if not xtaldirpath.exists(): os.mkdir(xtaldirpath) c = conn.cursor() c.execute( '''CREATE TABLE IF NOT EXISTS XTALS (pdbid text, acc text, srcdb text,dldate text, relpath text, pdbformat text, dlsuccess int, obsolete int)''') pdbl = PDBList() cxRE = re.compile('([A-Za-z0-9]{4})\[') #by default folder containing xtals is named 'xtals' and in same directory as xtalsdb for srcdbpathstr in sourcedbpathstrs: srcdbpath = Path(srcdbpathstr) srcdbstr = srcdbpath.name src_conn = seqdbutils.gracefuldbopen(srcdbpath) seqdbutils.check_tables_exist(src_conn, ['CAZYSEQDATA']) src_c = src_conn.cursor() src_c.execute( 'SELECT acc,pdbids FROM CAZYSEQDATA WHERE pdbids NOT NULL') pdbrows = src_c.fetchall() dbpdbs = [] dbaccs = [] for pdbrow in pdbrows: pdbentry = pdbrow['pdbids'] accentry = pdbrow['acc'] pdbs = cxRE.findall(pdbentry) dbaccs.extend([accentry for _ in range(len(pdbs))]) dbpdbs.extend(pdbs) #dbpdbs=['4IM4'] pdbl.download_pdb_files(dbpdbs, pdir=xtaldirpath) #,obsolete=True) for acc, pdb in zip(dbaccs, dbpdbs): rel_pdbpath = xtaldirpath / f'{pdb}.cif' download_success = os.path.exists(rel_pdbpath) str_relpath = str(rel_pdbpath) c.execute( '''SELECT COUNT(*) FROM XTALS WHERE pdbid=(?) AND dlsuccess=(?)''', (pdb, 1)) already_downloaded = c.fetchone()[0] if already_downloaded: continue c.execute( '''SELECT COUNT(*) FROM XTALS WHERE pdbid=(?) AND dlsuccess=(?)''', (pdb, 0)) previously_failed = c.fetchone()[0] if previously_failed: if download_success: print(f'new download of previously failed {pdb}') c.execute( '''UPDATE XTALS SET dldate = (?), dlsuccess = (?) WHERE pdbid=(?)''', (todaystr, 1, pdb)) continue c.execute('''INSERT INTO XTALS VALUES (?,?,?,?,?,?,?,?)''',\ (pdb,acc,srcdbstr,todaystr,str_relpath,pdbformat,download_success,None)) conn.commit() src_conn.close() conn.close()
def download_pdb(pdb_id, pdbs_path): """Downloads a pdb file Parameters ---------- pdb_id : str pdb id pdbs_path: str, optional path of folder containing the pdbs (default is "pdbs") """ pdbl = PDBList(obsolete_pdb=True) pdbl.download_pdb_files(pdb_id, file_format="pdb", pdir=pdbs_path)
def download_pdb_files(FASTA_FILE, PDB_OUTPUT): """Downloads all pdb files from a given fasta file with pdb ids Parameters ---------- FASTA_FILE : str The file location of the .fasta PDB_OUTPUT : str The directory for pdb outputs """ # get protein ids records = list(SeqIO.parse(FASTA_FILE, "fasta")) protein_ids = set([x.id[:-1] for x in records]) # download pdbs pdbl = PDBList() pdbl.download_pdb_files(protein_ids, pdir=PDB_OUTPUT)
def obtain_pdb(dataset): # download the pdb set pdb_set = list(set(dataset['pdb'])) if not os.path.exists('./PDB'): os.mkdir('./PDB') PDB_dl_handle = PDBList(verbose=False) PDB_dl_handle.download_pdb_files(pdb_codes=pdb_set, file_format='pdb', pdir='./PDB') # download the fasta set if not os.path.exists('./FASTA'): os.mkdir('./FASTA') for item in pdb_set: if not os.path.exists("./FASTA/{}.fasta".format(item)): os.system( """wget -q -O - "https://www.rcsb.org/pdb/download/viewFastaFiles.do?structureIdList={}&compressionType=uncompressed" > ./FASTA/{}.fasta""" .format(item, item)) time.sleep(0.1) assert (os.stat("./FASTA/{}.fasta".format(item)).st_size != 0), 'Download Failed, Empty File generated.'
if i != ',': outputlist.append(pdbID + str(i)) #string functions together and generate output in folders organized by taxID for taxID in taxidlist: os.mkdir(str(taxID)) os.chdir(str(taxID)) outfilename = str(taxID) + '.txt' pdbids = [] query(taxID) initial_num_entries = len(pdbids) pdbl = PDBList() cwd = os.getcwd() pdbl.download_pdb_files(pdbids, obsolete=False, pdir=cwd, file_format='mmCif') for pdbID in pdbids: try: filter_bad_strc(pdbID) except: print "No pdb entries for protein: " + str(pdbID) mainlist = [] for pdbID in pdbids: fetch(pdbID) filteredpdbIDs = [] filter_redundant_strc(mainlist) outputlist = [] for pdbID in filteredpdbIDs: try: get_chains(pdbID)
pdbs_path = os.path.join(current_path, 'pdb') if pdbs_path not in glob(os.path.join(current_path, '*')): os.mkdir(pdbs_path) with open(os.path.join(current_path, 'pdbs.txt'), 'r') as f: pdbs = [pdb[:-1] for pdb in f] if len(pdbs) == 1: pdb_id = pdbs[0] else: pdb_id = current_path.rsplit('/', 1)[1] pdbl = PDBList(obsolete_pdb=True) if not glob(os.path.join(pdbs_path, '*')): pdbl.download_pdb_files(pdbs, file_format='pdb', pdir=pdbs_path) #initialize databases to report database_1 = [] database_2 = [] pdbs = glob(os.path.join(pdbs_path, '*')) for pdb in pdbs: mol = bg.Pmolecule(pdb) net = mol.network() # take only selected positions: if selected_positions: for node in list(net.nodes): pos = int(node[1::])
def retrieve_pdb_files(pdb_codes: List[str], pdb_data_dir: Union[Path, str], file_format: str = "mmCif"): pdbl = PDBList(server="ftp://ftp.wwpdb.org", pdb=pdb_data_dir) filenames = pdbl.download_pdb_files(pdb_codes, file_format=file_format) return filenames
from Bio.PDB import PDBList ##this script may download pdb files for a list of pdb codes. if len(sys.argv) < 3: print 'python DownloadPDBs.py pdbcodeListFile format [ResDir]' print '\tpdbcodeListFile: a file of PDB IDs, in upper or lower case' print '\tformat: pdb or mmCif. This option specifies which PDB file format needed' exit(1) listFile = sys.argv[1] with open(listFile, 'r') as fh: content = [line.strip() for line in list(fh)] pdbcodes = [] for c in content: pdbcodes.extend(c.split()) format = sys.argv[2] ResDir = os.getcwd() if len(sys.argv) > 3: ResDir = sys.argv[3] if not os.path.isdir(ResDir): os.mkdir(ResDir) pdblist = PDBList() pdblist.download_pdb_files(pdbcodes, file_format=format, pdir=ResDir)
#!/Applications/Anaconda/anaconda/bin/python3 from Bio.PDB import PDBList pdbl = PDBList() #pdbl.retrieve_pdb_file('1FAT',file_format='pdb') ##download_pdb_files(self, pdb_codes, obsolete=False, pdir=None, file_format='pdb', overwrite=False) mylist = [ '6eqd', '6ane', '5yfe', '5xjh', '5xg0', '5xfy', '5xfz', '5xh3', '5yns' ] pdbl.download_pdb_files( mylist, obsolete=False, pdir="/Users/astuart/Ramapo/Research/PlasticDegradingEnzymes/Structures/", file_format='pdb')
'CudaPrecision': 'mixed', 'CudaDeviceIndex': (CUDA_DEV_IDX if CUDA_DEV_IDX else 0)} """ Helper function to choose mutant residues """ RESIDUES = ['ACE', 'ALA', 'ALAD', 'ARG', 'ARGN', 'ASF', 'ASH', 'ASN', 'ASN1', 'ASP', 'ASPH', 'CALA', 'CARG', 'CASF', 'CASN', 'CASP', 'CCYS', 'CCYX', 'CGLN', 'CGLU', 'CGLY', 'CHID', 'CHIE', 'CHIP', 'CILE', 'CLEU', 'CLYS', 'CME', 'CMET', 'CPHE', 'CPRO', 'CSER', 'CTHR', 'CTRP', 'CTYR', 'CVAL', 'CYM', 'CYS', 'CYS1', 'CYS2', 'CYSH', 'CYX', 'DAB', 'GLH', 'GLN', 'GLU', 'GLUH', 'GLY', 'HID', 'HIE', 'HIP', 'HIS', 'HIS1', 'HIS2', 'HISA', 'HISB', 'HISD', 'HISE', 'HISH', 'HSD', 'HSE', 'HSP', 'HYP', 'ILE', 'LEU', 'LYN', 'LYS', 'LYSH', 'MET', 'MSE', 'NALA', 'NARG', 'NASN', 'NASP', 'NCYS', 'NCYX', 'NGLN', 'NGLU', 'NGLY', 'NHID', 'NHIE', 'NHIP', 'NILE', 'NLEU', 'NLYS', 'NME', 'NMET', 'NPHE', 'NPRO', 'NSER', 'NTHR', 'NTRP', 'NTYR', 'NVAL', 'ORN', 'PGLU', 'PHE', 'PRO', 'QLN', 'SER', 'THR', 'TRP', 'TYR', 'VAL'] STANDARDS = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS','ILE', 'LEU', 'LYN', 'LYS', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'] def randres(standard=False): pool = STANDARDS if standard else RESIDUES return pool[rd.randint(0,len(pool)-1)] ### # Seed ### """ Download seed scFv (RCSB PDB ID: 1MFA) """ pdbl = PDBList(obsolete_pdb='/dev/null') pdbl.download_pdb_files([SEED_PDB], pdir=PDB_DIR, file_format='pdb') """ Extract 1MFA components """ uni_1MFA = mda.Universe(f'{PDB_DIR}/pdb{SEED_PDB}.ent') assert hasattr(uni_1MFA, 'trajectory') lig_1MFA = uni_1MFA.select_atoms('not protein and not resname HOH') lig_1MFA.write(f'{PDB_DIR}/{SEED_PDB}.lig.pdb') fab_1MFA = uni_1MFA.select_atoms('protein') fab_1MFA.write(f'{PDB_DIR}/{SEED_PDB}.fab.pdb') # light_1MFA = uni_1MFA.select_atoms('segid L') # heavy_1MFA = uni_1MFA.select_atoms('segid H') """ Fix/clean the FAb apo protein and save it """ fixer = PDBFixer(PDB_DIR + '/' + SEED_PDB + '.fab.pdb')