def single_chain(pdb_dir): ''' Iterates through a directory and uses Biopython to select and write the first chain from each pdb. Called by: AccessPDB.py:main() (Option 6) ''' single_chain_dir = pdb_dir + "/SingleChains/" os.makedirs(single_chain_dir) print('\n\nExtracting first chain of each PDB file...\n') for pdb in pg(os.listdir(pdb_dir), widgets=widgets): if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith( ".ent.gz"): pdb_name = pdb.split('.')[0].split("/")[-1] if pdb.endswith(".ent.gz"): pdb_file = gzip.open(pdb_dir + '/' + pdb, 'rt') else: pdb_file = pdb_dir + '/' + pdb try: structure = p.get_structure(pdb_name, pdb_file) except: print("Structure " + pdb_name + " could not be strictly parsed.") continue chains = structure.get_chains() for chain in chains: chain_id = chain.id break single_chain_name = single_chain_dir + pdb_name + chain_id + ".pdb" io.set_structure(structure) io.save(single_chain_name, SelectChain(chain_id))
def match_parallel(list1, list2, threshold=0.75): ''' Compute the similarity score between strings from list1 and strings from list2 based on the Levensthein distance. Unlike match(), this function uses multiprocessing to compute scores. In order for this function to be efficient, the length of list1 should be superior to the length of list2. Parameters list1, list2: lists of strings threshold: minimum score needed to consider two strings as matched (between 0 and 1, where 0 is no match and 1 is perfect match) Returns matched: list of dict ''' if len(list1) > len(list2): print('\n[ ! ] WARNING: parameter list1 should contain the smallest', 'array. Otherwise computation time might be extanded.\n') if str(input('Are you sure you want to keep going with no changes? [y/n]: ')) == 'y': pass else: exit() matched = [] # Prepare data for parallelization n_cores = mp.cpu_count() list2_chunks = chunks(list2, n_cores) for a, _ in zip(list1, pg(range(len(list1)), widgets=widgets)): # Define jobs parameters job_parameters = [] for c in list2_chunks: job_parameters.append((a, c, threshold)) # Parallelize score computation with mp.Pool(processes=n_cores) as p: results = p.starmap(match_job, job_parameters) # Yield results for r in results: matched += r return matched
def pdb_to_fasta(pdb_dir): ''' Simply iterates over the PDB files in the current directory and creates a FASTA file containing entries for each chain of all PDBs. Called by: AccessPDB.py:main() (option 9) ''' fasta_file = pdb_dir.split("/")[-1] + '.fasta' for pdb in pg(os.listdir(pdb_dir), widgets=widgets): try: pdb_name, pdb_file, structure, contents = parse_pdb(pdb_dir, pdb) except: continue nchains, seqs, chain_ids = extract_seqs(structure, 0) with open(fasta_file, 'a') as f: for seq, chain_id in zip(seqs, chain_ids): wrapped_seq = "\n".join(tw.wrap(seq)) fasta_entry = '>' + pdb_name + ':' + str( chain_id) + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry)
def min_chain_length(pdb_dir, length): ''' Iterates over PDB files in directory, checks the chain lenght for each chain and copies the ones in which at least one chain has more than the desired length. Called by: AccessPDB.py:main() (option 8) ''' filtered_dir = 'Over' + length os.mkdir(filtered_dir) for pdb in pg(os.listdir(pdb_dir), widgets=widgets): try: pdb_name, pdb_file, structure, contents = parse_pdb(pdb_dir, pdb) except: continue nchains, seqs = extract_seqs(structure, 0) if all(len(seq) < int(length) for seq in seqs): continue else: shutil.copyfile( pdb_dir + '/' + pdb_name + '.pdb', pdb_dir + '/' + filtered_dir + '/' + pdb_name + '.pdb')
def match(list1, list2, threshold=0.75): ''' Compute the similarity score between strings from list1 and strings from list2 based on the Levensthein distance. Parameters list1, list2: lists of strings threshold: minimum score needed to consider two strings as matched (between 0 and 1, where 0 is no match and 1 is perfect match) Returns matched: list of dict ''' matched = [] for a, _ in zip(list1, pg(range(len(list1)), widgets=widgets)): for b in list2: a_ = alpha_sort(a) b_ = alpha_sort(b) d = 1 - dist.eval(a_, b_) / max(len(a_), len(b_)) if d > threshold: matched.append({'list1': a, 'list2': b, 'score': round(d, 3)}) return matched
def clean_pdb_files(pdb_dir): ''' Iterates over files in given directory and uses clean_pdb function to write PDB files containing only amino_acids. (Option 5) Called by: AccessPDB.py:main() ''' clean_dir = "clean" os.mkdir(clean_dir) print('\n\nCleaning PDB files...\n') for pdb in pg(os.listdir(pdb_dir), widgets=widgets): if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith( ".ent.gz"): pdb_name = pdb.split('.')[0].split("/")[-1] if pdb.endswith(".ent.gz"): pdb_file = gzip.open(pdb_dir + '/' + pdb, 'rt') else: pdb_file = pdb_dir + '/' + pdb try: structure = p.get_structure(pdb_name, pdb_file) except: print("Structure " + pdb_name + " could not be strictly parsed.") continue clean_pdb(structure, pdb_name, clean_dir)
def collect_fasta(verbosity): ''' Fetches fasta files in the pdb_homo_archive and creates a single fasta file within a "sequences" folder. For that, it checks the identity among the chains in the original fasta and only keeps track of the unique chains, i.e. less than 99% identity to the other chains. This file is later use to make the blast database. Called by: update_databases() ''' fastafiles = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb_homo_archive) for f in filenames if f.endswith(".fasta") ] seqdir = os.path.join(pdb_homo_archive, 'sequences') if not os.path.isdir(seqdir): os.mkdir(seqdir) largepdb_collected_fasta = os.path.join(seqdir, 'largepdb_collected.fastas') with open(largepdb_collected_fasta, 'w+'): pass homo_collected_fasta = os.path.join(seqdir, 'homo_collected.fastas') with open(homo_collected_fasta, 'w+'): pass mono_collected_fasta = os.path.join(seqdir, 'mono_collected.fastas') with open(mono_collected_fasta, 'w+'): pass hetero_collected_fasta = os.path.join(seqdir, 'hetero_collected.fastas') with open(hetero_collected_fasta, 'w+'): pass for fasta in pg(fastafiles, widgets=widgets): pctools.printv('Assessing ' + clrs['y'] + fasta + clrs['n'] + '...', verbosity) contents = open(fasta, 'r').read() contentlines = contents.split('>') nchains = str(len(re.findall('>', contents))) pctools.printv( 'With ' + clrs['y'] + nchains + clrs['n'] + ' chains to be assessed\n', verbosity) uniques = [] for entry in contentlines: if entry: splitentry = entry.split('\n', 1) pdbch = splitentry[0] seq = splitentry[1].replace('\n', '') if uniques: percent_ids = [] for unique in uniques: alignment = parasail.sg_stats_striped_16( seq, unique[1], 10, 1, parasail.blosum62) if alignment.length == 0: percent_ids.append(0) else: percent_ids.append( (alignment.matches) / alignment.length * 100) if all(percent_id <= 99 for percent_id in percent_ids): uniques.append([pdbch, seq]) else: uniques.append([pdbch, seq]) if '/largepdb_sequences/' in fasta: with open(largepdb_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/mono_sequences/' in fasta: with open(mono_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/hetero_sequences/' in fasta: with open(hetero_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/homo_sequences/' in fasta: with open(homo_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) subprocess.run([ makeblastdb_exe, '-in', largepdb_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'largedb') ]) subprocess.run([ makeblastdb_exe, '-in', mono_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'monodb') ]) subprocess.run([ makeblastdb_exe, '-in', hetero_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'heterodb') ]) subprocess.run([ makeblastdb_exe, '-in', homo_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'homodb') ])
def curate_homoDB(verbosity): ''' Creates h**o-oligomeric database from a local pdb repsitory. The divided scheme adopted by RCSB, in which the subdirectories are the two middle characters in the PDB code, is assumed. Each database contains three key files: dat, log and fasta. * homodb.dat contains only the pdb codes contained in the database. * homodb.log contains summarized relevant information about each entry. * homodb.fasta contains the sequences of every chain in the database. Called by: update_databases() ''' # Create stats folder if does not exist stats_dir = os.path.join(pdb_homo_archive, 'stats') if not os.path.isdir(stats_dir): os.mkdir(stats_dir) # Compare latest assession with new files assession_log = read_latest_assession(stats_dir) new_files = list_new_files(pdb1_archive, assession_log, verbosity) print(clrs['g'] + str(len(new_files)) + clrs['n'] + ' new structure files were found and will be processed') now = str(time.strftime("%d-%m-%Y@%H.%M.%S")) dat_file = os.path.join(stats_dir, now + '-choirdb.dat') log_file = os.path.join(stats_dir, now + '-choirdb.log') err_file = os.path.join(stats_dir, now + '-choirdb.err') if not os.path.isfile(dat_file): with open(dat_file, 'w+'): pass # Write files not to be updated to new dat file with open(dat_file, 'a') as f: for i in assession_log: if i not in new_files: f.write(i + " " + assession_log[i] + "\n") # Create log file if not os.path.isfile(log_file): with open(log_file, 'w+') as f: f.write('Code, Chains, Author, Software, Date\n') # Read Chain correspondences chain_correspondences_file = os.path.join(stats_dir, 'chain_correspondences.pickle') if os.path.isfile(chain_correspondences_file): with open(chain_correspondences_file, 'rb') as p: chain_correspondences = pickle.load(p) else: chain_correspondences = {} # Main loop that will populate the ProtCHOIR database for pdb in pg(new_files, widgets=widgets): filename = pdb.split('/')[-1] subfolder = pdb.split('/')[-2] # Record assessment in dat file with open(dat_file, 'a') as f: f.write(filename + " " + str(time.time()) + '\n') # Start assession pctools.printv('\nAssessing ' + pdb + '...', verbosity) # Reject files larger than 10Mb file_size = os.stat(pdb).st_size / 1048576 pctools.printv( 'File size: ' + clrs['c'] + '{0:.1g}'.format(file_size) + ' Mb' + clrs['n'], verbosity) if file_size > 2: pctools.printv(clrs['r'] + "File size too large!" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Will try to fetch sequences from asymmetric unit." + clrs['n'], verbosity) try: alternative_pdb = os.path.join( pdb_archive, subfolder, 'pdb' + filename.split('.')[0] + '.ent.gz') pdb_code, structure, nchains = pctools.parse_pdb_structure( alternative_pdb) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) # Write in fasta file pctools.printv( clrs['y'] + "Recording large-pdb sequence" + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='largepdb') except: pctools.printv( clrs['r'] + "Failed to fetch sequence!" + clrs['n'], verbosity) continue try: pdb_code, structure, nchains = pctools.parse_pdb_structure(pdb) pctools.printv( 'Number of chains in structure ' + clrs['y'] + pdb_code + clrs['n'] + ': ' + str(nchains), verbosity) # Reject structures with more than 60 chains if int(nchains) > 60: pctools.printv( "Number of chains (" + clrs['y'] + str(nchains) + clrs['n'] + ") larger than 60! " + clrs['r'] + "Too many chains!" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Will try to fetch sequences anyway." + clrs['n'], verbosity) try: pdb_code, structure, nchains = pctools.parse_pdb_structure( pdb) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) pctools.printv( clrs['y'] + "Recording large-pdb sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='largepdb') except: pctools.printv( clrs['r'] + "Failed to fetch sequence!" + clrs['n'], verbosity) continue structure, chain_correspondences[pdb_code] = pctools.split_states( structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) pctools.printv( 'Number of chains (' + clrs['c'] + str(nchains) + clrs['n'] + ') and file size (' + clrs['c'] + str(file_size) + clrs['n'] + ') OK.' + clrs['g'] + ' Proceeding.' + clrs['n'] + '\n', verbosity) # Try to get info from the canonic pdb header (homonimous to pdb1) canonpdb = "pdb" + pdb_code + ".ent.gz" try: contents = pctools.parse_pdb_contents( os.path.join(pdb_archive, subfolder, canonpdb))[1] except: pctools.printv( clrs['r'] + '\n\n Mismatch between pdb and biounit entries...' + clrs['n'], verbosity) author, software = pctools.get_annotated_states(contents) pctools.printv( 'Author determined biological unit = ' + str(author), verbosity) pctools.printv( 'Software determined quaternary structure= ' + str(software), verbosity) # Start assessing sequences and structures (from 2 up to 26 chains) if 1 < int(nchains) < 61: ids, proteinpair = pctools.get_pairwise_ids(seqs, nchains) for id in ids: if id[0] >= 90: color = clrs['g'] else: color = clrs['r'] pctools.printv( 'Identity between chains ' + clrs['y'] + str(id[1]) + clrs['n'] + ' and ' + clrs['y'] + str(id[2]) + clrs['n'] + ' is ' + color + str(id[0]) + "%" + clrs['n'] + ".", verbosity) # Save records for pure h**o-oligomers if all(id[0] > 90 for id in ids) and proteinpair is True: pctools.printv( "All identities over 90%. Likely " + clrs['b'] + "h**o-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv(clrs['y'] + "FETCHING" + clrs['n'] + ".\n", verbosity) # Write file to database newfile = os.path.join(pdb_homo_archive, subfolder, pdb_code + ".pdb") if not os.path.isdir( os.path.join(pdb_homo_archive, subfolder)): os.mkdir(os.path.join(pdb_homo_archive, subfolder)) io.set_structure(structure) io.save(newfile) pctools.gzip_pdb(newfile) # Write to log file with open(log_file, 'a') as f: f.write( str(pdb_code) + "," + str(nchains) + "," + '/'.join(author) + "," + '/'.join(software) + "," + str(os.path.getctime(newfile + '.gz')) + '\n') # Write in fasta file pctools.printv( clrs['y'] + "Recording h**o-oligomer sequence." + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='h**o') # Investigate partial h**o-oligomers elif any(id[0] > 90 for id in ids) and proteinpair is True: at_least_one_interface = False for id in ids: if id[0] > 90: # Check if similar chains share interfaces if pctools.check_interfaces( structure, id[1], id[2]): at_least_one_interface = True pctools.printv( 'Contacts found between chains ' + clrs['g'] + str(id[1]) + clrs['n'] + ' and ' + clrs['g'] + str(id[2]) + clrs['n'] + ' sharing ' + clrs['g'] + str(id[0]) + clrs['n'] + " % identity.", verbosity) pctools.printv( "At least one putative " + clrs['b'] + "h**o-oligomeric " + clrs['n'] + "interface found.", verbosity) pctools.printv( clrs['y'] + "FETCHING" + clrs['n'] + ".\n", verbosity) # Write file to database newfile = os.path.join(pdb_homo_archive, subfolder, pdb_code + ".pdb") if not os.path.isdir( os.path.join(pdb_homo_archive, subfolder)): os.mkdir( os.path.join(pdb_homo_archive, subfolder)) io.set_structure(structure) io.save(newfile) pctools.gzip_pdb(newfile) # Write to log file with open(log_file, 'a') as f: f.write( str(pdb_code) + "," + str(nchains) + "," + '/'.join(author) + "," + '/'.join(software) + "," + str(os.path.getctime(newfile + '.gz')) + '\n') # Write in fasta file pctools.printv( clrs['y'] + "Recording h**o-oligomer sequence." + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='h**o') break if at_least_one_interface is False: pctools.printv( "No h**o-oligomeric interface found. Likely " + clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') elif proteinpair is False: pctools.printv( clrs['r'] + "No proteic chain pairs found" + clrs['n'] + ".", verbosity) if any([set(seq[1]) != {'X'} for seq in seqs]): pctools.printv( clrs['y'] + "Protein sequences found though" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') else: pctools.printv( clrs['r'] + "Not even a single protein chain. Disregarding." + clrs['n'], verbosity) else: pctools.printv( "No similar chains found. Likely " + clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') elif int(nchains) == 1: pctools.printv( "Only one chain found. Likely " + clrs['r'] + "monomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording monomer sequence." + clrs['n'], verbosity) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='mono') except: errtype, errvalue, errtraceback = sys.exc_info() errtypeshort = str(errtype).split('\'')[1] pctools.printv( clrs['r'] + '*' + str(errtypeshort) + ': ' + str(errvalue) + ' l.' + str(errtraceback.tb_lineno) + '*' + clrs['n'], verbosity) traceback.print_exception(*sys.exc_info()) if errtypeshort == 'KeyboardInterrupt': quit() #pctools.printv(clrs['r']+"UNKNOWN FAULT"+clrs['n']+".", verbosity) if not os.path.isfile(err_file): with open(err_file, 'w+') as f: pass with open(err_file, 'a') as f: f.write(filename + '\n') continue with open(chain_correspondences_file, 'wb') as p: pickle.dump(chain_correspondences, p, protocol=pickle.HIGHEST_PROTOCOL) if not os.path.isfile(err_file): with open(err_file, 'w+') as f: f.write('\nNo errors. Assessment terminated succesfully.\n')
def clean_and_sort(pdb_dir): ''' Make clean directory and h**o multimer subdirectories (Option 4) Called by: AccessPDB.py:main() ''' clean_dir = "clean/" try: os.mkdir(clean_dir) for i in range(1, 7): os.mkdir(clean_dir + '/' + str(i) + 'mers') except: pass ''' Loop through pdb files to detect h**o get_oligomeric_status ''' for pdb in pg(os.listdir(pdb_dir), widgets=widgets): if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith( ".ent.gz"): pdb_name = pdb.split('.')[0].split("/")[-1] if pdb.endswith(".ent.gz"): pdb_file = gzip.open(pdb_dir + '/' + pdb, 'rt') contents = gzip.open(pdb_dir + '/' + pdb, 'rt').read() else: pdb_file = open(pdb_dir + '/' + pdb) contents = open(pdb_dir + '/' + pdb, 'rt').read() try: structure = p.get_structure(pdb_name, pdb_file) except: print("Structure " + pdb_name + " could not be strictly parsed.") continue nchains, seqs, chid = extract_seqs(structure, 0) del chid print("\n\nAssessing " + pdb_name + ". This PDB has got " + str(nchains) + " chain(s).") if 2 <= nchains <= 6: if author_agrees(oligo_dict, contents, nchains): print("Author agrees that " + pdb_name + " is " + oligo_dict[nchains] + " and IDs will be checked.") ids = get_pairwise_ids(seqs, nchains) if all(id > 90 for id in ids): print( "All identities over 90%. Likely h**o-oligomer. Cleaning and sorting.\n\n" ) if clean_pdb(structure, pdb_name, clean_dir): os.rename( clean_dir + pdb_name + '.clean.pdb', clean_dir + str(nchains) + 'mers/' + pdb_name + '.clean.pdb') else: print( "Oops! Polypeptide chain too short or inexistent. Skipping.\n\n" ) else: print( "Identity under 90%. Likely not a h**o-oligomer. Skipping.\n\n" ) else: print("Author disagrees. Although PDB has " + str(nchains) + " chains, likely not " + oligo_dict[nchains] + ".\n\n") elif nchains == 1: if author_agrees(oligo_dict, contents, nchains): print("Author agrees that " + pdb_name + " is " + oligo_dict[nchains] + ". Cleaning and sorting.\n\n") if clean_pdb(structure, pdb_name, clean_dir): os.rename( clean_dir + pdb_name + '.clean.pdb', clean_dir + str(nchains) + 'mers/' + pdb_name + '.clean.pdb') else: print( "Oops! Polypeptide chain too short or inexistent. Skipping.\n\n" ) else: print("Author disagrees. Although PDB has " + str(nchains) + " chains, likely not " + oligo_dict[nchains] + ".\n\n") elif nchains > 6: print("Too many chains. Skipping\n\n")
def main(): assert not os.path.isfile( 'OligoSum.csv' ), '\033[1;31;40m \n\n File OligoSum.csv exists. Get rid of it.\n' results = open('OligoSum.csv', 'a') results.write( 'PDB ID,Chain length,Was Available,No of templates,Template,Is Same,ID,Gesamt Rank,Model Chains, Orig Chains, RMSD, Aligned (%), TM-Score\n' ) for job in pg(os.listdir(workdir), widgets=widgets): if job.endswith('h**o.oligo'): solution_list = [] tm_list = [] ntemplates = 0 template = 'NA' is_same = 'NA' id = 'NA' gesamt_rank = 'NA' RMSD = 'NA' alignedp = 'NA' nchains_model = 0 job_id = job.split('_')[0] pdb_id = ''.join(list(job)[3:7]) original_pdb_file = clean_dir + 'pdb' + pdb_id + '.clean.pdb' original_structure = p.get_structure('original', original_pdb_file) nchains_orig, nres = strtools.count_chains(original_structure) if pdb_id in available_models: was_available = 'YES' else: was_available = 'NO' model_list = [] for model in os.listdir(workdir + job): if model.startswith('oligo_model'): model_list.append(model) if not model_list: tmscore = 'No templates found' tm_list.append(tmscore) results.write(pdb_id + ',' + str(nres) + ',' + was_available + ',' + str(ntemplates) + ',' + template + ',' + is_same + ',' + str(id) + ',' + str(gesamt_rank) + ',' + str(nchains_model) + ',' + str(nchains_orig) + ',' + str(RMSD) + ',' + str(alignedp) + ',' + str(tmscore) + '\n') else: for model in model_list: template = model.split('_')[2] if template == pdb_id: is_same = 'YES' else: is_same = 'NO' ntemplates = len(model_list) gesamt_results_file = workdir + job + '/' + job_id + '.pdb_first_ges.res' with open(gesamt_results_file, 'r') as f: for line in f.readlines(): if re.search(template, line): gesamt_rank = line.split()[0] id = line.split()[4] break model_pdb_file = workdir + job + '/' + model + '/' + job_id + '.B99990001.pdb' try: model_structure = p.get_structure( 'modeled', model_pdb_file) nchains_model = strtools.count_chains( model_structure)[0] except FileNotFoundError: nchains_model = 0 if nchains_model == nchains_orig: merged_model_file = strtools.merge_chains( model_pdb_file) merged_original_file = strtools.merge_chains( original_pdb_file) aligned, RMSD, tmscore = strtools.run_tmalign( merged_model_file, merged_original_file) alignedp = (aligned * 100) / (nchains_model * nres) tm_list.append(tmscore) elif nchains_model == 0: tmscore = 'Templates found but models not built' RMSD = 'NA' alignedp = 'NA' tm_list.append(tmscore) else: tmscore = 'Wrong number of chains' RMSD = 'NA' alignedp = 'NA' tm_list.append(tmscore) results.write(pdb_id + ',' + str(nres) + ',' + was_available + ',' + str(ntemplates) + ',' + template + ',' + is_same + ',' + str(id) + ',' + str(gesamt_rank) + ',' + str(nchains_model) + ',' + str(nchains_orig) + ',' + str(RMSD) + ',' + str(alignedp) + ',' + str(tmscore) + '\n') for i in tm_list: if type(i) == str or i != max( [x for x in tm_list if type(x) != str]) or i < 0.58: solution_list.append('NO') else: solution_list.append('YES') solutions_list.append(solution_list) results.close() merged_solutions_list = sum(solutions_list, []) results = pd.read_csv('OligoSum.csv', na_filter=False) results.insert(13, 'Solution', merged_solutions_list) results.to_csv('OligoSum.csv')