def list_new_files(pdb1_archive, assession_log, verbosity): ''' Taps into the pdb1 local repository and checks if there are new files there which should be assessed by curate_homoDB function. It creates a list with files from the pdb1 database that are newer than the ones last assessed (registered in the dat file); which means that it takes the previous assession log as an input in the form of a dictionary where keys are files and the values correspond to the last assession time. Called by: curate_homoDB() ''' new_files = [] pctools.printv('Assessing files in PDB1 archive...', verbosity) assert os.path.isdir(pdb1_archive), clrs[ 'r'] + '\n\n Not able to find PDB archive.\n\n Does "' + pdb1_archive + '" exist?' + clrs[ 'n'] pdbfiles = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb1_archive) for f in filenames if f.endswith(".pdb1.gz") ] for f in pdbfiles: filename = f.split('/')[-1] mod_date = os.path.getctime(f) if filename not in assession_log or mod_date > float( assession_log[filename]): pctools.printv( clrs['y'] + f + ' should be assessed' + clrs['n'] + '...\n', verbosity) new_files.append(f) return new_files
def make_local_template(best_oligo_template): middle_letters_best = best_oligo_template[1:3] if g_args.allow_monomers: best_template_file = os.path.join( pdb_archive, middle_letters_best, 'pdb' + best_oligo_template + ".ent.gz") pdb_name, contents = pctools.parse_pdb_contents(best_template_file) is_nmr = pctools.is_nmr(contents) if is_nmr: print( clrs['r'] + '\n\n Selected template ' + best_oligo_template + ' is an NMR structure \n Will try a a different candidate.\n\n' + clrs['n']) raise else: best_template_file = os.path.join(pdb_homo_archive, middle_letters_best, best_oligo_template + ".pdb.gz") clean_template_file = os.path.join( workdir, best_oligo_template + "_CHOIR_CleanTemplate.pdb") pdb_name, structure, nchains = pctools.parse_any_structure( best_template_file) io.set_structure(structure) io.save(clean_template_file, pctools.SelectIfCA()) return clean_template_file
def analyse_oligomers(input_file, template_hitchain, oligomers_list, interfaces_dict, tmdata, report, args, entropies=None, z_entropies=None, minx=None, maxx=None): global g_template_hitchain global g_interfaces_dict global g_tmdata global g_report global g_args global g_entropies global g_z_entropies global g_minx global g_maxx global template global template_file global template_molprobity g_template_hitchain = template_hitchain g_interfaces_dict = interfaces_dict g_tmdata = tmdata g_report = report g_args = args g_entropies = entropies g_z_entropies = z_entropies g_minx = minx g_maxx = maxx pctools.print_section(3, 'OLIGOMER ANALYSIS') # Define template for comparisons template = template_hitchain.split(':')[0] template_file = template + '_CHOIR_RelevantChains.pdb' reports = [] if 'M' in args.assessment: template_molprobity, molprobity_output = pctools.run_molprobity( template_file, args) print(molprobity_output) # Run the analysis for all models in parallel if args.multiprocess is True: p = Pool() for model_report, output in p.map_async(analyse_model, oligomers_list).get(): print(output) reports.append(model_report) p.close() p.join() else: for oligomer in oligomers_list: model_report, output = analyse_model(oligomer) print(output) reports.append(model_report) return reports
def extract_relevant_chains(pdb_file, relevant_chains): template_name = os.path.basename(pdb_file).split('_CHOIR_')[0] pname, structure, nchains = pctools.parse_any_structure(pdb_file) relevant_chains_file = os.path.join( workdir, template_name + "_CHOIR_RelevantChains.pdb") chains = bpp.Selection.unfold_entities(structure, 'C') io.set_structure(structure) io.save(relevant_chains_file, pctools.SelectChains(relevant_chains)) return relevant_chains_file
def restore_chain_identifiers(pdb_file, chains_dict, full_residue_mapping): pname, structure, nchains = pctools.parse_any_structure(pdb_file) restored_chains_file = os.path.join(workdir, pname + "_CHOIR_CorrectedChains.pdb") chains = bpp.Selection.unfold_entities(structure, 'C') str_id = structure.id new_structure = bpp.Structure.Structure(str_id) new_model = bpp.Model.Model(0) for original, current in chains_dict.items(): for chain in chains: if chain.id == current: new_chain = bpp.Chain.Chain(current) new_chain.id = original for residue in chain: new_residue = bpp.Residue.Residue(residue.id, residue.get_resname(), residue.get_segid()) if type(full_residue_mapping[current] ) is collections.OrderedDict: for atom in residue: new_residue.add(atom) new_residue.id = ( ' ', full_residue_mapping[current][residue.id[1]], ' ') if type(full_residue_mapping[current]) is int: for atom in residue: new_residue.add(atom) new_residue.id = (' ', full_residue_mapping[current] + residue.id[1], ' ') new_chain.add(new_residue) new_model.add(new_chain) new_structure.add(new_model) io.set_structure(new_structure) io.save(restored_chains_file) return restored_chains_file
def update_seqres(verbosity): ''' Runs wget to update the local seqres database, decompresses it and runs makeblastdb. Called by: update_databases() ''' seqres_dir = os.path.join(choirdb, 'seqres') if not os.path.isdir(seqres_dir): os.mkdir(seqres_dir) seqres_txt = os.path.join(seqres_dir, 'pdb_seqres.txt') seqres_fasta = os.path.join(seqres_dir, 'seqres.fasta') pctools.printv('Fetching pdb_seqres.txt...', verbosity) attempt = 0 while attempt < 3: try: wgetout = subprocess.check_output([ 'wget', '-m', '-r', '-nH', '--cut-dirs=3', '--user=anonymous', seqres_ftp, '-P', seqres_dir ], stderr=subprocess.STDOUT) break except: attempt += 1 if attempt < 3: print('Attempt ' + str(attempt) + ' failed, trying again.') if attempt == 3: print( 'Failed to download seqres in 3 attempts. Try again later.' ) no_wget = 'seqres.txt.gz’ -- not retrieving' if no_wget not in wgetout.decode( 'UTF-8') or not os.path.isfile(seqres_txt): pctools.printv('Decompressing pdb_seqres.txt...', verbosity) with gzip.open(seqres_txt + '.gz', 'rb') as fin, open(seqres_fasta, 'wb') as fout: shutil.copyfileobj(fin, fout) if no_wget not in wgetout.decode( 'UTF-8') or not os.path.isfile(seqres_fasta + '.pal'): subprocess.run([ makeblastdb_exe, '-in', seqres_fasta, '-parse_seqids', '-dbtype', 'prot', '-blastdb_version', '5', '-out', seqres ])
def update_uniref(verbosity): ''' Runs wget to update the local uniref50 database, decompresses it and runs makeblastdb. Called by: update_databases() ''' uniref50_fasta = os.path.join(choirdb, 'uniref50/uniref50.fasta') pctools.printv('Fetching uniref50.fasta...', verbosity) attempt = 0 while attempt < 3: try: wgetout = subprocess.check_output([ 'wget', '-m', '-r', '-nH', '--cut-dirs=4', '--user=anonymous', uniref50_ftp, '-P', choirdb ], stderr=subprocess.STDOUT) break except: attempt += 1 if attempt < 3: print('Attempt ' + str(attempt) + ' failed, trying again.') if attempt == 3: print( 'Failed to download UniRef50 in 3 attempts. Try again later.' ) no_wget = 'uniref50.fasta.gz’ -- not retrieving' if no_wget not in wgetout.decode( 'UTF-8') or not os.path.isfile(uniref50_fasta): pctools.printv('Decompressing uniref50.fasta...', verbosity) with gzip.open(uniref50_fasta + '.gz', 'rb') as fin, open(uniref50_fasta, 'wb') as fout: shutil.copyfileobj(fin, fout) if no_wget not in wgetout.decode( 'UTF-8') or not os.path.isfile(uniref50_fasta + '.pal'): subprocess.run([ makeblastdb_exe, '-in', uniref50_fasta, '-parse_seqids', '-dbtype', 'prot', '-out', uniref50 ])
def score_pairwise(seq1, seq2, matrix, gap_s, gap_e): score = 0 gap = False ipos = 0 fpos = 30 nwindows = -(-len(seq1) // 30) pctools.printv('Number of 30-residue segments: ' + str(nwindows), g_args.verbosity) wscores = [] for window in range(nwindows): wscore = 0 if fpos > len(seq1): fpos = len(seq1) pctools.printv( str(ipos + 1) + ' ' + seq1[ipos:fpos] + ' ' + str(fpos), g_args.verbosity) pctools.printv( str(ipos + 1) + ' ' + seq2[ipos:fpos] + ' ' + str(fpos), g_args.verbosity) for i in range(len(seq1))[ipos:fpos]: pair = (seq1[i], seq2[i]) if not gap: if pair == ('-', '-'): score += 4 wscore += 4 elif '-' in pair: gap = True score += gap_s wscore += gap_s else: score += score_match(pair, matrix) wscore += score_match(pair, matrix) else: if '-' not in pair: gap = False score += score_match(pair, matrix) wscore += score_match(pair, matrix) else: score += gap_e wscore += gap_e ipos += 30 fpos += 30 pctools.printv('Segment score: ' + str(wscore), g_args.verbosity) wscores.append(wscore) return score, wscores
def record_fasta(pdb_code, seqs, chain_ids, subfolder, type=None): if not os.path.isdir(os.path.join(pdb_homo_archive, subfolder)): os.mkdir(os.path.join(pdb_homo_archive, subfolder)) type_folder = os.path.join(pdb_homo_archive, subfolder, type + '_sequences') if not os.path.isdir(type_folder): os.mkdir(type_folder) fasta_file = os.path.join(type_folder, pdb_code + ".fasta") with open(fasta_file, 'w+') as f: for seq, chain_id in zip(seqs, chain_ids): if pctools.is_valid_sequence(seq[1]): wrapped_seq = "\n".join(tw.wrap(seq[1])) fasta_entry = '>' + pdb_code + ':' + str( chain_id) + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry)
def rename_relevant_chains(pdb_file): template_name = os.path.basename(pdb_file).split('_CHOIR_')[0] pname, structure, nchains = pctools.parse_any_structure(pdb_file) renamed_chains_file = os.path.join( workdir, template_name + "_CHOIR_RenamedChainsTemplate.pdb") chains = bpp.Selection.unfold_entities(structure, 'C') chains_dict = {} n = 1 for chain in chains: original = chain.id new = numalpha[str(n)] chain.id = 'X' + new n += 1 chains_dict[original] = new for chain in chains: chain.id = chain.id[1] io.set_structure(structure) io.save(renamed_chains_file) return renamed_chains_file, chains_dict
def analyse_largest_complexes(item): output = [] hitchain, chains = item template, hit_chain = hitchain.split(':') middle_letters = template[1:3] template_file = os.path.join(pdb_homo_archive, middle_letters, template + ".pdb.gz") sum_qscore = 0 chain_n = 0 for chain in chains: chain_n += 1 qscore, rmsd, fasta_out, gesamt_output = pctools.run_gesamt( template, template_file, input_name, g_input_file, chain, g_args) sum_qscore += float(qscore) output.append(gesamt_output) average_qscore = sum_qscore / chain_n output.append('--\n\nAverage Q-Score for all candidate chains is ' + clrs['c'] + str(average_qscore) + clrs['n'] + '\n') output.append( '-------------------------------------------------------------------\n' ) return hitchain, average_qscore, '\n'.join(output)
def curate_homoDB(verbosity): ''' Creates h**o-oligomeric database from a local pdb repsitory. The divided scheme adopted by RCSB, in which the subdirectories are the two middle characters in the PDB code, is assumed. Each database contains three key files: dat, log and fasta. * homodb.dat contains only the pdb codes contained in the database. * homodb.log contains summarized relevant information about each entry. * homodb.fasta contains the sequences of every chain in the database. Called by: update_databases() ''' # Create stats folder if does not exist stats_dir = os.path.join(pdb_homo_archive, 'stats') if not os.path.isdir(stats_dir): os.mkdir(stats_dir) # Compare latest assession with new files assession_log = read_latest_assession(stats_dir) new_files = list_new_files(pdb1_archive, assession_log, verbosity) print(clrs['g'] + str(len(new_files)) + clrs['n'] + ' new structure files were found and will be processed') now = str(time.strftime("%d-%m-%Y@%H.%M.%S")) dat_file = os.path.join(stats_dir, now + '-choirdb.dat') log_file = os.path.join(stats_dir, now + '-choirdb.log') err_file = os.path.join(stats_dir, now + '-choirdb.err') if not os.path.isfile(dat_file): with open(dat_file, 'w+'): pass # Write files not to be updated to new dat file with open(dat_file, 'a') as f: for i in assession_log: if i not in new_files: f.write(i + " " + assession_log[i] + "\n") # Create log file if not os.path.isfile(log_file): with open(log_file, 'w+') as f: f.write('Code, Chains, Author, Software, Date\n') # Read Chain correspondences chain_correspondences_file = os.path.join(stats_dir, 'chain_correspondences.pickle') if os.path.isfile(chain_correspondences_file): with open(chain_correspondences_file, 'rb') as p: chain_correspondences = pickle.load(p) else: chain_correspondences = {} # Main loop that will populate the ProtCHOIR database for pdb in pg(new_files, widgets=widgets): filename = pdb.split('/')[-1] subfolder = pdb.split('/')[-2] # Record assessment in dat file with open(dat_file, 'a') as f: f.write(filename + " " + str(time.time()) + '\n') # Start assession pctools.printv('\nAssessing ' + pdb + '...', verbosity) # Reject files larger than 10Mb file_size = os.stat(pdb).st_size / 1048576 pctools.printv( 'File size: ' + clrs['c'] + '{0:.1g}'.format(file_size) + ' Mb' + clrs['n'], verbosity) if file_size > 2: pctools.printv(clrs['r'] + "File size too large!" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Will try to fetch sequences from asymmetric unit." + clrs['n'], verbosity) try: alternative_pdb = os.path.join( pdb_archive, subfolder, 'pdb' + filename.split('.')[0] + '.ent.gz') pdb_code, structure, nchains = pctools.parse_pdb_structure( alternative_pdb) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) # Write in fasta file pctools.printv( clrs['y'] + "Recording large-pdb sequence" + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='largepdb') except: pctools.printv( clrs['r'] + "Failed to fetch sequence!" + clrs['n'], verbosity) continue try: pdb_code, structure, nchains = pctools.parse_pdb_structure(pdb) pctools.printv( 'Number of chains in structure ' + clrs['y'] + pdb_code + clrs['n'] + ': ' + str(nchains), verbosity) # Reject structures with more than 60 chains if int(nchains) > 60: pctools.printv( "Number of chains (" + clrs['y'] + str(nchains) + clrs['n'] + ") larger than 60! " + clrs['r'] + "Too many chains!" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Will try to fetch sequences anyway." + clrs['n'], verbosity) try: pdb_code, structure, nchains = pctools.parse_pdb_structure( pdb) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) pctools.printv( clrs['y'] + "Recording large-pdb sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='largepdb') except: pctools.printv( clrs['r'] + "Failed to fetch sequence!" + clrs['n'], verbosity) continue structure, chain_correspondences[pdb_code] = pctools.split_states( structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) pctools.printv( 'Number of chains (' + clrs['c'] + str(nchains) + clrs['n'] + ') and file size (' + clrs['c'] + str(file_size) + clrs['n'] + ') OK.' + clrs['g'] + ' Proceeding.' + clrs['n'] + '\n', verbosity) # Try to get info from the canonic pdb header (homonimous to pdb1) canonpdb = "pdb" + pdb_code + ".ent.gz" try: contents = pctools.parse_pdb_contents( os.path.join(pdb_archive, subfolder, canonpdb))[1] except: pctools.printv( clrs['r'] + '\n\n Mismatch between pdb and biounit entries...' + clrs['n'], verbosity) author, software = pctools.get_annotated_states(contents) pctools.printv( 'Author determined biological unit = ' + str(author), verbosity) pctools.printv( 'Software determined quaternary structure= ' + str(software), verbosity) # Start assessing sequences and structures (from 2 up to 26 chains) if 1 < int(nchains) < 61: ids, proteinpair = pctools.get_pairwise_ids(seqs, nchains) for id in ids: if id[0] >= 90: color = clrs['g'] else: color = clrs['r'] pctools.printv( 'Identity between chains ' + clrs['y'] + str(id[1]) + clrs['n'] + ' and ' + clrs['y'] + str(id[2]) + clrs['n'] + ' is ' + color + str(id[0]) + "%" + clrs['n'] + ".", verbosity) # Save records for pure h**o-oligomers if all(id[0] > 90 for id in ids) and proteinpair is True: pctools.printv( "All identities over 90%. Likely " + clrs['b'] + "h**o-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv(clrs['y'] + "FETCHING" + clrs['n'] + ".\n", verbosity) # Write file to database newfile = os.path.join(pdb_homo_archive, subfolder, pdb_code + ".pdb") if not os.path.isdir( os.path.join(pdb_homo_archive, subfolder)): os.mkdir(os.path.join(pdb_homo_archive, subfolder)) io.set_structure(structure) io.save(newfile) pctools.gzip_pdb(newfile) # Write to log file with open(log_file, 'a') as f: f.write( str(pdb_code) + "," + str(nchains) + "," + '/'.join(author) + "," + '/'.join(software) + "," + str(os.path.getctime(newfile + '.gz')) + '\n') # Write in fasta file pctools.printv( clrs['y'] + "Recording h**o-oligomer sequence." + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='h**o') # Investigate partial h**o-oligomers elif any(id[0] > 90 for id in ids) and proteinpair is True: at_least_one_interface = False for id in ids: if id[0] > 90: # Check if similar chains share interfaces if pctools.check_interfaces( structure, id[1], id[2]): at_least_one_interface = True pctools.printv( 'Contacts found between chains ' + clrs['g'] + str(id[1]) + clrs['n'] + ' and ' + clrs['g'] + str(id[2]) + clrs['n'] + ' sharing ' + clrs['g'] + str(id[0]) + clrs['n'] + " % identity.", verbosity) pctools.printv( "At least one putative " + clrs['b'] + "h**o-oligomeric " + clrs['n'] + "interface found.", verbosity) pctools.printv( clrs['y'] + "FETCHING" + clrs['n'] + ".\n", verbosity) # Write file to database newfile = os.path.join(pdb_homo_archive, subfolder, pdb_code + ".pdb") if not os.path.isdir( os.path.join(pdb_homo_archive, subfolder)): os.mkdir( os.path.join(pdb_homo_archive, subfolder)) io.set_structure(structure) io.save(newfile) pctools.gzip_pdb(newfile) # Write to log file with open(log_file, 'a') as f: f.write( str(pdb_code) + "," + str(nchains) + "," + '/'.join(author) + "," + '/'.join(software) + "," + str(os.path.getctime(newfile + '.gz')) + '\n') # Write in fasta file pctools.printv( clrs['y'] + "Recording h**o-oligomer sequence." + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='h**o') break if at_least_one_interface is False: pctools.printv( "No h**o-oligomeric interface found. Likely " + clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') elif proteinpair is False: pctools.printv( clrs['r'] + "No proteic chain pairs found" + clrs['n'] + ".", verbosity) if any([set(seq[1]) != {'X'} for seq in seqs]): pctools.printv( clrs['y'] + "Protein sequences found though" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') else: pctools.printv( clrs['r'] + "Not even a single protein chain. Disregarding." + clrs['n'], verbosity) else: pctools.printv( "No similar chains found. Likely " + clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') elif int(nchains) == 1: pctools.printv( "Only one chain found. Likely " + clrs['r'] + "monomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording monomer sequence." + clrs['n'], verbosity) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='mono') except: errtype, errvalue, errtraceback = sys.exc_info() errtypeshort = str(errtype).split('\'')[1] pctools.printv( clrs['r'] + '*' + str(errtypeshort) + ': ' + str(errvalue) + ' l.' + str(errtraceback.tb_lineno) + '*' + clrs['n'], verbosity) traceback.print_exception(*sys.exc_info()) if errtypeshort == 'KeyboardInterrupt': quit() #pctools.printv(clrs['r']+"UNKNOWN FAULT"+clrs['n']+".", verbosity) if not os.path.isfile(err_file): with open(err_file, 'w+') as f: pass with open(err_file, 'a') as f: f.write(filename + '\n') continue with open(chain_correspondences_file, 'wb') as p: pickle.dump(chain_correspondences, p, protocol=pickle.HIGHEST_PROTOCOL) if not os.path.isfile(err_file): with open(err_file, 'w+') as f: f.write('\nNo errors. Assessment terminated succesfully.\n')
def main(): args = initial_args # Define multiprocessing options args.available_cores = cpu_count() if args.force_single_core is True: args.multiprocess = False args.psiblast_threads = 1 args.modeller_threads = 1 else: if args.psiblast_threads is None: args.psiblast_threads = args.available_cores if args.modeller_threads is None: args.modeller_threads = min([args.available_cores, args.models]) if args.update is True: print( tw.dedent(""" !WARNING! You have chosen to updtate the local databases. ** The root directory for the database files is: """ + clrs['y'] + choirdb + clrs['n'] + """ ** The path to local pdb mirror is: """ + clrs['y'] + pdb_archive + clrs['n'] + """ ** The path to local pdb biounit mirror is: """ + clrs['y'] + pdb1_archive + clrs['n'] + """ ** The path to local gesamt archive is: """ + clrs['y'] + ges_homo_archive + clrs['n'] + """ ** The path to local UniRef50 blast database is: """ + clrs['y'] + uniref50 + clrs['n'] + """ This could take a long time. <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< """)) option = input('Do you confirm the information above? (y/n)') if option == 'y' or option == 'Y' or option == 'YES' or option == 'yes' or option == 'Yes': update_databases(args.verbosity) print('\n\nDone updating all databases. Exiting.\n') else: print('\n\nNo positive confirmation, will not update databases.\n') exit() # Actually run oligomerization protocol else: outdir = os.getcwd() input_file = args.input_file assert os.path.isdir(pdb_archive), clrs[ 'r'] + '\n\n Not able to find PDB directory.\n\n Does "' + pdb_archive + '" exist?' + clrs[ 'n'] assert os.path.isdir(pdb1_archive), clrs[ 'r'] + '\n\n Not able to find PDB1 assemblies directory.\n\n Does "' + pdb1_archive + '" exist?' + clrs[ 'n'] assert os.path.isdir(pdb_homo_archive), clrs[ 'r'] + '\n\n Not able to find ProtCHOIR database directory.\n\n Does "' + pdb_homo_archive + '" exist?' + clrs[ 'n'] assert os.path.isdir(ges_homo_archive), clrs[ 'r'] + '\n\n Not able to find GESAMT archive directory.\n\n Does "' + ges_homo_archive + '" exist?' + clrs[ 'n'] assert args.refine_level in [0, 1, 2, 3, 4], clrs[ 'r'] + '\n\n Refinement level must be an integer number from 0 to 4.\n Run ProtCHOIR -h for more information\n\n' + clrs[ 'n'] assert args.psiblast_params in psiblast_params, clrs[ 'r'] + '\n\n PSI-BLAST parameters invalid.\n Run ProtCHOIR -h for more information\n\n' + clrs[ 'n'] assert input_file is not None, clrs[ 'r'] + '\n\n Please inform the input file name.\n Run ProtCHOIR -h for more information.\n\n' + clrs[ 'n'] assert os.path.isfile(input_file), clrs[ 'r'] + '\n\n Not able to find input file.\n\n Does "' + input_file + '" exist?\n' + clrs[ 'n'] assert args.zip_output in [0, 1, 2], clrs[ 'r'] + '\n\n Compression level must be an integer number between 0 and 2.\n Run ProtCHOIR -h for more information\n\n' + clrs[ 'n'] assert all([ i in set('MIG') for i in set(args.assessment) ]) or args.assessment == 'N', clrs[ 'r'] + '\n\n Oligomer assessment type do not comply.\n Choose any combination of [G]Gesamt, [M]Molprobity, [I]Interfaces or choose [N] for None\n\n' + clrs[ 'n'] # Force generation of topologies and all assessments if final report is requested if args.generate_report is True: args.assessment = 'MIG' args.plot_topologies = True # Deal with dots and dashes in the input file and remove dots if input_file.lower().endswith('.pdb'): input_basename = os.path.basename(input_file).split('.pdb')[0] input_basename = input_basename.replace(".", "_") input_basename = input_basename.replace("-", "_") new_input_file = input_basename + '.pdb' if os.path.basename(input_file) == os.path.basename( new_input_file): pass else: shutil.copy(input_file, new_input_file) # Also process filename to fasta header if input file is fasta elif input_file.lower().endswith('.fasta'): input_basename = os.path.basename(input_file).split('.fasta')[0] input_basename = input_basename.replace(".", "_") input_basename = input_basename.replace("-", "_") new_input_file = os.path.join( outdir, input_basename + '_CHOIR_MonomerSequence.fasta') with open(input_file, 'r') as infile, open(new_input_file, 'w') as outfile: outfile.write('>' + input_basename + '\n') n = 0 for line in infile.readlines(): if not line.startswith('>'): outfile.write(line) else: n += 1 if n == 2: break args.sequence_mode = True else: raise pctools.FileFormatError( clrs['r'] + '\n\n Input format must be either pdb or fasta\n Run ./ProtCHOIR -h for more information\n\n' + clrs['n']) if args.allow_monomers: assert args.sequence_mode is True, clrs[ 'r'] + '\n\n To allow building monomers you must use sequence mode. \n Run ProtCHOIR -h for more information\n\n' + clrs[ 'n'] # Start recording job progress with open('CHOIR_Progress.out', 'w') as f: f.write("Starting new ProtCHOIR run\n") # Pickle Runtime arguments pickle.dump(args, open('CHOIR_Args.pickle', 'wb')) # Show arguments used and create CHOIR.conf pctools.print_section(0, "Runtime Arguments") runtime_arguments = {} choir_args = os.path.join(outdir, "CHOIR.args") with open(choir_args, 'w') as f: for name, value in vars(args).items(): runtime_arguments[name] = value print(name + "=" + str(value)) f.write(name + "=" + str(value) + "\n") print('\nRuntime parameters written to: ' + clrs['g'] + os.path.basename(choir_args) + clrs['n'] + '\n') # Initialize report report = {} report['runtime_arguments'] = runtime_arguments report['input_filename'] = os.path.basename(new_input_file) # Write errorprof placeholder summary placeholder_report = report.copy() report_data = [ 'input_filename', 'sequence_mode', 'templatedmodel', 'protomer_residues', 'tmspans', 'highest_scoring_state', 'homo_oligomeric_over_other_score', 'best_template', 'best_nchains', 'best_id', 'best_cov', 'best_qscore', 'model_oligomer_name', 'model_molprobity', 'gesamt_rmsd', 'protchoir_score', 'surface_score', 'interfaces_score', 'quality_score', 'total_runtime', 'exit' ] for data in report_data: if data not in placeholder_report: placeholder_report[data] = 'NA' with open(input_basename + '_CHOIR_Summary.tsv', 'w') as f: f.write( 'Input\tSeq.Mode\tTemplated\tLength\tTMSpans\tLikelyState\tH3OScore\tTemplate\tChains\tIdentity\tCoverage\tAv.QScore\tBestModel\tMolprobity\tRMSD\tProtCHOIR\tSurface\tInterfaces\tQuality\tRuntime\tExit\n' ) f.write('\t'.join( [str(placeholder_report[data]) for data in report_data]) + '\n') # Start analysis of protomer analyse_protomer_results, report, args = analyze_protomer( new_input_file, report, args) # If no suitable h**o-oligomeric template wasfound, exit nicely. if analyse_protomer_results is None: finalize(report, input_basename, start_time, start_timestamp, args) pctools.print_sorry() sys.exit(0) # Else, proceed conditionally on runtime arguments elif analyse_protomer_results is not None and args.sequence_mode is True: residue_index_mapping = None minx = None maxx = None if args.skip_conservation: entropies = None z_entropies = None pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, tmdata = analyse_protomer_results elif not args.skip_conservation: pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, entropies, z_entropies, tmdata = analyse_protomer_results if entropies == z_entropies == minx == maxx == None: args.skip_conservation = True elif analyse_protomer_results is not None and args.sequence_mode is False: if args.skip_conservation: minx = None maxx = None entropies = None z_entropies = None pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, residue_index_mapping, tmdata = analyse_protomer_results elif not args.skip_conservation: pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, entropies, z_entropies, residue_index_mapping, minx, maxx, tmdata = analyse_protomer_results if entropies == z_entropies == minx == maxx == None: args.skip_conservation = True report['runtime_arguments'][ 'skip_conservation'] = args.skip_conservation new_input_file = clean_input_file # Use information of complexes to build oligomers best_oligo_template, built_oligomers, report = make_oligomer( new_input_file, largest_oligo_complexes, report, args, residue_index_mapping=residue_index_mapping) # If no models were built, exit nicely. if built_oligomers is None: finalize(report, input_basename, start_time, start_timestamp, args) pctools.print_sorry() sys.exit(0) # Analyse built models reports = analyse_oligomers(new_input_file, best_oligo_template, built_oligomers, interfaces_dict, tmdata, report, args, entropies=entropies, z_entropies=z_entropies, minx=minx, maxx=maxx) finalize(reports, input_basename, start_time, start_timestamp, args)
def finalize(reports, input_basename, start_time, start_timestamp, args): report_data = [ 'input_filename', 'sequence_mode', 'templatedmodel', 'protomer_residues', 'tmspans', 'highest_scoring_state', 'homo_oligomeric_over_other_score', 'best_template', 'best_nchains', 'best_id', 'best_cov', 'best_qscore', 'model_oligomer_name', 'model_molprobity', 'gesamt_rmsd', 'quality_score', 'surface_score', 'interfaces_score', 'protchoir_score', 'total_runtime', 'exit' ] if type(reports) is list: if args.zip_output == 2: # Don't prevent compression of anything nozip = [] for report in reports: if args.generate_report is True: report['html_report'] = pctools.html_report(report, args) else: # Prevent compression of files needed for the report and the models nozip = [ os.path.basename(report['model_filename']) for report in reports ] for report in reports: if args.generate_report is True: report['html_report'] = pctools.html_report(report, args) for key, value in report.items(): if key in [ 'html_report', 'molprobity_radar', 'comparison_plots', 'protomer_figure', 'protomer_plot', 'template_figure', 'topology_figure', 'assemblied_protomer_plot', 'input_filename' ]: nozip.append(os.path.basename(value)) if key == 'model_figures': for figure in value: nozip.append(os.path.basename(figure)) best_report = sorted(reports, key=operator.itemgetter('protchoir_score'))[-1] elif type(reports) is dict: nozip = [] best_report = reports for data in report_data: if data not in best_report: best_report[data] = 'NA' # Generate summary tsv file for the best report end_time = datetime.now() runtime = end_time - start_time best_report['total_runtime'] = str(runtime.seconds) summary_file = input_basename + '_CHOIR_Summary.tsv' nozip.append(summary_file) if 'exit' not in best_report: best_report['exit'] = '0' with open('CHOIR_Progress.out', 'a') as f: f.write(datetime.now().strftime("%H:%M:%S") + ": Finished running ProtCHOIR!") elif best_report['exit'] == '1': with open('CHOIR_Progress.out', 'a') as f: f.write( datetime.now().strftime("%H:%M:%S") + ": ERROR! Indicated template not found in oligomers database..." ) elif best_report['exit'] == '2': with open('CHOIR_Progress.out', 'a') as f: f.write(datetime.now().strftime("%H:%M:%S") + ": ERROR! Failed to find suitable homologues...") elif best_report['exit'] == '3': with open('CHOIR_Progress.out', 'a') as f: f.write( datetime.now().strftime("%H:%M:%S") + ": ERROR! Failed to find suitable h**o-oligomeri interfaces..." ) elif best_report['exit'] == '4': with open('CHOIR_Progress.out', 'a') as f: f.write( datetime.now().strftime("%H:%M:%S") + ": ERROR! No template had an average Q-score above cut-off...") elif best_report['exit'] == '5': with open('CHOIR_Progress.out', 'a') as f: f.write(datetime.now().strftime("%H:%M:%S") + ": ERROR! Failed to find templates in local databases...") elif best_report['exit'] == '6': with open('CHOIR_Progress.out', 'a') as f: f.write( datetime.now().strftime("%H:%M:%S") + ": ERROR! Sub-optimal alignment between template and target sequences..." ) with open(summary_file, 'w') as f: f.write( 'Input\tSeq.Mode\tTemplated\tLength\tTMSpans\tLikelyState\tH3OScore\tTemplate\tChains\tIdentity\tCoverage\tAv.QScore\tBestModel\tMolprobity\tRMSD\tQuality\tSurface\tInterfaces\tProtCHOIR\tRuntime\tExit\n' ) f.write('\t'.join([str(best_report[data]) for data in report_data]) + '\n') # Finalise final_end_time = datetime.timestamp(datetime.now()) time.sleep(1) # Compress output if args.zip_output > 0: try: import zlib compression = zipfile.ZIP_DEFLATED except (ImportError, AttributeError): compression = zipfile.ZIP_STORED with zipfile.ZipFile(input_basename + '_ProtCHOIR_OUT.zip', 'w', compression=compression) as zipf: for f in os.listdir(os.getcwd()): if f != input_basename + '_ProtCHOIR_OUT.zip' and os.path.getctime( f) > start_timestamp and os.path.getctime( f) < final_end_time: print('Compressing... ' + f) zipf.write(f) if f not in nozip: if os.path.isdir(f): shutil.rmtree(f) elif os.path.isfile(f): os.remove(f) print('FINISHED AT: ' + datetime.now().strftime("%d-%m-%Y %H:%M")) print('TOTAL RUNTIME: ' + str(runtime.seconds) + ' s')
def generate_ali(alignments, best_oligo_template, residue_index_mapping, args): best_oligo_template_file = best_oligo_template + "_CHOIR_RenamedChainsTemplate" final_alignment = os.path.join( workdir, input_name + '_' + best_oligo_template + '_CHOIR_Alignment.ali') getseq = False alignment_dict = {} full_residue_mapping = {} # Parse individual GESAMT alignments and organize in a per-chain dictionary for fasta_alignment in alignments: getseq = False template = False chain = None entryseq_dict = {} for line in open(fasta_alignment, 'r').readlines(): # Only record sequence if line above starts with > if getseq is True: getseq = False seq = line.replace('\n', '') # If this is the template, count leading and trailing gaps if template is True: template = False leading_gaps = 0 for r in seq: if r == '-': leading_gaps += 1 else: break trailing_gaps = 0 for r in seq[::-1]: if r == '-': trailing_gaps += 1 else: break assert seq is not None, 'Sequence is None' assert seq != '', 'Sequence is empty' entryseq_dict[entry] = seq.upper() del seq # If it is an entry line, get details and expect sequence if line.startswith('>'): entry = line.split('>')[1].split('(')[0].split( '.pdb')[0].replace('\n', '') # If entry is template, use chain as reference if entry == best_oligo_template_file: chain = line.split('(')[1].split(')')[0] template = True getseq = True # Remove leading and trailing gaps from the alignment for both template and query if trailing_gaps == 0: for entry, seq in entryseq_dict.items(): entryseq_dict[entry] = leading_gaps * '-' + seq[leading_gaps:] else: for entry, seq in entryseq_dict.items(): entryseq_dict[entry] = leading_gaps * '-' + seq[ leading_gaps:-trailing_gaps] + trailing_gaps * '-' if residue_index_mapping is not None: full_residue_mapping[chain] = collections.OrderedDict() for res, i in residue_index_mapping.items(): full_residue_mapping[chain][res] = i + leading_gaps else: full_residue_mapping[chain] = leading_gaps alignment_dict[chain] = entryseq_dict pctools.printv( 'Removed ' + clrs['c'] + str(leading_gaps) + clrs['n'] + ' leading gaps and ' + clrs['c'] + str(trailing_gaps) + clrs['n'] + ' trailing gaps from chain ' + clrs['c'] + chain + clrs['n'] + ' alignment.\n', verbosity) # If symmetry is desired, reduce all chains to match the size of the smallest if args.symmetry: max_leading_gaps = 0 max_trailing_gaps = 0 for chain, seqs in alignment_dict.items(): for entry, seq in seqs.items(): if entry == best_oligo_template_file: leading_gaps = 0 for r in seq: if r == '-': leading_gaps += 1 else: break if leading_gaps > max_leading_gaps: max_leading_gaps = leading_gaps trailing_gaps = 0 for r in seq[::-1]: if r == '-': trailing_gaps += 1 else: break if trailing_gaps > max_trailing_gaps: max_trailing_gaps = trailing_gaps pctools.printv( 'To cope with symmetry restraints, the modelled sequence will contain ' + clrs['c'] + str(max_leading_gaps) + clrs['n'] + ' leading gaps and ' + clrs['c'] + str(max_trailing_gaps) + clrs['n'] + ' trailing gaps' + clrs['n'] + '.\n', verbosity) print(max_trailing_gaps) for chain, seqs in alignment_dict.items(): if max_trailing_gaps == 0: seqs[entry] = max_leading_gaps * '-' + seqs[entry][ max_leading_gaps:] else: seqs[entry] = max_leading_gaps * '-' + seqs[ entry][max_leading_gaps: -max_trailing_gaps] + max_trailing_gaps * '-' # Find out first and last chains first_chain = sorted(alignment_dict)[0] last_chain = sorted(alignment_dict)[-1] # Create strings to write in alignment file alignment_string_dict = {} for entry in [input_name, best_oligo_template_file]: if entry == input_name: alignment_string_dict[ entry] = ">P1;" + input_name + "\nsequence:" + input_name + ":FIRST:" + first_chain + ":LAST:" + last_chain + "::::\n" elif entry == best_oligo_template_file: alignment_string_dict[ entry] = ">P1;" + best_oligo_template_file + ".pdb\nstructureX:" + best_oligo_template_file + ".pdb:FIRST:" + first_chain + ":LAST:" + last_chain + "::::\n" for chain, entryseq in sorted(alignment_dict.items()): if chain == last_chain: alignment_string_dict[entry] += entryseq[entry] + '*\n' else: alignment_string_dict[entry] += entryseq[entry] + '/\n' # Write alignment file with open(final_alignment, 'w') as f: for entry, entrystring in alignment_string_dict.items(): pctools.printv(entrystring, verbosity) f.write(entrystring) print('Modeller Alignment written to ' + clrs['g'] + os.path.basename(final_alignment) + clrs['n'] + '\n') return final_alignment, full_residue_mapping
def make_oligomer(input_file, largest_oligo_complexes, report, args, residue_index_mapping=None): global workdir global input_name global verbosity global g_input_file global g_args global best_oligo_template_code global renamed_chains_file g_input_file = input_file g_args = args verbosity = args.verbosity workdir = os.getcwd() symmetry = args.symmetry # Subsection 2[a] ####################################################################### if args.sequence_mode is False: input_name = os.path.basename(input_file).split(".pdb")[0].replace( '.', '_') candidate_qscores = {} # Select structurally best oligomeric template using GESAMT pctools.print_section(2, 'OLIGOMER ASSEMBLING') pctools.print_subsection('2[a]', 'Structural template selection') if args.multiprocess is True: p = Pool() for hitchain, average_qscore, output in p.map_async( analyse_largest_complexes, largest_oligo_complexes.items()).get(): candidate_qscores[hitchain] = average_qscore report['hits'][hitchain]['qscore'] = round(average_qscore, 3) print(output) p.close() p.join() else: for item in largest_oligo_complexes.items(): hitchain, average_qscore, output = analyse_largest_complexes( item) candidate_qscores[hitchain] = average_qscore report['hits'][hitchain]['qscore'] = round(average_qscore, 3) print(output) best_oligo_template = max(candidate_qscores.keys(), key=(lambda x: candidate_qscores[x])) if candidate_qscores[best_oligo_template] >= args.qscore_cutoff: print('Structurally, the best template is: ' + clrs['y'] + best_oligo_template + clrs['n'] + '. Using that!\n') report['best_template'] = best_oligo_template.split(':')[0] report['best_id'] = report['hits'][best_oligo_template]['id'] report['best_cov'] = report['hits'][best_oligo_template][ 'coverage'] report['best_qscore'] = report['hits'][best_oligo_template][ 'qscore'] report['best_nchains'] = report['hits'][best_oligo_template][ 'final_homo_chains'] else: print('No template had an average Q-score above cut-off of ' + clrs['c'] + str(args.qscore_cutoff) + clrs['n'] + '\nTry lowering the cutoff or running in sequence mode.\n') report['exit'] = '4' return None, None, report report['topology_figure'] = './' + best_oligo_template.replace( ':', '_') + '_CHOIR_Topology.png' template_chains = largest_oligo_complexes[best_oligo_template] best_oligo_template_code = best_oligo_template.split(':')[0] clean_template_file = make_local_template(best_oligo_template_code) elif args.sequence_mode is True: if input_file.endswith('.pdb'): input_name = os.path.basename(input_file).split(".pdb")[0].replace( '.', '_') input_file = os.path.join( workdir, input_name + '_CHOIR_MonomerSequence.fasta') g_input_file = input_file elif input_file.endswith('_CHOIR_MonomerSequence.fasta'): input_name = os.path.basename(input_file).split( "_CHOIR_MonomerSequence.fasta")[0] pctools.print_section(2, 'OLIGOMER ASSEMBLING - SEQUENCE MODE') print(clrs['y'] + "Skipping section 2[a] - Structural template selection" + clrs['n'] + "\n") attempt = 0 while attempt < len(largest_oligo_complexes): try: best_oligo_template = list(largest_oligo_complexes)[attempt] report['best_template'] = best_oligo_template.split(':')[0] report['best_id'] = report['hits'][best_oligo_template]['id'] report['best_cov'] = report['hits'][best_oligo_template][ 'coverage'] report['best_qscore'] = 'NA' report['best_nchains'] = report['hits'][best_oligo_template][ 'final_homo_chains'] report['topology_figure'] = './' + best_oligo_template.replace( ':', '_') + '_CHOIR_Topology.png' template_chains = largest_oligo_complexes[best_oligo_template] best_oligo_template_code = best_oligo_template.split(':')[0] clean_template_file = make_local_template( best_oligo_template_code) break except: attempt += 1 if attempt < len(largest_oligo_complexes): print('Attempt ' + str(attempt) + ' failed, trying a differente template candidate.') if attempt == len(largest_oligo_complexes): print('Failed to find templates in local databases.') report['exit'] = '5' return None, None, report relevant_chains_file = extract_relevant_chains(clean_template_file, template_chains) if args.generate_report is True: report['template_figure'], pymol_output = pctools.pymol_screenshot( relevant_chains_file, args) print(pymol_output) renamed_chains_file, chains_dict = rename_relevant_chains( relevant_chains_file) relevant_chains = [ chains_dict[template_chain] for template_chain in template_chains ] # Subsection 2[b] ####################################################################### pctools.print_subsection('2[b]', 'Generating alignment') # Generate per chain alignment files alignment_files = [] if args.sequence_mode is False: if args.multiprocess is True: p = Pool() for qscore, rmsd, fasta_out, gesamt_output in p.map_async( run_gesamt_parallel, chains_dict.values()).get(): alignment_files.append(fasta_out) print(gesamt_output) p.close() p.join() else: for chain in chains_dict.values(): qscore, rmsd, fasta_out, gesamt_output = run_gesamt_parallel( chain) alignment_files.append(fasta_out) print(gesamt_output) elif args.sequence_mode is True: if args.multiprocess is True: p = Pool() for fasta_out, output in p.map_async(alignment_from_sequence, chains_dict.values()).get(): alignment_files.append(fasta_out) print(output) else: for current_chain in chains_dict.values(): fasta_out, output = alignment_from_sequence(current_chain) alignment_files.append(fasta_out) print(output) print('Alignment files:\n' + clrs['g'] + ('\n').join([os.path.basename(i) for i in alignment_files]) + clrs['n']) # Generate final alignment which will be the input for Modeller final_alignment, full_residue_mapping = generate_ali( alignment_files, best_oligo_template_code, residue_index_mapping, args) # Score said alignment and enforce treshold report[ 'relative_alignment_score'], relative_wscores, nchains = score_alignment( final_alignment) print('\nFinal average relative score for alignment: ' + str(round(report['relative_alignment_score'], 2)) + '%') bad_streches = 0 for wscore in relative_wscores: if wscore < args.similarity_cutoff: bad_streches += 1 if bad_streches >= args.bad_streches * nchains: if args.sequence_mode is True: print( '\nThe alignment score was unacceptable for ' + clrs['r'] + str(bad_streches) + clrs['n'] + ' 30-res segments of the protein complex.\nTry running the default (structure) mode.\n' ) else: print( '\nThe alignment score was unacceptable for ' + clrs['r'] + str(bad_streches) + clrs['n'] + ' 30-res segments of the protein complex.\nTry increasing the number of candidate templates or tweaking the similarity cut-offs.\n' ) report['exit'] = '6' return None, None, report # Subsection 2[c] ####################################################################### pctools.print_subsection('2[c]', 'Generating models') genmodel_file, expected_models = create_genmodel(final_alignment, best_oligo_template_code, relevant_chains, args) run_modeller(genmodel_file) # Record list of oligomers built nmodels = 0 built_oligomers = [] for model in expected_models: built_oligomers.append( restore_chain_identifiers(model, chains_dict, full_residue_mapping)) nmodels += 1 print(clrs['b'] + 'ProtCHOIR' + clrs['n'] + ' built ' + clrs['c'] + str(nmodels) + clrs['n'] + ' model oligomers:') for model in built_oligomers: print(clrs['g'] + model + clrs['n']) return best_oligo_template, built_oligomers, report
def collect_fasta(verbosity): ''' Fetches fasta files in the pdb_homo_archive and creates a single fasta file within a "sequences" folder. For that, it checks the identity among the chains in the original fasta and only keeps track of the unique chains, i.e. less than 99% identity to the other chains. This file is later use to make the blast database. Called by: update_databases() ''' fastafiles = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb_homo_archive) for f in filenames if f.endswith(".fasta") ] seqdir = os.path.join(pdb_homo_archive, 'sequences') if not os.path.isdir(seqdir): os.mkdir(seqdir) largepdb_collected_fasta = os.path.join(seqdir, 'largepdb_collected.fastas') with open(largepdb_collected_fasta, 'w+'): pass homo_collected_fasta = os.path.join(seqdir, 'homo_collected.fastas') with open(homo_collected_fasta, 'w+'): pass mono_collected_fasta = os.path.join(seqdir, 'mono_collected.fastas') with open(mono_collected_fasta, 'w+'): pass hetero_collected_fasta = os.path.join(seqdir, 'hetero_collected.fastas') with open(hetero_collected_fasta, 'w+'): pass for fasta in pg(fastafiles, widgets=widgets): pctools.printv('Assessing ' + clrs['y'] + fasta + clrs['n'] + '...', verbosity) contents = open(fasta, 'r').read() contentlines = contents.split('>') nchains = str(len(re.findall('>', contents))) pctools.printv( 'With ' + clrs['y'] + nchains + clrs['n'] + ' chains to be assessed\n', verbosity) uniques = [] for entry in contentlines: if entry: splitentry = entry.split('\n', 1) pdbch = splitentry[0] seq = splitentry[1].replace('\n', '') if uniques: percent_ids = [] for unique in uniques: alignment = parasail.sg_stats_striped_16( seq, unique[1], 10, 1, parasail.blosum62) if alignment.length == 0: percent_ids.append(0) else: percent_ids.append( (alignment.matches) / alignment.length * 100) if all(percent_id <= 99 for percent_id in percent_ids): uniques.append([pdbch, seq]) else: uniques.append([pdbch, seq]) if '/largepdb_sequences/' in fasta: with open(largepdb_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/mono_sequences/' in fasta: with open(mono_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/hetero_sequences/' in fasta: with open(hetero_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/homo_sequences/' in fasta: with open(homo_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) subprocess.run([ makeblastdb_exe, '-in', largepdb_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'largedb') ]) subprocess.run([ makeblastdb_exe, '-in', mono_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'monodb') ]) subprocess.run([ makeblastdb_exe, '-in', hetero_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'heterodb') ]) subprocess.run([ makeblastdb_exe, '-in', homo_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'homodb') ])
def score_alignment(alignment_file): print(clrs['b'] + 'SCORING ALIGNMENT' + clrs['n'] + ' in ' + clrs['y'] + os.path.basename(alignment_file) + clrs['n'] + '\n') sequences = list(SeqIO.parse(alignment_file, "pir")) query_chains = str(sequences[0].seq).split('/') template_chains = str(sequences[1].seq).split('/') trimmed_query_chains = [] trimmed_template_chains = [] for query_chain, template_chain in zip(query_chains, template_chains): leading_gaps = 0 for r in query_chain: if r == '-': leading_gaps += 1 else: break trailing_gaps = 0 for r in query_chain[::-1]: if r == '-': trailing_gaps += 1 else: break if trailing_gaps == 0: trimmed_query_chains.append(query_chain[leading_gaps:]) trimmed_template_chains.append(template_chain[leading_gaps:]) else: trimmed_query_chains.append( query_chain[leading_gaps:-trailing_gaps]) trimmed_template_chains.append( template_chain[leading_gaps:-trailing_gaps]) relative_wscores = [] relative_scores = [] for q_chain, t_chain in zip(trimmed_query_chains, trimmed_template_chains): pctools.printv( '\nCalculating ' + clrs['y'] + 'maximum scores' + clrs['n'] + ' for chain segments:', g_args.verbosity) max_score, max_wscores = score_pairwise(t_chain, t_chain, MatrixInfo.blosum62, 0, 0) pctools.printv( '\nCalculating ' + clrs['y'] + 'actual scores' + clrs['n'] + ' for chain segments:', g_args.verbosity) score, wscores = score_pairwise(q_chain, t_chain, MatrixInfo.blosum62, 0, 0) relative_scores.append(round(score * 100 / max_score, 2)) for max_wscore, wscore in zip(max_wscores, wscores): if max_wscore != 0: relative_wscore = round(wscore * 100 / max_wscore, 2) else: relative_wscore = 100 relative_wscores.append(relative_wscore) relative_score = sum(relative_scores) / len(relative_scores) string = '' for relative_wscore in relative_wscores: if relative_wscore > g_args.similarity_cutoff: color = 'g' else: color = 'r' if string == '': string += (clrs[color] + str(relative_wscore) + clrs['n']) else: string += (' ~ ' + clrs[color] + str(relative_wscore) + clrs['n']) print('\nRelative score per 30-res segment: ' + string + clrs['n']) return relative_score, relative_wscores, len(query_chains)
def analyse_model(oligomer): output = [] model_report = g_report.copy() model_report['model_filename'] = oligomer model_oligomer_name = os.path.basename(oligomer).split( "_CHOIR_")[0].replace('.', '_') output.append(pctools.subsection('3', model_oligomer_name)) output.append('Analysing oligomer file: ' + clrs['y'] + oligomer + clrs['n'] + '\n') model_report['model_oligomer_name'] = model_oligomer_name if g_args.generate_report is True: model_report['model_figures'], pymol_output = pctools.pymol_screenshot( oligomer, g_args, putty=True) output.append(pymol_output) pdb_name, structure, nchains = pctools.parse_any_structure(oligomer) nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0) relevant_chains = [] for seq in seqs: relevant_chains.append(seq[0]) pisa_output, pisa_error, protomer_data = pctools.run_pisa( oligomer, '', g_args.verbosity, gen_monomer_data=True, gen_oligomer_data=True) protomer_surface_residues = pctools.get_areas(protomer_data) model_report['assemblied_protomer_plot'], model_report[ 'assemblied_protomer_exposed_area'], model_report[ 'assemblied_protomer_hydrophobic_area'], model_report[ 'assemblied_protomer_conserved_area'], minx, maxx, analysis_output = pctools.plot_analysis( pdb_name, protomer_surface_residues, g_entropies, g_z_entropies, g_tmdata, g_args, minx=g_minx, maxx=g_maxx) output.append(analysis_output) if 'I' in g_args.assessment and not g_args.allow_monomers: output.append( pctools.subsection('3' + '[I]', 'Interfaces Comparison: ' + model_oligomer_name)) if g_args.sequence_mode is False and g_args.skip_conservation is False: model_report['exposed_area_reduction'] = int( 100 * (float(model_report['assemblied_protomer_exposed_area']) - float(model_report['protomer_exposed_area'])) / float(model_report['protomer_exposed_area'])) model_report['hydrophobic_area_reduction'] = int( 100 * (float(model_report['assemblied_protomer_hydrophobic_area']) - float(model_report['protomer_hydrophobic_area'])) / float(model_report['protomer_hydrophobic_area'])) model_report['conserved_area_reduction'] = int( 100 * (float(model_report['assemblied_protomer_conserved_area']) - float(model_report['protomer_conserved_area'])) / float(model_report['protomer_conserved_area'])) if model_report['exposed_area_reduction'] < -5: if model_report['hydrophobic_area_reduction'] < 0: hydophobic_surface_score = 10 * ( model_report['hydrophobic_area_reduction'] / model_report['exposed_area_reduction']) / 3 else: hydophobic_surface_score = 0 if hydophobic_surface_score > 10: hydophobic_surface_score = 10 output.append('Hydrophobic surface score: ' + str(hydophobic_surface_score)) if model_report['conserved_area_reduction'] < 0: conserved_surface_score = 10 * ( model_report['conserved_area_reduction'] / model_report['exposed_area_reduction']) / 3 else: conserved_surface_score = 0 if conserved_surface_score > 10: conserved_surface_score = 10 output.append('Conserved surface score: ' + str(conserved_surface_score)) model_report['surface_score'] = round( (hydophobic_surface_score + conserved_surface_score) / 2, 2) else: output.append(clrs['r'] + 'Exposed area reduction too small.' + clrs['n']) model_report['surface_score'] = 0 output.append('Final surface score: ' + str(model_report['surface_score'])) else: model_report['surface_score'] = 'NA' model_oligomer = oligomer.split('_CHOIR_CorrectedChains')[0] xml_out = model_oligomer + '_CHOIR_PisaInterfaces.xml' model_interfaces_list, interfaces_output = pctools.parse_interfaces( xml_out, relevant_chains, g_args.verbosity) template_interfaces_list = g_interfaces_dict[g_template_hitchain] if model_interfaces_list and template_interfaces_list: if g_args.verbosity > 0: output.append(clrs['y'] + 'MODEL INTERFACES' + clrs['n']) for model_interface in model_interfaces_list: output.append(clrs['y'] + ' <> '.join(model_interface['chains']) + clrs['n']) output.append(clrs['y'] + 'Interface Area: ' + clrs['n'] + str(model_interface['interface area']) + ' A^2') output.append( clrs['y'] + 'Interface Solvation Energy: ' + clrs['n'] + str(model_interface['interface solvation energy']) + ' kcal/mol') output.append(clrs['y'] + 'Hydrogen Bonds: ' + clrs['n'] + str(model_interface['hydrogen bonds'])) output.append(clrs['y'] + 'Salt Bridges: ' + clrs['n'] + str(model_interface['salt bridges'])) output.append(clrs['y'] + 'Disulphide Bridges: ' + clrs['n'] + str(model_interface['disulphide bridges']) + "\n\n") interfaces_comparison = {} for template_interface in template_interfaces_list: for model_interface in model_interfaces_list: if set(model_interface['chains']) == set( template_interface['chains']): comparison_data = {} denominator = 12 delta_area = round( model_interface['interface area'] - template_interface['interface area'], 2) comparison_data['model area'] = model_interface[ 'interface area'] comparison_data['template area'] = template_interface[ 'interface area'] comparison_data['delta area'] = delta_area delta_energy = round( model_interface['interface solvation energy'] - template_interface['interface solvation energy'], 2) comparison_data['model energy'] = model_interface[ 'interface solvation energy'] comparison_data[ 'template energy'] = template_interface[ 'interface solvation energy'] comparison_data['delta energy'] = delta_energy delta_hb = round( model_interface['hydrogen bonds'] - template_interface['hydrogen bonds'], 2) comparison_data['model hb'] = model_interface[ 'hydrogen bonds'] comparison_data['template hb'] = template_interface[ 'hydrogen bonds'] comparison_data['delta hb'] = delta_hb delta_sb = round( model_interface['salt bridges'] - template_interface['salt bridges'], 2) comparison_data['model sb'] = model_interface[ 'salt bridges'] comparison_data['template sb'] = template_interface[ 'salt bridges'] comparison_data['delta sb'] = delta_sb delta_ss = round( model_interface['disulphide bridges'] - template_interface['disulphide bridges'], 2) comparison_data['model ss'] = model_interface[ 'disulphide bridges'] comparison_data['template ss'] = template_interface[ 'disulphide bridges'] comparison_data['delta ss'] = delta_ss output.append(clrs['y'] + 'INTERFACES COMPARISON' + clrs['n']) output.append(' <> '.join(model_interface['chains'])) if delta_area >= 0: emphasis_color = clrs['g'] relative_area = 100 else: emphasis_color = clrs['r'] relative_area = round( model_interface['interface area'] * 100 / template_interface['interface area'], 2) output.append('Delta Interface Area: ' + emphasis_color + str(delta_area) + clrs['n'] + ' A^2 (' + str(relative_area) + '%)') if delta_energy <= 0: emphasis_color = clrs['g'] relative_energy = 100 else: emphasis_color = clrs['r'] if model_interface[ 'interface solvation energy'] < 0 and template_interface[ 'interface solvation energy'] < 0: relative_energy = round( model_interface[ 'interface solvation energy'] * 100 / template_interface[ 'interface solvation energy'], 2) elif model_interface[ 'interface solvation energy'] > 0 and template_interface[ 'interface solvation energy'] < 0: relative_energy = 0 elif model_interface[ 'interface solvation energy'] < 0 and template_interface[ 'interface solvation energy'] > 0: relative_energy = 100 elif model_interface[ 'interface solvation energy'] > 0 and template_interface[ 'interface solvation energy'] > 0: relative_energy = 0 output.append('Delta Interface Solvation Energy: ' + emphasis_color + str(delta_energy) + clrs['n'] + ' kcal/mol (' + str(relative_energy) + '%)') if model_interface[ 'hydrogen bonds'] == template_interface[ 'hydrogen bonds'] == 0: relative_hb = 0 emphasis_color = clrs['r'] denominator -= 2 elif delta_hb >= 0: relative_hb = 100 emphasis_color = clrs['g'] else: emphasis_color = clrs['r'] relative_hb = round( model_interface['hydrogen bonds'] * 100 / template_interface['hydrogen bonds'], 2) output.append('Delta Hydrogen Bonds: ' + emphasis_color + str(delta_hb) + clrs['n'] + ' (' + str(relative_hb) + '%)') if model_interface[ 'salt bridges'] == template_interface[ 'salt bridges'] == 0: relative_sb = 0 emphasis_color = clrs['r'] denominator -= 3 elif delta_sb >= 0: relative_sb = 100 emphasis_color = clrs['g'] else: relative_sb = round( model_interface['salt bridges'] * 100 / template_interface['salt bridges'], 2) emphasis_color = clrs['r'] output.append('Delta Salt Bridges: ' + emphasis_color + str(delta_sb) + clrs['n'] + ' (' + str(relative_sb) + '%)') if model_interface[ 'disulphide bridges'] == template_interface[ 'disulphide bridges'] == 0: relative_ss = 0 emphasis_color = clrs['r'] denominator -= 4 elif delta_ss >= 0: relative_ss = 100 emphasis_color = clrs['g'] else: relative_ss = round( model_interface['disulphide bridges'] * 100 / template_interface['disulphide bridges'], 2) emphasis_color = clrs['r'] output.append('Delta Disulphide Bridges: ' + emphasis_color + str(delta_ss) + clrs['n'] + ' (' + str(relative_ss) + '%)\n') if denominator == 0: comparison_data['score'] = 0 else: comparison_data['score'] = round( (relative_area + 2 * relative_energy + 2 * relative_hb + 3 * relative_sb + 4 * relative_ss) / denominator, 2) output.append('Interface score: ' + str(comparison_data['score'])) interfaces_comparison[''.join( sorted( model_interface['chains']))] = comparison_data comparison_plots, interfaces_output = plot_deltas( model_oligomer_name, template, interfaces_comparison, g_args) model_report['comparison_plots'] = os.path.basename( comparison_plots) output.append(interfaces_output) summed_score = 0 for interface, data in interfaces_comparison.items(): summed_score += data['score'] model_report['interfaces_score'] = round( summed_score / (10 * len(interfaces_comparison)), 2) output.append('Final interfaces score: ' + str(model_report['interfaces_score'])) else: if 'surface_score' not in model_report: model_report['surface_score'] = 0 model_report['interfaces_score'] = 0 else: model_report['surface_score'] = 'NA' model_report['interfaces_score'] = 'NA' model_report['comparison_plots'] = 'NA' model_report['assemblied_protomer_exposed_area'] = 'NA' model_report['assemblied_protomer_hydrophobic_area'] = 'NA' model_report['assemblied_protomer_conserved_area'] = 'NA' if 'G' in g_args.assessment: output.append(pctools.subsection('3' + '[G]', 'GESAMT Comparison')) qscore, rmsd, fasta_out, gesamt_output = pctools.run_gesamt( template, template_file, model_oligomer_name, oligomer, None, g_args) output.append(gesamt_output) model_report['gesamt_qscore'] = str(qscore) model_report['gesamt_rmsd'] = str(rmsd) else: model_report['gesamt_qscore'] = 'NA' model_report['gesamt_rmsd'] = 'NA' if 'M' in g_args.assessment: output.append(pctools.subsection('3' + '[M]', 'Molprobity Comparison')) model_molprobity, molprobity_output = pctools.run_molprobity( oligomer, g_args) output.append(molprobity_output) model_report['model_clashscore'] = str(model_molprobity['clashscore']) model_report['model_molprobity'] = str( model_molprobity['molprobity_score']) output.append(clrs['y'] + 'MOLPROBITY COMPARISON' + clrs['n']) output.append('Criterion\tTempl.\tModel') output.append('Rama. Fav.\t' + str(template_molprobity['rama_fav']) + '\t' + str(model_molprobity['rama_fav'])) output.append('Rama. Out.\t' + str(template_molprobity['rama_out']) + '\t' + str(model_molprobity['rama_out'])) output.append('Rot. Out.\t' + str(template_molprobity['rot_out']) + '\t' + str(model_molprobity['rot_out'])) output.append('CBeta Dev.\t' + str(template_molprobity['cb_dev']) + '\t' + str(model_molprobity['cb_dev'])) output.append('Clashscore\t' + str(template_molprobity['clashscore']) + '\t' + str(model_molprobity['clashscore'])) output.append('Molprob. Score\t' + str(template_molprobity['molprobity_score']) + '\t' + str(model_molprobity['molprobity_score'])) molprobity_radar, radar_output = plot_molprobity( model_oligomer_name, model_molprobity, template, template_molprobity) output.append(radar_output) model_report['molprobity_radar'] = molprobity_radar delta_clashscore = (model_molprobity['clashscore'] - template_molprobity['clashscore']) / 10 output.append('Delta clashscore: ' + str(delta_clashscore)) if delta_clashscore >= 1: model_report['quality_score'] = round( 10 - math.log(delta_clashscore**5, 10), 2) else: model_report['quality_score'] = 10 output.append('Final quality score: ' + str(model_report['quality_score'])) else: model_report['model_clashscore'] = 'NA' model_report['model_molprobity'] = 'NA' model_report['quality_score'] = 'NA' if 'M' in g_args.assessment and 'I' in g_args.assessment and not g_args.allow_monomers: if g_args.sequence_mode is False and g_args.skip_conservation is False: model_report['protchoir_score'] = round( sum([ model_report['interfaces_score'], model_report['surface_score'], model_report['quality_score'] ]) / 3, 2) else: model_report['protchoir_score'] = round( sum([ model_report['interfaces_score'], model_report['quality_score'] ]) / 2, 2) elif 'M' in g_args.assessment: model_report['protchoir_score'] = model_report['quality_score'] elif 'I' in g_args.assessment: if g_args.sequence_mode is False and g_args.skip_conservation is False: model_report['protchoir_score'] = round( sum([ model_report['interfaces_score'], model_report['surface_score'] ]) / 2, 2) else: model_report['protchoir_score'] = model_report['interfaces_score'] else: model_report['protchoir_score'] = 'NA' if str(model_report['protchoir_score']) == 'NA': model_report['score_color'] = 'grey' elif model_report['protchoir_score'] <= 5: model_report['score_color'] = 'red' elif 5 < model_report['protchoir_score'] <= 7: model_report['score_color'] = 'orange' elif model_report['protchoir_score'] > 7: model_report['score_color'] = 'green' pickle.dump(model_report, open(model_oligomer_name + '_CHOIR_model_report.pickle', 'wb')) return model_report, '\n'.join(output)