def list_new_files(pdb1_archive, assession_log, verbosity): ''' Taps into the pdb1 local repository and checks if there are new files there which should be assessed by curate_homoDB function. It creates a list with files from the pdb1 database that are newer than the ones last assessed (registered in the dat file); which means that it takes the previous assession log as an input in the form of a dictionary where keys are files and the values correspond to the last assession time. Called by: curate_homoDB() ''' new_files = [] pctools.printv('Assessing files in PDB1 archive...', verbosity) assert os.path.isdir(pdb1_archive), clrs[ 'r'] + '\n\n Not able to find PDB archive.\n\n Does "' + pdb1_archive + '" exist?' + clrs[ 'n'] pdbfiles = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb1_archive) for f in filenames if f.endswith(".pdb1.gz") ] for f in pdbfiles: filename = f.split('/')[-1] mod_date = os.path.getctime(f) if filename not in assession_log or mod_date > float( assession_log[filename]): pctools.printv( clrs['y'] + f + ' should be assessed' + clrs['n'] + '...\n', verbosity) new_files.append(f) return new_files
def update_seqres(verbosity): ''' Runs wget to update the local seqres database, decompresses it and runs makeblastdb. Called by: update_databases() ''' seqres_dir = os.path.join(choirdb, 'seqres') if not os.path.isdir(seqres_dir): os.mkdir(seqres_dir) seqres_txt = os.path.join(seqres_dir, 'pdb_seqres.txt') seqres_fasta = os.path.join(seqres_dir, 'seqres.fasta') pctools.printv('Fetching pdb_seqres.txt...', verbosity) attempt = 0 while attempt < 3: try: wgetout = subprocess.check_output([ 'wget', '-m', '-r', '-nH', '--cut-dirs=3', '--user=anonymous', seqres_ftp, '-P', seqres_dir ], stderr=subprocess.STDOUT) break except: attempt += 1 if attempt < 3: print('Attempt ' + str(attempt) + ' failed, trying again.') if attempt == 3: print( 'Failed to download seqres in 3 attempts. Try again later.' ) no_wget = 'seqres.txt.gz’ -- not retrieving' if no_wget not in wgetout.decode( 'UTF-8') or not os.path.isfile(seqres_txt): pctools.printv('Decompressing pdb_seqres.txt...', verbosity) with gzip.open(seqres_txt + '.gz', 'rb') as fin, open(seqres_fasta, 'wb') as fout: shutil.copyfileobj(fin, fout) if no_wget not in wgetout.decode( 'UTF-8') or not os.path.isfile(seqres_fasta + '.pal'): subprocess.run([ makeblastdb_exe, '-in', seqres_fasta, '-parse_seqids', '-dbtype', 'prot', '-blastdb_version', '5', '-out', seqres ])
def update_uniref(verbosity): ''' Runs wget to update the local uniref50 database, decompresses it and runs makeblastdb. Called by: update_databases() ''' uniref50_fasta = os.path.join(choirdb, 'uniref50/uniref50.fasta') pctools.printv('Fetching uniref50.fasta...', verbosity) attempt = 0 while attempt < 3: try: wgetout = subprocess.check_output([ 'wget', '-m', '-r', '-nH', '--cut-dirs=4', '--user=anonymous', uniref50_ftp, '-P', choirdb ], stderr=subprocess.STDOUT) break except: attempt += 1 if attempt < 3: print('Attempt ' + str(attempt) + ' failed, trying again.') if attempt == 3: print( 'Failed to download UniRef50 in 3 attempts. Try again later.' ) no_wget = 'uniref50.fasta.gz’ -- not retrieving' if no_wget not in wgetout.decode( 'UTF-8') or not os.path.isfile(uniref50_fasta): pctools.printv('Decompressing uniref50.fasta...', verbosity) with gzip.open(uniref50_fasta + '.gz', 'rb') as fin, open(uniref50_fasta, 'wb') as fout: shutil.copyfileobj(fin, fout) if no_wget not in wgetout.decode( 'UTF-8') or not os.path.isfile(uniref50_fasta + '.pal'): subprocess.run([ makeblastdb_exe, '-in', uniref50_fasta, '-parse_seqids', '-dbtype', 'prot', '-out', uniref50 ])
def score_pairwise(seq1, seq2, matrix, gap_s, gap_e): score = 0 gap = False ipos = 0 fpos = 30 nwindows = -(-len(seq1) // 30) pctools.printv('Number of 30-residue segments: ' + str(nwindows), g_args.verbosity) wscores = [] for window in range(nwindows): wscore = 0 if fpos > len(seq1): fpos = len(seq1) pctools.printv( str(ipos + 1) + ' ' + seq1[ipos:fpos] + ' ' + str(fpos), g_args.verbosity) pctools.printv( str(ipos + 1) + ' ' + seq2[ipos:fpos] + ' ' + str(fpos), g_args.verbosity) for i in range(len(seq1))[ipos:fpos]: pair = (seq1[i], seq2[i]) if not gap: if pair == ('-', '-'): score += 4 wscore += 4 elif '-' in pair: gap = True score += gap_s wscore += gap_s else: score += score_match(pair, matrix) wscore += score_match(pair, matrix) else: if '-' not in pair: gap = False score += score_match(pair, matrix) wscore += score_match(pair, matrix) else: score += gap_e wscore += gap_e ipos += 30 fpos += 30 pctools.printv('Segment score: ' + str(wscore), g_args.verbosity) wscores.append(wscore) return score, wscores
def collect_fasta(verbosity): ''' Fetches fasta files in the pdb_homo_archive and creates a single fasta file within a "sequences" folder. For that, it checks the identity among the chains in the original fasta and only keeps track of the unique chains, i.e. less than 99% identity to the other chains. This file is later use to make the blast database. Called by: update_databases() ''' fastafiles = [ os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb_homo_archive) for f in filenames if f.endswith(".fasta") ] seqdir = os.path.join(pdb_homo_archive, 'sequences') if not os.path.isdir(seqdir): os.mkdir(seqdir) largepdb_collected_fasta = os.path.join(seqdir, 'largepdb_collected.fastas') with open(largepdb_collected_fasta, 'w+'): pass homo_collected_fasta = os.path.join(seqdir, 'homo_collected.fastas') with open(homo_collected_fasta, 'w+'): pass mono_collected_fasta = os.path.join(seqdir, 'mono_collected.fastas') with open(mono_collected_fasta, 'w+'): pass hetero_collected_fasta = os.path.join(seqdir, 'hetero_collected.fastas') with open(hetero_collected_fasta, 'w+'): pass for fasta in pg(fastafiles, widgets=widgets): pctools.printv('Assessing ' + clrs['y'] + fasta + clrs['n'] + '...', verbosity) contents = open(fasta, 'r').read() contentlines = contents.split('>') nchains = str(len(re.findall('>', contents))) pctools.printv( 'With ' + clrs['y'] + nchains + clrs['n'] + ' chains to be assessed\n', verbosity) uniques = [] for entry in contentlines: if entry: splitentry = entry.split('\n', 1) pdbch = splitentry[0] seq = splitentry[1].replace('\n', '') if uniques: percent_ids = [] for unique in uniques: alignment = parasail.sg_stats_striped_16( seq, unique[1], 10, 1, parasail.blosum62) if alignment.length == 0: percent_ids.append(0) else: percent_ids.append( (alignment.matches) / alignment.length * 100) if all(percent_id <= 99 for percent_id in percent_ids): uniques.append([pdbch, seq]) else: uniques.append([pdbch, seq]) if '/largepdb_sequences/' in fasta: with open(largepdb_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/mono_sequences/' in fasta: with open(mono_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/hetero_sequences/' in fasta: with open(hetero_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) elif '/homo_sequences/' in fasta: with open(homo_collected_fasta, 'a') as f: for unique in uniques: wrapped_seq = "\n".join(tw.wrap(unique[1])) fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n' f.write(fasta_entry) subprocess.run([ makeblastdb_exe, '-in', largepdb_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'largedb') ]) subprocess.run([ makeblastdb_exe, '-in', mono_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'monodb') ]) subprocess.run([ makeblastdb_exe, '-in', hetero_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'heterodb') ]) subprocess.run([ makeblastdb_exe, '-in', homo_collected_fasta, '-dbtype', 'prot', '-out', os.path.join(seqdir, 'homodb') ])
def curate_homoDB(verbosity): ''' Creates h**o-oligomeric database from a local pdb repsitory. The divided scheme adopted by RCSB, in which the subdirectories are the two middle characters in the PDB code, is assumed. Each database contains three key files: dat, log and fasta. * homodb.dat contains only the pdb codes contained in the database. * homodb.log contains summarized relevant information about each entry. * homodb.fasta contains the sequences of every chain in the database. Called by: update_databases() ''' # Create stats folder if does not exist stats_dir = os.path.join(pdb_homo_archive, 'stats') if not os.path.isdir(stats_dir): os.mkdir(stats_dir) # Compare latest assession with new files assession_log = read_latest_assession(stats_dir) new_files = list_new_files(pdb1_archive, assession_log, verbosity) print(clrs['g'] + str(len(new_files)) + clrs['n'] + ' new structure files were found and will be processed') now = str(time.strftime("%d-%m-%Y@%H.%M.%S")) dat_file = os.path.join(stats_dir, now + '-choirdb.dat') log_file = os.path.join(stats_dir, now + '-choirdb.log') err_file = os.path.join(stats_dir, now + '-choirdb.err') if not os.path.isfile(dat_file): with open(dat_file, 'w+'): pass # Write files not to be updated to new dat file with open(dat_file, 'a') as f: for i in assession_log: if i not in new_files: f.write(i + " " + assession_log[i] + "\n") # Create log file if not os.path.isfile(log_file): with open(log_file, 'w+') as f: f.write('Code, Chains, Author, Software, Date\n') # Read Chain correspondences chain_correspondences_file = os.path.join(stats_dir, 'chain_correspondences.pickle') if os.path.isfile(chain_correspondences_file): with open(chain_correspondences_file, 'rb') as p: chain_correspondences = pickle.load(p) else: chain_correspondences = {} # Main loop that will populate the ProtCHOIR database for pdb in pg(new_files, widgets=widgets): filename = pdb.split('/')[-1] subfolder = pdb.split('/')[-2] # Record assessment in dat file with open(dat_file, 'a') as f: f.write(filename + " " + str(time.time()) + '\n') # Start assession pctools.printv('\nAssessing ' + pdb + '...', verbosity) # Reject files larger than 10Mb file_size = os.stat(pdb).st_size / 1048576 pctools.printv( 'File size: ' + clrs['c'] + '{0:.1g}'.format(file_size) + ' Mb' + clrs['n'], verbosity) if file_size > 2: pctools.printv(clrs['r'] + "File size too large!" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Will try to fetch sequences from asymmetric unit." + clrs['n'], verbosity) try: alternative_pdb = os.path.join( pdb_archive, subfolder, 'pdb' + filename.split('.')[0] + '.ent.gz') pdb_code, structure, nchains = pctools.parse_pdb_structure( alternative_pdb) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) # Write in fasta file pctools.printv( clrs['y'] + "Recording large-pdb sequence" + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='largepdb') except: pctools.printv( clrs['r'] + "Failed to fetch sequence!" + clrs['n'], verbosity) continue try: pdb_code, structure, nchains = pctools.parse_pdb_structure(pdb) pctools.printv( 'Number of chains in structure ' + clrs['y'] + pdb_code + clrs['n'] + ': ' + str(nchains), verbosity) # Reject structures with more than 60 chains if int(nchains) > 60: pctools.printv( "Number of chains (" + clrs['y'] + str(nchains) + clrs['n'] + ") larger than 60! " + clrs['r'] + "Too many chains!" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Will try to fetch sequences anyway." + clrs['n'], verbosity) try: pdb_code, structure, nchains = pctools.parse_pdb_structure( pdb) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) pctools.printv( clrs['y'] + "Recording large-pdb sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='largepdb') except: pctools.printv( clrs['r'] + "Failed to fetch sequence!" + clrs['n'], verbosity) continue structure, chain_correspondences[pdb_code] = pctools.split_states( structure) nchainspostsplit, seqs, chain_ids = pctools.extract_seqs( structure, 0) pctools.printv( 'Number of chains (' + clrs['c'] + str(nchains) + clrs['n'] + ') and file size (' + clrs['c'] + str(file_size) + clrs['n'] + ') OK.' + clrs['g'] + ' Proceeding.' + clrs['n'] + '\n', verbosity) # Try to get info from the canonic pdb header (homonimous to pdb1) canonpdb = "pdb" + pdb_code + ".ent.gz" try: contents = pctools.parse_pdb_contents( os.path.join(pdb_archive, subfolder, canonpdb))[1] except: pctools.printv( clrs['r'] + '\n\n Mismatch between pdb and biounit entries...' + clrs['n'], verbosity) author, software = pctools.get_annotated_states(contents) pctools.printv( 'Author determined biological unit = ' + str(author), verbosity) pctools.printv( 'Software determined quaternary structure= ' + str(software), verbosity) # Start assessing sequences and structures (from 2 up to 26 chains) if 1 < int(nchains) < 61: ids, proteinpair = pctools.get_pairwise_ids(seqs, nchains) for id in ids: if id[0] >= 90: color = clrs['g'] else: color = clrs['r'] pctools.printv( 'Identity between chains ' + clrs['y'] + str(id[1]) + clrs['n'] + ' and ' + clrs['y'] + str(id[2]) + clrs['n'] + ' is ' + color + str(id[0]) + "%" + clrs['n'] + ".", verbosity) # Save records for pure h**o-oligomers if all(id[0] > 90 for id in ids) and proteinpair is True: pctools.printv( "All identities over 90%. Likely " + clrs['b'] + "h**o-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv(clrs['y'] + "FETCHING" + clrs['n'] + ".\n", verbosity) # Write file to database newfile = os.path.join(pdb_homo_archive, subfolder, pdb_code + ".pdb") if not os.path.isdir( os.path.join(pdb_homo_archive, subfolder)): os.mkdir(os.path.join(pdb_homo_archive, subfolder)) io.set_structure(structure) io.save(newfile) pctools.gzip_pdb(newfile) # Write to log file with open(log_file, 'a') as f: f.write( str(pdb_code) + "," + str(nchains) + "," + '/'.join(author) + "," + '/'.join(software) + "," + str(os.path.getctime(newfile + '.gz')) + '\n') # Write in fasta file pctools.printv( clrs['y'] + "Recording h**o-oligomer sequence." + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='h**o') # Investigate partial h**o-oligomers elif any(id[0] > 90 for id in ids) and proteinpair is True: at_least_one_interface = False for id in ids: if id[0] > 90: # Check if similar chains share interfaces if pctools.check_interfaces( structure, id[1], id[2]): at_least_one_interface = True pctools.printv( 'Contacts found between chains ' + clrs['g'] + str(id[1]) + clrs['n'] + ' and ' + clrs['g'] + str(id[2]) + clrs['n'] + ' sharing ' + clrs['g'] + str(id[0]) + clrs['n'] + " % identity.", verbosity) pctools.printv( "At least one putative " + clrs['b'] + "h**o-oligomeric " + clrs['n'] + "interface found.", verbosity) pctools.printv( clrs['y'] + "FETCHING" + clrs['n'] + ".\n", verbosity) # Write file to database newfile = os.path.join(pdb_homo_archive, subfolder, pdb_code + ".pdb") if not os.path.isdir( os.path.join(pdb_homo_archive, subfolder)): os.mkdir( os.path.join(pdb_homo_archive, subfolder)) io.set_structure(structure) io.save(newfile) pctools.gzip_pdb(newfile) # Write to log file with open(log_file, 'a') as f: f.write( str(pdb_code) + "," + str(nchains) + "," + '/'.join(author) + "," + '/'.join(software) + "," + str(os.path.getctime(newfile + '.gz')) + '\n') # Write in fasta file pctools.printv( clrs['y'] + "Recording h**o-oligomer sequence." + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='h**o') break if at_least_one_interface is False: pctools.printv( "No h**o-oligomeric interface found. Likely " + clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') elif proteinpair is False: pctools.printv( clrs['r'] + "No proteic chain pairs found" + clrs['n'] + ".", verbosity) if any([set(seq[1]) != {'X'} for seq in seqs]): pctools.printv( clrs['y'] + "Protein sequences found though" + clrs['n'], verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) # Write in fasta file record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') else: pctools.printv( clrs['r'] + "Not even a single protein chain. Disregarding." + clrs['n'], verbosity) else: pctools.printv( "No similar chains found. Likely " + clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording hetero-oligomer sequence" + clrs['n'], verbosity) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='hetero') elif int(nchains) == 1: pctools.printv( "Only one chain found. Likely " + clrs['r'] + "monomeric" + clrs['n'] + ".", verbosity) pctools.printv( clrs['y'] + "Recording monomer sequence." + clrs['n'], verbosity) structure, chain_correspondences[ pdb_code] = pctools.split_states(structure) nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0) record_fasta(pdb_code, seqs, chain_ids, subfolder, type='mono') except: errtype, errvalue, errtraceback = sys.exc_info() errtypeshort = str(errtype).split('\'')[1] pctools.printv( clrs['r'] + '*' + str(errtypeshort) + ': ' + str(errvalue) + ' l.' + str(errtraceback.tb_lineno) + '*' + clrs['n'], verbosity) traceback.print_exception(*sys.exc_info()) if errtypeshort == 'KeyboardInterrupt': quit() #pctools.printv(clrs['r']+"UNKNOWN FAULT"+clrs['n']+".", verbosity) if not os.path.isfile(err_file): with open(err_file, 'w+') as f: pass with open(err_file, 'a') as f: f.write(filename + '\n') continue with open(chain_correspondences_file, 'wb') as p: pickle.dump(chain_correspondences, p, protocol=pickle.HIGHEST_PROTOCOL) if not os.path.isfile(err_file): with open(err_file, 'w+') as f: f.write('\nNo errors. Assessment terminated succesfully.\n')
def score_alignment(alignment_file): print(clrs['b'] + 'SCORING ALIGNMENT' + clrs['n'] + ' in ' + clrs['y'] + os.path.basename(alignment_file) + clrs['n'] + '\n') sequences = list(SeqIO.parse(alignment_file, "pir")) query_chains = str(sequences[0].seq).split('/') template_chains = str(sequences[1].seq).split('/') trimmed_query_chains = [] trimmed_template_chains = [] for query_chain, template_chain in zip(query_chains, template_chains): leading_gaps = 0 for r in query_chain: if r == '-': leading_gaps += 1 else: break trailing_gaps = 0 for r in query_chain[::-1]: if r == '-': trailing_gaps += 1 else: break if trailing_gaps == 0: trimmed_query_chains.append(query_chain[leading_gaps:]) trimmed_template_chains.append(template_chain[leading_gaps:]) else: trimmed_query_chains.append( query_chain[leading_gaps:-trailing_gaps]) trimmed_template_chains.append( template_chain[leading_gaps:-trailing_gaps]) relative_wscores = [] relative_scores = [] for q_chain, t_chain in zip(trimmed_query_chains, trimmed_template_chains): pctools.printv( '\nCalculating ' + clrs['y'] + 'maximum scores' + clrs['n'] + ' for chain segments:', g_args.verbosity) max_score, max_wscores = score_pairwise(t_chain, t_chain, MatrixInfo.blosum62, 0, 0) pctools.printv( '\nCalculating ' + clrs['y'] + 'actual scores' + clrs['n'] + ' for chain segments:', g_args.verbosity) score, wscores = score_pairwise(q_chain, t_chain, MatrixInfo.blosum62, 0, 0) relative_scores.append(round(score * 100 / max_score, 2)) for max_wscore, wscore in zip(max_wscores, wscores): if max_wscore != 0: relative_wscore = round(wscore * 100 / max_wscore, 2) else: relative_wscore = 100 relative_wscores.append(relative_wscore) relative_score = sum(relative_scores) / len(relative_scores) string = '' for relative_wscore in relative_wscores: if relative_wscore > g_args.similarity_cutoff: color = 'g' else: color = 'r' if string == '': string += (clrs[color] + str(relative_wscore) + clrs['n']) else: string += (' ~ ' + clrs[color] + str(relative_wscore) + clrs['n']) print('\nRelative score per 30-res segment: ' + string + clrs['n']) return relative_score, relative_wscores, len(query_chains)
def generate_ali(alignments, best_oligo_template, residue_index_mapping, args): best_oligo_template_file = best_oligo_template + "_CHOIR_RenamedChainsTemplate" final_alignment = os.path.join( workdir, input_name + '_' + best_oligo_template + '_CHOIR_Alignment.ali') getseq = False alignment_dict = {} full_residue_mapping = {} # Parse individual GESAMT alignments and organize in a per-chain dictionary for fasta_alignment in alignments: getseq = False template = False chain = None entryseq_dict = {} for line in open(fasta_alignment, 'r').readlines(): # Only record sequence if line above starts with > if getseq is True: getseq = False seq = line.replace('\n', '') # If this is the template, count leading and trailing gaps if template is True: template = False leading_gaps = 0 for r in seq: if r == '-': leading_gaps += 1 else: break trailing_gaps = 0 for r in seq[::-1]: if r == '-': trailing_gaps += 1 else: break assert seq is not None, 'Sequence is None' assert seq != '', 'Sequence is empty' entryseq_dict[entry] = seq.upper() del seq # If it is an entry line, get details and expect sequence if line.startswith('>'): entry = line.split('>')[1].split('(')[0].split( '.pdb')[0].replace('\n', '') # If entry is template, use chain as reference if entry == best_oligo_template_file: chain = line.split('(')[1].split(')')[0] template = True getseq = True # Remove leading and trailing gaps from the alignment for both template and query if trailing_gaps == 0: for entry, seq in entryseq_dict.items(): entryseq_dict[entry] = leading_gaps * '-' + seq[leading_gaps:] else: for entry, seq in entryseq_dict.items(): entryseq_dict[entry] = leading_gaps * '-' + seq[ leading_gaps:-trailing_gaps] + trailing_gaps * '-' if residue_index_mapping is not None: full_residue_mapping[chain] = collections.OrderedDict() for res, i in residue_index_mapping.items(): full_residue_mapping[chain][res] = i + leading_gaps else: full_residue_mapping[chain] = leading_gaps alignment_dict[chain] = entryseq_dict pctools.printv( 'Removed ' + clrs['c'] + str(leading_gaps) + clrs['n'] + ' leading gaps and ' + clrs['c'] + str(trailing_gaps) + clrs['n'] + ' trailing gaps from chain ' + clrs['c'] + chain + clrs['n'] + ' alignment.\n', verbosity) # If symmetry is desired, reduce all chains to match the size of the smallest if args.symmetry: max_leading_gaps = 0 max_trailing_gaps = 0 for chain, seqs in alignment_dict.items(): for entry, seq in seqs.items(): if entry == best_oligo_template_file: leading_gaps = 0 for r in seq: if r == '-': leading_gaps += 1 else: break if leading_gaps > max_leading_gaps: max_leading_gaps = leading_gaps trailing_gaps = 0 for r in seq[::-1]: if r == '-': trailing_gaps += 1 else: break if trailing_gaps > max_trailing_gaps: max_trailing_gaps = trailing_gaps pctools.printv( 'To cope with symmetry restraints, the modelled sequence will contain ' + clrs['c'] + str(max_leading_gaps) + clrs['n'] + ' leading gaps and ' + clrs['c'] + str(max_trailing_gaps) + clrs['n'] + ' trailing gaps' + clrs['n'] + '.\n', verbosity) print(max_trailing_gaps) for chain, seqs in alignment_dict.items(): if max_trailing_gaps == 0: seqs[entry] = max_leading_gaps * '-' + seqs[entry][ max_leading_gaps:] else: seqs[entry] = max_leading_gaps * '-' + seqs[ entry][max_leading_gaps: -max_trailing_gaps] + max_trailing_gaps * '-' # Find out first and last chains first_chain = sorted(alignment_dict)[0] last_chain = sorted(alignment_dict)[-1] # Create strings to write in alignment file alignment_string_dict = {} for entry in [input_name, best_oligo_template_file]: if entry == input_name: alignment_string_dict[ entry] = ">P1;" + input_name + "\nsequence:" + input_name + ":FIRST:" + first_chain + ":LAST:" + last_chain + "::::\n" elif entry == best_oligo_template_file: alignment_string_dict[ entry] = ">P1;" + best_oligo_template_file + ".pdb\nstructureX:" + best_oligo_template_file + ".pdb:FIRST:" + first_chain + ":LAST:" + last_chain + "::::\n" for chain, entryseq in sorted(alignment_dict.items()): if chain == last_chain: alignment_string_dict[entry] += entryseq[entry] + '*\n' else: alignment_string_dict[entry] += entryseq[entry] + '/\n' # Write alignment file with open(final_alignment, 'w') as f: for entry, entrystring in alignment_string_dict.items(): pctools.printv(entrystring, verbosity) f.write(entrystring) print('Modeller Alignment written to ' + clrs['g'] + os.path.basename(final_alignment) + clrs['n'] + '\n') return final_alignment, full_residue_mapping