def save(session, alignment, stream): print("CLUSTAL W ALN saved from UCSF ChimeraX", file=stream) print("", file=stream) max_name = max([len(seq.name) for seq in alignment.seqs]) name_format = "%%-%ds" % (max_name+5) from chimerax.atomic import Sequence aln_len = len(alignment.seqs[0]) for start in range(0, aln_len, LINELEN): end = min(aln_len, start + LINELEN) for seq in alignment.seqs: name = seq.name.replace(' ', '_') temp_seq = Sequence() temp_seq.extend(seq[start:end]) if len(temp_seq.ungapped()) == 0: print(name_format % name, seq[start:end], file=stream) else: temp_seq = Sequence() temp_seq.extend(seq[:end]) print(name_format % name, seq[start:end], len(temp_seq.ungapped()), file=stream) from .. import clustal_strong_groups, clustal_weak_groups conservation = [] for pos in range(start, end): # completely conserved? first = alignment.seqs[0][pos].upper() if first.isupper(): for seq in alignment.seqs[1:]: if seq[pos].upper() != first: break else: # conserved conservation.append('*') continue # "strongly"/"weakly" conserved? conserved = False for groups, character in [(clustal_strong_groups, ':'), (clustal_weak_groups, '.')]: for group in groups: for seq in alignment.seqs: if seq[pos].upper() not in group: break else: # conserved conserved = True break if conserved: conservation.append(character) break if not conserved: # remainder conservation.append(' ') print(name_format % " ", "".join(conservation), file=stream) print("", file=stream)
def read(session, f): want = 'init' sequences = [] for line in f.readlines(): line = line.strip() if want == 'init': if len(line) < 4: continue if line[0] != '>' or line[3] != ';': continue sequences.append(Sequence(name=make_readable(line[4:]))) pir_type = line[1:3] if pir_type in ("P1", "F1"): sequences[-1].nucleic = True else: sequences[-1].nucleic = False sequences[-1].pir_type = pir_type want = 'description' elif want == 'description': sequences[-1].description = line sequences[-1].pir_description = line want = 'sequence' elif want == 'sequence': if not line: continue if line[-1] == '*': want = 'init' line = line[:-1] sequences[-1].extend("".join([c for c in line if not c.isspace()])) f.close() if want != 'init': raise FormatSyntaxError("Could not find end of sequence '%s'" % sequences[-1].name) return sequences, {}, {}
def show_mav(self, ids): # Collect names and sequences of selected matches. # All sequences should have the same length because # they include gaps generated from BLAST alignment. ids.insert(0, 0) names = [] seqs = [] for sid in ids: name, seq = self._sequences[sid] names.append(name) seqs.append(seq) # Find columns that are gaps in all sequences and remove them. all_gaps = set() for i in range(len(seqs[0])): for seq in seqs: if seq[i].isalpha(): break else: all_gaps.add(i) if all_gaps: for i in range(len(seqs)): seq = seqs[i] new_seq = ''.join( [seq[n] for n in range(len(seq)) if n not in all_gaps]) seqs[i] = new_seq # Generate multiple sequence alignment file # Ask sequence viewer to display alignment from chimerax.atomic import Sequence seqs = [ Sequence(name=name, characters=seqs[i]) for i, name in enumerate(names) ] name = "%s [%d]" % (self._instance_name, self._viewer_index) self.session.alignments.new_alignment(seqs, name)
def read(session, f): # skip header crap in_header = True line_num = 0 sequences = [] for line in f.readlines(): line = line.strip() line_num += 1 if not line: continue fields = line.split() if in_header: if len(fields[0]) == 2: continue if fields[0].startswith('#='): # some Pfam seed alignments have undocumented #=RF header continue in_header = False if len(fields) != 2: raise FormatSyntaxError( "Sequence line %d not of form 'seq-name seq-letters'" % line_num) seq = Sequence(name=make_readable(fields[0])) seq.extend(fields[1]) sequences.append(seq) f.close() return sequences, {}, {}
def nw_assoc(session, align_seq, struct_seq): '''Wrapper around Needleman-Wunsch matching, to make it return the same kinds of values that try_assoc returns''' from chimerax.atomic import Sequence, SeqMatchMap sseq = struct_seq aseq = Sequence(name=align_seq.name, characters=align_seq.ungapped()) aseq.circular = align_seq.circular from chimerax.alignment_algs.NeedlemanWunsch import nw score, match_list = nw(sseq, aseq) errors = 0 # matched are in reverse order... try: m_end = match_list[0][0] except IndexError: m_end = -1 if m_end < len(sseq) - 1: # trailing unmatched errors += len(sseq) - m_end - 1 match_map = SeqMatchMap(align_seq, struct_seq) last_match = m_end + 1 for s_index, a_index in match_list: if sseq[s_index] != aseq[a_index]: errors += 1 if s_index < last_match - 1: # gap in structure sequence errors += last_match - s_index - 1 res = sseq.residues[s_index] if res: match_map.match(res, a_index) last_match = s_index if last_match > 0: # beginning unmatched errors += last_match if len(sseq) > len(aseq): # unmatched residues forced, reduce errors by that amount... errors -= len(sseq) - len(aseq) return match_map, errors
def seqalign_chain(session, chains): ''' Show chain sequence(s) Parameters ---------- chains : list of Chain Chains to show ''' if len(chains) == 1: chain = chains[0] ident = ".".join([str(part) for part in chain.structure.id]) + "/" + chain.chain_id alignment = session.alignments.new_alignment([chain], ident, seq_viewer="sv", auto_associate=None, intrinsic=True) else: # all chains have to have the same sequence, and they will all be associated with # that sequence sequences = set([chain.characters for chain in chains]) if len(sequences) != 1: raise UserError("Chains must have same sequence") chars = sequences.pop() chain_ids = set([chain.chain_id for chain in chains]) if len(chain_ids) < len(chains) or len(chain_ids) > 10: name = "%d chains" % len(chains) else: name = "chains %s" % ",".join(sorted(list(chain_ids))) from chimerax.atomic import Sequence seq = Sequence(name=name, characters=chars) def get_numbering_start(chain): for i, r in enumerate(chain.residues): if r is None or r.deleted: continue return r.number - i return None starts = set([get_numbering_start(chain) for chain in chains]) starts.discard(None) if len(starts) == 1: seq.numbering_start = starts.pop() alignment = session.alignments.new_alignment([seq], None, seq_viewer="sv", auto_associate=False, name=chains[0].description, intrinsic=True) alignment.suspend_notify_observers() for chain in chains: alignment.associate(chain, keep_intrinsic=True) alignment.resume_notify_observers()
def fetch_uniprot(session, ident, ignore_cache=False): 'Fetch UniProt data' from chimerax.core.errors import UserError, CancelOperation try: accession = map_uniprot_ident(ident) seq_string, full_name, features = fetch_uniprot_accession_info(session, accession, ignore_cache=ignore_cache) except InvalidAccessionError as e: raise UserError(str(e)) except CancelOperation: session.logger.status("Fetch of %s cancelled" % ident) return from chimerax.atomic import Sequence seq = Sequence(name=ident) seq.extend(seq_string) session.logger.status("Opening UniProt %s" % ident) session.alignments.new_alignment([seq], ident) return [], "Opened UniProt %s" % ident
def read(session, f): in_header = True sequences = [] line_num = 0 for line in f.readlines(): line_num += 1 if in_header: if line.startswith("CLUSTAL"): in_header = False first_block = True else: if line.strip() != "": raise FormatSyntaxError( "First non-blank line does not start with 'CLUSTAL'") continue if not line or line[0].isspace(): if sequences: first_block = False expect = 0 continue try: seq_name, seq_block, num_residues = line.split() except ValueError: try: seq_name, seq_block = line.strip().split() except ValueError: raise FormatSyntaxError( "Line %d is not sequence name followed by sequence " "contents and optional ungapped length" % line_num) if first_block: sequences.append(Sequence(name=make_readable(seq_name))) sequences[-1].append(seq_block) continue try: seq = sequences[expect] except IndexError: raise FormatSyntaxError( "Sequence on line %d not in initial sequence block" % line_num) expect += 1 seq.append(seq_block) f.close() return sequences, {}, {}
def _read_sequences(self, f): from chimerax.atomic import Sequence self.sequence_list = [] while 1: line = f.readline() if not line: raise FormatSyntaxError('no alignment separator') if line == '//\n' or line == '//\r\n': break m = MSF._Sum.match(line) if m is not None: name = m.group(1) length = m.group(2) check = m.group(3) weight = m.group(4) s = Sequence(name=make_readable(name)) self.sequence_list.append(s) s.attrs = {} s.attrs['MSF length'] = length s.attrs['MSF check'] = check s.attrs['MSF weight'] = weight if not self.sequence_list: raise FormatSyntaxError('No sequences found in header')
def read(session, f): from chimerax.atomic import Sequence from ..parse import FormatSyntaxError, make_readable in_sequence = False sequences = [] for line in f.readlines(): if in_sequence: if not line or line.isspace(): in_sequence = False continue if line[0] == '>': in_sequence = False # fall through else: sequences[-1].extend(line.strip()) if not in_sequence: if line[0] == '>': if sequences and len(sequences[-1]) == 0: raise FormatSyntaxError("No sequence found for %s" % sequences[-1].name) in_sequence = True sequences.append(Sequence(name=make_readable(line[1:]))) return sequences, {}, {}
def read(session, f): line_num = 0 file_attrs = {} file_markups = {} seq_attrs = {} seq_markups = {} sequences = {} seq_sequence = [] for line in f.readlines(): line = line.rstrip() # drop trailing newline/whitespace line_num += 1 if line_num == 1: if line.startswith("# STOCKHOLM"): continue raise FormatSymtaxError("File does not start with '# STOCKHOLM'") if not line: continue if line.startswith('#='): markup_type = line[2:4] markup = line[5:].strip() def try_split(num_split): fields = markup.split(None, num_split) if len(fields) == num_split: # value is empty fields.append("") if len(fields) != num_split + 1: raise FormatSyntaxError( "Not enough arguments after #=%s markup on line %d" % (markup_type, line_num)) return fields if markup_type == "GF": tag, val = try_split(1) tag = tag.replace("_", " ") tag = generic_file_attrs.get(tag, "Stockholm " + tag) if tag in file_attrs: file_attrs[tag] += '\n' + val else: file_attrs[tag] = val elif markup_type == "GS": seq_name, tag, val = try_split(2) tag = tag.replace("_", " ") attrs = seq_attrs.setdefault(seq_name, {}) tag = generic_seq_attrs.get(tag, "Stockholm " + tag) if tag in attrs: attrs[tag] += '\n' + val else: attrs[tag] = val elif markup_type == "GC": tag, val = try_split(1) tag = tag.replace("_", " ") file_markups[tag] = file_markups.get(tag, "") + val elif markup_type == "GR": seq_name, tag, val = try_split(2) tag = tag.replace("_", " ") seq_markups.setdefault(seq_name, {}).setdefault(tag, "") seq_markups[seq_name][tag] += val # ignore other types continue elif line.startswith('#'): # unstructured comment if 'comments' in file_attrs: file_attrs['comments'] += "\n" + line[1:] else: file_attrs['comments'] = line[1:] continue elif line.strip() == "//": # end of sequence alignment blocks, but comments may follow this, so keep going... continue # sequence info... try: seq_name, block = line.split(None, 1) except ValueError: raise FormatSyntaxError( "Sequence info not in name/contents format on line %d" % line_num) if seq_name not in sequences: sequences[seq_name] = Sequence(name=make_readable(seq_name)) seq_sequence.append(seq_name) sequences[seq_name].extend(block) f.close() for seq_name, seq in sequences.items(): if seq_name in seq_attrs: seq.attrs = seq_attrs[seq_name] if seq_name in seq_markups: seq.markups = seq_markups[seq_name] for tag, markup in seq.markups.items(): if len(markup) != len(seq): session.logger.warning( "Markup %s for sequence %s is wrong length; ignoring" % (tag, seq_name)) del seq.markups[tag] for seq_info, label in [(seq_attrs, "sequence"), (seq_markups, "residue")]: for seq_name in seq_info.keys(): if seq_name in sequences: continue # might be sequence name if trailing '/start-end' is removed... for full_name in sequences.keys(): if full_name.startswith(seq_name) \ and full_name[len(seq_name)] == '/' \ and '/' not in full_name[len(seq_name)+1:]: break else: raise FormatSyntaxError( "%s annotations provided for non-existent sequence %s" % (label.capitalize(), seq_name)) session.logger.info( "Updating %s %s annotations with %s annotations" % (full_name, label, seq_name)) seq_info[full_name].update(seq_info[seq_name]) del seq_info[seq_name] for tag, markup in file_markups.items(): if len(markup) != len(sequences[seq_sequence[0]]): raise FormatSyntaxError("Column annotation %s is wrong length" % tag) return [sequences[name] for name in seq_sequence], file_attrs, file_markups
def read(session, f): IN_HEADER, START_ATTRS, IN_ATTRS, IN_FEATURES, IN_SEQ = range(5) state = IN_HEADER sequences = [] line_num = 0 has_offset = False longest = None file_attrs = {} for line in f.readlines(): line = line.rstrip() # remove trailing whitespace/newline line_num += 1 if line_num == 1: if line.startswith("!!RICH_SEQUENCE"): continue raise FormatSyntaxError("First line does not start with !!RICH_SEQUENCE") if state == IN_HEADER: if line.strip() == "..": state = START_ATTRS continue if "comments" in file_attrs: file_attrs["comments"] += "\n" + line else: file_attrs["comments"] = line continue if not line.strip(): continue if state == START_ATTRS: if line.strip() == "{": state = IN_ATTRS cur_attr = None attrs = {} elif line: raise FormatSyntaxError( "Unexpected text before start of sequence on line %d" &line_num) continue if state == IN_ATTRS or state == IN_FEATURES: if line.strip() == "sequence" and line[0] == "s": if "RSF name" not in attrs: raise FormatSyntaxError("Sequence on line %d has no name" & line_num) state = IN_SEQ seq = Sequence(name=make_readable(attrs["RSF name"])) del attrs["RSF name"] seq.attrs = attrs if "RSF descrip" in attrs: attrs["description"] = attrs["RSF descrip"] del attrs["RSF descrip"] sequences.append(seq) if "RSF offset" in attrs: seq.extend("." * int(attrs["RSF offset"])) has_offset = True del attrs["RSF offset"] continue if line.startswith("feature"): if state == IN_ATTRS: attrs["RSF features"] = [[line[8:]]] else: attrs["RSF features"].append([line[8:]]) state = IN_FEATURES continue if state == IN_ATTRS: if line[0].isspace(): # continuation if not cur_attr: raise FormatSyntaxError("Bogus indentation at line %d" % line_num) if attrs[cur_attr]: attrs[cur_attr] += "\n" + line else: attrs[cur_attr] = line continue if " " in line.strip(): cur_attr, val = line.split(None, 1) cur_attr.replace("_", " ") cur_attr = "RSF " + cur_attr attrs[cur_attr] = val.strip() else: cur_attr = "RSF " + line.strip().replace("_", " ") attrs[cur_attr] = "" continue if state == IN_FEATURES: attrs["RSF features"][-1].append(line) continue if line.strip() == "}": state = START_ATTRS if not longest: longest = len(seq) else: if len(seq) < longest: seq.extend("." * (longest - len(seq))) elif len(seq) > longest: longest = len(seq) for s in sequences[:-1]: s.extend("." * (longest - len(s))) continue seq.extend(line.strip()) if not seq[0].isalpha(): has_offset = True f.close() if state == IN_HEADER: raise FormatSyntaxError("No end to header (i.e. '..' line) found") if state == IN_ATTRS or state == IN_FEATURES: raise FormatSyntaxError("No sequence data found for sequence %s" % attrs["RSF name"]) if state == IN_SEQ: raise FormatSyntaxError("No terminating brace for sequence %s" % attrs["RSF name"]) if not has_offset: session.logger.warning("No offset fields in RSF file; assuming zero offset") return sequences, file_attrs, {}
def read(session, f): doing = None sequences = [] header_ok = False line_num = 0 align_start_index = None for line in f.readlines(): if doing == 'alignments': # don't strip() alignment section since it has significant leading spaces line = line.rstrip() else: line = line.strip() line_num += 1 if not header_ok: if line.lower().startswith("hssp"): header_ok = True continue raise FormatSyntaxError("No initial HSSP header line") if line.startswith('##'): if doing == 'proteins' and not sequences: raise FormatSyntaxError("No entries in PROTEINS section") try: doing = line.split()[1].lower() except IndexError: doing = None if doing == 'alignments': try: hashes, alignments, begin, dash, end = line.strip().split() begin = int(begin) end = int(end) except ValueError: raise FormatSyntaError("ALIGNMENTS line (line #%d) not of the form: " "## ALIGNMENTS (number) - (number)" % line_num) continue if doing == 'proteins': if not line[0].isdigit(): continue try: seq_name = line.split()[2] except IndexError: raise FormatSyntaxError("Line %d in PROTEINS section does not start with " "[integer] : [sequence name]" % line_num) sequences.append(Sequence(name=make_readable(seq_name))) elif doing == 'alignments': if line.lstrip().lower().startswith('seqno'): try: align_start_index = line.index('.') except Exception: raise FormatSyntaxError("No indication of alignment starting column " "('.' character) in SeqNo line in ALIGNMENTS section") continue if align_start_index == None: raise FormatSyntaxError("No initial SeqNo line in ALIGNMENTS section") block = line[align_start_index:] if not block: raise FormatSyntaxError("No alignment block given on line %d" % line_num) block_len = end - begin + 1 if len(block) > block_len: raise FormatSyntaxError("Too many characters (%d, only %d sequences) in " "alignment block given on line %d" % (len(block), block_len, line_num)) block = block + ' ' * (block_len - len(block)) for seq, c in zip(sequences[begin-1:end], block): seq.append(c) f.close() return sequences, {}, {}
def align(session, ref, match, matrix_name, algorithm, gap_open, gap_extend, dssp_cache, ss_matrix=defaults["ss_scores"], ss_fraction=defaults["ss_mixture"], gap_open_helix=defaults["helix_open"], gap_open_strand=defaults["strand_open"], gap_open_other=defaults["other_open"], compute_ss=defaults["compute_ss"]): from chimerax import sim_matrices similarity_matrix = sim_matrices.matrix(matrix_name, session.logger) ssf = ss_fraction ssm = ss_matrix if ssf is not None and ssf is not False and compute_ss: need_compute = [] if ref.structure not in dssp_cache: for r in ref.residues: if r and len(r.atoms) > 1: # not CA only need_compute.append(ref.structure) dssp_cache[ref.structure] = ( ref.structure.residues.ss_ids, ref.structure.residues.ss_types) break if match.structure not in dssp_cache: for r in match.residues: if r and len(r.atoms) > 1: # not CA only need_compute.append(match.structure) dssp_cache[match.structure] = ( match.structure.residues.ss_ids, match.structure.residues.ss_types) break if need_compute: """TODO from chimera.initprefs import ksdsspPrefs, \ KSDSSP_ENERGY, KSDSSP_HELIX_LENGTH, \ KSDSSP_STRAND_LENGTH """ from chimerax.std_commands import dssp dssp.compute_ss(session, need_compute) if algorithm == "nw": from chimerax.alignment_algs import NeedlemanWunsch score, seqs = NeedlemanWunsch.nw(ref, match, score_gap=-gap_extend, score_gap_open=0 - gap_open, similarity_matrix=similarity_matrix, return_seqs=True, ss_matrix=ss_matrix, ss_fraction=ss_fraction, gap_open_helix=-gap_open_helix, gap_open_strand=-gap_open_strand, gap_open_other=-gap_open_other) gapped_ref, gapped_match = seqs elif algorithm == "sw": def ss_let(r): if not r: return ' ' if r.is_helix: return 'H' elif r.is_strand: return 'S' return 'O' if ssf is False or ssf is None: ssf = 0.0 ssm = None if ssm: # account for missing structure (blank SS letter) ssm = ssm.copy() for let in "HSO ": ssm[(let, ' ')] = 0.0 ssm[(' ', let)] = 0.0 from chimerax.alignment_algs import SmithWaterman score, alignment = SmithWaterman.align( ref.characters, match.characters, similarity_matrix, float(gap_open), float(gap_extend), gap_char=".", ss_matrix=ssm, ss_fraction=ssf, gap_open_helix=float(gap_open_helix), gap_open_strand=float(gap_open_strand), gap_open_other=float(gap_open_other), ss1="".join([ss_let(r) for r in ref.residues]), ss2="".join([ss_let(r) for r in match.residues])) from chimerax.atomic import StructureSeq, Sequence gapped_ref = StructureSeq(structure=ref.structure, chain_id=ref.chain_id) gapped_ref.name = ref.structure.name gapped_match = StructureSeq(structure=match.structure, chain_id=match.chain_id) gapped_match.name = match.structure.name # Smith-Waterman may not be entirety of sequences... for orig, gapped, sw in [ (ref, gapped_ref, Sequence(characters=alignment[0])), (match, gapped_match, Sequence(characters=alignment[1])) ]: ungapped = sw.ungapped() for i in range(len(orig) - len(ungapped) + 1): if ungapped == orig[i:i + len(ungapped)]: break else: raise ValueError("Smith-Waterman result not" " a subsequence of original sequence") gapped.bulk_set(orig.residues[i:i + len(ungapped)], sw.characters) else: raise ValueError("Unknown sequence alignment algorithm: %s" % algorithm) # If the structures are disjoint snippets of the same longer SEQRES, # they may be able to be structurally aligned but the SEQRES records # will keep them apart. Try to detect this situation and work around # by snipping off sequence ends. sr_disjoint = False if ref.from_seqres and match.from_seqres: struct_match = 0 for i in range(len(gapped_ref)): uri = gapped_ref.gapped_to_ungapped(i) if uri is None: continue umi = gapped_match.gapped_to_ungapped(i) if umi is None: continue if gapped_ref.residues[uri] and gapped_match.residues[umi]: struct_match += 1 if struct_match >= 3: break if struct_match < 3: seq_match = 0 for s1, s2 in zip(gapped_ref[:], gapped_match[:]): if s1.isalpha() and s2.isalpha(): seq_match += 1 if seq_match > 3: break if seq_match > 3: need = 3 - struct_match if (ref.residues[:need].count(None) == 3 or ref.residues[-need:].count(None) == 3) \ and (match.residues[:need].count(None) == 3 or match.residues[-need:].count(None) == 3): sr_disjoint = True if sr_disjoint: from copy import copy clipped_ref = copy(ref) clipped_match = copy(match) for seq in (clipped_ref, clipped_match): num_none = 0 for r in seq.residues: if r: break num_none += 1 if num_none: seq.bulk_set(seq.residues[num_none:], seq[num_none:]) num_none = 0 for r in reversed(seq.residues): if r: break num_none += 1 if num_none: seq.bulk_set(seq.residues[:-num_none], seq[:-num_none]) return align(session, clipped_ref, clipped_match, matrix_name, algorithm, gap_open, gap_extend, dssp_cache, ss_matrix=ss_matrix, ss_fraction=ss_fraction, gap_open_helix=gap_open_helix, gap_open_strand=gap_open_strand, gap_open_other=gap_open_other, compute_ss=False) for orig, aligned in [(ref, gapped_ref), (match, gapped_match)]: if hasattr(orig, '_dm_rebuild_info'): aligned._dm_rebuild_info = orig._dm_rebuild_info _dm_cleanup.append(aligned) return score, gapped_ref, gapped_match
def model(session, targets, *, block=True, multichain=True, custom_script=None, dist_restraints=None, executable_location=None, fast=False, het_preserve=False, hydrogens=False, license_key=None, num_models=5, show_gui=True, temp_path=None, thorough_opt=False, water_preserve=False): """ Generate comparative models for the target sequences. Arguments: session current session targets list of (alignment, sequence) tuples. Each sequence will be modelled. block If True, wait for modelling job to finish before returning and return list of (opened) models. Otherwise return immediately. Also see 'show_gui' option. multichain If True, the associated chains of each structure are used individually to generate chains in the resulting models (i.e. the models will be multimers). If False, all associated chains are used together as templates to generate a single-chain model for the target sequence. custom_script If provided, the location of a custom Modeller script to use instead of the one we would otherwise generate. Only used when executing locally. dist_restraints If provided, the location of a file containing additional distance restraints executable_location If provided, the path to the locally installed Modeller executable. If not provided, use the web service. fast Whether to use fast but crude generation of models het_preserve Whether to preserve HET atoms in generated models hydrogens Whether to generate models with hydrogen atoms license_key Modeller license key. If not provided, try to use settings to find one. num_models Number of models to generate for each template sequence show_gui If True, show user interface for Modeller results (if ChimeraX is in gui mode). temp_path If provided, folder to use for temporary files thorough_opt Whether to perform thorough optimization water_preserve Whether to preserve water in generated models """ from chimerax.core.errors import LimitationError, UserError from .common import modeller_copy if multichain: # So, first find structure with most associated chains and least non-associated chains. # That structure is used as the multimer template. Chains from other structures are used # as "standalone" templates -- each such chain will be on its own line. Need to allow # space on the left and right of the target sequence so that the largest chains can be # accomodated. # Find the structure we will use as the multimer template by_structure = {} chain_info = {} for alignment, orig_target in targets: # Copy the target sequence, changing name to conform to Modeller limitations target = modeller_copy(orig_target) if not alignment.associations: raise UserError("Alignment %s has no associated chains" % alignment.ident) for chain, aseq in alignment.associations.items(): if len(chain.chain_id) > 1: raise LimitationError( "Modeller cannot handle templates with multi-character chain IDs" ) by_structure.setdefault(chain.structure, []).append(chain) chain_info[chain] = (aseq, target) max_matched = min_unmatched = None for s, match_info in by_structure.items(): matched = len(match_info) unmatched = s.num_chains - len(match_info) if max_matched is None or matched > max_matched or ( matched == max_matched and (unmatched < min_unmatched)): multimer_template = s max_matched = matched min_unmatched = unmatched mm_targets = [] mm_chains = [] match_chains = [] for chain in multimer_template.chains: mm_chains.append(chain) try: aseq, target = chain_info[chain] except KeyError: mm_targets.append(None) else: mm_targets.append(target) match_chains.append(chain) # okay, now form single-chain lines for the other structure associations, that eventually will # be handled column by column in exactly the same way as the non-multichain method. single_template_lines = [] for chain, info in chain_info.items(): if chain.structure == multimer_template: continue aseq, target = info for i, mm_target in enumerate(mm_targets): if mm_target != target: continue template_line = [None] * len(mm_targets) template_line[i] = chain single_template_lines.append(template_line) # AFAIK, the multimer template chain sequences need to have complete PDB sequence, so may need # to prefix and suffix he corresponding alignment sequence with characters for residues # outside of the alignment sequence. For other templates/targets, affix a corresponding number # of '-' characters prefixes, suffixes = find_affixes(mm_chains, chain_info) target_strings = [] for prefix, suffix, mm_target in zip(prefixes, suffixes, mm_targets): if mm_target is None: target_strings.append('-') continue target_strings.append('-' * len(prefix) + mm_target.characters + '-' * len(suffix)) templates_strings = [] templates_info = [] mm_template_strings = [] for prefix, suffix, chain in zip(prefixes, suffixes, mm_chains): try: aseq, target = chain_info[chain] except KeyError: mm_template_strings.append('-') continue mm_template_strings.append( prefix + regularized_seq(aseq, chain).characters + suffix) templates_strings.append(mm_template_strings) templates_info.append(None) for template_line in single_template_lines: template_strings = [] for prefix, suffix, chain, target in zip(prefixes, suffixes, template_line, mm_targets): if target is None: template_strings.append('-') elif chain is None: template_strings.append( '-' * (len(prefix) + len(target) + len(suffix))) else: aseq, target = chain_info[chain] template_strings.append( '-' * len(prefix) + regularized_seq(aseq, chain).characters + '-' * len(suffix)) templates_info.append((chain, aseq.match_maps[chain])) templates_strings.append(template_strings) target_name = "target" if len(targets) > 1 else target.name else: if len(targets) > 1: raise LimitationError( "Cannot have multiple targets(/alignments) unless creating multimeric model" ) alignment, orig_target = targets[0] # Copy the target sequence, changing name to conform to Modeller limitations target = modeller_copy(orig_target) target_strings = [target.characters] templates_strings = [] templates_info = [] match_chains = [] for chain, aseq in alignment.associations.items(): if len(chain.chain_id) > 1: raise LimitationError( "Modeller cannot handle templates with multi-character chain IDs" ) templates_strings.append([regularized_seq(aseq, chain).characters]) templates_info.append((chain, aseq.match_maps[chain])) if not match_chains: match_chains.append(chain) target_name = target.name from .common import write_modeller_scripts, get_license_key script_path, config_path, temp_dir = write_modeller_scripts( get_license_key(session, license_key), num_models, het_preserve, water_preserve, hydrogens, fast, None, custom_script, temp_path, thorough_opt, dist_restraints) input_file_map = [] # form the sequences to be written out as a PIR from chimerax.atomic import Sequence pir_target = Sequence(name=target_name) pir_target.description = "sequence:%s:.:.:.:.::::" % pir_target.name pir_target.characters = '/'.join(target_strings) pir_seqs = [pir_target] structures_to_save = set() for strings, info in zip(templates_strings, templates_info): if info is None: # multimer template pir_template = Sequence( name=structure_save_name(multimer_template)) pir_template.description = "structure:%s:FIRST:%s::::::" % ( pir_template.name, multimer_template.chains[0].chain_id) structures_to_save.add(multimer_template) else: # single-chain template chain, match_map = info first_assoc_pos = 0 while first_assoc_pos not in match_map: first_assoc_pos += 1 first_assoc_res = match_map[first_assoc_pos] pir_template = Sequence(name=chain_save_name(chain)) pir_template.description = "structure:%s:%d%s:%s:+%d:%s::::" % ( structure_save_name(chain.structure), first_assoc_res.number, first_assoc_res.insertion_code, chain.chain_id, len(match_map), chain.chain_id) structures_to_save.add(chain.structure) pir_template.characters = '/'.join(strings) pir_seqs.append(pir_template) import os.path pir_file = os.path.join(temp_dir.name, "alignment.ali") aln = session.alignments.new_alignment(pir_seqs, False, auto_associate=False, create_headers=False) aln.save(pir_file, format_name="pir") session.alignments.destroy_alignment(aln) input_file_map.append(("alignment.ali", "text_file", pir_file)) # write the namelist.dat file, target seq name on first line, templates on remaining lines name_file = os.path.join(temp_dir.name, "namelist.dat") input_file_map.append(("namelist.dat", "text_file", name_file)) with open(name_file, 'w') as f: for template_seq in pir_seqs: print(template_seq.name, file=f) config_name = os.path.basename(config_path) input_file_map.append((config_name, "text_file", config_path)) # save structure files import os struct_dir = os.path.join(temp_dir.name, "template_struc") if not os.path.exists(struct_dir): try: os.mkdir(struct_dir, mode=0o755) except FileExistsError: pass from chimerax.pdb import save_pdb, standard_polymeric_res_names as std_res_names for structure in structures_to_save: base_name = structure_save_name(structure) + '.pdb' pdb_file_name = os.path.join(struct_dir, base_name) input_file_map.append((base_name, "text_file", pdb_file_name)) ATOM_res_names = structure.in_seq_hets ATOM_res_names.update(std_res_names) save_pdb(session, pdb_file_name, models=[structure], polymeric_res_names=ATOM_res_names) delattr(structure, 'in_seq_hets') from chimerax.atomic import Chains match_chains = Chains(match_chains) if executable_location is None: if custom_script is not None: raise LimitationError( "Custom Modeller scripts only supported when executing locally" ) if dist_restraints is not None: raise LimitationError( "Distance restraints only supported when executing locally") if thorough_opt: session.logger.warning( "Thorough optimization only supported when executing locally") job_runner = ModellerWebService(session, match_chains, num_models, pir_target.name, input_file_map, config_name, targets, show_gui) else: #TODO: job_runner = ModellerLocal(...) from chimerax.core.errors import LimitationError raise LimitationError("Local Modeller execution not yet implemented") # a custom script [only used when executing locally] needs to be copied into the tmp dir... if os.path.exists(script_path) \ and os.path.normpath(temp_dir.name) != os.path.normpath(os.path.dirname(script_path)): import shutil shutil.copy(script_path, temp_dir.name) return job_runner.run(block=block)