def save(session, alignment, stream): print("CLUSTAL W ALN saved from UCSF ChimeraX", file=stream) print("", file=stream) max_name = max([len(seq.name) for seq in alignment.seqs]) name_format = "%%-%ds" % (max_name+5) from chimerax.atomic import Sequence aln_len = len(alignment.seqs[0]) for start in range(0, aln_len, LINELEN): end = min(aln_len, start + LINELEN) for seq in alignment.seqs: name = seq.name.replace(' ', '_') temp_seq = Sequence() temp_seq.extend(seq[start:end]) if len(temp_seq.ungapped()) == 0: print(name_format % name, seq[start:end], file=stream) else: temp_seq = Sequence() temp_seq.extend(seq[:end]) print(name_format % name, seq[start:end], len(temp_seq.ungapped()), file=stream) from .. import clustal_strong_groups, clustal_weak_groups conservation = [] for pos in range(start, end): # completely conserved? first = alignment.seqs[0][pos].upper() if first.isupper(): for seq in alignment.seqs[1:]: if seq[pos].upper() != first: break else: # conserved conservation.append('*') continue # "strongly"/"weakly" conserved? conserved = False for groups, character in [(clustal_strong_groups, ':'), (clustal_weak_groups, '.')]: for group in groups: for seq in alignment.seqs: if seq[pos].upper() not in group: break else: # conserved conserved = True break if conserved: conservation.append(character) break if not conserved: # remainder conservation.append(' ') print(name_format % " ", "".join(conservation), file=stream) print("", file=stream)
def regularized_seq(aseq, chain): mmap = aseq.match_maps[chain] from .common import modeller_copy rseq = modeller_copy(aseq) rseq.description = "structure:" + chain_save_name(chain) seq_chars = list(rseq.characters) from chimerax.atomic import Sequence from chimerax.pdb import standard_polymeric_res_names as std_res_names in_seq_hets = [] num_res = 0 for ungapped in range(len(aseq.ungapped())): gapped = aseq.ungapped_to_gapped(ungapped) if ungapped not in mmap: seq_chars[gapped] = '-' else: r = mmap[ungapped] num_res += 1 if r.name not in std_res_names: in_seq_hets.append(r.name) seq_chars[gapped] = '.' else: seq_chars[gapped] = Sequence.rname3to1(mmap[ungapped].name) s = chain.structure het_set = getattr(s, 'in_seq_hets', set()) # may want to preserve all-HET chains, so don't auto-exclude them if num_res != len(in_seq_hets): het_set.update(in_seq_hets) s.in_seq_hets = het_set rseq.characters = "".join(seq_chars) return rseq
def show_mav(self, ids): # Collect names and sequences of selected matches. # All sequences should have the same length because # they include gaps generated from BLAST alignment. ids.insert(0, 0) names = [] seqs = [] for sid in ids: name, seq = self._sequences[sid] names.append(name) seqs.append(seq) # Find columns that are gaps in all sequences and remove them. all_gaps = set() for i in range(len(seqs[0])): for seq in seqs: if seq[i].isalpha(): break else: all_gaps.add(i) if all_gaps: for i in range(len(seqs)): seq = seqs[i] new_seq = ''.join( [seq[n] for n in range(len(seq)) if n not in all_gaps]) seqs[i] = new_seq # Generate multiple sequence alignment file # Ask sequence viewer to display alignment from chimerax.atomic import Sequence seqs = [ Sequence(name=name, characters=seqs[i]) for i, name in enumerate(names) ] name = "%s [%d]" % (self._instance_name, self._viewer_index) self.session.alignments.new_alignment(seqs, name)
def read(session, f): # skip header crap in_header = True line_num = 0 sequences = [] for line in f.readlines(): line = line.strip() line_num += 1 if not line: continue fields = line.split() if in_header: if len(fields[0]) == 2: continue if fields[0].startswith('#='): # some Pfam seed alignments have undocumented #=RF header continue in_header = False if len(fields) != 2: raise FormatSyntaxError( "Sequence line %d not of form 'seq-name seq-letters'" % line_num) seq = Sequence(name=make_readable(fields[0])) seq.extend(fields[1]) sequences.append(seq) f.close() return sequences, {}, {}
def read(session, f): want = 'init' sequences = [] for line in f.readlines(): line = line.strip() if want == 'init': if len(line) < 4: continue if line[0] != '>' or line[3] != ';': continue sequences.append(Sequence(name=make_readable(line[4:]))) pir_type = line[1:3] if pir_type in ("P1", "F1"): sequences[-1].nucleic = True else: sequences[-1].nucleic = False sequences[-1].pir_type = pir_type want = 'description' elif want == 'description': sequences[-1].description = line sequences[-1].pir_description = line want = 'sequence' elif want == 'sequence': if not line: continue if line[-1] == '*': want = 'init' line = line[:-1] sequences[-1].extend("".join([c for c in line if not c.isspace()])) f.close() if want != 'init': raise FormatSyntaxError("Could not find end of sequence '%s'" % sequences[-1].name) return sequences, {}, {}
def find_affixes(chains, chain_info): from chimerax.pdb import standard_polymeric_res_names as std_res_names in_seq_hets = [] prefixes = [] suffixes = [] from chimerax.atomic import Sequence for chain in chains: try: aseq, target = chain_info[chain] except KeyError: prefixes.append('') suffixes.append('') continue match_map = aseq.match_maps[chain] prefix = '' for r in chain.existing_residues: if r in match_map: break if r.name not in std_res_names: in_seq_hets.append(r.name) prefix += '.' else: prefix += Sequence.rname3to1(r.name) prefixes.append(prefix) suffix = '' for r in reversed(chain.existing_residues): if r in match_map: break if r.name not in std_res_names: in_seq_hets.append(r.name) suffix = '.' + suffix else: suffix = Sequence.rname3to1(r.name) + suffix suffixes.append(suffix) s = chain.structure het_set = getattr(s, 'in_seq_hets', set()) het_set.update(in_seq_hets) s.in_seq_hets = het_set return prefixes, suffixes
def seqalign_chain(session, chains): ''' Show chain sequence(s) Parameters ---------- chains : list of Chain Chains to show ''' if len(chains) == 1: chain = chains[0] ident = ".".join([str(part) for part in chain.structure.id]) + "/" + chain.chain_id alignment = session.alignments.new_alignment([chain], ident, seq_viewer="sv", auto_associate=None, intrinsic=True) else: # all chains have to have the same sequence, and they will all be associated with # that sequence sequences = set([chain.characters for chain in chains]) if len(sequences) != 1: raise UserError("Chains must have same sequence") chars = sequences.pop() chain_ids = set([chain.chain_id for chain in chains]) if len(chain_ids) < len(chains) or len(chain_ids) > 10: name = "%d chains" % len(chains) else: name = "chains %s" % ",".join(sorted(list(chain_ids))) from chimerax.atomic import Sequence seq = Sequence(name=name, characters=chars) def get_numbering_start(chain): for i, r in enumerate(chain.residues): if r is None or r.deleted: continue return r.number - i return None starts = set([get_numbering_start(chain) for chain in chains]) starts.discard(None) if len(starts) == 1: seq.numbering_start = starts.pop() alignment = session.alignments.new_alignment([seq], None, seq_viewer="sv", auto_associate=False, name=chains[0].description, intrinsic=True) alignment.suspend_notify_observers() for chain in chains: alignment.associate(chain, keep_intrinsic=True) alignment.resume_notify_observers()
def nw_assoc(session, align_seq, struct_seq): '''Wrapper around Needleman-Wunsch matching, to make it return the same kinds of values that try_assoc returns''' from chimerax.atomic import Sequence, SeqMatchMap sseq = struct_seq aseq = Sequence(name=align_seq.name, characters=align_seq.ungapped()) aseq.circular = align_seq.circular from chimerax.alignment_algs.NeedlemanWunsch import nw score, match_list = nw(sseq, aseq) errors = 0 # matched are in reverse order... try: m_end = match_list[0][0] except IndexError: m_end = -1 if m_end < len(sseq) - 1: # trailing unmatched errors += len(sseq) - m_end - 1 match_map = SeqMatchMap(align_seq, struct_seq) last_match = m_end + 1 for s_index, a_index in match_list: if sseq[s_index] != aseq[a_index]: errors += 1 if s_index < last_match - 1: # gap in structure sequence errors += last_match - s_index - 1 res = sseq.residues[s_index] if res: match_map.match(res, a_index) last_match = s_index if last_match > 0: # beginning unmatched errors += last_match if len(sseq) > len(aseq): # unmatched residues forced, reduce errors by that amount... errors -= len(sseq) - len(aseq) return match_map, errors
def fetch_uniprot(session, ident, ignore_cache=False): 'Fetch UniProt data' from chimerax.core.errors import UserError, CancelOperation try: accession = map_uniprot_ident(ident) seq_string, full_name, features = fetch_uniprot_accession_info(session, accession, ignore_cache=ignore_cache) except InvalidAccessionError as e: raise UserError(str(e)) except CancelOperation: session.logger.status("Fetch of %s cancelled" % ident) return from chimerax.atomic import Sequence seq = Sequence(name=ident) seq.extend(seq_string) session.logger.status("Opening UniProt %s" % ident) session.alignments.new_alignment([seq], ident) return [], "Opened UniProt %s" % ident
def read(session, f): in_header = True sequences = [] line_num = 0 for line in f.readlines(): line_num += 1 if in_header: if line.startswith("CLUSTAL"): in_header = False first_block = True else: if line.strip() != "": raise FormatSyntaxError( "First non-blank line does not start with 'CLUSTAL'") continue if not line or line[0].isspace(): if sequences: first_block = False expect = 0 continue try: seq_name, seq_block, num_residues = line.split() except ValueError: try: seq_name, seq_block = line.strip().split() except ValueError: raise FormatSyntaxError( "Line %d is not sequence name followed by sequence " "contents and optional ungapped length" % line_num) if first_block: sequences.append(Sequence(name=make_readable(seq_name))) sequences[-1].append(seq_block) continue try: seq = sequences[expect] except IndexError: raise FormatSyntaxError( "Sequence on line %d not in initial sequence block" % line_num) expect += 1 seq.append(seq_block) f.close() return sequences, {}, {}
def _read_sequences(self, f): from chimerax.atomic import Sequence self.sequence_list = [] while 1: line = f.readline() if not line: raise FormatSyntaxError('no alignment separator') if line == '//\n' or line == '//\r\n': break m = MSF._Sum.match(line) if m is not None: name = m.group(1) length = m.group(2) check = m.group(3) weight = m.group(4) s = Sequence(name=make_readable(name)) self.sequence_list.append(s) s.attrs = {} s.attrs['MSF length'] = length s.attrs['MSF check'] = check s.attrs['MSF weight'] = weight if not self.sequence_list: raise FormatSyntaxError('No sequences found in header')
def read(session, f): from chimerax.atomic import Sequence from ..parse import FormatSyntaxError, make_readable in_sequence = False sequences = [] for line in f.readlines(): if in_sequence: if not line or line.isspace(): in_sequence = False continue if line[0] == '>': in_sequence = False # fall through else: sequences[-1].extend(line.strip()) if not in_sequence: if line[0] == '>': if sequences and len(sequences[-1]) == 0: raise FormatSyntaxError("No sequence found for %s" % sequences[-1].name) in_sequence = True sequences.append(Sequence(name=make_readable(line[1:]))) return sequences, {}, {}
def read(session, f): line_num = 0 file_attrs = {} file_markups = {} seq_attrs = {} seq_markups = {} sequences = {} seq_sequence = [] for line in f.readlines(): line = line.rstrip() # drop trailing newline/whitespace line_num += 1 if line_num == 1: if line.startswith("# STOCKHOLM"): continue raise FormatSymtaxError("File does not start with '# STOCKHOLM'") if not line: continue if line.startswith('#='): markup_type = line[2:4] markup = line[5:].strip() def try_split(num_split): fields = markup.split(None, num_split) if len(fields) == num_split: # value is empty fields.append("") if len(fields) != num_split + 1: raise FormatSyntaxError( "Not enough arguments after #=%s markup on line %d" % (markup_type, line_num)) return fields if markup_type == "GF": tag, val = try_split(1) tag = tag.replace("_", " ") tag = generic_file_attrs.get(tag, "Stockholm " + tag) if tag in file_attrs: file_attrs[tag] += '\n' + val else: file_attrs[tag] = val elif markup_type == "GS": seq_name, tag, val = try_split(2) tag = tag.replace("_", " ") attrs = seq_attrs.setdefault(seq_name, {}) tag = generic_seq_attrs.get(tag, "Stockholm " + tag) if tag in attrs: attrs[tag] += '\n' + val else: attrs[tag] = val elif markup_type == "GC": tag, val = try_split(1) tag = tag.replace("_", " ") file_markups[tag] = file_markups.get(tag, "") + val elif markup_type == "GR": seq_name, tag, val = try_split(2) tag = tag.replace("_", " ") seq_markups.setdefault(seq_name, {}).setdefault(tag, "") seq_markups[seq_name][tag] += val # ignore other types continue elif line.startswith('#'): # unstructured comment if 'comments' in file_attrs: file_attrs['comments'] += "\n" + line[1:] else: file_attrs['comments'] = line[1:] continue elif line.strip() == "//": # end of sequence alignment blocks, but comments may follow this, so keep going... continue # sequence info... try: seq_name, block = line.split(None, 1) except ValueError: raise FormatSyntaxError( "Sequence info not in name/contents format on line %d" % line_num) if seq_name not in sequences: sequences[seq_name] = Sequence(name=make_readable(seq_name)) seq_sequence.append(seq_name) sequences[seq_name].extend(block) f.close() for seq_name, seq in sequences.items(): if seq_name in seq_attrs: seq.attrs = seq_attrs[seq_name] if seq_name in seq_markups: seq.markups = seq_markups[seq_name] for tag, markup in seq.markups.items(): if len(markup) != len(seq): session.logger.warning( "Markup %s for sequence %s is wrong length; ignoring" % (tag, seq_name)) del seq.markups[tag] for seq_info, label in [(seq_attrs, "sequence"), (seq_markups, "residue")]: for seq_name in seq_info.keys(): if seq_name in sequences: continue # might be sequence name if trailing '/start-end' is removed... for full_name in sequences.keys(): if full_name.startswith(seq_name) \ and full_name[len(seq_name)] == '/' \ and '/' not in full_name[len(seq_name)+1:]: break else: raise FormatSyntaxError( "%s annotations provided for non-existent sequence %s" % (label.capitalize(), seq_name)) session.logger.info( "Updating %s %s annotations with %s annotations" % (full_name, label, seq_name)) seq_info[full_name].update(seq_info[seq_name]) del seq_info[seq_name] for tag, markup in file_markups.items(): if len(markup) != len(sequences[seq_sequence[0]]): raise FormatSyntaxError("Column annotation %s is wrong length" % tag) return [sequences[name] for name in seq_sequence], file_attrs, file_markups
def read(session, f): IN_HEADER, START_ATTRS, IN_ATTRS, IN_FEATURES, IN_SEQ = range(5) state = IN_HEADER sequences = [] line_num = 0 has_offset = False longest = None file_attrs = {} for line in f.readlines(): line = line.rstrip() # remove trailing whitespace/newline line_num += 1 if line_num == 1: if line.startswith("!!RICH_SEQUENCE"): continue raise FormatSyntaxError("First line does not start with !!RICH_SEQUENCE") if state == IN_HEADER: if line.strip() == "..": state = START_ATTRS continue if "comments" in file_attrs: file_attrs["comments"] += "\n" + line else: file_attrs["comments"] = line continue if not line.strip(): continue if state == START_ATTRS: if line.strip() == "{": state = IN_ATTRS cur_attr = None attrs = {} elif line: raise FormatSyntaxError( "Unexpected text before start of sequence on line %d" &line_num) continue if state == IN_ATTRS or state == IN_FEATURES: if line.strip() == "sequence" and line[0] == "s": if "RSF name" not in attrs: raise FormatSyntaxError("Sequence on line %d has no name" & line_num) state = IN_SEQ seq = Sequence(name=make_readable(attrs["RSF name"])) del attrs["RSF name"] seq.attrs = attrs if "RSF descrip" in attrs: attrs["description"] = attrs["RSF descrip"] del attrs["RSF descrip"] sequences.append(seq) if "RSF offset" in attrs: seq.extend("." * int(attrs["RSF offset"])) has_offset = True del attrs["RSF offset"] continue if line.startswith("feature"): if state == IN_ATTRS: attrs["RSF features"] = [[line[8:]]] else: attrs["RSF features"].append([line[8:]]) state = IN_FEATURES continue if state == IN_ATTRS: if line[0].isspace(): # continuation if not cur_attr: raise FormatSyntaxError("Bogus indentation at line %d" % line_num) if attrs[cur_attr]: attrs[cur_attr] += "\n" + line else: attrs[cur_attr] = line continue if " " in line.strip(): cur_attr, val = line.split(None, 1) cur_attr.replace("_", " ") cur_attr = "RSF " + cur_attr attrs[cur_attr] = val.strip() else: cur_attr = "RSF " + line.strip().replace("_", " ") attrs[cur_attr] = "" continue if state == IN_FEATURES: attrs["RSF features"][-1].append(line) continue if line.strip() == "}": state = START_ATTRS if not longest: longest = len(seq) else: if len(seq) < longest: seq.extend("." * (longest - len(seq))) elif len(seq) > longest: longest = len(seq) for s in sequences[:-1]: s.extend("." * (longest - len(s))) continue seq.extend(line.strip()) if not seq[0].isalpha(): has_offset = True f.close() if state == IN_HEADER: raise FormatSyntaxError("No end to header (i.e. '..' line) found") if state == IN_ATTRS or state == IN_FEATURES: raise FormatSyntaxError("No sequence data found for sequence %s" % attrs["RSF name"]) if state == IN_SEQ: raise FormatSyntaxError("No terminating brace for sequence %s" % attrs["RSF name"]) if not has_offset: session.logger.warning("No offset fields in RSF file; assuming zero offset") return sequences, file_attrs, {}
def _update_errors_gaps(self, aseq): if not self.settings.error_region_shown and not self.settings.gap_region_shown: return a_ref_seq = getattr(aseq, 'residue_sequence', aseq.ungapped()) errors = [0] * len(a_ref_seq) gaps = [0] * len(a_ref_seq) from chimerax.atomic import Sequence for chain, match_map in aseq.match_maps.items(): for i, char in enumerate(a_ref_seq): try: res = match_map[i] except KeyError: gaps[i] += 1 else: if Sequence.rname3to1(res.name) != char.upper(): errors[i] += 1 partial_error_blocks, full_error_blocks = [], [] partial_gap_blocks, full_gap_blocks = [], [] num_assocs = len(aseq.match_maps) if num_assocs > 0: for partial, full, check in [ (partial_error_blocks, full_error_blocks, errors), (partial_gap_blocks, full_gap_blocks, gaps) ]: cur_partial_block = cur_full_block = None for i, check_num in enumerate(check): gapped_i = aseq.ungapped_to_gapped(i) if check_num == num_assocs: if cur_full_block: cur_full_block[-1] = gapped_i else: cur_full_block = [aseq, aseq, gapped_i, gapped_i] full.append(cur_full_block) if cur_partial_block: cur_partial_block = None else: if cur_full_block: cur_full_block = None if check_num > 0: if cur_partial_block: cur_partial_block[-1] = gapped_i else: cur_partial_block = [ aseq, aseq, gapped_i, gapped_i ] partial.append(cur_partial_block) elif cur_partial_block: cur_partial_block = None for shown, region_name_part, partial_blocks, full_blocks, fills, outlines in [ (self.settings.error_region_shown, self.ERROR_REGION_STRING, partial_error_blocks, full_error_blocks, self.settings.error_region_interiors, self.settings.error_region_borders), (self.settings.gap_region_shown, self.GAP_REGION_STRING, partial_gap_blocks, full_gap_blocks, self.settings.gap_region_interiors, self.settings.gap_region_borders) ]: if not shown: continue full_fill, partial_fill = fills full_outline, partial_outline = outlines for region_name_start, blocks, fill, outline in [ (region_name_part, full_blocks, full_fill, full_outline), ("partial " + region_name_part, partial_blocks, partial_fill, partial_outline) ]: region_name = "%s of %s" % (region_name_start, aseq.name) old_reg = self.region_browser.get_region(region_name, create=False) if old_reg: self.region_browser.delete_region(old_reg) if blocks: self.region_browser.new_region(region_name, blocks=blocks, fill=fill, outline=outline, sequence=aseq, cover_gaps=False)
def model(session, targets, *, block=True, multichain=True, custom_script=None, dist_restraints=None, executable_location=None, fast=False, het_preserve=False, hydrogens=False, license_key=None, num_models=5, show_gui=True, temp_path=None, thorough_opt=False, water_preserve=False): """ Generate comparative models for the target sequences. Arguments: session current session targets list of (alignment, sequence) tuples. Each sequence will be modelled. block If True, wait for modelling job to finish before returning and return list of (opened) models. Otherwise return immediately. Also see 'show_gui' option. multichain If True, the associated chains of each structure are used individually to generate chains in the resulting models (i.e. the models will be multimers). If False, all associated chains are used together as templates to generate a single-chain model for the target sequence. custom_script If provided, the location of a custom Modeller script to use instead of the one we would otherwise generate. Only used when executing locally. dist_restraints If provided, the location of a file containing additional distance restraints executable_location If provided, the path to the locally installed Modeller executable. If not provided, use the web service. fast Whether to use fast but crude generation of models het_preserve Whether to preserve HET atoms in generated models hydrogens Whether to generate models with hydrogen atoms license_key Modeller license key. If not provided, try to use settings to find one. num_models Number of models to generate for each template sequence show_gui If True, show user interface for Modeller results (if ChimeraX is in gui mode). temp_path If provided, folder to use for temporary files thorough_opt Whether to perform thorough optimization water_preserve Whether to preserve water in generated models """ from chimerax.core.errors import LimitationError, UserError from .common import modeller_copy if multichain: # So, first find structure with most associated chains and least non-associated chains. # That structure is used as the multimer template. Chains from other structures are used # as "standalone" templates -- each such chain will be on its own line. Need to allow # space on the left and right of the target sequence so that the largest chains can be # accomodated. # Find the structure we will use as the multimer template by_structure = {} chain_info = {} for alignment, orig_target in targets: # Copy the target sequence, changing name to conform to Modeller limitations target = modeller_copy(orig_target) if not alignment.associations: raise UserError("Alignment %s has no associated chains" % alignment.ident) for chain, aseq in alignment.associations.items(): if len(chain.chain_id) > 1: raise LimitationError( "Modeller cannot handle templates with multi-character chain IDs" ) by_structure.setdefault(chain.structure, []).append(chain) chain_info[chain] = (aseq, target) max_matched = min_unmatched = None for s, match_info in by_structure.items(): matched = len(match_info) unmatched = s.num_chains - len(match_info) if max_matched is None or matched > max_matched or ( matched == max_matched and (unmatched < min_unmatched)): multimer_template = s max_matched = matched min_unmatched = unmatched mm_targets = [] mm_chains = [] match_chains = [] for chain in multimer_template.chains: mm_chains.append(chain) try: aseq, target = chain_info[chain] except KeyError: mm_targets.append(None) else: mm_targets.append(target) match_chains.append(chain) # okay, now form single-chain lines for the other structure associations, that eventually will # be handled column by column in exactly the same way as the non-multichain method. single_template_lines = [] for chain, info in chain_info.items(): if chain.structure == multimer_template: continue aseq, target = info for i, mm_target in enumerate(mm_targets): if mm_target != target: continue template_line = [None] * len(mm_targets) template_line[i] = chain single_template_lines.append(template_line) # AFAIK, the multimer template chain sequences need to have complete PDB sequence, so may need # to prefix and suffix he corresponding alignment sequence with characters for residues # outside of the alignment sequence. For other templates/targets, affix a corresponding number # of '-' characters prefixes, suffixes = find_affixes(mm_chains, chain_info) target_strings = [] for prefix, suffix, mm_target in zip(prefixes, suffixes, mm_targets): if mm_target is None: target_strings.append('-') continue target_strings.append('-' * len(prefix) + mm_target.characters + '-' * len(suffix)) templates_strings = [] templates_info = [] mm_template_strings = [] for prefix, suffix, chain in zip(prefixes, suffixes, mm_chains): try: aseq, target = chain_info[chain] except KeyError: mm_template_strings.append('-') continue mm_template_strings.append( prefix + regularized_seq(aseq, chain).characters + suffix) templates_strings.append(mm_template_strings) templates_info.append(None) for template_line in single_template_lines: template_strings = [] for prefix, suffix, chain, target in zip(prefixes, suffixes, template_line, mm_targets): if target is None: template_strings.append('-') elif chain is None: template_strings.append( '-' * (len(prefix) + len(target) + len(suffix))) else: aseq, target = chain_info[chain] template_strings.append( '-' * len(prefix) + regularized_seq(aseq, chain).characters + '-' * len(suffix)) templates_info.append((chain, aseq.match_maps[chain])) templates_strings.append(template_strings) target_name = "target" if len(targets) > 1 else target.name else: if len(targets) > 1: raise LimitationError( "Cannot have multiple targets(/alignments) unless creating multimeric model" ) alignment, orig_target = targets[0] # Copy the target sequence, changing name to conform to Modeller limitations target = modeller_copy(orig_target) target_strings = [target.characters] templates_strings = [] templates_info = [] match_chains = [] for chain, aseq in alignment.associations.items(): if len(chain.chain_id) > 1: raise LimitationError( "Modeller cannot handle templates with multi-character chain IDs" ) templates_strings.append([regularized_seq(aseq, chain).characters]) templates_info.append((chain, aseq.match_maps[chain])) if not match_chains: match_chains.append(chain) target_name = target.name from .common import write_modeller_scripts, get_license_key script_path, config_path, temp_dir = write_modeller_scripts( get_license_key(session, license_key), num_models, het_preserve, water_preserve, hydrogens, fast, None, custom_script, temp_path, thorough_opt, dist_restraints) input_file_map = [] # form the sequences to be written out as a PIR from chimerax.atomic import Sequence pir_target = Sequence(name=target_name) pir_target.description = "sequence:%s:.:.:.:.::::" % pir_target.name pir_target.characters = '/'.join(target_strings) pir_seqs = [pir_target] structures_to_save = set() for strings, info in zip(templates_strings, templates_info): if info is None: # multimer template pir_template = Sequence( name=structure_save_name(multimer_template)) pir_template.description = "structure:%s:FIRST:%s::::::" % ( pir_template.name, multimer_template.chains[0].chain_id) structures_to_save.add(multimer_template) else: # single-chain template chain, match_map = info first_assoc_pos = 0 while first_assoc_pos not in match_map: first_assoc_pos += 1 first_assoc_res = match_map[first_assoc_pos] pir_template = Sequence(name=chain_save_name(chain)) pir_template.description = "structure:%s:%d%s:%s:+%d:%s::::" % ( structure_save_name(chain.structure), first_assoc_res.number, first_assoc_res.insertion_code, chain.chain_id, len(match_map), chain.chain_id) structures_to_save.add(chain.structure) pir_template.characters = '/'.join(strings) pir_seqs.append(pir_template) import os.path pir_file = os.path.join(temp_dir.name, "alignment.ali") aln = session.alignments.new_alignment(pir_seqs, False, auto_associate=False, create_headers=False) aln.save(pir_file, format_name="pir") session.alignments.destroy_alignment(aln) input_file_map.append(("alignment.ali", "text_file", pir_file)) # write the namelist.dat file, target seq name on first line, templates on remaining lines name_file = os.path.join(temp_dir.name, "namelist.dat") input_file_map.append(("namelist.dat", "text_file", name_file)) with open(name_file, 'w') as f: for template_seq in pir_seqs: print(template_seq.name, file=f) config_name = os.path.basename(config_path) input_file_map.append((config_name, "text_file", config_path)) # save structure files import os struct_dir = os.path.join(temp_dir.name, "template_struc") if not os.path.exists(struct_dir): try: os.mkdir(struct_dir, mode=0o755) except FileExistsError: pass from chimerax.pdb import save_pdb, standard_polymeric_res_names as std_res_names for structure in structures_to_save: base_name = structure_save_name(structure) + '.pdb' pdb_file_name = os.path.join(struct_dir, base_name) input_file_map.append((base_name, "text_file", pdb_file_name)) ATOM_res_names = structure.in_seq_hets ATOM_res_names.update(std_res_names) save_pdb(session, pdb_file_name, models=[structure], polymeric_res_names=ATOM_res_names) delattr(structure, 'in_seq_hets') from chimerax.atomic import Chains match_chains = Chains(match_chains) if executable_location is None: if custom_script is not None: raise LimitationError( "Custom Modeller scripts only supported when executing locally" ) if dist_restraints is not None: raise LimitationError( "Distance restraints only supported when executing locally") if thorough_opt: session.logger.warning( "Thorough optimization only supported when executing locally") job_runner = ModellerWebService(session, match_chains, num_models, pir_target.name, input_file_map, config_name, targets, show_gui) else: #TODO: job_runner = ModellerLocal(...) from chimerax.core.errors import LimitationError raise LimitationError("Local Modeller execution not yet implemented") # a custom script [only used when executing locally] needs to be copied into the tmp dir... if os.path.exists(script_path) \ and os.path.normpath(temp_dir.name) != os.path.normpath(os.path.dirname(script_path)): import shutil shutil.copy(script_path, temp_dir.name) return job_runner.run(block=block)
def _prep_add(session, structures, unknowns_info, template, need_all=False, **prot_schemes): global _serial _serial = None atoms = [] type_info_for_atom = {} naming_schemas = {} idatm_type = {} # need this later; don't want a recomp hydrogen_totals = {} # add missing OXTs of "real" C termini; # delete hydrogens of "fake" N termini after protonation # and add a single "HN" back on, using same dihedral as preceding residue; # delete extra hydrogen of "fake" C termini after protonation logger = session.logger real_N, real_C, fake_N, fake_C = determine_termini(session, structures) logger.info("Chain-initial residues that are actual N" " termini: %s" % ", ".join([str(r) for r in real_N])) logger.info("Chain-initial residues that are not actual N" " termini: %s" % ", ".join([str(r) for r in fake_N])) logger.info("Chain-final residues that are actual C" " termini: %s" % ", ".join([str(r) for r in real_C])) logger.info("Chain-final residues that are not actual C" " termini: %s" % ", ".join([str(r) for r in fake_C])) for rc in real_C: complete_terminal_carboxylate(session, rc) # ensure that N termini are protonated as N3+ (since Npl will fail) from chimerax.atomic import Sequence for nter in real_N + fake_N: n = nter.find_atom("N") if not n: continue # if residue wasn't templated, leave atom typing alone if Sequence.protein3to1(n.residue.name) == 'X': continue if not (n.residue.name == "PRO" and n.num_bonds >= 2): n.idatm_type = "N3+" coordinations = {} for struct in structures: pbg = struct.pseudobond_group(struct.PBG_METAL_COORDINATION, create_type=None) if not pbg: continue for pb in pbg.pseudobonds: for a in pb.atoms: if not need_all and a.structure not in structures: continue if not a.element.is_metal: coordinations.setdefault(a, []).append(pb.other_atom(a)) remaining_unknowns = {} type_info_class = type_info['H'].__class__ from chimerax.atomic import Residue for struct in structures: for atom in struct.atoms: if atom.element.number == 0: res = atom.residue struct.delete_atom(atom) idatm_lookup = {} if template: template_lookup = {} from chimerax.atomic import TmplResidue get_template = TmplResidue.get_template for res in struct.residues: if get_template(res.name): continue try: exemplar = template_lookup[res.name] except KeyError: from chimerax.mmcif import find_template_residue tmpl = find_template_residue(session, res.name) if not tmpl: continue from chimerax.atomic import AtomicStructure s = AtomicStructure(session) r = exemplar = template_lookup[res.name] = s.new_residue( res.name, 'A', 1) atom_map = {} for ta in tmpl.atoms: if ta.element.number > 1: a = s.new_atom(ta.name, ta.element) a.coord = ta.coord r.add_atom(a) atom_map[ta] = a for tnb in ta.neighbors: if tnb in atom_map: s.new_bond(a, atom_map[tnb]) for a in res.atoms: ea = exemplar.find_atom(a.name) if ea: a.idatm_type = ea.idatm_type for r in template_lookup.values(): r.structure.delete() template_lookup.clear() for atom in struct.atoms: atom_type = atom.idatm_type idatm_type[atom] = atom_type if atom_type in type_info: # don't want to ask for idatm_type in middle # of hydrogen-adding loop (since that will # force a recomp), so remember here type_info_for_atom[atom] = type_info[atom_type] # if atom is in standard residue but has missing bonds to # heavy atoms, skip it instead of incorrectly protonating # (or possibly throwing an error if e.g. it's planar) # also # UNK/N residues will be missing some or all of their side-chain atoms, so # skip atoms that would otherwise be incorrectly protonated due to their # missing neighbors truncated = \ atom.is_missing_heavy_template_neighbors(no_template_okay=True) \ or \ (atom.residue.name in ["UNK", "N"] and atom.residue.polymer_type != Residue.PT_NONE and unk_atom_truncated(atom)) \ or \ (atom.residue.polymer_type == Residue.PT_NUCLEIC and atom.name == "P" and atom.num_explicit_bonds < 4) if truncated: session.logger.warning( "Not adding hydrogens to %s because it is missing heavy-atom" " bond partners" % atom) type_info_for_atom[atom] = type_info_class( 4, atom.num_bonds, atom.name) else: atoms.append(atom) # sulfonamide nitrogens coordinating a metal # get an additional hydrogen stripped if coordinations.get(atom, []) and atom.element.name == "N": if "Son" in [nb.idatm_type for nb in atom.neighbors]: orig_ti = type_info[atom_type] type_info_for_atom[atom] = orig_ti.__class__( orig_ti.geometry, orig_ti.substituents - 1, orig_ti.description) continue if atom in unknowns_info: type_info_for_atom[atom] = unknowns_info[atom] atoms.append(atom) continue remaining_unknowns.setdefault(atom.residue.name, set()).add(atom.name) # leave remaining unknown atoms alone type_info_for_atom[atom] = type_info_class(4, atom.num_bonds, atom.name) for rname, atom_names in remaining_unknowns.items(): names_text = ", ".join([nm for nm in atom_names]) atom_text, obj_text = ("atoms", "them") if len(atom_names) > 1 else ("atom", "it") logger.warning( "Unknown hybridization for %s (%s) of residue type %s;" " not adding hydrogens to %s" % (atom_text, names_text, rname, obj_text)) naming_schemas.update( determine_naming_schemas(struct, type_info_for_atom)) if need_all: from chimerax.atomic import AtomicStructure for struct in [ m for m in session.models if isinstance(m, AtomicStructure) ]: if struct in structures: continue for atom in struct.atoms: idatm_type[atom] = atom.idatm_type if atom.idatm_type in type_info: type_info_for_atom[atom] = type_info[atom.idatm_type] for atom in atoms: if atom not in type_info_for_atom: continue bonding_info = type_info_for_atom[atom] total_hydrogens = bonding_info.substituents - atom.num_bonds for bonded in atom.neighbors: if bonded.element.number == 1: total_hydrogens += 1 hydrogen_totals[atom] = total_hydrogens schemes = {} # HIS and CYS treated as 'unspecified'; use built-in typing for scheme_type, res_names, res_check, typed_atoms in [ ('his', ["HID", "HIE", "HIP"], None, []), ('asp', asp_res_names, _asp_check, asp_prot_names), ('glu', glu_res_names, _glu_check, glu_prot_names), ('lys', ["LYS", "LYN"], _lys_check, ["NZ"]), ('cys', ["CYM"], _cys_check, ["SG"]) ]: scheme = prot_schemes.get(scheme_type + '_scheme', None) if scheme is None: by_name = True scheme = {} else: by_name = False if not scheme: for s in structures: for r in s.residues: if r.name in res_names and res_check and res_check(r): if by_name: scheme[r] = r.name elif scheme_type != 'his': scheme[r] = res_names[0] # unset any explicit typing... for ta in typed_atoms: a = r.find_atom(ta) if a: a.idatm_type = None else: for r in scheme.keys(): if res_check and not res_check(r, scheme[r]): del scheme[r] schemes[scheme_type] = scheme # create dictionary keyed on histidine residue with value of another # dictionary keyed on the nitrogen atoms with boolean values: True # equals should be protonated his_Ns = {} for r, protonation in schemes["his"].items(): delta = r.find_atom("ND1") epsilon = r.find_atom("NE2") if delta is None or epsilon is None: # find the ring, etc. rings = r.structure.rings() for ring in rings: if r in rings.atoms.residues: break else: continue # find CG by locating CB-CG bond ring_bonds = ring.bonds for ra in ring.atoms: if ra.element.name != "C": continue for ba, b in zip(ra.neighbors, ra.bonds): if ba.element.name == "C" and b not in ring_bonds: break else: continue break else: continue nitrogens = [a for a in ring.atoms if a.element.name == "N"] if len(nitrogens) != 2: continue if ra in nitrogens[0].neighbors: delta, epsilon = nitrogens else: epsilon, delta = nitrogens if protonation == "HID": his_Ns.update({delta: True, epsilon: False}) elif protonation == "HIE": his_Ns.update({delta: False, epsilon: True}) elif protonation == "HIP": his_Ns.update({delta: True, epsilon: True}) else: continue for n, do_prot in his_Ns.items(): if do_prot: type_info_for_atom[n] = type_info["Npl"] n.idatm_type = idatm_type[n] = "Npl" else: type_info_for_atom[n] = type_info["N2"] n.idatm_type = idatm_type[n] = "N2" for r, protonation in schemes["asp"].items(): _handle_acid_protonation_scheme_item(r, protonation, asp_res_names, asp_prot_names, type_info, type_info_for_atom) for r, protonation in schemes["glu"].items(): _handle_acid_protonation_scheme_item(r, protonation, glu_res_names, glu_prot_names, type_info, type_info_for_atom) for r, protonation in schemes["lys"].items(): nz = r.find_atom("NZ") if protonation == "LYS": it = 'N3+' else: it = 'N3' ti = type_info[it] if nz is not None: type_info_for_atom[nz] = ti # avoid explicitly setting type if possible if nz.idatm_type != it: nz.idatm_type = it for r, protonation in schemes["cys"].items(): sg = r.find_atom("SG") if protonation == "CYS": it = 'S3' else: it = 'S3-' ti = type_info[it] if sg is not None: type_info_for_atom[sg] = ti # avoid explicitly setting type if possible if sg.idatm_type != it: sg.idatm_type = it return atoms, type_info_for_atom, naming_schemas, idatm_type, \ hydrogen_totals, his_Ns, coordinations, fake_N, fake_C
def align(session, ref, match, matrix_name, algorithm, gap_open, gap_extend, dssp_cache, ss_matrix=defaults["ss_scores"], ss_fraction=defaults["ss_mixture"], gap_open_helix=defaults["helix_open"], gap_open_strand=defaults["strand_open"], gap_open_other=defaults["other_open"], compute_ss=defaults["compute_ss"]): from chimerax import sim_matrices similarity_matrix = sim_matrices.matrix(matrix_name, session.logger) ssf = ss_fraction ssm = ss_matrix if ssf is not None and ssf is not False and compute_ss: need_compute = [] if ref.structure not in dssp_cache: for r in ref.residues: if r and len(r.atoms) > 1: # not CA only need_compute.append(ref.structure) dssp_cache[ref.structure] = ( ref.structure.residues.ss_ids, ref.structure.residues.ss_types) break if match.structure not in dssp_cache: for r in match.residues: if r and len(r.atoms) > 1: # not CA only need_compute.append(match.structure) dssp_cache[match.structure] = ( match.structure.residues.ss_ids, match.structure.residues.ss_types) break if need_compute: """TODO from chimera.initprefs import ksdsspPrefs, \ KSDSSP_ENERGY, KSDSSP_HELIX_LENGTH, \ KSDSSP_STRAND_LENGTH """ from chimerax.std_commands import dssp dssp.compute_ss(session, need_compute) if algorithm == "nw": from chimerax.alignment_algs import NeedlemanWunsch score, seqs = NeedlemanWunsch.nw(ref, match, score_gap=-gap_extend, score_gap_open=0 - gap_open, similarity_matrix=similarity_matrix, return_seqs=True, ss_matrix=ss_matrix, ss_fraction=ss_fraction, gap_open_helix=-gap_open_helix, gap_open_strand=-gap_open_strand, gap_open_other=-gap_open_other) gapped_ref, gapped_match = seqs elif algorithm == "sw": def ss_let(r): if not r: return ' ' if r.is_helix: return 'H' elif r.is_strand: return 'S' return 'O' if ssf is False or ssf is None: ssf = 0.0 ssm = None if ssm: # account for missing structure (blank SS letter) ssm = ssm.copy() for let in "HSO ": ssm[(let, ' ')] = 0.0 ssm[(' ', let)] = 0.0 from chimerax.alignment_algs import SmithWaterman score, alignment = SmithWaterman.align( ref.characters, match.characters, similarity_matrix, float(gap_open), float(gap_extend), gap_char=".", ss_matrix=ssm, ss_fraction=ssf, gap_open_helix=float(gap_open_helix), gap_open_strand=float(gap_open_strand), gap_open_other=float(gap_open_other), ss1="".join([ss_let(r) for r in ref.residues]), ss2="".join([ss_let(r) for r in match.residues])) from chimerax.atomic import StructureSeq, Sequence gapped_ref = StructureSeq(structure=ref.structure, chain_id=ref.chain_id) gapped_ref.name = ref.structure.name gapped_match = StructureSeq(structure=match.structure, chain_id=match.chain_id) gapped_match.name = match.structure.name # Smith-Waterman may not be entirety of sequences... for orig, gapped, sw in [ (ref, gapped_ref, Sequence(characters=alignment[0])), (match, gapped_match, Sequence(characters=alignment[1])) ]: ungapped = sw.ungapped() for i in range(len(orig) - len(ungapped) + 1): if ungapped == orig[i:i + len(ungapped)]: break else: raise ValueError("Smith-Waterman result not" " a subsequence of original sequence") gapped.bulk_set(orig.residues[i:i + len(ungapped)], sw.characters) else: raise ValueError("Unknown sequence alignment algorithm: %s" % algorithm) # If the structures are disjoint snippets of the same longer SEQRES, # they may be able to be structurally aligned but the SEQRES records # will keep them apart. Try to detect this situation and work around # by snipping off sequence ends. sr_disjoint = False if ref.from_seqres and match.from_seqres: struct_match = 0 for i in range(len(gapped_ref)): uri = gapped_ref.gapped_to_ungapped(i) if uri is None: continue umi = gapped_match.gapped_to_ungapped(i) if umi is None: continue if gapped_ref.residues[uri] and gapped_match.residues[umi]: struct_match += 1 if struct_match >= 3: break if struct_match < 3: seq_match = 0 for s1, s2 in zip(gapped_ref[:], gapped_match[:]): if s1.isalpha() and s2.isalpha(): seq_match += 1 if seq_match > 3: break if seq_match > 3: need = 3 - struct_match if (ref.residues[:need].count(None) == 3 or ref.residues[-need:].count(None) == 3) \ and (match.residues[:need].count(None) == 3 or match.residues[-need:].count(None) == 3): sr_disjoint = True if sr_disjoint: from copy import copy clipped_ref = copy(ref) clipped_match = copy(match) for seq in (clipped_ref, clipped_match): num_none = 0 for r in seq.residues: if r: break num_none += 1 if num_none: seq.bulk_set(seq.residues[num_none:], seq[num_none:]) num_none = 0 for r in reversed(seq.residues): if r: break num_none += 1 if num_none: seq.bulk_set(seq.residues[:-num_none], seq[:-num_none]) return align(session, clipped_ref, clipped_match, matrix_name, algorithm, gap_open, gap_extend, dssp_cache, ss_matrix=ss_matrix, ss_fraction=ss_fraction, gap_open_helix=gap_open_helix, gap_open_strand=gap_open_strand, gap_open_other=gap_open_other, compute_ss=False) for orig, aligned in [(ref, gapped_ref), (match, gapped_match)]: if hasattr(orig, '_dm_rebuild_info'): aligned._dm_rebuild_info = orig._dm_rebuild_info _dm_cleanup.append(aligned) return score, gapped_ref, gapped_match
def write_mol2(session, file_name, *, models=None, atoms=None, status=None, anchor=None, rel_model=None, sybyl_hyd_naming=True, combine_models=False, skip_atoms=None, res_num=False, gaff_type=False, gaff_fail_error=None): """Write a Mol2 file. Parameters ---------- file_name : str, or file object open for writing Output file. models : a list/tuple/set of models (:py:class:`~chimerax.atomic.Structure`s) or a single :py:class:`~chimerax.atomic.Structure` The structure(s) to write out. If None (and 'atoms' is also None) then write out all structures. atoms : an :py:class:`~chimerax.atomic.Atoms` collection or None. If not None, then 'models' must be None. status : function or None If not None, a function that takes a string -- used to report the progress of the write. anchor : :py:class:`~chimerax.atomic.Atoms` collection Atoms (and their implied internal bonds) that should be written out to the @SET section of the file as the rigid framework for flexible ligand docking. rel_model : Model whose coordinate system the coordinates should be written out reletive to, i.e. take the output atoms' coordinates and apply the inverse of the rel_model's transform. sybyl_hyd_naming : bool Controls whether hydrogen names should be "Sybyl-like" or "PDB-like" -- e.g. HG21 vs. 1HG2. combine_models : bool Controls whether multiple structures will be combined into a single @MOLECULE section (value: True) or each given its own section (value: False). skip_atoms : list/set of :py:class:`~chimerax.atomic.Atom`s or an :py:class:`~chimerax.atomic.Atoms` collection or None Atoms to not output res_num : bool Controls whether residue sequence numbers are included in the substructure name. Since Sybyl Mol2 files include them, this defaults to True. gaff_type : bool If 'gaff_type' is True, outout GAFF atom types instead of Sybyl atom types. `gaff_fail_error`, if specified, is the type of error to throw (e.g. UserError) if there is no gaff_type attribute for an atom, otherwise throw the standard AttributeError. """ if status: status("Writing Mol2 file %s" % file_name) from chimerax import io f = io.open_output(file_name, "utf-8") sort_key_func = serial_sort_key = lambda a, ri={}: write_mol2_sort_key( a, res_indices=ri) from chimerax.atomic import Structure, Atoms, Residue class JPBGroup: def __init__(self, atoms): atom_set = set(atoms) pbs = [] for s in atoms.unique_structures: pbg = s.pbg_map.get(s.PBG_METAL_COORDINATION, None) if not pbg: continue for pb in pbg.pseudobonds: if pb.atoms[0] in atom_set and pb.atoms[1] in atom_set: pbs.append(pb) self._pbs = pbs @property def pseudobonds(self): return self._pbs if models is None: if atoms is None: structures = session.models.list(type=Structure) else: structures = atoms else: if atoms is None: if isinstance(models, Structure): structures = [models] else: structures = [m for m in models if isinstance(m, Structure)] else: raise ValueError( "Cannot specify both 'models' and 'atoms' keywords") if isinstance(structures, Atoms): class Jumbo: def __init__(self, atoms): self.atoms = atoms self.residues = atoms.unique_residues self.bonds = atoms.intra_bonds self.name = "(selection)" self.pbg_map = { Structure.PBG_METAL_COORDINATION: JPBGroup(atoms) } structures = [Jumbo(structures)] sort_key_func = lambda a: (a.structure.id, ) + serial_sort_key(a) combine_models = False # transform... if rel_model is None: from chimerax.geometry import identity xform = identity() else: xform = rel_model.scene_position.inverse() # need to find amide moieties since Sybyl has an explicit amide type if status: status("Finding amides") from chimerax.chem_group import find_group amides = find_group("amide", structures) amide_Ns = set([amide[2] for amide in amides]) amide_CNs = set([amide[0] for amide in amides]) amide_CNs.update(amide_Ns) amide_Os = set([amide[1] for amide in amides]) substructure_names = None if combine_models and len(structures) > 1: # create a fictitious jumbo model class Jumbo: def __init__(self, structures): self.name = structures[0].name + " (combined)" from chimerax.atomic import concatenate self.atoms = concatenate([s.atoms for s in structures]) self.bonds = concatenate([s.bonds for s in structures]) self.residues = concatenate([s.residues for s in structures]) self.pbg_map = { Structure.PBG_METAL_COORDINATION: JPBGroup(self.atoms) } # if combining single-residue structures, # can be more informative to use model name # instead of residue type for substructure if len(structures) == len(self.residues): rnames = self.residues.names if len(set(rnames)) < len(rnames): snames = [s.name for s in structures] if len(set(snames)) == len(snames): self.substructure_names = dict( zip(self.residues, snames)) structures = [Jumbo(structures)] if hasattr(structures[-1], 'substructure_names'): substructure_names = structures[-1].substructure_names delattr(structures[-1], 'substructure_names') sort_key_func = lambda a: (a.structure.id, ) + serial_sort(a) # write out structures for struct in structures: if hasattr(struct, 'mol2_comments'): for m2c in struct.mol2_comments: print(m2c, file=f) if hasattr(struct, 'solvent_info'): print(struct.solvent_info, file=f) # molecule section header print("%s" % MOLECULE_HEADER, file=f) # molecule name print("%s" % struct.name, file=f) atoms = list(struct.atoms) bonds = list(struct.bonds) # add metal-coordination bonds coord_grp = struct.pbg_map.get(Structure.PBG_METAL_COORDINATION, None) if coord_grp: bonds.extend(list(coord_grp.pseudobonds)) if skip_atoms: skip_atoms = set(skip_atoms) atoms = [a for a in atoms if a not in skip_atoms] bonds = [ b for b in bonds if b.atoms[0] not in skip_atoms and b.atoms[1] not in skip_atoms ] residues = struct.residues # Put the atoms in the order we want for output if status: status("Putting atoms in input order") atoms.sort(key=sort_key_func) # if anchor is not None, then there will be two entries in # the @SET section of the file... if anchor: sets = 2 else: sets = 0 # number of entries for various sections... print("%d %d %d 0 %d" % (len(atoms), len(bonds), len(residues), sets), file=f) # type of molecule if hasattr(struct, "mol2_type"): mtype = struct.mol2_type else: mtype = "SMALL" from chimerax.atomic import Sequence for r in struct.residues: if Sequence.protein3to1(r.name) != 'X': mtype = "PROTEIN" break if Sequence.nucleic3to1(r.name) != 'X': mtype = "NUCLEIC_ACID" break print(mtype, file=f) # indicate type of charge information if hasattr(struct, 'charge_model'): print(struct.charge_model, file=f) else: print("NO_CHARGES", file=f) if hasattr(struct, 'mol2_comment'): print("\n%s" % struct.mol2_comment, file=f) else: print("\n", file=f) if status: status("writing atoms") # atom section header print("%s" % ATOM_HEADER, file=f) # make a dictionary of residue indices so that we can do quick look ups res_indices = {} for i, r in enumerate(residues): res_indices[r] = i + 1 for i, atom in enumerate(atoms): # atom ID, starting from 1 print("%7d" % (i + 1), end=" ", file=f) # atom name, possibly rearranged if it's a hydrogen if sybyl_hyd_naming and not atom.name[0].isalpha(): atom_name = atom.name[1:] + atom.name[0] else: atom_name = atom.name print("%-8s" % atom_name, end=" ", file=f) # use correct relative coordinate position coord = xform * atom.scene_coord print("%9.4f %9.4f %9.4f" % tuple(coord), end=" ", file=f) # atom type if gaff_type: try: atom_type = atom.gaff_type except AttributeError: if not gaff_fail_error: raise raise gaff_fail_error( "%s has no Amber/GAFF type assigned.\n" "Use the AddCharge tool to assign Amber/GAFF types." % atom) elif hasattr(atom, 'mol2_type'): atom_type = atom.mol2_type elif atom in amide_Ns: atom_type = "N.am" elif atom.structure_category == "solvent" \ and atom.residue.name in Residue.water_res_names: if atom.element.name == "O": atom_type = "O.t3p" else: atom_type = "H.t3p" elif atom.element.name == "N" and len( [r for r in atom.rings() if r.aromatic]) > 0: atom_type = "N.ar" elif atom.idatm_type == "C2" and len( [nb for nb in atom.neighbors if nb.idatm_type == "Ng+"]) > 2: atom_type = "C.cat" elif sulfur_oxygen(atom): atom_type = "O.2" else: try: atom_type = chimera_to_sybyl[atom.idatm_type] except KeyError: session.logger.warning( "Atom whose IDATM type has no equivalent" " Sybyl type: %s (type: %s)" % (atom, atom.idatm_type)) atom_type = str(atom.element) print("%-5s" % atom_type, end=" ", file=f) # residue-related info res = atom.residue # residue index print("%5d" % res_indices[res], end=" ", file=f) # substructure identifier and charge if hasattr(atom, 'charge') and atom.charge is not None: charge = atom.charge else: charge = 0.0 if substructure_names: rname = substructure_names[res] elif res_num: rname = "%3s%-5d" % (res.name, res.number) else: rname = "%3s" % res.name print("%s %9.4f" % (rname, charge), file=f) if status: status("writing bonds") # bond section header print("%s" % BOND_HEADER, file=f) # make an atom-index dictionary to speed lookups atom_indices = {} for i, a in enumerate(atoms): atom_indices[a] = i + 1 for i, bond in enumerate(bonds): a1, a2 = bond.atoms # ID print("%6d" % (i + 1), end=" ", file=f) # atom IDs print("%4d %4d" % (atom_indices[a1], atom_indices[a2]), end=" ", file=f) # bond order; give it our best shot... if hasattr(bond, 'mol2_type'): print(bond.mol2_type, file=f) continue amide_A1 = a1 in amide_CNs amide_A2 = a2 in amide_CNs if amide_A1 and amide_A2: print("am", file=f) continue if amide_A1 or amide_A2: if a1 in amide_Os or a2 in amide_Os: print("2", file=f) else: print("1", file=f) continue aromatic = False # 'bond' might be a metal-coordination bond so do a test for rings if hasattr(bond, 'rings'): for ring in bond.rings(): if ring.aromatic: aromatic = True break if aromatic: print("ar", file=f) continue try: geom1 = idatm_info[a1.idatm_type].geometry except KeyError: print("1", file=f) continue try: geom2 = idatm_info[a2.idatm_type].geometry except KeyError: print("1", file=f) continue # sulfone/sulfoxide is classically depicted as double- # bonded despite the high dipolar character of the # bond making it have single-bond character. For # output, use the classical values. if sulfur_oxygen(a1) or sulfur_oxygen(a2): print("2", file=f) continue if geom1 not in [2, 3] or geom2 not in [2, 3]: print("1", file=f) continue # if either endpoint atom is in an aromatic ring and # the bond isn't, it's a single bond... for endp in [a1, a2]: aromatic = False for ring in endp.rings(): if ring.aromatic: aromatic = True break if aromatic: break else: # neither endpoint in aromatic ring if geom1 == 2 and geom2 == 2: print("3", file=f) else: print("2", file=f) continue print("1", file=f) if status: status("writing residues") # residue section header print("%s" % SUBSTR_HEADER, file=f) for i, res in enumerate(residues): # residue id field print("%6d" % (i + 1), end=" ", file=f) # residue name field if substructure_names: rname = substructure_names[res] elif res_num: rname = "%3s%-4d" % (res.name, res.number) else: rname = "%3s" % res.name print(rname, end=" ", file=f) # ID of the root atom of the residue chain_atom = res.principal_atom if chain_atom is None: # if writing out a selection, not all residue atoms # might be in atom_indices... for chain_atom in res.atoms: if chain_atom in atom_indices: break print("%5d" % atom_indices[chain_atom], end=" ", file=f) print("RESIDUE 4", end=" ", file=f) # Sybyl seems to use chain 'A' when chain ID is blank, # so run with that chain_id = res.chain_id if not chain_id.strip(): chain_id = 'A' print("%-4s %3s" % (chain_id, res.name), end=" ", file=f) # number of out-of-substructure bonds cross_res_bonds = 0 for a in res.atoms: for nb in a.neighbors: if nb.residue != res: cross_res_bonds += 1 print("%5d" % cross_res_bonds, end="", file=f) # print "ROOT" if first or only residue of a chain if not res.chain or res.chain.existing_residues[0] == res: print(" ROOT", file=f) else: print(file=f) # write flexible ligand docking info if anchor: if status: status("writing anchor info") print("%s" % SET_HEADER, file=f) atom_indices = {} for i, a in enumerate(atoms): atom_indices[a] = i + 1 bond_indices = {} for i, b in enumerate(bonds): bond_indices[b] = i + 1 print( "ANCHOR STATIC ATOMS <user> **** Anchor Atom Set", file=f) print(len(anchor), end=" ", file=f) for a in anchor: if a in atom_indices: print(atom_indices[a], end=" ", file=f) print(file=f) print( "RIGID STATIC BONDS <user> **** Rigid Bond Set", file=f) bonds = anchor.intra_bonds print(len(bonds), end=" ", file=f) for b in bonds: if b in bond_indices: print(bond_indices[b], end=" ", file=f) print(file=f) if file_name != f: f.close() if status: status("Wrote Mol2 file %s" % file_name)
def read(session, f): doing = None sequences = [] header_ok = False line_num = 0 align_start_index = None for line in f.readlines(): if doing == 'alignments': # don't strip() alignment section since it has significant leading spaces line = line.rstrip() else: line = line.strip() line_num += 1 if not header_ok: if line.lower().startswith("hssp"): header_ok = True continue raise FormatSyntaxError("No initial HSSP header line") if line.startswith('##'): if doing == 'proteins' and not sequences: raise FormatSyntaxError("No entries in PROTEINS section") try: doing = line.split()[1].lower() except IndexError: doing = None if doing == 'alignments': try: hashes, alignments, begin, dash, end = line.strip().split() begin = int(begin) end = int(end) except ValueError: raise FormatSyntaError("ALIGNMENTS line (line #%d) not of the form: " "## ALIGNMENTS (number) - (number)" % line_num) continue if doing == 'proteins': if not line[0].isdigit(): continue try: seq_name = line.split()[2] except IndexError: raise FormatSyntaxError("Line %d in PROTEINS section does not start with " "[integer] : [sequence name]" % line_num) sequences.append(Sequence(name=make_readable(seq_name))) elif doing == 'alignments': if line.lstrip().lower().startswith('seqno'): try: align_start_index = line.index('.') except Exception: raise FormatSyntaxError("No indication of alignment starting column " "('.' character) in SeqNo line in ALIGNMENTS section") continue if align_start_index == None: raise FormatSyntaxError("No initial SeqNo line in ALIGNMENTS section") block = line[align_start_index:] if not block: raise FormatSyntaxError("No alignment block given on line %d" % line_num) block_len = end - begin + 1 if len(block) > block_len: raise FormatSyntaxError("Too many characters (%d, only %d sequences) in " "alignment block given on line %d" % (len(block), block_len, line_num)) block = block + ' ' * (block_len - len(block)) for seq, c in zip(sequences[begin-1:end], block): seq.append(c) f.close() return sequences, {}, {}