Python Sequence Examples, chimerax.atomic.Sequence Python Examples

Example #1

0

Show file

def save(session, alignment, stream):
    print("CLUSTAL W ALN saved from UCSF ChimeraX", file=stream)
    print("", file=stream)

    max_name = max([len(seq.name) for seq in alignment.seqs])
    name_format = "%%-%ds" % (max_name+5)

    from chimerax.atomic import Sequence
    aln_len = len(alignment.seqs[0])
    for start in range(0, aln_len, LINELEN):
        end = min(aln_len, start + LINELEN)

        for seq in alignment.seqs:
            name = seq.name.replace(' ', '_')
            temp_seq = Sequence()
            temp_seq.extend(seq[start:end])
            if len(temp_seq.ungapped()) == 0:
                print(name_format % name, seq[start:end], file=stream)
            else:
                temp_seq = Sequence()
                temp_seq.extend(seq[:end])
                print(name_format % name, seq[start:end], len(temp_seq.ungapped()), file=stream)
        from .. import clustal_strong_groups, clustal_weak_groups
        conservation = []
        for pos in range(start, end):
            # completely conserved?
            first = alignment.seqs[0][pos].upper()
            if first.isupper():
                for seq in alignment.seqs[1:]:
                    if seq[pos].upper() != first:
                        break
                else:
                    # conserved
                    conservation.append('*')
                    continue

            # "strongly"/"weakly" conserved?
            conserved = False
            for groups, character in [(clustal_strong_groups, ':'), (clustal_weak_groups, '.')]:
                for group in groups:
                    for seq in alignment.seqs:
                        if seq[pos].upper() not in group:
                            break
                    else:
                        # conserved
                        conserved = True
                        break
                if conserved:
                    conservation.append(character)
                    break

            if not conserved:
                # remainder
                conservation.append(' ')
        print(name_format % " ", "".join(conservation), file=stream)
        print("", file=stream)

Example #2

0

Show file

File: comparative.py Project: Yongcheng123/ChimeraX

def regularized_seq(aseq, chain):
    mmap = aseq.match_maps[chain]
    from .common import modeller_copy
    rseq = modeller_copy(aseq)
    rseq.description = "structure:" + chain_save_name(chain)
    seq_chars = list(rseq.characters)
    from chimerax.atomic import Sequence
    from chimerax.pdb import standard_polymeric_res_names as std_res_names
    in_seq_hets = []
    num_res = 0
    for ungapped in range(len(aseq.ungapped())):
        gapped = aseq.ungapped_to_gapped(ungapped)
        if ungapped not in mmap:
            seq_chars[gapped] = '-'
        else:
            r = mmap[ungapped]
            num_res += 1
            if r.name not in std_res_names:
                in_seq_hets.append(r.name)
                seq_chars[gapped] = '.'
            else:
                seq_chars[gapped] = Sequence.rname3to1(mmap[ungapped].name)
    s = chain.structure
    het_set = getattr(s, 'in_seq_hets', set())
    # may want to preserve all-HET chains, so don't auto-exclude them
    if num_res != len(in_seq_hets):
        het_set.update(in_seq_hets)
    s.in_seq_hets = het_set
    rseq.characters = "".join(seq_chars)
    return rseq

Example #3

0

Show file

File: tool.py Project: Yongcheng123/ChimeraX

 def show_mav(self, ids):
     # Collect names and sequences of selected matches.
     # All sequences should have the same length because
     # they include gaps generated from BLAST alignment.
     ids.insert(0, 0)
     names = []
     seqs = []
     for sid in ids:
         name, seq = self._sequences[sid]
         names.append(name)
         seqs.append(seq)
     # Find columns that are gaps in all sequences and remove them.
     all_gaps = set()
     for i in range(len(seqs[0])):
         for seq in seqs:
             if seq[i].isalpha():
                 break
         else:
             all_gaps.add(i)
     if all_gaps:
         for i in range(len(seqs)):
             seq = seqs[i]
             new_seq = ''.join(
                 [seq[n] for n in range(len(seq)) if n not in all_gaps])
             seqs[i] = new_seq
     # Generate multiple sequence alignment file
     # Ask sequence viewer to display alignment
     from chimerax.atomic import Sequence
     seqs = [
         Sequence(name=name, characters=seqs[i])
         for i, name in enumerate(names)
     ]
     name = "%s [%d]" % (self._instance_name, self._viewer_index)
     self.session.alignments.new_alignment(seqs, name)

Example #4

0

Show file

File: readPFAM.py Project: Yongcheng123/ChimeraX

def read(session, f):
    # skip header crap
    in_header = True
    line_num = 0
    sequences = []
    for line in f.readlines():
        line = line.strip()
        line_num += 1
        if not line:
            continue
        fields = line.split()
        if in_header:
            if len(fields[0]) == 2:
                continue
            if fields[0].startswith('#='):
                # some Pfam seed alignments have undocumented #=RF header
                continue
            in_header = False
        if len(fields) != 2:
            raise FormatSyntaxError(
                "Sequence line %d not of form 'seq-name seq-letters'" % line_num)
        seq = Sequence(name=make_readable(fields[0]))
        seq.extend(fields[1])
        sequences.append(seq)
    f.close()
    return sequences, {}, {}

Example #5

0

Show file

def read(session, f):
    want = 'init'
    sequences = []
    for line in f.readlines():
        line = line.strip()
        if want == 'init':
            if len(line) < 4:
                continue
            if line[0] != '>' or line[3] != ';':
                continue
            sequences.append(Sequence(name=make_readable(line[4:])))
            pir_type = line[1:3]
            if pir_type in ("P1", "F1"):
                sequences[-1].nucleic = True
            else:
                sequences[-1].nucleic = False
            sequences[-1].pir_type = pir_type
            want = 'description'
        elif want == 'description':
            sequences[-1].description = line
            sequences[-1].pir_description = line
            want = 'sequence'
        elif want == 'sequence':
            if not line:
                continue
            if line[-1] == '*':
                want = 'init'
                line = line[:-1]
            sequences[-1].extend("".join([c for c in line if not c.isspace()]))
    f.close()
    if want != 'init':
        raise FormatSyntaxError("Could not find end of sequence '%s'" %
                                sequences[-1].name)
    return sequences, {}, {}

Example #6

0

Show file

File: comparative.py Project: Yongcheng123/ChimeraX

def find_affixes(chains, chain_info):
    from chimerax.pdb import standard_polymeric_res_names as std_res_names
    in_seq_hets = []
    prefixes = []
    suffixes = []
    from chimerax.atomic import Sequence
    for chain in chains:
        try:
            aseq, target = chain_info[chain]
        except KeyError:
            prefixes.append('')
            suffixes.append('')
            continue
        match_map = aseq.match_maps[chain]
        prefix = ''
        for r in chain.existing_residues:
            if r in match_map:
                break
            if r.name not in std_res_names:
                in_seq_hets.append(r.name)
                prefix += '.'
            else:
                prefix += Sequence.rname3to1(r.name)
        prefixes.append(prefix)

        suffix = ''
        for r in reversed(chain.existing_residues):
            if r in match_map:
                break
            if r.name not in std_res_names:
                in_seq_hets.append(r.name)
                suffix = '.' + suffix
            else:
                suffix = Sequence.rname3to1(r.name) + suffix
        suffixes.append(suffix)
    s = chain.structure
    het_set = getattr(s, 'in_seq_hets', set())
    het_set.update(in_seq_hets)
    s.in_seq_hets = het_set
    return prefixes, suffixes

Example #7

0

Show file

File: cmd.py Project: Yongcheng123/ChimeraX

def seqalign_chain(session, chains):
    '''
    Show chain sequence(s)

    Parameters
    ----------
    chains : list of Chain
        Chains to show
    '''

    if len(chains) == 1:
        chain = chains[0]
        ident = ".".join([str(part) for part in chain.structure.id]) + "/" + chain.chain_id
        alignment = session.alignments.new_alignment([chain], ident, seq_viewer="sv",
            auto_associate=None, intrinsic=True)
    else:
        # all chains have to have the same sequence, and they will all be associated with
        # that sequence
        sequences = set([chain.characters for chain in chains])
        if len(sequences) != 1:
            raise UserError("Chains must have same sequence")
        chars = sequences.pop()
        chain_ids = set([chain.chain_id for chain in chains])
        if len(chain_ids) < len(chains) or len(chain_ids) > 10:
            name = "%d chains" % len(chains)
        else:
            name = "chains %s" % ",".join(sorted(list(chain_ids)))
        from chimerax.atomic import Sequence
        seq = Sequence(name=name, characters=chars)
        def get_numbering_start(chain):
            for i, r in enumerate(chain.residues):
                if r is None or r.deleted:
                    continue
                return r.number - i
            return None
        starts = set([get_numbering_start(chain) for chain in chains])
        starts.discard(None)
        if len(starts) == 1:
            seq.numbering_start = starts.pop()
        alignment = session.alignments.new_alignment([seq], None, seq_viewer="sv",
            auto_associate=False, name=chains[0].description, intrinsic=True)
        alignment.suspend_notify_observers()
        for chain in chains:
            alignment.associate(chain, keep_intrinsic=True)
        alignment.resume_notify_observers()

Example #8

0

Show file

def nw_assoc(session, align_seq, struct_seq):
    '''Wrapper around Needleman-Wunsch matching, to make it return the same kinds of values
       that try_assoc returns'''

    from chimerax.atomic import Sequence, SeqMatchMap
    sseq = struct_seq
    aseq = Sequence(name=align_seq.name, characters=align_seq.ungapped())
    aseq.circular = align_seq.circular
    from chimerax.alignment_algs.NeedlemanWunsch import nw
    score, match_list = nw(sseq, aseq)

    errors = 0
    # matched are in reverse order...
    try:
        m_end = match_list[0][0]
    except IndexError:
        m_end = -1
    if m_end < len(sseq) - 1:
        # trailing unmatched
        errors += len(sseq) - m_end - 1

    match_map = SeqMatchMap(align_seq, struct_seq)
    last_match = m_end + 1
    for s_index, a_index in match_list:
        if sseq[s_index] != aseq[a_index]:
            errors += 1

        if s_index < last_match - 1:
            # gap in structure sequence
            errors += last_match - s_index - 1

        res = sseq.residues[s_index]
        if res:
            match_map.match(res, a_index)

        last_match = s_index
    if last_match > 0:
        # beginning unmatched
        errors += last_match

    if len(sseq) > len(aseq):
        # unmatched residues forced, reduce errors by that amount...
        errors -= len(sseq) - len(aseq)

    return match_map, errors

Example #9

0

Show file

def fetch_uniprot(session, ident, ignore_cache=False):
	'Fetch UniProt data'

	from chimerax.core.errors import UserError, CancelOperation
	try:
		accession = map_uniprot_ident(ident)
		seq_string, full_name, features = fetch_uniprot_accession_info(session, accession,
			ignore_cache=ignore_cache)
	except InvalidAccessionError as e:
		raise UserError(str(e))
	except CancelOperation:
		session.logger.status("Fetch of %s cancelled" % ident)
		return
	from chimerax.atomic import Sequence
	seq = Sequence(name=ident)
	seq.extend(seq_string)
	session.logger.status("Opening UniProt %s" % ident)
	session.alignments.new_alignment([seq], ident)
	return [], "Opened UniProt %s" % ident

Example #10

0

Show file

File: readALN.py Project: Yongcheng123/ChimeraX

def read(session, f):
    in_header = True
    sequences = []
    line_num = 0
    for line in f.readlines():
        line_num += 1
        if in_header:
            if line.startswith("CLUSTAL"):
                in_header = False
                first_block = True
            else:
                if line.strip() != "":
                    raise FormatSyntaxError(
                        "First non-blank line does not start with 'CLUSTAL'")
            continue
        if not line or line[0].isspace():
            if sequences:
                first_block = False
                expect = 0
            continue
        try:
            seq_name, seq_block, num_residues = line.split()
        except ValueError:
            try:
                seq_name, seq_block = line.strip().split()
            except ValueError:
                raise FormatSyntaxError(
                    "Line %d is not sequence name followed by sequence "
                    "contents and optional ungapped length" % line_num)
        if first_block:
            sequences.append(Sequence(name=make_readable(seq_name)))
            sequences[-1].append(seq_block)
            continue
        try:
            seq = sequences[expect]
        except IndexError:
            raise FormatSyntaxError(
                "Sequence on line %d not in initial sequence block" % line_num)
        expect += 1
        seq.append(seq_block)
    f.close()
    return sequences, {}, {}

Example #11

0

Show file

 def _read_sequences(self, f):
     from chimerax.atomic import Sequence
     self.sequence_list = []
     while 1:
         line = f.readline()
         if not line:
             raise FormatSyntaxError('no alignment separator')
         if line == '//\n' or line == '//\r\n':
             break
         m = MSF._Sum.match(line)
         if m is not None:
             name = m.group(1)
             length = m.group(2)
             check = m.group(3)
             weight = m.group(4)
             s = Sequence(name=make_readable(name))
             self.sequence_list.append(s)
             s.attrs = {}
             s.attrs['MSF length'] = length
             s.attrs['MSF check'] = check
             s.attrs['MSF weight'] = weight
     if not self.sequence_list:
         raise FormatSyntaxError('No sequences found in header')

Example #12

0

Show file

File: readFASTA.py Project: Yongcheng123/ChimeraX

def read(session, f):
    from chimerax.atomic import Sequence
    from ..parse import FormatSyntaxError, make_readable
    in_sequence = False
    sequences = []
    for line in f.readlines():
        if in_sequence:
            if not line or line.isspace():
                in_sequence = False
                continue
            if line[0] == '>':
                in_sequence = False
                # fall through
            else:
                sequences[-1].extend(line.strip())
        if not in_sequence:
            if line[0] == '>':
                if sequences and len(sequences[-1]) == 0:
                    raise FormatSyntaxError("No sequence found for %s"
                        % sequences[-1].name)
                in_sequence = True
                sequences.append(Sequence(name=make_readable(line[1:])))
    return sequences, {}, {}

Example #13

0

Show file

File: readSTOCKHOLM.py Project: Yongcheng123/ChimeraX

def read(session, f):
    line_num = 0
    file_attrs = {}
    file_markups = {}
    seq_attrs = {}
    seq_markups = {}
    sequences = {}
    seq_sequence = []
    for line in f.readlines():
        line = line.rstrip()  # drop trailing newline/whitespace
        line_num += 1
        if line_num == 1:
            if line.startswith("# STOCKHOLM"):
                continue
            raise FormatSymtaxError("File does not start with '# STOCKHOLM'")
        if not line:
            continue
        if line.startswith('#='):
            markup_type = line[2:4]
            markup = line[5:].strip()

            def try_split(num_split):
                fields = markup.split(None, num_split)
                if len(fields) == num_split:
                    # value is empty
                    fields.append("")
                if len(fields) != num_split + 1:
                    raise FormatSyntaxError(
                        "Not enough arguments after #=%s markup on line %d" %
                        (markup_type, line_num))
                return fields

            if markup_type == "GF":
                tag, val = try_split(1)
                tag = tag.replace("_", " ")
                tag = generic_file_attrs.get(tag, "Stockholm " + tag)
                if tag in file_attrs:
                    file_attrs[tag] += '\n' + val
                else:
                    file_attrs[tag] = val
            elif markup_type == "GS":
                seq_name, tag, val = try_split(2)
                tag = tag.replace("_", " ")
                attrs = seq_attrs.setdefault(seq_name, {})
                tag = generic_seq_attrs.get(tag, "Stockholm " + tag)
                if tag in attrs:
                    attrs[tag] += '\n' + val
                else:
                    attrs[tag] = val
            elif markup_type == "GC":
                tag, val = try_split(1)
                tag = tag.replace("_", " ")
                file_markups[tag] = file_markups.get(tag, "") + val
            elif markup_type == "GR":
                seq_name, tag, val = try_split(2)
                tag = tag.replace("_", " ")
                seq_markups.setdefault(seq_name, {}).setdefault(tag, "")
                seq_markups[seq_name][tag] += val
            # ignore other types
            continue
        elif line.startswith('#'):
            # unstructured comment
            if 'comments' in file_attrs:
                file_attrs['comments'] += "\n" + line[1:]
            else:
                file_attrs['comments'] = line[1:]
            continue
        elif line.strip() == "//":
            # end of sequence alignment blocks, but comments may follow this, so keep going...
            continue
        # sequence info...
        try:
            seq_name, block = line.split(None, 1)
        except ValueError:
            raise FormatSyntaxError(
                "Sequence info not in name/contents format on line %d" %
                line_num)
        if seq_name not in sequences:
            sequences[seq_name] = Sequence(name=make_readable(seq_name))
            seq_sequence.append(seq_name)
        sequences[seq_name].extend(block)
    f.close()
    for seq_name, seq in sequences.items():
        if seq_name in seq_attrs:
            seq.attrs = seq_attrs[seq_name]
        if seq_name in seq_markups:
            seq.markups = seq_markups[seq_name]
            for tag, markup in seq.markups.items():
                if len(markup) != len(seq):
                    session.logger.warning(
                        "Markup %s for sequence %s is wrong length; ignoring" %
                        (tag, seq_name))
                    del seq.markups[tag]
    for seq_info, label in [(seq_attrs, "sequence"), (seq_markups, "residue")]:
        for seq_name in seq_info.keys():
            if seq_name in sequences:
                continue
            # might be sequence name if trailing '/start-end' is removed...
            for full_name in sequences.keys():
                if full_name.startswith(seq_name) \
                and full_name[len(seq_name)] == '/' \
                and '/' not in full_name[len(seq_name)+1:]:
                    break
            else:
                raise FormatSyntaxError(
                    "%s annotations provided for non-existent sequence %s" %
                    (label.capitalize(), seq_name))
            session.logger.info(
                "Updating %s %s annotations with %s annotations" %
                (full_name, label, seq_name))
            seq_info[full_name].update(seq_info[seq_name])
            del seq_info[seq_name]
    for tag, markup in file_markups.items():
        if len(markup) != len(sequences[seq_sequence[0]]):
            raise FormatSyntaxError("Column annotation %s is wrong length" %
                                    tag)

    return [sequences[name] for name in seq_sequence], file_attrs, file_markups

Example #14

0

Show file

File: readRSF.py Project: Yongcheng123/ChimeraX

def read(session, f):
    IN_HEADER, START_ATTRS, IN_ATTRS, IN_FEATURES, IN_SEQ = range(5)

    state = IN_HEADER

    sequences = []
    line_num = 0
    has_offset = False
    longest = None
    file_attrs = {}
    for line in f.readlines():
        line = line.rstrip() # remove trailing whitespace/newline
        line_num += 1
        if line_num == 1:
            if line.startswith("!!RICH_SEQUENCE"):
                continue
            raise FormatSyntaxError("First line does not start with !!RICH_SEQUENCE")

        if state == IN_HEADER:
            if line.strip() == "..":
                state = START_ATTRS
                continue
            if "comments" in file_attrs:
                file_attrs["comments"] += "\n" + line
            else:
                file_attrs["comments"] = line
            continue
        if not line.strip():
            continue

        if state == START_ATTRS:
            if line.strip() == "{":
                state = IN_ATTRS
                cur_attr = None
                attrs = {}
            elif line:
                raise FormatSyntaxError(
                    "Unexpected text before start of sequence on line %d" &line_num)
            continue

        if state == IN_ATTRS or state == IN_FEATURES:
            if line.strip() == "sequence" and line[0] == "s":
                if "RSF name" not in attrs:
                    raise FormatSyntaxError("Sequence on line %d has no name" & line_num)
                state = IN_SEQ
                seq = Sequence(name=make_readable(attrs["RSF name"]))
                del attrs["RSF name"]
                seq.attrs = attrs
                if "RSF descrip" in attrs:
                    attrs["description"] = attrs["RSF descrip"]
                    del attrs["RSF descrip"]
                sequences.append(seq)
                if "RSF offset" in attrs:
                    seq.extend("." * int(attrs["RSF offset"]))
                    has_offset = True
                    del attrs["RSF offset"]
                continue
            if line.startswith("feature"):
                if state == IN_ATTRS:
                    attrs["RSF features"] = [[line[8:]]]
                else:
                    attrs["RSF features"].append([line[8:]])
                state = IN_FEATURES
                continue

        if state == IN_ATTRS:
            if line[0].isspace():
                # continuation
                if not cur_attr:
                    raise FormatSyntaxError("Bogus indentation at line %d" % line_num)
                if attrs[cur_attr]:
                    attrs[cur_attr] += "\n" + line
                else:
                    attrs[cur_attr] = line
                continue
            if " " in line.strip():
                cur_attr, val = line.split(None, 1)
                cur_attr.replace("_", " ")
                cur_attr = "RSF " + cur_attr
                attrs[cur_attr] = val.strip()
            else:
                cur_attr = "RSF " + line.strip().replace("_", " ")
                attrs[cur_attr] = ""
            continue

        if state == IN_FEATURES:
            attrs["RSF features"][-1].append(line)
            continue
        if line.strip() == "}":
            state = START_ATTRS
            if not longest:
                longest = len(seq)
            else:
                if len(seq) < longest:
                    seq.extend("." * (longest - len(seq)))
                elif len(seq) > longest:
                    longest = len(seq)
                    for s in sequences[:-1]:
                        s.extend("." * (longest - len(s)))
            continue
        seq.extend(line.strip())
        if not seq[0].isalpha():
            has_offset = True

    f.close()
    if state == IN_HEADER:
        raise FormatSyntaxError("No end to header (i.e. '..' line) found")
    if state == IN_ATTRS or state == IN_FEATURES:
        raise FormatSyntaxError("No sequence data found for sequence %s" % attrs["RSF name"])
    if state == IN_SEQ:
        raise FormatSyntaxError("No terminating brace for sequence %s" % attrs["RSF name"])
    if not has_offset:
        session.logger.warning("No offset fields in RSF file; assuming zero offset")
    return sequences, file_attrs, {}

Example #15

0

Show file

File: tool.py Project: Yongcheng123/ChimeraX

    def _update_errors_gaps(self, aseq):
        if not self.settings.error_region_shown and not self.settings.gap_region_shown:
            return
        a_ref_seq = getattr(aseq, 'residue_sequence', aseq.ungapped())
        errors = [0] * len(a_ref_seq)
        gaps = [0] * len(a_ref_seq)
        from chimerax.atomic import Sequence
        for chain, match_map in aseq.match_maps.items():
            for i, char in enumerate(a_ref_seq):
                try:
                    res = match_map[i]
                except KeyError:
                    gaps[i] += 1
                else:
                    if Sequence.rname3to1(res.name) != char.upper():
                        errors[i] += 1
        partial_error_blocks, full_error_blocks = [], []
        partial_gap_blocks, full_gap_blocks = [], []
        num_assocs = len(aseq.match_maps)
        if num_assocs > 0:
            for partial, full, check in [
                (partial_error_blocks, full_error_blocks, errors),
                (partial_gap_blocks, full_gap_blocks, gaps)
            ]:
                cur_partial_block = cur_full_block = None
                for i, check_num in enumerate(check):
                    gapped_i = aseq.ungapped_to_gapped(i)
                    if check_num == num_assocs:
                        if cur_full_block:
                            cur_full_block[-1] = gapped_i
                        else:
                            cur_full_block = [aseq, aseq, gapped_i, gapped_i]
                            full.append(cur_full_block)
                        if cur_partial_block:
                            cur_partial_block = None
                    else:
                        if cur_full_block:
                            cur_full_block = None
                        if check_num > 0:
                            if cur_partial_block:
                                cur_partial_block[-1] = gapped_i
                            else:
                                cur_partial_block = [
                                    aseq, aseq, gapped_i, gapped_i
                                ]
                                partial.append(cur_partial_block)
                        elif cur_partial_block:
                            cur_partial_block = None

        for shown, region_name_part, partial_blocks, full_blocks, fills, outlines in [
            (self.settings.error_region_shown, self.ERROR_REGION_STRING,
             partial_error_blocks, full_error_blocks,
             self.settings.error_region_interiors,
             self.settings.error_region_borders),
            (self.settings.gap_region_shown, self.GAP_REGION_STRING,
             partial_gap_blocks, full_gap_blocks,
             self.settings.gap_region_interiors,
             self.settings.gap_region_borders)
        ]:
            if not shown:
                continue
            full_fill, partial_fill = fills
            full_outline, partial_outline = outlines
            for region_name_start, blocks, fill, outline in [
                (region_name_part, full_blocks, full_fill, full_outline),
                ("partial " + region_name_part, partial_blocks, partial_fill,
                 partial_outline)
            ]:
                region_name = "%s of %s" % (region_name_start, aseq.name)
                old_reg = self.region_browser.get_region(region_name,
                                                         create=False)
                if old_reg:
                    self.region_browser.delete_region(old_reg)
                if blocks:
                    self.region_browser.new_region(region_name,
                                                   blocks=blocks,
                                                   fill=fill,
                                                   outline=outline,
                                                   sequence=aseq,
                                                   cover_gaps=False)

Example #16

0

Show file

File: comparative.py Project: Yongcheng123/ChimeraX

def model(session,
          targets,
          *,
          block=True,
          multichain=True,
          custom_script=None,
          dist_restraints=None,
          executable_location=None,
          fast=False,
          het_preserve=False,
          hydrogens=False,
          license_key=None,
          num_models=5,
          show_gui=True,
          temp_path=None,
          thorough_opt=False,
          water_preserve=False):
    """
    Generate comparative models for the target sequences.

    Arguments:
    session
        current session
    targets
        list of (alignment, sequence) tuples.  Each sequence will be modelled.
    block
        If True, wait for modelling job to finish before returning and return list of
        (opened) models.  Otherwise return immediately.  Also see 'show_gui' option.
    multichain
        If True, the associated chains of each structure are used individually to generate
        chains in the resulting models (i.e. the models will be multimers).  If False, all
        associated chains are used together as templates to generate a single-chain model
        for the target sequence.
    custom_script
        If provided, the location of a custom Modeller script to use instead of the
        one we would otherwise generate.  Only used when executing locally.
    dist_restraints
        If provided, the location of a file containing additional distance restraints
    executable_location
        If provided, the path to the locally installed Modeller executable.  If not
        provided, use the web service.
    fast
        Whether to use fast but crude generation of models
    het_preserve
        Whether to preserve HET atoms in generated models
    hydrogens
        Whether to generate models with hydrogen atoms
    license_key
        Modeller license key.  If not provided, try to use settings to find one.
    num_models
        Number of models to generate for each template sequence
    show_gui
        If True, show user interface for Modeller results (if ChimeraX is in gui mode).
    temp_path
        If provided, folder to use for temporary files
    thorough_opt
        Whether to perform thorough optimization
    water_preserve
        Whether to preserve water in generated models
    """

    from chimerax.core.errors import LimitationError, UserError
    from .common import modeller_copy
    if multichain:
        # So, first find structure with most associated chains and least non-associated chains.
        # That structure is used as the multimer template.  Chains from other structures are used
        # as "standalone" templates -- each such chain will be on its own line.  Need to allow
        # space on the left and right of the target sequence so that the largest chains can be
        # accomodated.

        # Find the structure we will use as the multimer template
        by_structure = {}
        chain_info = {}
        for alignment, orig_target in targets:
            # Copy the target sequence, changing name to conform to Modeller limitations
            target = modeller_copy(orig_target)
            if not alignment.associations:
                raise UserError("Alignment %s has no associated chains" %
                                alignment.ident)
            for chain, aseq in alignment.associations.items():
                if len(chain.chain_id) > 1:
                    raise LimitationError(
                        "Modeller cannot handle templates with multi-character chain IDs"
                    )
                by_structure.setdefault(chain.structure, []).append(chain)
                chain_info[chain] = (aseq, target)
        max_matched = min_unmatched = None
        for s, match_info in by_structure.items():
            matched = len(match_info)
            unmatched = s.num_chains - len(match_info)
            if max_matched is None or matched > max_matched or (
                    matched == max_matched and (unmatched < min_unmatched)):
                multimer_template = s
                max_matched = matched
                min_unmatched = unmatched
        mm_targets = []
        mm_chains = []
        match_chains = []
        for chain in multimer_template.chains:
            mm_chains.append(chain)
            try:
                aseq, target = chain_info[chain]
            except KeyError:
                mm_targets.append(None)
            else:
                mm_targets.append(target)
                match_chains.append(chain)
        # okay, now form single-chain lines for the other structure associations, that eventually will
        # be handled column by column in exactly the same way as the non-multichain method.
        single_template_lines = []
        for chain, info in chain_info.items():
            if chain.structure == multimer_template:
                continue
            aseq, target = info
            for i, mm_target in enumerate(mm_targets):
                if mm_target != target:
                    continue
                template_line = [None] * len(mm_targets)
                template_line[i] = chain
                single_template_lines.append(template_line)
        # AFAIK, the multimer template chain sequences need to have complete PDB sequence, so may need
        # to prefix and suffix he corresponding alignment sequence with characters for residues
        # outside of the alignment sequence.  For other templates/targets, affix a corresponding number
        # of '-' characters
        prefixes, suffixes = find_affixes(mm_chains, chain_info)
        target_strings = []
        for prefix, suffix, mm_target in zip(prefixes, suffixes, mm_targets):
            if mm_target is None:
                target_strings.append('-')
                continue
            target_strings.append('-' * len(prefix) + mm_target.characters +
                                  '-' * len(suffix))
        templates_strings = []
        templates_info = []
        mm_template_strings = []
        for prefix, suffix, chain in zip(prefixes, suffixes, mm_chains):
            try:
                aseq, target = chain_info[chain]
            except KeyError:
                mm_template_strings.append('-')
                continue
            mm_template_strings.append(
                prefix + regularized_seq(aseq, chain).characters + suffix)
        templates_strings.append(mm_template_strings)
        templates_info.append(None)
        for template_line in single_template_lines:
            template_strings = []
            for prefix, suffix, chain, target in zip(prefixes, suffixes,
                                                     template_line,
                                                     mm_targets):
                if target is None:
                    template_strings.append('-')
                elif chain is None:
                    template_strings.append(
                        '-' * (len(prefix) + len(target) + len(suffix)))
                else:
                    aseq, target = chain_info[chain]
                    template_strings.append(
                        '-' * len(prefix) +
                        regularized_seq(aseq, chain).characters +
                        '-' * len(suffix))
                    templates_info.append((chain, aseq.match_maps[chain]))
            templates_strings.append(template_strings)
        target_name = "target" if len(targets) > 1 else target.name
    else:
        if len(targets) > 1:
            raise LimitationError(
                "Cannot have multiple targets(/alignments) unless creating multimeric model"
            )
        alignment, orig_target = targets[0]
        # Copy the target sequence, changing name to conform to Modeller limitations
        target = modeller_copy(orig_target)
        target_strings = [target.characters]

        templates_strings = []
        templates_info = []
        match_chains = []
        for chain, aseq in alignment.associations.items():
            if len(chain.chain_id) > 1:
                raise LimitationError(
                    "Modeller cannot handle templates with multi-character chain IDs"
                )
            templates_strings.append([regularized_seq(aseq, chain).characters])
            templates_info.append((chain, aseq.match_maps[chain]))
            if not match_chains:
                match_chains.append(chain)

        target_name = target.name

    from .common import write_modeller_scripts, get_license_key
    script_path, config_path, temp_dir = write_modeller_scripts(
        get_license_key(session, license_key), num_models, het_preserve,
        water_preserve, hydrogens, fast, None, custom_script, temp_path,
        thorough_opt, dist_restraints)

    input_file_map = []

    # form the sequences to be written out as a PIR
    from chimerax.atomic import Sequence
    pir_target = Sequence(name=target_name)
    pir_target.description = "sequence:%s:.:.:.:.::::" % pir_target.name
    pir_target.characters = '/'.join(target_strings)
    pir_seqs = [pir_target]

    structures_to_save = set()
    for strings, info in zip(templates_strings, templates_info):
        if info is None:
            # multimer template
            pir_template = Sequence(
                name=structure_save_name(multimer_template))
            pir_template.description = "structure:%s:FIRST:%s::::::" % (
                pir_template.name, multimer_template.chains[0].chain_id)
            structures_to_save.add(multimer_template)
        else:
            # single-chain template
            chain, match_map = info
            first_assoc_pos = 0
            while first_assoc_pos not in match_map:
                first_assoc_pos += 1
            first_assoc_res = match_map[first_assoc_pos]
            pir_template = Sequence(name=chain_save_name(chain))
            pir_template.description = "structure:%s:%d%s:%s:+%d:%s::::" % (
                structure_save_name(chain.structure), first_assoc_res.number,
                first_assoc_res.insertion_code, chain.chain_id, len(match_map),
                chain.chain_id)
            structures_to_save.add(chain.structure)
        pir_template.characters = '/'.join(strings)
        pir_seqs.append(pir_template)
    import os.path
    pir_file = os.path.join(temp_dir.name, "alignment.ali")
    aln = session.alignments.new_alignment(pir_seqs,
                                           False,
                                           auto_associate=False,
                                           create_headers=False)
    aln.save(pir_file, format_name="pir")
    session.alignments.destroy_alignment(aln)
    input_file_map.append(("alignment.ali", "text_file", pir_file))

    # write the namelist.dat file, target seq name on first line, templates on remaining lines
    name_file = os.path.join(temp_dir.name, "namelist.dat")
    input_file_map.append(("namelist.dat", "text_file", name_file))
    with open(name_file, 'w') as f:
        for template_seq in pir_seqs:
            print(template_seq.name, file=f)

    config_name = os.path.basename(config_path)
    input_file_map.append((config_name, "text_file", config_path))

    # save structure files
    import os
    struct_dir = os.path.join(temp_dir.name, "template_struc")
    if not os.path.exists(struct_dir):
        try:
            os.mkdir(struct_dir, mode=0o755)
        except FileExistsError:
            pass
    from chimerax.pdb import save_pdb, standard_polymeric_res_names as std_res_names
    for structure in structures_to_save:
        base_name = structure_save_name(structure) + '.pdb'
        pdb_file_name = os.path.join(struct_dir, base_name)
        input_file_map.append((base_name, "text_file", pdb_file_name))
        ATOM_res_names = structure.in_seq_hets
        ATOM_res_names.update(std_res_names)
        save_pdb(session,
                 pdb_file_name,
                 models=[structure],
                 polymeric_res_names=ATOM_res_names)
        delattr(structure, 'in_seq_hets')

    from chimerax.atomic import Chains
    match_chains = Chains(match_chains)
    if executable_location is None:
        if custom_script is not None:
            raise LimitationError(
                "Custom Modeller scripts only supported when executing locally"
            )
        if dist_restraints is not None:
            raise LimitationError(
                "Distance restraints only supported when executing locally")
        if thorough_opt:
            session.logger.warning(
                "Thorough optimization only supported when executing locally")
        job_runner = ModellerWebService(session, match_chains, num_models,
                                        pir_target.name, input_file_map,
                                        config_name, targets, show_gui)
    else:
        #TODO: job_runner = ModellerLocal(...)
        from chimerax.core.errors import LimitationError
        raise LimitationError("Local Modeller execution not yet implemented")
        # a custom script [only used when executing locally] needs to be copied into the tmp dir...
        if os.path.exists(script_path) \
        and os.path.normpath(temp_dir.name) != os.path.normpath(os.path.dirname(script_path)):
            import shutil
            shutil.copy(script_path, temp_dir.name)

    return job_runner.run(block=block)

Example #17

0

Show file

def _prep_add(session,
              structures,
              unknowns_info,
              template,
              need_all=False,
              **prot_schemes):
    global _serial
    _serial = None
    atoms = []
    type_info_for_atom = {}
    naming_schemas = {}
    idatm_type = {}  # need this later; don't want a recomp
    hydrogen_totals = {}

    # add missing OXTs of "real" C termini;
    # delete hydrogens of "fake" N termini after protonation
    # and add a single "HN" back on, using same dihedral as preceding residue;
    # delete extra hydrogen of "fake" C termini after protonation
    logger = session.logger
    real_N, real_C, fake_N, fake_C = determine_termini(session, structures)
    logger.info("Chain-initial residues that are actual N"
                " termini: %s" % ", ".join([str(r) for r in real_N]))
    logger.info("Chain-initial residues that are not actual N"
                " termini: %s" % ", ".join([str(r) for r in fake_N]))
    logger.info("Chain-final residues that are actual C"
                " termini: %s" % ", ".join([str(r) for r in real_C]))
    logger.info("Chain-final residues that are not actual C"
                " termini: %s" % ", ".join([str(r) for r in fake_C]))
    for rc in real_C:
        complete_terminal_carboxylate(session, rc)

    # ensure that N termini are protonated as N3+ (since Npl will fail)
    from chimerax.atomic import Sequence
    for nter in real_N + fake_N:
        n = nter.find_atom("N")
        if not n:
            continue
        # if residue wasn't templated, leave atom typing alone
        if Sequence.protein3to1(n.residue.name) == 'X':
            continue
        if not (n.residue.name == "PRO" and n.num_bonds >= 2):
            n.idatm_type = "N3+"

    coordinations = {}
    for struct in structures:
        pbg = struct.pseudobond_group(struct.PBG_METAL_COORDINATION,
                                      create_type=None)
        if not pbg:
            continue
        for pb in pbg.pseudobonds:
            for a in pb.atoms:
                if not need_all and a.structure not in structures:
                    continue
                if not a.element.is_metal:
                    coordinations.setdefault(a, []).append(pb.other_atom(a))

    remaining_unknowns = {}
    type_info_class = type_info['H'].__class__
    from chimerax.atomic import Residue
    for struct in structures:
        for atom in struct.atoms:
            if atom.element.number == 0:
                res = atom.residue
                struct.delete_atom(atom)
        idatm_lookup = {}
        if template:
            template_lookup = {}
            from chimerax.atomic import TmplResidue
            get_template = TmplResidue.get_template
            for res in struct.residues:
                if get_template(res.name):
                    continue
                try:
                    exemplar = template_lookup[res.name]
                except KeyError:
                    from chimerax.mmcif import find_template_residue
                    tmpl = find_template_residue(session, res.name)
                    if not tmpl:
                        continue
                    from chimerax.atomic import AtomicStructure
                    s = AtomicStructure(session)
                    r = exemplar = template_lookup[res.name] = s.new_residue(
                        res.name, 'A', 1)
                    atom_map = {}
                    for ta in tmpl.atoms:
                        if ta.element.number > 1:
                            a = s.new_atom(ta.name, ta.element)
                            a.coord = ta.coord
                            r.add_atom(a)
                            atom_map[ta] = a
                            for tnb in ta.neighbors:
                                if tnb in atom_map:
                                    s.new_bond(a, atom_map[tnb])
                for a in res.atoms:
                    ea = exemplar.find_atom(a.name)
                    if ea:
                        a.idatm_type = ea.idatm_type
            for r in template_lookup.values():
                r.structure.delete()
            template_lookup.clear()

        for atom in struct.atoms:
            atom_type = atom.idatm_type
            idatm_type[atom] = atom_type
            if atom_type in type_info:
                # don't want to ask for idatm_type in middle
                # of hydrogen-adding loop (since that will
                # force a recomp), so remember here
                type_info_for_atom[atom] = type_info[atom_type]
                # if atom is in standard residue but has missing bonds to
                # heavy atoms, skip it instead of incorrectly protonating
                # (or possibly throwing an error if e.g. it's planar)
                # also
                # UNK/N residues will be missing some or all of their side-chain atoms, so
                # skip atoms that would otherwise be incorrectly protonated due to their
                # missing neighbors
                truncated = \
                        atom.is_missing_heavy_template_neighbors(no_template_okay=True) \
                    or \
                        (atom.residue.name in ["UNK", "N"] and atom.residue.polymer_type != Residue.PT_NONE
                        and unk_atom_truncated(atom)) \
                    or \
                        (atom.residue.polymer_type == Residue.PT_NUCLEIC and atom.name == "P"
                        and atom.num_explicit_bonds < 4)

                if truncated:
                    session.logger.warning(
                        "Not adding hydrogens to %s because it is missing heavy-atom"
                        " bond partners" % atom)
                    type_info_for_atom[atom] = type_info_class(
                        4, atom.num_bonds, atom.name)
                else:
                    atoms.append(atom)
                # sulfonamide nitrogens coordinating a metal
                # get an additional hydrogen stripped
                if coordinations.get(atom, []) and atom.element.name == "N":
                    if "Son" in [nb.idatm_type for nb in atom.neighbors]:
                        orig_ti = type_info[atom_type]
                        type_info_for_atom[atom] = orig_ti.__class__(
                            orig_ti.geometry, orig_ti.substituents - 1,
                            orig_ti.description)
                continue
            if atom in unknowns_info:
                type_info_for_atom[atom] = unknowns_info[atom]
                atoms.append(atom)
                continue
            remaining_unknowns.setdefault(atom.residue.name,
                                          set()).add(atom.name)
            # leave remaining unknown atoms alone
            type_info_for_atom[atom] = type_info_class(4, atom.num_bonds,
                                                       atom.name)

        for rname, atom_names in remaining_unknowns.items():
            names_text = ", ".join([nm for nm in atom_names])
            atom_text, obj_text = ("atoms",
                                   "them") if len(atom_names) > 1 else ("atom",
                                                                        "it")
            logger.warning(
                "Unknown hybridization for %s (%s) of residue type %s;"
                " not adding hydrogens to %s" %
                (atom_text, names_text, rname, obj_text))
        naming_schemas.update(
            determine_naming_schemas(struct, type_info_for_atom))

    if need_all:
        from chimerax.atomic import AtomicStructure
        for struct in [
                m for m in session.models if isinstance(m, AtomicStructure)
        ]:
            if struct in structures:
                continue
            for atom in struct.atoms:
                idatm_type[atom] = atom.idatm_type
                if atom.idatm_type in type_info:
                    type_info_for_atom[atom] = type_info[atom.idatm_type]

    for atom in atoms:
        if atom not in type_info_for_atom:
            continue
        bonding_info = type_info_for_atom[atom]
        total_hydrogens = bonding_info.substituents - atom.num_bonds
        for bonded in atom.neighbors:
            if bonded.element.number == 1:
                total_hydrogens += 1
        hydrogen_totals[atom] = total_hydrogens

    schemes = {}
    # HIS and CYS treated as 'unspecified'; use built-in typing
    for scheme_type, res_names, res_check, typed_atoms in [
        ('his', ["HID", "HIE", "HIP"], None, []),
        ('asp', asp_res_names, _asp_check, asp_prot_names),
        ('glu', glu_res_names, _glu_check, glu_prot_names),
        ('lys', ["LYS", "LYN"], _lys_check, ["NZ"]),
        ('cys', ["CYM"], _cys_check, ["SG"])
    ]:
        scheme = prot_schemes.get(scheme_type + '_scheme', None)
        if scheme is None:
            by_name = True
            scheme = {}
        else:
            by_name = False
        if not scheme:
            for s in structures:
                for r in s.residues:
                    if r.name in res_names and res_check and res_check(r):
                        if by_name:
                            scheme[r] = r.name
                        elif scheme_type != 'his':
                            scheme[r] = res_names[0]
                        # unset any explicit typing...
                        for ta in typed_atoms:
                            a = r.find_atom(ta)
                            if a:
                                a.idatm_type = None
        else:
            for r in scheme.keys():
                if res_check and not res_check(r, scheme[r]):
                    del scheme[r]
        schemes[scheme_type] = scheme
    # create dictionary keyed on histidine residue with value of another
    # dictionary keyed on the nitrogen atoms with boolean values: True
    # equals should be protonated
    his_Ns = {}
    for r, protonation in schemes["his"].items():
        delta = r.find_atom("ND1")
        epsilon = r.find_atom("NE2")
        if delta is None or epsilon is None:
            # find the ring, etc.
            rings = r.structure.rings()
            for ring in rings:
                if r in rings.atoms.residues:
                    break
            else:
                continue
            # find CG by locating CB-CG bond
            ring_bonds = ring.bonds
            for ra in ring.atoms:
                if ra.element.name != "C":
                    continue
                for ba, b in zip(ra.neighbors, ra.bonds):
                    if ba.element.name == "C" and b not in ring_bonds:
                        break
                else:
                    continue
                break
            else:
                continue
            nitrogens = [a for a in ring.atoms if a.element.name == "N"]
            if len(nitrogens) != 2:
                continue
            if ra in nitrogens[0].neighbors:
                delta, epsilon = nitrogens
            else:
                epsilon, delta = nitrogens
        if protonation == "HID":
            his_Ns.update({delta: True, epsilon: False})
        elif protonation == "HIE":
            his_Ns.update({delta: False, epsilon: True})
        elif protonation == "HIP":
            his_Ns.update({delta: True, epsilon: True})
        else:
            continue
    for n, do_prot in his_Ns.items():
        if do_prot:
            type_info_for_atom[n] = type_info["Npl"]
            n.idatm_type = idatm_type[n] = "Npl"
        else:
            type_info_for_atom[n] = type_info["N2"]
            n.idatm_type = idatm_type[n] = "N2"

    for r, protonation in schemes["asp"].items():
        _handle_acid_protonation_scheme_item(r, protonation, asp_res_names,
                                             asp_prot_names, type_info,
                                             type_info_for_atom)

    for r, protonation in schemes["glu"].items():
        _handle_acid_protonation_scheme_item(r, protonation, glu_res_names,
                                             glu_prot_names, type_info,
                                             type_info_for_atom)

    for r, protonation in schemes["lys"].items():
        nz = r.find_atom("NZ")
        if protonation == "LYS":
            it = 'N3+'
        else:
            it = 'N3'
        ti = type_info[it]
        if nz is not None:
            type_info_for_atom[nz] = ti
            # avoid explicitly setting type if possible
            if nz.idatm_type != it:
                nz.idatm_type = it

    for r, protonation in schemes["cys"].items():
        sg = r.find_atom("SG")
        if protonation == "CYS":
            it = 'S3'
        else:
            it = 'S3-'
        ti = type_info[it]
        if sg is not None:
            type_info_for_atom[sg] = ti
            # avoid explicitly setting type if possible
            if sg.idatm_type != it:
                sg.idatm_type = it

    return atoms, type_info_for_atom, naming_schemas, idatm_type, \
            hydrogen_totals, his_Ns, coordinations, fake_N, fake_C

Example #18

0

Show file

File: match.py Project: Yongcheng123/ChimeraX

def align(session,
          ref,
          match,
          matrix_name,
          algorithm,
          gap_open,
          gap_extend,
          dssp_cache,
          ss_matrix=defaults["ss_scores"],
          ss_fraction=defaults["ss_mixture"],
          gap_open_helix=defaults["helix_open"],
          gap_open_strand=defaults["strand_open"],
          gap_open_other=defaults["other_open"],
          compute_ss=defaults["compute_ss"]):
    from chimerax import sim_matrices
    similarity_matrix = sim_matrices.matrix(matrix_name, session.logger)
    ssf = ss_fraction
    ssm = ss_matrix
    if ssf is not None and ssf is not False and compute_ss:
        need_compute = []
        if ref.structure not in dssp_cache:
            for r in ref.residues:
                if r and len(r.atoms) > 1:
                    # not CA only
                    need_compute.append(ref.structure)
                    dssp_cache[ref.structure] = (
                        ref.structure.residues.ss_ids,
                        ref.structure.residues.ss_types)
                    break
        if match.structure not in dssp_cache:
            for r in match.residues:
                if r and len(r.atoms) > 1:
                    # not CA only
                    need_compute.append(match.structure)
                    dssp_cache[match.structure] = (
                        match.structure.residues.ss_ids,
                        match.structure.residues.ss_types)
                    break
        if need_compute:
            """TODO
            from chimera.initprefs import ksdsspPrefs, \
                    KSDSSP_ENERGY, KSDSSP_HELIX_LENGTH, \
                    KSDSSP_STRAND_LENGTH
            """
            from chimerax.std_commands import dssp
            dssp.compute_ss(session, need_compute)
    if algorithm == "nw":
        from chimerax.alignment_algs import NeedlemanWunsch
        score, seqs = NeedlemanWunsch.nw(ref,
                                         match,
                                         score_gap=-gap_extend,
                                         score_gap_open=0 - gap_open,
                                         similarity_matrix=similarity_matrix,
                                         return_seqs=True,
                                         ss_matrix=ss_matrix,
                                         ss_fraction=ss_fraction,
                                         gap_open_helix=-gap_open_helix,
                                         gap_open_strand=-gap_open_strand,
                                         gap_open_other=-gap_open_other)
        gapped_ref, gapped_match = seqs
    elif algorithm == "sw":

        def ss_let(r):
            if not r:
                return ' '
            if r.is_helix:
                return 'H'
            elif r.is_strand:
                return 'S'
            return 'O'

        if ssf is False or ssf is None:
            ssf = 0.0
            ssm = None
        if ssm:
            # account for missing structure (blank SS letter)
            ssm = ssm.copy()
            for let in "HSO ":
                ssm[(let, ' ')] = 0.0
                ssm[(' ', let)] = 0.0
        from chimerax.alignment_algs import SmithWaterman
        score, alignment = SmithWaterman.align(
            ref.characters,
            match.characters,
            similarity_matrix,
            float(gap_open),
            float(gap_extend),
            gap_char=".",
            ss_matrix=ssm,
            ss_fraction=ssf,
            gap_open_helix=float(gap_open_helix),
            gap_open_strand=float(gap_open_strand),
            gap_open_other=float(gap_open_other),
            ss1="".join([ss_let(r) for r in ref.residues]),
            ss2="".join([ss_let(r) for r in match.residues]))
        from chimerax.atomic import StructureSeq, Sequence
        gapped_ref = StructureSeq(structure=ref.structure,
                                  chain_id=ref.chain_id)
        gapped_ref.name = ref.structure.name
        gapped_match = StructureSeq(structure=match.structure,
                                    chain_id=match.chain_id)
        gapped_match.name = match.structure.name
        # Smith-Waterman may not be entirety of sequences...
        for orig, gapped, sw in [
            (ref, gapped_ref, Sequence(characters=alignment[0])),
            (match, gapped_match, Sequence(characters=alignment[1]))
        ]:
            ungapped = sw.ungapped()
            for i in range(len(orig) - len(ungapped) + 1):
                if ungapped == orig[i:i + len(ungapped)]:
                    break
            else:
                raise ValueError("Smith-Waterman result not"
                                 " a subsequence of original sequence")
            gapped.bulk_set(orig.residues[i:i + len(ungapped)], sw.characters)
    else:
        raise ValueError("Unknown sequence alignment algorithm: %s" %
                         algorithm)

    # If the structures are disjoint snippets of the same longer SEQRES,
    # they may be able to be structurally aligned but the SEQRES records
    # will keep them apart.  Try to detect this situation and work around
    # by snipping off sequence ends.
    sr_disjoint = False
    if ref.from_seqres and match.from_seqres:
        struct_match = 0
        for i in range(len(gapped_ref)):
            uri = gapped_ref.gapped_to_ungapped(i)
            if uri is None:
                continue
            umi = gapped_match.gapped_to_ungapped(i)
            if umi is None:
                continue
            if gapped_ref.residues[uri] and gapped_match.residues[umi]:
                struct_match += 1
                if struct_match >= 3:
                    break
        if struct_match < 3:
            seq_match = 0
            for s1, s2 in zip(gapped_ref[:], gapped_match[:]):
                if s1.isalpha() and s2.isalpha():
                    seq_match += 1
                    if seq_match > 3:
                        break
            if seq_match > 3:
                need = 3 - struct_match
                if (ref.residues[:need].count(None) == 3
                or ref.residues[-need:].count(None) == 3) \
                and (match.residues[:need].count(None) == 3
                or match.residues[-need:].count(None) == 3):
                    sr_disjoint = True
    if sr_disjoint:
        from copy import copy
        clipped_ref = copy(ref)
        clipped_match = copy(match)
        for seq in (clipped_ref, clipped_match):
            num_none = 0
            for r in seq.residues:
                if r:
                    break
                num_none += 1
            if num_none:
                seq.bulk_set(seq.residues[num_none:], seq[num_none:])

            num_none = 0
            for r in reversed(seq.residues):
                if r:
                    break
                num_none += 1
            if num_none:
                seq.bulk_set(seq.residues[:-num_none], seq[:-num_none])
        return align(session,
                     clipped_ref,
                     clipped_match,
                     matrix_name,
                     algorithm,
                     gap_open,
                     gap_extend,
                     dssp_cache,
                     ss_matrix=ss_matrix,
                     ss_fraction=ss_fraction,
                     gap_open_helix=gap_open_helix,
                     gap_open_strand=gap_open_strand,
                     gap_open_other=gap_open_other,
                     compute_ss=False)
    for orig, aligned in [(ref, gapped_ref), (match, gapped_match)]:
        if hasattr(orig, '_dm_rebuild_info'):
            aligned._dm_rebuild_info = orig._dm_rebuild_info
            _dm_cleanup.append(aligned)
    return score, gapped_ref, gapped_match

Example #19

0

Show file

def write_mol2(session,
               file_name,
               *,
               models=None,
               atoms=None,
               status=None,
               anchor=None,
               rel_model=None,
               sybyl_hyd_naming=True,
               combine_models=False,
               skip_atoms=None,
               res_num=False,
               gaff_type=False,
               gaff_fail_error=None):
    """Write a Mol2 file.

    Parameters
    ----------

    file_name : str, or file object open for writing
        Output file.

    models : a list/tuple/set of models (:py:class:`~chimerax.atomic.Structure`s)
        or a single :py:class:`~chimerax.atomic.Structure`
        The structure(s) to write out. If None (and 'atoms' is also None) then
        write out all structures.

    atoms : an :py:class:`~chimerax.atomic.Atoms` collection or None.  If not None,
        then 'models' must be None.

    status : function or None
        If not None, a function that takes a string -- used to report the progress of the write.

    anchor : :py:class:`~chimerax.atomic.Atoms` collection
        Atoms (and their implied internal bonds) that should be written out to the
        @SET section of the file as the rigid framework for flexible ligand docking.

    rel_model : Model whose coordinate system the coordinates should be written out reletive to,
        i.e. take the output atoms' coordinates and apply the inverse of the rel_model's transform.

    sybyl_hyd_naming : bool
        Controls whether hydrogen names should be "Sybyl-like" or "PDB-like" -- e.g.  HG21 vs. 1HG2.

    combine_models : bool
        Controls whether multiple structures will be combined into a single @MOLECULE
        section (value: True) or each given its own section (value: False).

    skip_atoms : list/set of :py:class:`~chimerax.atomic.Atom`s or an :py:class:`~chimerax.atomic.Atoms` collection or None
       Atoms to not output

    res_num : bool
        Controls whether residue sequence numbers are included in the substructure name.
        Since Sybyl Mol2 files include them, this defaults to True.

    gaff_type : bool
       If 'gaff_type' is True, outout GAFF atom types instead of Sybyl atom types.
       `gaff_fail_error`, if specified, is the type of error to throw (e.g. UserError)
       if there is no gaff_type attribute for an atom, otherwise throw the standard AttributeError.
    """

    if status:
        status("Writing Mol2 file %s" % file_name)

    from chimerax import io
    f = io.open_output(file_name, "utf-8")

    sort_key_func = serial_sort_key = lambda a, ri={}: write_mol2_sort_key(
        a, res_indices=ri)

    from chimerax.atomic import Structure, Atoms, Residue

    class JPBGroup:
        def __init__(self, atoms):
            atom_set = set(atoms)
            pbs = []
            for s in atoms.unique_structures:
                pbg = s.pbg_map.get(s.PBG_METAL_COORDINATION, None)
                if not pbg:
                    continue
                for pb in pbg.pseudobonds:
                    if pb.atoms[0] in atom_set and pb.atoms[1] in atom_set:
                        pbs.append(pb)
            self._pbs = pbs

        @property
        def pseudobonds(self):
            return self._pbs

    if models is None:
        if atoms is None:
            structures = session.models.list(type=Structure)
        else:
            structures = atoms
    else:
        if atoms is None:
            if isinstance(models, Structure):
                structures = [models]
            else:
                structures = [m for m in models if isinstance(m, Structure)]
        else:
            raise ValueError(
                "Cannot specify both 'models' and 'atoms' keywords")

    if isinstance(structures, Atoms):

        class Jumbo:
            def __init__(self, atoms):
                self.atoms = atoms
                self.residues = atoms.unique_residues
                self.bonds = atoms.intra_bonds
                self.name = "(selection)"
                self.pbg_map = {
                    Structure.PBG_METAL_COORDINATION: JPBGroup(atoms)
                }

        structures = [Jumbo(structures)]
        sort_key_func = lambda a: (a.structure.id, ) + serial_sort_key(a)
        combine_models = False

    # transform...
    if rel_model is None:
        from chimerax.geometry import identity
        xform = identity()
    else:
        xform = rel_model.scene_position.inverse()

    # need to find amide moieties since Sybyl has an explicit amide type
    if status:
        status("Finding amides")
    from chimerax.chem_group import find_group
    amides = find_group("amide", structures)
    amide_Ns = set([amide[2] for amide in amides])
    amide_CNs = set([amide[0] for amide in amides])
    amide_CNs.update(amide_Ns)
    amide_Os = set([amide[1] for amide in amides])

    substructure_names = None
    if combine_models and len(structures) > 1:
        # create a fictitious jumbo model
        class Jumbo:
            def __init__(self, structures):
                self.name = structures[0].name + " (combined)"
                from chimerax.atomic import concatenate
                self.atoms = concatenate([s.atoms for s in structures])
                self.bonds = concatenate([s.bonds for s in structures])
                self.residues = concatenate([s.residues for s in structures])
                self.pbg_map = {
                    Structure.PBG_METAL_COORDINATION: JPBGroup(self.atoms)
                }
                # if combining single-residue structures,
                # can be more informative to use model name
                # instead of residue type for substructure
                if len(structures) == len(self.residues):
                    rnames = self.residues.names
                    if len(set(rnames)) < len(rnames):
                        snames = [s.name for s in structures]
                        if len(set(snames)) == len(snames):
                            self.substructure_names = dict(
                                zip(self.residues, snames))

        structures = [Jumbo(structures)]
        if hasattr(structures[-1], 'substructure_names'):
            substructure_names = structures[-1].substructure_names
            delattr(structures[-1], 'substructure_names')
        sort_key_func = lambda a: (a.structure.id, ) + serial_sort(a)

    # write out structures
    for struct in structures:
        if hasattr(struct, 'mol2_comments'):
            for m2c in struct.mol2_comments:
                print(m2c, file=f)
        if hasattr(struct, 'solvent_info'):
            print(struct.solvent_info, file=f)

        # molecule section header
        print("%s" % MOLECULE_HEADER, file=f)

        # molecule name
        print("%s" % struct.name, file=f)

        atoms = list(struct.atoms)
        bonds = list(struct.bonds)
        # add metal-coordination bonds
        coord_grp = struct.pbg_map.get(Structure.PBG_METAL_COORDINATION, None)
        if coord_grp:
            bonds.extend(list(coord_grp.pseudobonds))
        if skip_atoms:
            skip_atoms = set(skip_atoms)
            atoms = [a for a in atoms if a not in skip_atoms]
            bonds = [
                b for b in bonds if b.atoms[0] not in skip_atoms
                and b.atoms[1] not in skip_atoms
            ]
        residues = struct.residues

        # Put the atoms in the order we want for output
        if status:
            status("Putting atoms in input order")
        atoms.sort(key=sort_key_func)

        # if anchor is not None, then there will be two entries in
        # the @SET section of the file...
        if anchor:
            sets = 2
        else:
            sets = 0
        # number of entries for various sections...
        print("%d %d %d 0 %d" % (len(atoms), len(bonds), len(residues), sets),
              file=f)

        # type of molecule
        if hasattr(struct, "mol2_type"):
            mtype = struct.mol2_type
        else:
            mtype = "SMALL"
            from chimerax.atomic import Sequence
            for r in struct.residues:
                if Sequence.protein3to1(r.name) != 'X':
                    mtype = "PROTEIN"
                    break
                if Sequence.nucleic3to1(r.name) != 'X':
                    mtype = "NUCLEIC_ACID"
                    break
        print(mtype, file=f)

        # indicate type of charge information
        if hasattr(struct, 'charge_model'):
            print(struct.charge_model, file=f)
        else:
            print("NO_CHARGES", file=f)

        if hasattr(struct, 'mol2_comment'):
            print("\n%s" % struct.mol2_comment, file=f)
        else:
            print("\n", file=f)

        if status:
            status("writing atoms")
        # atom section header
        print("%s" % ATOM_HEADER, file=f)

        # make a dictionary of residue indices so that we can do quick look ups
        res_indices = {}
        for i, r in enumerate(residues):
            res_indices[r] = i + 1
        for i, atom in enumerate(atoms):
            # atom ID, starting from 1
            print("%7d" % (i + 1), end=" ", file=f)

            # atom name, possibly rearranged if it's a hydrogen
            if sybyl_hyd_naming and not atom.name[0].isalpha():
                atom_name = atom.name[1:] + atom.name[0]
            else:
                atom_name = atom.name
            print("%-8s" % atom_name, end=" ", file=f)

            # use correct relative coordinate position
            coord = xform * atom.scene_coord
            print("%9.4f %9.4f %9.4f" % tuple(coord), end=" ", file=f)

            # atom type
            if gaff_type:
                try:
                    atom_type = atom.gaff_type
                except AttributeError:
                    if not gaff_fail_error:
                        raise
                    raise gaff_fail_error(
                        "%s has no Amber/GAFF type assigned.\n"
                        "Use the AddCharge tool to assign Amber/GAFF types." %
                        atom)
            elif hasattr(atom, 'mol2_type'):
                atom_type = atom.mol2_type
            elif atom in amide_Ns:
                atom_type = "N.am"
            elif atom.structure_category == "solvent" \
            and atom.residue.name in Residue.water_res_names:
                if atom.element.name == "O":
                    atom_type = "O.t3p"
                else:
                    atom_type = "H.t3p"
            elif atom.element.name == "N" and len(
                [r for r in atom.rings() if r.aromatic]) > 0:
                atom_type = "N.ar"
            elif atom.idatm_type == "C2" and len(
                [nb for nb in atom.neighbors if nb.idatm_type == "Ng+"]) > 2:
                atom_type = "C.cat"
            elif sulfur_oxygen(atom):
                atom_type = "O.2"
            else:
                try:
                    atom_type = chimera_to_sybyl[atom.idatm_type]
                except KeyError:
                    session.logger.warning(
                        "Atom whose IDATM type has no equivalent"
                        " Sybyl type: %s (type: %s)" % (atom, atom.idatm_type))
                    atom_type = str(atom.element)
            print("%-5s" % atom_type, end=" ", file=f)

            # residue-related info
            res = atom.residue

            # residue index
            print("%5d" % res_indices[res], end=" ", file=f)

            # substructure identifier and charge
            if hasattr(atom, 'charge') and atom.charge is not None:
                charge = atom.charge
            else:
                charge = 0.0
            if substructure_names:
                rname = substructure_names[res]
            elif res_num:
                rname = "%3s%-5d" % (res.name, res.number)
            else:
                rname = "%3s" % res.name
            print("%s %9.4f" % (rname, charge), file=f)

        if status:
            status("writing bonds")
        # bond section header
        print("%s" % BOND_HEADER, file=f)

        # make an atom-index dictionary to speed lookups
        atom_indices = {}
        for i, a in enumerate(atoms):
            atom_indices[a] = i + 1
        for i, bond in enumerate(bonds):
            a1, a2 = bond.atoms

            # ID
            print("%6d" % (i + 1), end=" ", file=f)

            # atom IDs
            print("%4d %4d" % (atom_indices[a1], atom_indices[a2]),
                  end=" ",
                  file=f)

            # bond order; give it our best shot...
            if hasattr(bond, 'mol2_type'):
                print(bond.mol2_type, file=f)
                continue
            amide_A1 = a1 in amide_CNs
            amide_A2 = a2 in amide_CNs
            if amide_A1 and amide_A2:
                print("am", file=f)
                continue
            if amide_A1 or amide_A2:
                if a1 in amide_Os or a2 in amide_Os:
                    print("2", file=f)
                else:
                    print("1", file=f)
                continue

            aromatic = False
            # 'bond' might be a metal-coordination bond so do a test for rings
            if hasattr(bond, 'rings'):
                for ring in bond.rings():
                    if ring.aromatic:
                        aromatic = True
                        break
            if aromatic:
                print("ar", file=f)
                continue

            try:
                geom1 = idatm_info[a1.idatm_type].geometry
            except KeyError:
                print("1", file=f)
                continue
            try:
                geom2 = idatm_info[a2.idatm_type].geometry
            except KeyError:
                print("1", file=f)
                continue
            # sulfone/sulfoxide is classically depicted as double-
            # bonded despite the high dipolar character of the
            # bond making it have single-bond character.  For
            # output, use the classical values.
            if sulfur_oxygen(a1) or sulfur_oxygen(a2):
                print("2", file=f)
                continue
            if geom1 not in [2, 3] or geom2 not in [2, 3]:
                print("1", file=f)
                continue
            # if either endpoint atom is in an aromatic ring and
            # the bond isn't, it's a single bond...
            for endp in [a1, a2]:
                aromatic = False
                for ring in endp.rings():
                    if ring.aromatic:
                        aromatic = True
                        break
                if aromatic:
                    break
            else:
                # neither endpoint in aromatic ring
                if geom1 == 2 and geom2 == 2:
                    print("3", file=f)
                else:
                    print("2", file=f)
                continue
            print("1", file=f)

        if status:
            status("writing residues")
        # residue section header
        print("%s" % SUBSTR_HEADER, file=f)

        for i, res in enumerate(residues):
            # residue id field
            print("%6d" % (i + 1), end=" ", file=f)

            # residue name field
            if substructure_names:
                rname = substructure_names[res]
            elif res_num:
                rname = "%3s%-4d" % (res.name, res.number)
            else:
                rname = "%3s" % res.name
            print(rname, end=" ", file=f)

            # ID of the root atom of the residue
            chain_atom = res.principal_atom
            if chain_atom is None:
                # if writing out a selection, not all residue atoms
                # might be in atom_indices...
                for chain_atom in res.atoms:
                    if chain_atom in atom_indices:
                        break
            print("%5d" % atom_indices[chain_atom], end=" ", file=f)

            print("RESIDUE           4", end=" ", file=f)

            # Sybyl seems to use chain 'A' when chain ID is blank,
            # so run with that
            chain_id = res.chain_id
            if not chain_id.strip():
                chain_id = 'A'
            print("%-4s  %3s" % (chain_id, res.name), end=" ", file=f)

            # number of out-of-substructure bonds
            cross_res_bonds = 0
            for a in res.atoms:
                for nb in a.neighbors:
                    if nb.residue != res:
                        cross_res_bonds += 1
            print("%5d" % cross_res_bonds, end="", file=f)
            # print "ROOT" if first or only residue of a chain
            if not res.chain or res.chain.existing_residues[0] == res:
                print(" ROOT", file=f)
            else:
                print(file=f)

        # write flexible ligand docking info
        if anchor:
            if status:
                status("writing anchor info")
            print("%s" % SET_HEADER, file=f)
            atom_indices = {}
            for i, a in enumerate(atoms):
                atom_indices[a] = i + 1
            bond_indices = {}
            for i, b in enumerate(bonds):
                bond_indices[b] = i + 1
            print(
                "ANCHOR          STATIC     ATOMS    <user>   **** Anchor Atom Set",
                file=f)
            print(len(anchor), end=" ", file=f)
            for a in anchor:
                if a in atom_indices:
                    print(atom_indices[a], end=" ", file=f)
            print(file=f)

            print(
                "RIGID           STATIC     BONDS    <user>   **** Rigid Bond Set",
                file=f)
            bonds = anchor.intra_bonds
            print(len(bonds), end=" ", file=f)
            for b in bonds:
                if b in bond_indices:
                    print(bond_indices[b], end=" ", file=f)
            print(file=f)

    if file_name != f:
        f.close()

    if status:
        status("Wrote Mol2 file %s" % file_name)

Example #20

0

Show file

File: readHSSP.py Project: Yongcheng123/ChimeraX

def read(session, f):
    doing = None
    sequences = []
    header_ok = False
    line_num = 0
    align_start_index = None
    for line in f.readlines():
        if doing == 'alignments':
            # don't strip() alignment section since it has significant leading spaces
            line = line.rstrip()
        else:
            line = line.strip()
        line_num += 1
        if not header_ok:
            if line.lower().startswith("hssp"):
                header_ok = True
                continue
            raise FormatSyntaxError("No initial HSSP header line")
        if line.startswith('##'):
            if doing == 'proteins' and not sequences:
                raise FormatSyntaxError("No entries in PROTEINS section")
            try:
                doing = line.split()[1].lower()
            except IndexError:
                doing = None
            if doing == 'alignments':
                try:
                    hashes, alignments, begin, dash, end = line.strip().split()
                    begin = int(begin)
                    end = int(end)
                except ValueError:
                    raise FormatSyntaError("ALIGNMENTS line (line #%d) not of the form: "
                        "## ALIGNMENTS (number) - (number)" % line_num)
            continue
        if doing == 'proteins':
            if not line[0].isdigit():
                continue
            try:
                seq_name = line.split()[2]
            except IndexError:
                raise FormatSyntaxError("Line %d in PROTEINS section does not start with "
                    "[integer] : [sequence name]" % line_num)
            sequences.append(Sequence(name=make_readable(seq_name)))
        elif doing == 'alignments':
            if line.lstrip().lower().startswith('seqno'):
                try:
                    align_start_index = line.index('.')
                except Exception:
                    raise FormatSyntaxError("No indication of alignment starting column "
                        "('.' character) in SeqNo line in ALIGNMENTS section")
                continue
            if align_start_index == None:
                raise FormatSyntaxError("No initial SeqNo line in ALIGNMENTS section")
            block = line[align_start_index:]
            if not block:
                raise FormatSyntaxError("No alignment block given on line %d" % line_num)
            block_len = end - begin + 1
            if len(block) > block_len:
                raise FormatSyntaxError("Too many characters (%d, only %d sequences) in "
                    "alignment block given on line %d" % (len(block), block_len, line_num))
            block = block + ' ' * (block_len - len(block))
            for seq, c in zip(sequences[begin-1:end], block):
                seq.append(c)
    f.close()
    return sequences, {}, {}