예제 #1
0
def save(session, alignment, stream):
    print("CLUSTAL W ALN saved from UCSF ChimeraX", file=stream)
    print("", file=stream)

    max_name = max([len(seq.name) for seq in alignment.seqs])
    name_format = "%%-%ds" % (max_name+5)

    from chimerax.atomic import Sequence
    aln_len = len(alignment.seqs[0])
    for start in range(0, aln_len, LINELEN):
        end = min(aln_len, start + LINELEN)

        for seq in alignment.seqs:
            name = seq.name.replace(' ', '_')
            temp_seq = Sequence()
            temp_seq.extend(seq[start:end])
            if len(temp_seq.ungapped()) == 0:
                print(name_format % name, seq[start:end], file=stream)
            else:
                temp_seq = Sequence()
                temp_seq.extend(seq[:end])
                print(name_format % name, seq[start:end], len(temp_seq.ungapped()), file=stream)
        from .. import clustal_strong_groups, clustal_weak_groups
        conservation = []
        for pos in range(start, end):
            # completely conserved?
            first = alignment.seqs[0][pos].upper()
            if first.isupper():
                for seq in alignment.seqs[1:]:
                    if seq[pos].upper() != first:
                        break
                else:
                    # conserved
                    conservation.append('*')
                    continue

            # "strongly"/"weakly" conserved?
            conserved = False
            for groups, character in [(clustal_strong_groups, ':'), (clustal_weak_groups, '.')]:
                for group in groups:
                    for seq in alignment.seqs:
                        if seq[pos].upper() not in group:
                            break
                    else:
                        # conserved
                        conserved = True
                        break
                if conserved:
                    conservation.append(character)
                    break

            if not conserved:
                # remainder
                conservation.append(' ')
        print(name_format % " ", "".join(conservation), file=stream)
        print("", file=stream)
예제 #2
0
def read(session, f):
    want = 'init'
    sequences = []
    for line in f.readlines():
        line = line.strip()
        if want == 'init':
            if len(line) < 4:
                continue
            if line[0] != '>' or line[3] != ';':
                continue
            sequences.append(Sequence(name=make_readable(line[4:])))
            pir_type = line[1:3]
            if pir_type in ("P1", "F1"):
                sequences[-1].nucleic = True
            else:
                sequences[-1].nucleic = False
            sequences[-1].pir_type = pir_type
            want = 'description'
        elif want == 'description':
            sequences[-1].description = line
            sequences[-1].pir_description = line
            want = 'sequence'
        elif want == 'sequence':
            if not line:
                continue
            if line[-1] == '*':
                want = 'init'
                line = line[:-1]
            sequences[-1].extend("".join([c for c in line if not c.isspace()]))
    f.close()
    if want != 'init':
        raise FormatSyntaxError("Could not find end of sequence '%s'" %
                                sequences[-1].name)
    return sequences, {}, {}
예제 #3
0
 def show_mav(self, ids):
     # Collect names and sequences of selected matches.
     # All sequences should have the same length because
     # they include gaps generated from BLAST alignment.
     ids.insert(0, 0)
     names = []
     seqs = []
     for sid in ids:
         name, seq = self._sequences[sid]
         names.append(name)
         seqs.append(seq)
     # Find columns that are gaps in all sequences and remove them.
     all_gaps = set()
     for i in range(len(seqs[0])):
         for seq in seqs:
             if seq[i].isalpha():
                 break
         else:
             all_gaps.add(i)
     if all_gaps:
         for i in range(len(seqs)):
             seq = seqs[i]
             new_seq = ''.join(
                 [seq[n] for n in range(len(seq)) if n not in all_gaps])
             seqs[i] = new_seq
     # Generate multiple sequence alignment file
     # Ask sequence viewer to display alignment
     from chimerax.atomic import Sequence
     seqs = [
         Sequence(name=name, characters=seqs[i])
         for i, name in enumerate(names)
     ]
     name = "%s [%d]" % (self._instance_name, self._viewer_index)
     self.session.alignments.new_alignment(seqs, name)
예제 #4
0
def read(session, f):
    # skip header crap
    in_header = True
    line_num = 0
    sequences = []
    for line in f.readlines():
        line = line.strip()
        line_num += 1
        if not line:
            continue
        fields = line.split()
        if in_header:
            if len(fields[0]) == 2:
                continue
            if fields[0].startswith('#='):
                # some Pfam seed alignments have undocumented #=RF header
                continue
            in_header = False
        if len(fields) != 2:
            raise FormatSyntaxError(
                "Sequence line %d not of form 'seq-name seq-letters'" % line_num)
        seq = Sequence(name=make_readable(fields[0]))
        seq.extend(fields[1])
        sequences.append(seq)
    f.close()
    return sequences, {}, {}
예제 #5
0
def nw_assoc(session, align_seq, struct_seq):
    '''Wrapper around Needleman-Wunsch matching, to make it return the same kinds of values
       that try_assoc returns'''

    from chimerax.atomic import Sequence, SeqMatchMap
    sseq = struct_seq
    aseq = Sequence(name=align_seq.name, characters=align_seq.ungapped())
    aseq.circular = align_seq.circular
    from chimerax.alignment_algs.NeedlemanWunsch import nw
    score, match_list = nw(sseq, aseq)

    errors = 0
    # matched are in reverse order...
    try:
        m_end = match_list[0][0]
    except IndexError:
        m_end = -1
    if m_end < len(sseq) - 1:
        # trailing unmatched
        errors += len(sseq) - m_end - 1

    match_map = SeqMatchMap(align_seq, struct_seq)
    last_match = m_end + 1
    for s_index, a_index in match_list:
        if sseq[s_index] != aseq[a_index]:
            errors += 1

        if s_index < last_match - 1:
            # gap in structure sequence
            errors += last_match - s_index - 1

        res = sseq.residues[s_index]
        if res:
            match_map.match(res, a_index)

        last_match = s_index
    if last_match > 0:
        # beginning unmatched
        errors += last_match

    if len(sseq) > len(aseq):
        # unmatched residues forced, reduce errors by that amount...
        errors -= len(sseq) - len(aseq)

    return match_map, errors
예제 #6
0
def seqalign_chain(session, chains):
    '''
    Show chain sequence(s)

    Parameters
    ----------
    chains : list of Chain
        Chains to show
    '''

    if len(chains) == 1:
        chain = chains[0]
        ident = ".".join([str(part) for part in chain.structure.id]) + "/" + chain.chain_id
        alignment = session.alignments.new_alignment([chain], ident, seq_viewer="sv",
            auto_associate=None, intrinsic=True)
    else:
        # all chains have to have the same sequence, and they will all be associated with
        # that sequence
        sequences = set([chain.characters for chain in chains])
        if len(sequences) != 1:
            raise UserError("Chains must have same sequence")
        chars = sequences.pop()
        chain_ids = set([chain.chain_id for chain in chains])
        if len(chain_ids) < len(chains) or len(chain_ids) > 10:
            name = "%d chains" % len(chains)
        else:
            name = "chains %s" % ",".join(sorted(list(chain_ids)))
        from chimerax.atomic import Sequence
        seq = Sequence(name=name, characters=chars)
        def get_numbering_start(chain):
            for i, r in enumerate(chain.residues):
                if r is None or r.deleted:
                    continue
                return r.number - i
            return None
        starts = set([get_numbering_start(chain) for chain in chains])
        starts.discard(None)
        if len(starts) == 1:
            seq.numbering_start = starts.pop()
        alignment = session.alignments.new_alignment([seq], None, seq_viewer="sv",
            auto_associate=False, name=chains[0].description, intrinsic=True)
        alignment.suspend_notify_observers()
        for chain in chains:
            alignment.associate(chain, keep_intrinsic=True)
        alignment.resume_notify_observers()
예제 #7
0
def fetch_uniprot(session, ident, ignore_cache=False):
	'Fetch UniProt data'

	from chimerax.core.errors import UserError, CancelOperation
	try:
		accession = map_uniprot_ident(ident)
		seq_string, full_name, features = fetch_uniprot_accession_info(session, accession,
			ignore_cache=ignore_cache)
	except InvalidAccessionError as e:
		raise UserError(str(e))
	except CancelOperation:
		session.logger.status("Fetch of %s cancelled" % ident)
		return
	from chimerax.atomic import Sequence
	seq = Sequence(name=ident)
	seq.extend(seq_string)
	session.logger.status("Opening UniProt %s" % ident)
	session.alignments.new_alignment([seq], ident)
	return [], "Opened UniProt %s" % ident
예제 #8
0
def read(session, f):
    in_header = True
    sequences = []
    line_num = 0
    for line in f.readlines():
        line_num += 1
        if in_header:
            if line.startswith("CLUSTAL"):
                in_header = False
                first_block = True
            else:
                if line.strip() != "":
                    raise FormatSyntaxError(
                        "First non-blank line does not start with 'CLUSTAL'")
            continue
        if not line or line[0].isspace():
            if sequences:
                first_block = False
                expect = 0
            continue
        try:
            seq_name, seq_block, num_residues = line.split()
        except ValueError:
            try:
                seq_name, seq_block = line.strip().split()
            except ValueError:
                raise FormatSyntaxError(
                    "Line %d is not sequence name followed by sequence "
                    "contents and optional ungapped length" % line_num)
        if first_block:
            sequences.append(Sequence(name=make_readable(seq_name)))
            sequences[-1].append(seq_block)
            continue
        try:
            seq = sequences[expect]
        except IndexError:
            raise FormatSyntaxError(
                "Sequence on line %d not in initial sequence block" % line_num)
        expect += 1
        seq.append(seq_block)
    f.close()
    return sequences, {}, {}
예제 #9
0
 def _read_sequences(self, f):
     from chimerax.atomic import Sequence
     self.sequence_list = []
     while 1:
         line = f.readline()
         if not line:
             raise FormatSyntaxError('no alignment separator')
         if line == '//\n' or line == '//\r\n':
             break
         m = MSF._Sum.match(line)
         if m is not None:
             name = m.group(1)
             length = m.group(2)
             check = m.group(3)
             weight = m.group(4)
             s = Sequence(name=make_readable(name))
             self.sequence_list.append(s)
             s.attrs = {}
             s.attrs['MSF length'] = length
             s.attrs['MSF check'] = check
             s.attrs['MSF weight'] = weight
     if not self.sequence_list:
         raise FormatSyntaxError('No sequences found in header')
예제 #10
0
def read(session, f):
    from chimerax.atomic import Sequence
    from ..parse import FormatSyntaxError, make_readable
    in_sequence = False
    sequences = []
    for line in f.readlines():
        if in_sequence:
            if not line or line.isspace():
                in_sequence = False
                continue
            if line[0] == '>':
                in_sequence = False
                # fall through
            else:
                sequences[-1].extend(line.strip())
        if not in_sequence:
            if line[0] == '>':
                if sequences and len(sequences[-1]) == 0:
                    raise FormatSyntaxError("No sequence found for %s"
                        % sequences[-1].name)
                in_sequence = True
                sequences.append(Sequence(name=make_readable(line[1:])))
    return sequences, {}, {}
예제 #11
0
def read(session, f):
    line_num = 0
    file_attrs = {}
    file_markups = {}
    seq_attrs = {}
    seq_markups = {}
    sequences = {}
    seq_sequence = []
    for line in f.readlines():
        line = line.rstrip()  # drop trailing newline/whitespace
        line_num += 1
        if line_num == 1:
            if line.startswith("# STOCKHOLM"):
                continue
            raise FormatSymtaxError("File does not start with '# STOCKHOLM'")
        if not line:
            continue
        if line.startswith('#='):
            markup_type = line[2:4]
            markup = line[5:].strip()

            def try_split(num_split):
                fields = markup.split(None, num_split)
                if len(fields) == num_split:
                    # value is empty
                    fields.append("")
                if len(fields) != num_split + 1:
                    raise FormatSyntaxError(
                        "Not enough arguments after #=%s markup on line %d" %
                        (markup_type, line_num))
                return fields

            if markup_type == "GF":
                tag, val = try_split(1)
                tag = tag.replace("_", " ")
                tag = generic_file_attrs.get(tag, "Stockholm " + tag)
                if tag in file_attrs:
                    file_attrs[tag] += '\n' + val
                else:
                    file_attrs[tag] = val
            elif markup_type == "GS":
                seq_name, tag, val = try_split(2)
                tag = tag.replace("_", " ")
                attrs = seq_attrs.setdefault(seq_name, {})
                tag = generic_seq_attrs.get(tag, "Stockholm " + tag)
                if tag in attrs:
                    attrs[tag] += '\n' + val
                else:
                    attrs[tag] = val
            elif markup_type == "GC":
                tag, val = try_split(1)
                tag = tag.replace("_", " ")
                file_markups[tag] = file_markups.get(tag, "") + val
            elif markup_type == "GR":
                seq_name, tag, val = try_split(2)
                tag = tag.replace("_", " ")
                seq_markups.setdefault(seq_name, {}).setdefault(tag, "")
                seq_markups[seq_name][tag] += val
            # ignore other types
            continue
        elif line.startswith('#'):
            # unstructured comment
            if 'comments' in file_attrs:
                file_attrs['comments'] += "\n" + line[1:]
            else:
                file_attrs['comments'] = line[1:]
            continue
        elif line.strip() == "//":
            # end of sequence alignment blocks, but comments may follow this, so keep going...
            continue
        # sequence info...
        try:
            seq_name, block = line.split(None, 1)
        except ValueError:
            raise FormatSyntaxError(
                "Sequence info not in name/contents format on line %d" %
                line_num)
        if seq_name not in sequences:
            sequences[seq_name] = Sequence(name=make_readable(seq_name))
            seq_sequence.append(seq_name)
        sequences[seq_name].extend(block)
    f.close()
    for seq_name, seq in sequences.items():
        if seq_name in seq_attrs:
            seq.attrs = seq_attrs[seq_name]
        if seq_name in seq_markups:
            seq.markups = seq_markups[seq_name]
            for tag, markup in seq.markups.items():
                if len(markup) != len(seq):
                    session.logger.warning(
                        "Markup %s for sequence %s is wrong length; ignoring" %
                        (tag, seq_name))
                    del seq.markups[tag]
    for seq_info, label in [(seq_attrs, "sequence"), (seq_markups, "residue")]:
        for seq_name in seq_info.keys():
            if seq_name in sequences:
                continue
            # might be sequence name if trailing '/start-end' is removed...
            for full_name in sequences.keys():
                if full_name.startswith(seq_name) \
                and full_name[len(seq_name)] == '/' \
                and '/' not in full_name[len(seq_name)+1:]:
                    break
            else:
                raise FormatSyntaxError(
                    "%s annotations provided for non-existent sequence %s" %
                    (label.capitalize(), seq_name))
            session.logger.info(
                "Updating %s %s annotations with %s annotations" %
                (full_name, label, seq_name))
            seq_info[full_name].update(seq_info[seq_name])
            del seq_info[seq_name]
    for tag, markup in file_markups.items():
        if len(markup) != len(sequences[seq_sequence[0]]):
            raise FormatSyntaxError("Column annotation %s is wrong length" %
                                    tag)

    return [sequences[name] for name in seq_sequence], file_attrs, file_markups
예제 #12
0
def read(session, f):
    IN_HEADER, START_ATTRS, IN_ATTRS, IN_FEATURES, IN_SEQ = range(5)

    state = IN_HEADER

    sequences = []
    line_num = 0
    has_offset = False
    longest = None
    file_attrs = {}
    for line in f.readlines():
        line = line.rstrip() # remove trailing whitespace/newline
        line_num += 1
        if line_num == 1:
            if line.startswith("!!RICH_SEQUENCE"):
                continue
            raise FormatSyntaxError("First line does not start with !!RICH_SEQUENCE")

        if state == IN_HEADER:
            if line.strip() == "..":
                state = START_ATTRS
                continue
            if "comments" in file_attrs:
                file_attrs["comments"] += "\n" + line
            else:
                file_attrs["comments"] = line
            continue
        if not line.strip():
            continue

        if state == START_ATTRS:
            if line.strip() == "{":
                state = IN_ATTRS
                cur_attr = None
                attrs = {}
            elif line:
                raise FormatSyntaxError(
                    "Unexpected text before start of sequence on line %d" &line_num)
            continue

        if state == IN_ATTRS or state == IN_FEATURES:
            if line.strip() == "sequence" and line[0] == "s":
                if "RSF name" not in attrs:
                    raise FormatSyntaxError("Sequence on line %d has no name" & line_num)
                state = IN_SEQ
                seq = Sequence(name=make_readable(attrs["RSF name"]))
                del attrs["RSF name"]
                seq.attrs = attrs
                if "RSF descrip" in attrs:
                    attrs["description"] = attrs["RSF descrip"]
                    del attrs["RSF descrip"]
                sequences.append(seq)
                if "RSF offset" in attrs:
                    seq.extend("." * int(attrs["RSF offset"]))
                    has_offset = True
                    del attrs["RSF offset"]
                continue
            if line.startswith("feature"):
                if state == IN_ATTRS:
                    attrs["RSF features"] = [[line[8:]]]
                else:
                    attrs["RSF features"].append([line[8:]])
                state = IN_FEATURES
                continue

        if state == IN_ATTRS:
            if line[0].isspace():
                # continuation
                if not cur_attr:
                    raise FormatSyntaxError("Bogus indentation at line %d" % line_num)
                if attrs[cur_attr]:
                    attrs[cur_attr] += "\n" + line
                else:
                    attrs[cur_attr] = line
                continue
            if " " in line.strip():
                cur_attr, val = line.split(None, 1)
                cur_attr.replace("_", " ")
                cur_attr = "RSF " + cur_attr
                attrs[cur_attr] = val.strip()
            else:
                cur_attr = "RSF " + line.strip().replace("_", " ")
                attrs[cur_attr] = ""
            continue

        if state == IN_FEATURES:
            attrs["RSF features"][-1].append(line)
            continue
        if line.strip() == "}":
            state = START_ATTRS
            if not longest:
                longest = len(seq)
            else:
                if len(seq) < longest:
                    seq.extend("." * (longest - len(seq)))
                elif len(seq) > longest:
                    longest = len(seq)
                    for s in sequences[:-1]:
                        s.extend("." * (longest - len(s)))
            continue
        seq.extend(line.strip())
        if not seq[0].isalpha():
            has_offset = True

    f.close()
    if state == IN_HEADER:
        raise FormatSyntaxError("No end to header (i.e. '..' line) found")
    if state == IN_ATTRS or state == IN_FEATURES:
        raise FormatSyntaxError("No sequence data found for sequence %s" % attrs["RSF name"])
    if state == IN_SEQ:
        raise FormatSyntaxError("No terminating brace for sequence %s" % attrs["RSF name"])
    if not has_offset:
        session.logger.warning("No offset fields in RSF file; assuming zero offset")
    return sequences, file_attrs, {}
예제 #13
0
def read(session, f):
    doing = None
    sequences = []
    header_ok = False
    line_num = 0
    align_start_index = None
    for line in f.readlines():
        if doing == 'alignments':
            # don't strip() alignment section since it has significant leading spaces
            line = line.rstrip()
        else:
            line = line.strip()
        line_num += 1
        if not header_ok:
            if line.lower().startswith("hssp"):
                header_ok = True
                continue
            raise FormatSyntaxError("No initial HSSP header line")
        if line.startswith('##'):
            if doing == 'proteins' and not sequences:
                raise FormatSyntaxError("No entries in PROTEINS section")
            try:
                doing = line.split()[1].lower()
            except IndexError:
                doing = None
            if doing == 'alignments':
                try:
                    hashes, alignments, begin, dash, end = line.strip().split()
                    begin = int(begin)
                    end = int(end)
                except ValueError:
                    raise FormatSyntaError("ALIGNMENTS line (line #%d) not of the form: "
                        "## ALIGNMENTS (number) - (number)" % line_num)
            continue
        if doing == 'proteins':
            if not line[0].isdigit():
                continue
            try:
                seq_name = line.split()[2]
            except IndexError:
                raise FormatSyntaxError("Line %d in PROTEINS section does not start with "
                    "[integer] : [sequence name]" % line_num)
            sequences.append(Sequence(name=make_readable(seq_name)))
        elif doing == 'alignments':
            if line.lstrip().lower().startswith('seqno'):
                try:
                    align_start_index = line.index('.')
                except Exception:
                    raise FormatSyntaxError("No indication of alignment starting column "
                        "('.' character) in SeqNo line in ALIGNMENTS section")
                continue
            if align_start_index == None:
                raise FormatSyntaxError("No initial SeqNo line in ALIGNMENTS section")
            block = line[align_start_index:]
            if not block:
                raise FormatSyntaxError("No alignment block given on line %d" % line_num)
            block_len = end - begin + 1
            if len(block) > block_len:
                raise FormatSyntaxError("Too many characters (%d, only %d sequences) in "
                    "alignment block given on line %d" % (len(block), block_len, line_num))
            block = block + ' ' * (block_len - len(block))
            for seq, c in zip(sequences[begin-1:end], block):
                seq.append(c)
    f.close()
    return sequences, {}, {}
예제 #14
0
def align(session,
          ref,
          match,
          matrix_name,
          algorithm,
          gap_open,
          gap_extend,
          dssp_cache,
          ss_matrix=defaults["ss_scores"],
          ss_fraction=defaults["ss_mixture"],
          gap_open_helix=defaults["helix_open"],
          gap_open_strand=defaults["strand_open"],
          gap_open_other=defaults["other_open"],
          compute_ss=defaults["compute_ss"]):
    from chimerax import sim_matrices
    similarity_matrix = sim_matrices.matrix(matrix_name, session.logger)
    ssf = ss_fraction
    ssm = ss_matrix
    if ssf is not None and ssf is not False and compute_ss:
        need_compute = []
        if ref.structure not in dssp_cache:
            for r in ref.residues:
                if r and len(r.atoms) > 1:
                    # not CA only
                    need_compute.append(ref.structure)
                    dssp_cache[ref.structure] = (
                        ref.structure.residues.ss_ids,
                        ref.structure.residues.ss_types)
                    break
        if match.structure not in dssp_cache:
            for r in match.residues:
                if r and len(r.atoms) > 1:
                    # not CA only
                    need_compute.append(match.structure)
                    dssp_cache[match.structure] = (
                        match.structure.residues.ss_ids,
                        match.structure.residues.ss_types)
                    break
        if need_compute:
            """TODO
            from chimera.initprefs import ksdsspPrefs, \
                    KSDSSP_ENERGY, KSDSSP_HELIX_LENGTH, \
                    KSDSSP_STRAND_LENGTH
            """
            from chimerax.std_commands import dssp
            dssp.compute_ss(session, need_compute)
    if algorithm == "nw":
        from chimerax.alignment_algs import NeedlemanWunsch
        score, seqs = NeedlemanWunsch.nw(ref,
                                         match,
                                         score_gap=-gap_extend,
                                         score_gap_open=0 - gap_open,
                                         similarity_matrix=similarity_matrix,
                                         return_seqs=True,
                                         ss_matrix=ss_matrix,
                                         ss_fraction=ss_fraction,
                                         gap_open_helix=-gap_open_helix,
                                         gap_open_strand=-gap_open_strand,
                                         gap_open_other=-gap_open_other)
        gapped_ref, gapped_match = seqs
    elif algorithm == "sw":

        def ss_let(r):
            if not r:
                return ' '
            if r.is_helix:
                return 'H'
            elif r.is_strand:
                return 'S'
            return 'O'

        if ssf is False or ssf is None:
            ssf = 0.0
            ssm = None
        if ssm:
            # account for missing structure (blank SS letter)
            ssm = ssm.copy()
            for let in "HSO ":
                ssm[(let, ' ')] = 0.0
                ssm[(' ', let)] = 0.0
        from chimerax.alignment_algs import SmithWaterman
        score, alignment = SmithWaterman.align(
            ref.characters,
            match.characters,
            similarity_matrix,
            float(gap_open),
            float(gap_extend),
            gap_char=".",
            ss_matrix=ssm,
            ss_fraction=ssf,
            gap_open_helix=float(gap_open_helix),
            gap_open_strand=float(gap_open_strand),
            gap_open_other=float(gap_open_other),
            ss1="".join([ss_let(r) for r in ref.residues]),
            ss2="".join([ss_let(r) for r in match.residues]))
        from chimerax.atomic import StructureSeq, Sequence
        gapped_ref = StructureSeq(structure=ref.structure,
                                  chain_id=ref.chain_id)
        gapped_ref.name = ref.structure.name
        gapped_match = StructureSeq(structure=match.structure,
                                    chain_id=match.chain_id)
        gapped_match.name = match.structure.name
        # Smith-Waterman may not be entirety of sequences...
        for orig, gapped, sw in [
            (ref, gapped_ref, Sequence(characters=alignment[0])),
            (match, gapped_match, Sequence(characters=alignment[1]))
        ]:
            ungapped = sw.ungapped()
            for i in range(len(orig) - len(ungapped) + 1):
                if ungapped == orig[i:i + len(ungapped)]:
                    break
            else:
                raise ValueError("Smith-Waterman result not"
                                 " a subsequence of original sequence")
            gapped.bulk_set(orig.residues[i:i + len(ungapped)], sw.characters)
    else:
        raise ValueError("Unknown sequence alignment algorithm: %s" %
                         algorithm)

    # If the structures are disjoint snippets of the same longer SEQRES,
    # they may be able to be structurally aligned but the SEQRES records
    # will keep them apart.  Try to detect this situation and work around
    # by snipping off sequence ends.
    sr_disjoint = False
    if ref.from_seqres and match.from_seqres:
        struct_match = 0
        for i in range(len(gapped_ref)):
            uri = gapped_ref.gapped_to_ungapped(i)
            if uri is None:
                continue
            umi = gapped_match.gapped_to_ungapped(i)
            if umi is None:
                continue
            if gapped_ref.residues[uri] and gapped_match.residues[umi]:
                struct_match += 1
                if struct_match >= 3:
                    break
        if struct_match < 3:
            seq_match = 0
            for s1, s2 in zip(gapped_ref[:], gapped_match[:]):
                if s1.isalpha() and s2.isalpha():
                    seq_match += 1
                    if seq_match > 3:
                        break
            if seq_match > 3:
                need = 3 - struct_match
                if (ref.residues[:need].count(None) == 3
                or ref.residues[-need:].count(None) == 3) \
                and (match.residues[:need].count(None) == 3
                or match.residues[-need:].count(None) == 3):
                    sr_disjoint = True
    if sr_disjoint:
        from copy import copy
        clipped_ref = copy(ref)
        clipped_match = copy(match)
        for seq in (clipped_ref, clipped_match):
            num_none = 0
            for r in seq.residues:
                if r:
                    break
                num_none += 1
            if num_none:
                seq.bulk_set(seq.residues[num_none:], seq[num_none:])

            num_none = 0
            for r in reversed(seq.residues):
                if r:
                    break
                num_none += 1
            if num_none:
                seq.bulk_set(seq.residues[:-num_none], seq[:-num_none])
        return align(session,
                     clipped_ref,
                     clipped_match,
                     matrix_name,
                     algorithm,
                     gap_open,
                     gap_extend,
                     dssp_cache,
                     ss_matrix=ss_matrix,
                     ss_fraction=ss_fraction,
                     gap_open_helix=gap_open_helix,
                     gap_open_strand=gap_open_strand,
                     gap_open_other=gap_open_other,
                     compute_ss=False)
    for orig, aligned in [(ref, gapped_ref), (match, gapped_match)]:
        if hasattr(orig, '_dm_rebuild_info'):
            aligned._dm_rebuild_info = orig._dm_rebuild_info
            _dm_cleanup.append(aligned)
    return score, gapped_ref, gapped_match
예제 #15
0
def model(session,
          targets,
          *,
          block=True,
          multichain=True,
          custom_script=None,
          dist_restraints=None,
          executable_location=None,
          fast=False,
          het_preserve=False,
          hydrogens=False,
          license_key=None,
          num_models=5,
          show_gui=True,
          temp_path=None,
          thorough_opt=False,
          water_preserve=False):
    """
    Generate comparative models for the target sequences.

    Arguments:
    session
        current session
    targets
        list of (alignment, sequence) tuples.  Each sequence will be modelled.
    block
        If True, wait for modelling job to finish before returning and return list of
        (opened) models.  Otherwise return immediately.  Also see 'show_gui' option.
    multichain
        If True, the associated chains of each structure are used individually to generate
        chains in the resulting models (i.e. the models will be multimers).  If False, all
        associated chains are used together as templates to generate a single-chain model
        for the target sequence.
    custom_script
        If provided, the location of a custom Modeller script to use instead of the
        one we would otherwise generate.  Only used when executing locally.
    dist_restraints
        If provided, the location of a file containing additional distance restraints
    executable_location
        If provided, the path to the locally installed Modeller executable.  If not
        provided, use the web service.
    fast
        Whether to use fast but crude generation of models
    het_preserve
        Whether to preserve HET atoms in generated models
    hydrogens
        Whether to generate models with hydrogen atoms
    license_key
        Modeller license key.  If not provided, try to use settings to find one.
    num_models
        Number of models to generate for each template sequence
    show_gui
        If True, show user interface for Modeller results (if ChimeraX is in gui mode).
    temp_path
        If provided, folder to use for temporary files
    thorough_opt
        Whether to perform thorough optimization
    water_preserve
        Whether to preserve water in generated models
    """

    from chimerax.core.errors import LimitationError, UserError
    from .common import modeller_copy
    if multichain:
        # So, first find structure with most associated chains and least non-associated chains.
        # That structure is used as the multimer template.  Chains from other structures are used
        # as "standalone" templates -- each such chain will be on its own line.  Need to allow
        # space on the left and right of the target sequence so that the largest chains can be
        # accomodated.

        # Find the structure we will use as the multimer template
        by_structure = {}
        chain_info = {}
        for alignment, orig_target in targets:
            # Copy the target sequence, changing name to conform to Modeller limitations
            target = modeller_copy(orig_target)
            if not alignment.associations:
                raise UserError("Alignment %s has no associated chains" %
                                alignment.ident)
            for chain, aseq in alignment.associations.items():
                if len(chain.chain_id) > 1:
                    raise LimitationError(
                        "Modeller cannot handle templates with multi-character chain IDs"
                    )
                by_structure.setdefault(chain.structure, []).append(chain)
                chain_info[chain] = (aseq, target)
        max_matched = min_unmatched = None
        for s, match_info in by_structure.items():
            matched = len(match_info)
            unmatched = s.num_chains - len(match_info)
            if max_matched is None or matched > max_matched or (
                    matched == max_matched and (unmatched < min_unmatched)):
                multimer_template = s
                max_matched = matched
                min_unmatched = unmatched
        mm_targets = []
        mm_chains = []
        match_chains = []
        for chain in multimer_template.chains:
            mm_chains.append(chain)
            try:
                aseq, target = chain_info[chain]
            except KeyError:
                mm_targets.append(None)
            else:
                mm_targets.append(target)
                match_chains.append(chain)
        # okay, now form single-chain lines for the other structure associations, that eventually will
        # be handled column by column in exactly the same way as the non-multichain method.
        single_template_lines = []
        for chain, info in chain_info.items():
            if chain.structure == multimer_template:
                continue
            aseq, target = info
            for i, mm_target in enumerate(mm_targets):
                if mm_target != target:
                    continue
                template_line = [None] * len(mm_targets)
                template_line[i] = chain
                single_template_lines.append(template_line)
        # AFAIK, the multimer template chain sequences need to have complete PDB sequence, so may need
        # to prefix and suffix he corresponding alignment sequence with characters for residues
        # outside of the alignment sequence.  For other templates/targets, affix a corresponding number
        # of '-' characters
        prefixes, suffixes = find_affixes(mm_chains, chain_info)
        target_strings = []
        for prefix, suffix, mm_target in zip(prefixes, suffixes, mm_targets):
            if mm_target is None:
                target_strings.append('-')
                continue
            target_strings.append('-' * len(prefix) + mm_target.characters +
                                  '-' * len(suffix))
        templates_strings = []
        templates_info = []
        mm_template_strings = []
        for prefix, suffix, chain in zip(prefixes, suffixes, mm_chains):
            try:
                aseq, target = chain_info[chain]
            except KeyError:
                mm_template_strings.append('-')
                continue
            mm_template_strings.append(
                prefix + regularized_seq(aseq, chain).characters + suffix)
        templates_strings.append(mm_template_strings)
        templates_info.append(None)
        for template_line in single_template_lines:
            template_strings = []
            for prefix, suffix, chain, target in zip(prefixes, suffixes,
                                                     template_line,
                                                     mm_targets):
                if target is None:
                    template_strings.append('-')
                elif chain is None:
                    template_strings.append(
                        '-' * (len(prefix) + len(target) + len(suffix)))
                else:
                    aseq, target = chain_info[chain]
                    template_strings.append(
                        '-' * len(prefix) +
                        regularized_seq(aseq, chain).characters +
                        '-' * len(suffix))
                    templates_info.append((chain, aseq.match_maps[chain]))
            templates_strings.append(template_strings)
        target_name = "target" if len(targets) > 1 else target.name
    else:
        if len(targets) > 1:
            raise LimitationError(
                "Cannot have multiple targets(/alignments) unless creating multimeric model"
            )
        alignment, orig_target = targets[0]
        # Copy the target sequence, changing name to conform to Modeller limitations
        target = modeller_copy(orig_target)
        target_strings = [target.characters]

        templates_strings = []
        templates_info = []
        match_chains = []
        for chain, aseq in alignment.associations.items():
            if len(chain.chain_id) > 1:
                raise LimitationError(
                    "Modeller cannot handle templates with multi-character chain IDs"
                )
            templates_strings.append([regularized_seq(aseq, chain).characters])
            templates_info.append((chain, aseq.match_maps[chain]))
            if not match_chains:
                match_chains.append(chain)

        target_name = target.name

    from .common import write_modeller_scripts, get_license_key
    script_path, config_path, temp_dir = write_modeller_scripts(
        get_license_key(session, license_key), num_models, het_preserve,
        water_preserve, hydrogens, fast, None, custom_script, temp_path,
        thorough_opt, dist_restraints)

    input_file_map = []

    # form the sequences to be written out as a PIR
    from chimerax.atomic import Sequence
    pir_target = Sequence(name=target_name)
    pir_target.description = "sequence:%s:.:.:.:.::::" % pir_target.name
    pir_target.characters = '/'.join(target_strings)
    pir_seqs = [pir_target]

    structures_to_save = set()
    for strings, info in zip(templates_strings, templates_info):
        if info is None:
            # multimer template
            pir_template = Sequence(
                name=structure_save_name(multimer_template))
            pir_template.description = "structure:%s:FIRST:%s::::::" % (
                pir_template.name, multimer_template.chains[0].chain_id)
            structures_to_save.add(multimer_template)
        else:
            # single-chain template
            chain, match_map = info
            first_assoc_pos = 0
            while first_assoc_pos not in match_map:
                first_assoc_pos += 1
            first_assoc_res = match_map[first_assoc_pos]
            pir_template = Sequence(name=chain_save_name(chain))
            pir_template.description = "structure:%s:%d%s:%s:+%d:%s::::" % (
                structure_save_name(chain.structure), first_assoc_res.number,
                first_assoc_res.insertion_code, chain.chain_id, len(match_map),
                chain.chain_id)
            structures_to_save.add(chain.structure)
        pir_template.characters = '/'.join(strings)
        pir_seqs.append(pir_template)
    import os.path
    pir_file = os.path.join(temp_dir.name, "alignment.ali")
    aln = session.alignments.new_alignment(pir_seqs,
                                           False,
                                           auto_associate=False,
                                           create_headers=False)
    aln.save(pir_file, format_name="pir")
    session.alignments.destroy_alignment(aln)
    input_file_map.append(("alignment.ali", "text_file", pir_file))

    # write the namelist.dat file, target seq name on first line, templates on remaining lines
    name_file = os.path.join(temp_dir.name, "namelist.dat")
    input_file_map.append(("namelist.dat", "text_file", name_file))
    with open(name_file, 'w') as f:
        for template_seq in pir_seqs:
            print(template_seq.name, file=f)

    config_name = os.path.basename(config_path)
    input_file_map.append((config_name, "text_file", config_path))

    # save structure files
    import os
    struct_dir = os.path.join(temp_dir.name, "template_struc")
    if not os.path.exists(struct_dir):
        try:
            os.mkdir(struct_dir, mode=0o755)
        except FileExistsError:
            pass
    from chimerax.pdb import save_pdb, standard_polymeric_res_names as std_res_names
    for structure in structures_to_save:
        base_name = structure_save_name(structure) + '.pdb'
        pdb_file_name = os.path.join(struct_dir, base_name)
        input_file_map.append((base_name, "text_file", pdb_file_name))
        ATOM_res_names = structure.in_seq_hets
        ATOM_res_names.update(std_res_names)
        save_pdb(session,
                 pdb_file_name,
                 models=[structure],
                 polymeric_res_names=ATOM_res_names)
        delattr(structure, 'in_seq_hets')

    from chimerax.atomic import Chains
    match_chains = Chains(match_chains)
    if executable_location is None:
        if custom_script is not None:
            raise LimitationError(
                "Custom Modeller scripts only supported when executing locally"
            )
        if dist_restraints is not None:
            raise LimitationError(
                "Distance restraints only supported when executing locally")
        if thorough_opt:
            session.logger.warning(
                "Thorough optimization only supported when executing locally")
        job_runner = ModellerWebService(session, match_chains, num_models,
                                        pir_target.name, input_file_map,
                                        config_name, targets, show_gui)
    else:
        #TODO: job_runner = ModellerLocal(...)
        from chimerax.core.errors import LimitationError
        raise LimitationError("Local Modeller execution not yet implemented")
        # a custom script [only used when executing locally] needs to be copied into the tmp dir...
        if os.path.exists(script_path) \
        and os.path.normpath(temp_dir.name) != os.path.normpath(os.path.dirname(script_path)):
            import shutil
            shutil.copy(script_path, temp_dir.name)

    return job_runner.run(block=block)