Example #1
0
    def test_get_moltype(self):
        """correctly return a moltype by name"""
        for label in ("dna", "rna", "protein", "protein_with_stop"):
            mt = get_moltype(label)
            self.assertEqual(mt.label, label)
            mt = get_moltype(label.upper())
            self.assertEqual(mt.label, label)

        mt = get_moltype(DNA)
        self.assertEqual(mt.label, "dna")
        with self.assertRaises(ValueError):
            _ = get_moltype("blah")
Example #2
0
    def test_strand_symmetry(self):
        """correctly compute test of strand symmetry"""
        from cogent3 import get_moltype
        from cogent3.core.alignment import Aligned

        seq = DnaSequence("ACGGCTGAAGCGCTCCGGGTTTAAAACG")
        ssym = seq.strand_symmetry(motif_length=1)
        assert_allclose(ssym.observed.array, [[7, 5], [7, 9]])
        assert_allclose(ssym.expected.array, [[6, 6], [8, 8]])

        # RNA too
        seq = seq.to_rna()
        ssym = seq.strand_symmetry(motif_length=1)
        assert_allclose(ssym.observed.array, [[7, 5], [7, 9]])

        # Aligned
        seq = DnaSequence("ACGGCTGAAGCGCTCCGGGTTTAAAACG")
        m, s = seq.parse_out_gaps()
        seq = Aligned(m, s)
        ssym = seq.strand_symmetry(motif_length=1)
        assert_allclose(ssym.observed.array, [[7, 5], [7, 9]])

        with self.assertRaises(TypeError):
            text = get_moltype("text")
            m, s = text.make_seq(
                "ACGGCTGAAGCGCTCCGGGTTTAAAACG").parse_out_gaps()
            s.strand_symmetry(motif_length=1)

        # with motif_length=2
        seq = DnaSequence("AC GG CT GA AG CG CT CC GG GT TT AA AA CG".replace(
            " ", ""))
        ssym = seq.strand_symmetry(motif_length=2)
        self.assertLessEqual(len(ssym.observed.keys()), 8)
        assert_allclose(ssym.observed["AA"].to_array(), [2, 1])
        assert_allclose(ssym.observed["CC"].to_array(), [1, 2])
Example #3
0
def read(filepath):
    """returns MotifFreqsArray matrix"""
    try:
        infile = open(filepath)
        data = infile.readlines()
        infile.close()
    except TypeError:
        data = filepath

    data = [l.split() for l in data]
    revised = list(zip(*data))
    states = []
    matrix = []
    for row in revised[1:]:
        states.append(row[0])
        matrix.append([float(i) for i in row[1:]])

    matrix = dict(zip(states, matrix))
    if len(states) == 4:
        name = "rna" if "U" in states else "dna"
    else:
        name = "protein"

    states = list(get_moltype(name))
    matrix = [matrix[s] for s in states]
    matrix = array(matrix, dtype=float)

    pfm = MotifFreqsArray(matrix.T, states)
    return pfm
Example #4
0
    def __init__(self, allowed_frac=0.99, motif_length=1, moltype=None):
        """
        Parameters
        ----------
        allowed_frac : float
            columns with a fraction of gap characters exceeding allowed_frac are
            excluded
        motif_length : int
            sequences split into non-overlapping tuples of this size.
        moltype : str
            molecular type, must be either DNA or RNA
        """
        super(omit_gap_pos, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()
        if moltype:
            moltype = get_moltype(moltype)
            assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype"

        self.moltype = moltype
        self._allowed_frac = allowed_frac
        self._motif_length = motif_length
        self.func = self.omit
Example #5
0
    def __init__(self,
                 moltype="dna",
                 gc=1,
                 allow_rc=False,
                 trim_terminal_stop=True):
        """generates aa sequences

        Parameters
        ----------
        moltype : str
            molecular type, must be either DNA or RNA
        gc
            identifier for a genetic code or a genetic code instance
        trim_terminal_stop : bool
            exclude terminal stop codon from seqs

        Returns
        -------
        A sequence collection. Sequences that could not be translated
        are excluded.
        """
        super(translate_seqs, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()

        moltype = get_moltype(moltype)
        assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype"

        self._moltype = moltype
        self._gc = get_code(gc)
        self._trim_terminal_stop = trim_terminal_stop
        self.func = self.get_translated
Example #6
0
def read(filepath):
    """returns matrixid and MotifCountsArray matrix"""
    with open(filepath) as infile:
        matrix = []
        states = []
        for line in infile:
            line = line.strip()
            if line.startswith(">"):
                identifier = line[1:].split()
            elif line:
                line = _brackets.sub("", line)
                line = line.split()
                states.append(line.pop(0).upper())
                matrix.append([int(i) for i in line])

    matrix = dict(zip(states, matrix))
    if len(states) == 4:
        name = "rna" if "U" in states else "dna"
    else:
        name = "protein"

    states = list(get_moltype(name))
    matrix = array([matrix[s] for s in states], dtype=int).T

    pwm = MotifCountsArray(matrix, states)
    return identifier, pwm
Example #7
0
 def __init__(self, length, motif_length=1, subtract_degen=True, moltype=None):
     """
     Parameters
     ----------
     length : int
         only alignments with this length returned, False otherwise
     motif_length : int
         length is converted to modulo motif_length
     subtract_degen : bool
         degenerate characters subtracted from sequence length calculation
     moltype
         molecular type, can be string or instance
     """
     super(min_length, self).__init__(
         input_types=self._input_types,
         output_types=self._output_types,
         data_types=self._data_types,
     )
     self._formatted_params()
     if motif_length > 1:
         length = length // motif_length
     self._min_length = length
     self._motif_length = motif_length
     self.func = self.if_long_enough
     self._subtract_degen = subtract_degen
     if moltype:
         moltype = get_moltype(moltype)
     self._moltype = moltype
Example #8
0
 def test_roundtrip_alphabet(self):
     """alphabet to_json enables roundtrip"""
     dna = moltype.get_moltype("dna")
     data = dna.alphabet.to_json()
     got = deserialise_object(data)
     self.assertEqual(type(got), type(dna.alphabet))
     self.assertEqual(list(got), list(dna.alphabet))
Example #9
0
def make_unaligned_seqs(
    data, moltype=None, label_to_name=None, info=None, source=None, **kw
):
    """Initialize an unaligned collection of sequences.

    Parameters
    ----------
    data
        sequences
    moltype
        the moltype, eg DNA, PROTEIN, 'dna', 'protein'
    label_to_name
        function for converting original name into another name.
    info
        a dict from which to make an info object
    source
        origins of this data, defaults to 'unknown'
    **kw
        other keyword arguments passed to SequenceCollection
    """

    if moltype is not None:
        moltype = get_moltype(moltype)

    info = info or {}
    for other_kw in ("constructor_kw", "kw"):
        other_kw = kw.pop(other_kw, None) or {}
        kw.update(other_kw)
    assert isinstance(info, dict), "info must be a dict"
    info["source"] = source or "unknown"

    return SequenceCollection(
        data=data, moltype=moltype, label_to_name=label_to_name, info=info, **kw
    )
Example #10
0
    def __init__(self, quantile=None, gap_fraction=1, moltype="dna"):
        """Returns an alignment without the sequences responsible for
        exceeding disallowed_frac.

        Parameters
        ----------
        quantile : float or None
            The number of gaps uniquely introduced by a sequence are counted.
            The value corresponding to quantile is determined and all sequences
            whose unique gap count is larger than this cutoff are excluded.
            If None, this condition is not applied.
        gap_fraction
            sequences whose proportion of gaps is >= this value are excluded, the
            default excludes sequences that are just gaps.
        moltype
            molecular type, can be string or instance
        """
        super(omit_bad_seqs, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()
        if moltype:
            moltype = get_moltype(moltype)
        assert (
            moltype.label.lower() in "dna rna protein protein_with_stop"
        ), "moltype must be one of DNA, RNA or PROTEIN"
        self._quantile = quantile
        self._gap_fraction = gap_fraction
        self._moltype = moltype
        self.func = self.drop_bad_seqs
Example #11
0
def translate_frames(seq, moltype=None, gc=1, allow_rc=False):
    """translates a nucleic acid sequence

    Parameters
    ----------
    moltype
        molecular type, must be either DNA or RNA
    gc
        identifer for a genetic code or a genetic code instance
    allow_rc : bool
        includes frames sequence reverse complement

    Returns
    -------
    [(frame, translation), ..]
    Reverse complement frame numbers are negative
    """
    gc = get_code(gc)
    if moltype:
        moltype = get_moltype(moltype)
        seq = moltype.make_seq(seq)

    translations = gc.sixframes(seq)
    if not allow_rc:
        translations = translations[:3]

    return translations
Example #12
0
    def __init__(self, moltype=None, gap_is_degen=True, motif_length=1):
        """excludes degenerate characters from alignment

        Parameters
        ----------
        moltype : str
            molecular type, must be either DNA or RNA
        gap_is_degen : bool
            include gap character in degenerate character set
        motif_length : int
            sequences split into non-overlapping tuples of this size. If a
            tuple contains a degen character at any position the entire tuple
            is excluded
        """
        super(omit_degenerates, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()
        if moltype:
            moltype = get_moltype(moltype)
            assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype"

        self.moltype = moltype
        self._no_degen = omit_degenerates
        self._allow_gap = not gap_is_degen
        self._motif_length = motif_length
        self.func = self.filter_degenerates
Example #13
0
 def __init__(self, moltype=None, format="fasta"):
     """
     Parameters
     ----------
     moltype
         molecular type, string or instance
     format : str
         sequence file format
     """
     super(ComposableSeq, self).__init__(
         input_types=None,
         output_types=("sequences", "serialisable"),
         data_types=(
             "DataStoreMember",
             "str",
             "Path",
             "ArrayAlignment",
             "Alignment",
             "SequenceCollection",
         ),
     )
     _seq_loader.__init__(self)
     self._formatted_params()
     if moltype:
         moltype = get_moltype(moltype)
     self.moltype = moltype
     self._parser = PARSERS[format.lower()]
Example #14
0
    def __init__(
        self,
        *positions,
        fourfold_degenerate=False,
        gc="Standard Nuclear",
        moltype="dna",
    ):
        """selects the indicated codon positions from an alignment

        Parameters
        ----------
        positions
            either an integer (1, 2, 3), or a tuple of position numbers,
            e.g. 3 is third position, (1,2) is first and second codon position
        fourfold_degenerate : bool
            if True, returns third positions from four-fold degenerate codons.
            Overrides positions.
        gc
            identifer for a genetic code or a genetic code instance
        moltype : str
            molecular type, must be either DNA or RNA
        """
        super(take_codon_positions, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()
        assert moltype is not None
        moltype = get_moltype(moltype)

        assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype"

        self._moltype = moltype
        self._four_fold_degen = fourfold_degenerate
        self._fourfold_degen_sets = None

        if fourfold_degenerate:
            gc = get_code(gc)
            sets = get_fourfold_degenerate_sets(
                gc, alphabet=moltype.alphabet, as_indices=True
            )
            self._fourfold_degen_sets = sets
            self.func = self.take_fourfold_positions
            return

        assert (
            1 <= min(positions) <= 3 and 1 <= max(positions) <= 3
        ), "Invalid codon positions"

        by_index = True if len(positions) == 1 else False
        if by_index:
            positions = positions[0] - 1
            self.func = self.take_codon_position
        else:
            positions = tuple(p - 1 for p in sorted(positions))
            self.func = self.take_codon_positions

        self._positions = positions
Example #15
0
 def test_count_ab(self):
     """abseq array seq should count characters"""
     AB = get_moltype("ab")
     seq = AB.make_array_seq("aaba-", alphabet=AB.alphabet.with_gap_motif())
     c = seq.counts()
     self.assertEqual(c.to_dict(), {"a": 3, "b": 1})
     c = seq.counts(allow_gap=True)
     self.assertEqual(c.to_dict(), {"a": 3, "b": 1, "-": 1})
Example #16
0
    def __init__(self,
                 length,
                 start=0,
                 random=False,
                 seed=None,
                 motif_length=1,
                 moltype=None):
        """
        Parameters
        ----------
        length : int
            only alignments with this length returned, False otherwise
        start
            integer starting position for truncation, or 'random' in which case
            a random start is chosen (within the possible range returning an
            alignment of the specified length). Overrides  `random`.
        random : bool
            random positions for the corresponding tuple are chosen.
        seed : int
            random number seed
        motif_length : int
            length of sequence units to consider. If not 1, length and start are
            converted (reduced) if necessary to be modulo motif_length
        moltype
            molecular type, can be string or instance
        """
        super(fixed_length, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()
        diff = length % motif_length
        if diff != 0:
            length -= diff
        assert length % motif_length == 0

        self._length = length
        self._motif_length = motif_length
        if moltype:
            moltype = get_moltype(moltype)
        self._moltype = moltype
        if type(start) == str:
            assert start.lower().startswith("rand")
            random = False
        else:
            assert type(start) == int
            assert start >= 0
            diff = start % motif_length
            if diff != 0:
                start -= diff

        self._start = _GetStart(start)
        if seed:
            np_random.seed(seed)

        self.func = {False: self.truncated}.get(random, self.sample_positions)
Example #17
0
 def test_roundtrip_seq(self):
     """seq to_json enables roundtrip"""
     for mtype in ("dna", "protein"):
         mtype = moltype.get_moltype(mtype)
         seq = mtype.make_seq("ACGGTCGG", "label", info={"something": 3})
         got = deserialise_object(seq.to_json())
         self.assertEqual(got.info.something, 3)
         self.assertEqual(got.name, "label")
         self.assertEqual(got.moltype, seq.moltype)
         self.assertEqual(str(got), str(seq))
Example #18
0
def deserialise_moltype(data):
    """returns a cogent3 MolType instance, or a CodonAlphabet"""
    data.pop("version", None)
    label = data["moltype"]
    data["moltype"] = get_moltype(label)
    klass = _get_class(data.pop("type"))
    if klass == _CodonAlphabet:
        gc = get_code(data.pop("genetic_code"))
        result = _CodonAlphabet(**data)
        result._gc = gc
    else:
        result = data["moltype"]

    return result
Example #19
0
def deserialise_alphabet(data):
    """returns a cogent3 Alphabet instance"""
    data.pop("version", None)
    if _get_class(data.get("type")) == _CodonAlphabet:
        result = deserialise_moltype(data)
        return result

    label = data["moltype"]
    data["moltype"] = get_moltype(label)
    key = "data" if "data" in data else "motifset"
    motifs = data.pop(key)
    klass = _get_class(data.pop("type"))
    result = klass(motifs, **data)
    return result
Example #20
0
 def test_jaspar(self):
     """correctly load jaspar formatted counts matrix"""
     path = "data/sample.jaspar"
     mid, pwm = jaspar.read(path)
     assert mid == ["PSSMid", "HGNCsymbol"], "ID line wrong"
     # note state indices are ordered by moltype
     base_order = list(get_moltype("dna"))
     expect = [
         [35, 374, 30, 121, 6, 121, 33],
         [0, 10, 0, 0, 3, 2, 44],
         [352, 3, 354, 268, 360, 222, 155],
         [2, 2, 5, 0, 10, 44, 157],
     ]
     assert_array_equal(pwm.array, array(expect).T)
     self.assertEqual(pwm[0, "A"], 352)
     self.assertEqual(pwm[3, "T"], 121)
Example #21
0
    def __init__(
        self,
        ref_seq="longest",
        score_matrix=None,
        insertion_penalty=20,
        extension_penalty=2,
        moltype="dna",
    ):
        """
        Parameters
        ----------
        ref_seq : str
            either a name to be found in the data, or 'longest'.
            If latter, the longest sequence will be chosen as the reference
        score_matrix
            scoring dict for DNA, defaults to `make_dna_scoring_dict(10, -1, -8)`
        insertion_penalty
            penalty for gap insertion
        extension_penalty
            penalty for gap extension
        moltype : str
            molecular type
        """
        super(align_to_ref, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()
        assert moltype
        moltype = get_moltype(moltype)
        self._moltype = moltype
        S = score_matrix or (
            make_dna_scoring_dict(10, -1, -8)
            if self._moltype.label == "dna"
            else make_generic_scoring_dict(10, self._moltype)
        )
        self._kwargs = dict(
            S=S, d=insertion_penalty, e=extension_penalty, return_score=False
        )
        if ref_seq.lower() == "longest":
            self.func = self.align_to_longest
        else:
            self.func = self.align_to_named_seq
            self._ref_name = ref_seq

        self._gap_state = None  # can be character or int, depends on aligner
Example #22
0
def Sequence(moltype=None, seq=None, name=None, filename=None, format=None):
    if seq is None:
        for (a_name, a_seq) in FromFilenameParser(filename, format):
            if seq is None:
                seq = a_seq
                if name is None:
                    name = a_name
            else:
                raise ValueError("Multiple sequences in '%s'" % filename)
    if moltype is not None:
        moltype = get_moltype(moltype)
        seq = moltype.make_seq(seq)
    elif not hasattr(seq, "moltype"):
        seq = ASCII.make_seq(seq)
    if name is not None:
        seq.name = name
    return seq
Example #23
0
    def __init__(self,
                 moltype="dna",
                 gc=1,
                 allow_rc=False,
                 trim_terminal_stop=True):
        """selects translatable sequences

        Sequences are truncated to modulo 3. seqs.info has a translation_errors
        entry.

        Parameters
        ----------
        moltype : str
            molecular type, must be either DNA or RNA
        gc
            identifier for a genetic code or a genetic code instance
        allow_rc : bool
            If False, forward strand considered only. If True, and
              best frame on rc, it will be negative
        trim_terminal_stop : bool
            exclude terminal stop codon from seqs

        Returns
        -------
        A sequence collection. Sequences that could not be translated
        are excluded.
        """
        super(select_translatable, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )
        self._formatted_params()

        moltype = get_moltype(moltype)
        assert moltype.label.lower() in ("dna", "rna"), "Invalid moltype"

        self._moltype = moltype
        self._gc = get_code(gc)
        self._allow_rc = allow_rc
        self._trim_terminal_stop = trim_terminal_stop
        self.func = self.get_translatable
Example #24
0
 def __init__(self, moltype=None, format="fasta"):
     """
     Parameters
     ----------
     moltype
         molecular type, string or instance
     format : str
         sequence file format
     """
     super(ComposableSeq, self).__init__(
         input_types=self._input_types,
         output_types=self._output_types,
         data_types=self._data_types,
     )
     _seq_loader.__init__(self)
     self._formatted_params()
     if moltype:
         moltype = get_moltype(moltype)
     self.moltype = moltype
     self._parser = PARSERS[format.lower()]
Example #25
0
File: io.py Project: jbw900/cogent3
 def __init__(self, moltype=None, format="fasta"):
     """
     Parameters
     ----------
     moltype
         molecular type, string or instance
     format : str
         sequence file format
     """
     super(ComposableSeq, self).__init__(
         input_types=None,
         output_types=(SEQUENCE_TYPE, SERIALISABLE_TYPE),
         data_types=("DataStoreMember", "str", "Path"),
     )
     _seq_loader.__init__(self)
     self._formatted_params()
     if moltype:
         moltype = get_moltype(moltype)
     self.moltype = moltype
     self._parser = PARSERS[format.lower()]
Example #26
0
    def __init__(self,
                 mask_degen=False,
                 choose="longest",
                 seed=None,
                 moltype=None):
        """Returns unique sequences, adds 'dropped' key to seqs.info

        Parameters
        ----------
        mask_degen
            if True, degenerate characters are ignored
        choose
            choose a representative from sets of duplicated sequences.
            Valid values are None (all members of a duplicated set are excluded),
            'longest', 'random'.
        seed : int
            set random number seed. Only applied of choose=='random'
        moltype
            molecular type, can be string or instance
        """
        super(omit_duplicated, self).__init__(
            input_types=self._input_types,
            output_types=self._output_types,
            data_types=self._data_types,
        )

        assert not choose or choose in "longestrandom"
        self._formatted_params()
        if moltype:
            moltype = get_moltype(moltype)
        self._moltype = moltype
        if choose == "random" and seed:
            np_random.seed(seed)

        self._mask_degen = mask_degen
        if choose == "longest":
            self.func = self.choose_longest
        elif choose == "random":
            self.func = self.choose_random
        else:
            self.func = self.take_unique
Example #27
0
def make_seq(seq, name=None, moltype=None):
    """
    Parameters
    ----------
    seq : str
        raw string to be converted to sequence object
    name : str
        sequence name
    moltype
        name of a moltype or moltype instance

    Returns
    -------
    returns a sequence object
    """
    moltype = moltype or "text"
    moltype = get_moltype(moltype)
    seq = moltype.make_seq(seq)
    if name is not None:
        seq.name = name
    return seq
Example #28
0
def deserialise_seq(data, aligned=False):
    """deserialises sequence and any annotations

    Parameters
    ----------
    data : dict
        a result of json.loads of a to_rich_dict()
    aligned
        whether sequence type is for an Alignment, in which case an Aligned
        instance will be returned
    Returns
    -------

    """
    from cogent3.core.moltype import get_moltype

    data.pop("version", None)
    data["moltype"] = get_moltype(data.pop("moltype"))
    annotations = data.pop("annotations", None)
    make_seq = data["moltype"].make_seq
    type_ = data.pop("type")
    klass = _get_class(type_)
    if "-" in data["seq"]:
        aligned = True

    data.pop("moltype")
    result = make_seq(**data)
    if aligned:
        map_, result = result.parse_out_gaps()

    if annotations:
        deserialise_annotation(annotations, result)

    if aligned:
        result = Aligned(map_, result)

    return result
Example #29
0
def deserialise_seq_collections(data):
    """returns a cogent3 sequence/collection/alignment instance"""
    # We first try to load moltype/alphabet using get_moltype
    from cogent3.core.moltype import get_moltype

    data.pop("version", None)
    data["moltype"] = get_moltype(data.pop("moltype"))
    annotations = data.pop("annotations", None)
    type_ = data.pop("type")
    klass = _get_class(type_)
    assert "alignment" in type_.lower(), "not alignment type"
    aligned = not type_.endswith("SequenceCollection")
    seqs = []
    for v in data.pop("seqs").values():
        v["moltype"] = data["moltype"]
        seq = deserialise_seq(v, aligned=aligned)
        seqs.append(seq)

    result = klass(seqs, **data)

    if annotations:
        deserialise_annotation(annotations, result)

    return result
Example #30
0
    def __init__(
        self,
        seq1,
        seq2,
        moltype="text",
        window=20,
        threshold=None,
        min_gap=0,
        rc=False,
        xtitle=None,
        ytitle=None,
        title=None,
        width=500,
        show_progress=False,
    ):
        """
        Parameters
        ----------
        seq1, seq2 : string or sequence object
        moltype : str or MolType instance
            if seq1, seq2 are strings, moltype is used to convert to sequence
            objects
        window : int
            k-mer size for comparison between sequences
        threshold : int
            windows where the sequences are identical >= threshold are a match
        min_gap : int
            permitted gap for joining adjacent line segments, default is no gap
            joining
        rc : bool or None
            include dotplot of reverse compliment also. Only applies to Nucleic
            acids moltypes
        xtitle, ytitle
            name of the seq1, seq2. None if included as part of a
            AnnotatedDrawable
        title : str
            title for the plot
        show_progress : bool
            displays progress bar
        """
        from cogent3.core.alignment import Aligned

        # we ensure sequences have gaps parsed and the calculate aspect ratio
        if hasattr(seq1, "moltype"):
            moltype = seq1.moltype
        else:
            moltype = get_moltype(moltype)

        is_aligned = isinstance(seq1, Aligned) and isinstance(seq2, Aligned)
        map1, seq1 = _convert_input(seq1, moltype)
        map2, seq2 = _convert_input(seq2, moltype)
        len1, len2 = len(seq1), len(seq2)
        height = width * len2 / len1

        super(Dotplot, self).__init__(visible_axes=True,
                                      showlegend=True,
                                      width=width,
                                      height=height)

        self.seq1 = seq1
        self.seq2 = seq2
        self._aligned_coords = get_align_coords(map1, map2, aligned=is_aligned)

        self.xtitle = xtitle
        self.ytitle = ytitle
        self.title = title
        self._window = window
        self._min_gap = min_gap
        if threshold is None:
            universe = (len1 - window) * (len2 - window)
            acceptable_noise = min(len1, len2) / window
            threshold = suitable_threshold(window, acceptable_noise / universe)

        self._threshold = threshold

        fwd, rev = get_dotplot_coords(
            self.seq1,
            self.seq2,
            window=window,
            threshold=threshold,
            min_gap=min_gap,
            rc=rc,
            show_progress=show_progress,
        )
        self._fwd = fwd
        self._rev = rev