def test_multiple_seqs():
    assert k_mer_frequencies(["A", "A"], 1, include_missing=False) == {
        1: {
            "A": 1.0
        }
    }
    assert k_mer_frequencies(["A", "T"], 1, include_missing=False) == {
        1: {
            "A": 0.5,
            "T": 0.5
        }
    }
    assert k_mer_frequencies(["AA", "TT"], 2, include_missing=False) == {
        2: {
            "AA": 0.5,
            "TT": 0.5
        }
    }
    assert k_mer_frequencies(["A", "T"], 1, include_missing=True) == {
        1: {
            "A": 0.5,
            "T": 0.5,
            "G": 0.0,
            "C": 0.0
        }
    }
    assert np.array_equal(
        k_mer_frequencies(["A", "T"], 1, include_missing=True, vector=True),
        np.array([0.5, 0.0, 0.0, 0.5]))
Ejemplo n.º 2
0
def test_vectorization():
    # check that the ordering is alphabetical
    assert np.array_equal(k_mer_frequencies("A", 1, include_missing=True, vector=True), np.array([1.0, 0.0, 0.0, 0.0]))
    assert np.array_equal(k_mer_frequencies("T", 1, include_missing=True, vector=True), np.array([0.0, 0.0, 0.0, 1.0]))
    assert np.array_equal(k_mer_frequencies("G", 1, include_missing=True, vector=True), np.array([0.0, 0.0, 1.0, 0.0]))
    assert np.array_equal(k_mer_frequencies("C", 1, include_missing=True, vector=True), np.array([0.0, 1.0, 0.0, 0.0]))
    assert np.array_equal(k_mer_frequencies("AT", 1, include_missing=True, vector=True), np.array([0.5, 0.0, 0.0, 0.5]))
def test_multiple_k():
    assert np.array_equal(
        k_mer_frequencies("AA", [1, 2], include_missing=True, vector=True),
        np.array([
            1.0,
            0.0,
            0.0,
            0.0,  # k = 1
            1.0,
            0.0,
            0.0,
            0.0,  # k = 2
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0
        ]))

    assert np.array_equal(
        k_mer_frequencies("AA", [2, 1], include_missing=True, vector=True),
        np.array([
            1.0,
            0.0,
            0.0,
            0.0,  # k = 1
            1.0,
            0.0,
            0.0,
            0.0,  # k = 2
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0,
            0.0
        ]))
    assert k_mer_frequencies("AA", [1, 2], include_missing=False) == {
        1: {
            "A": 1.0
        },
        2: {
            "AA": 1.0
        }
    }
Ejemplo n.º 4
0
def aa(filepath, mode, trans_table, length, stop_codon, output, verbose):
    # translate the DNA seq, if using exact AA seq
    if mode == "seq":
        print(SeqIO.read(filepath, "fasta").seq.translate(table=trans_table))
        return

    # if using frequency mode
    seqs = []
    with open(filepath, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            try:
                aa_seq = str(record.seq.translate(table=trans_table))
            except Bio.Data.CodonTable.TranslationError:
                aa_seq = str(record.seq)
            if "*" in aa_seq:
                aa_seq.replace("*", "")
            seqs.append(aa_seq)

    aa_seq = amino_acid_seq(length, k_mer_frequencies("".join(seqs), 1))
    if stop_codon:
        aa_seq += "*"
    if output:
        with open(output, "w+") as output_handle:
            SeqIO.write(
                SeqRecord(Seq(aa_seq),
                          id="Optimized by Freqgen",
                          description=""), output_handle, "fasta")
    if verbose:
        print(aa_seq)
Ejemplo n.º 5
0
def test_amino_acid():
    assert k_mer_frequencies("INQTEL", 1, include_missing=False) == {1: {'E': 0.16666666666666666,
                                                                         'I': 0.16666666666666666,
                                                                         'L': 0.16666666666666666,
                                                                         'N': 0.16666666666666666,
                                                                         'Q': 0.16666666666666666,
                                                                         'T': 0.16666666666666666}}
Ejemplo n.º 6
0
def featurize(filepath, k, codon_usage, output):
    # get the sequences as strs
    seqs = []
    with open(filepath, "r") as handle:
        for seq in SeqIO.parse(handle, "fasta"):
            seq = str(seq.seq)
            seqs.append(seq)

    if k:
        result = k_mer_frequencies(seqs, k, include_missing=True)
    else:
        result = {}

    # get the codon usage frequencies
    if codon_usage:
        for seq in seqs:
            if len(seq) % 3 != 0:
                raise ValueError(
                    "Cannot calculate codons for sequence whose length is not divisible by 3"
                )
        result["codons"] = codon_frequencies("".join(seqs))

    if output:
        yaml.dump(result, open(output, "w+"), default_flow_style=False)
        return
    print(yaml.dump(result, default_flow_style=False))
Ejemplo n.º 7
0
def test_dna():
    assert k_mer_frequencies("GATGATGGC", 3, include_missing=False) == {
        'ATG': 0.2857142857142857,
        'GAT': 0.2857142857142857,
        'GGC': 0.14285714285714285,
        'TGA': 0.14285714285714285,
        'TGG': 0.14285714285714285
    }
Ejemplo n.º 8
0
def aa(filepath, mode, genetic_code, length, stop_codon, output, verbose):

    # translate the DNA seq, if using exact AA seq
    if mode == "seq":
        try:
            aa_seq = SeqIO.read(filepath,
                                "fasta").seq.translate(table=genetic_code)
        except Bio.Data.CodonTable.TranslationError:
            print(
                "Sequence is not able to be translated! Is it already an amino acid sequence?"
            )
            return
        aa_seq = str(aa_seq).replace("*", "")

    elif mode == "freq":
        # ensure we know how ling the new sequence should be
        if not length:
            print("Must provide lenght parameter using -l INTEGER")
            return

        seqs = []
        # extract the sequences from the reference set
        with open(filepath, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                try:
                    aa_seq = str(record.seq.translate(table=genetic_code)
                                 )  # for DNA sequences, translate them
                except Bio.Data.CodonTable.TranslationError:
                    aa_seq = str(
                        record.seq
                    )  # for amino acid sequences, just get the string
                seqs.append(aa_seq)

        # make them into one big sequence
        seqs = "".join(seqs)
        seqs = seqs.replace("*", "")

        # generate a new sequence of the right length
        aa_seq = amino_acid_seq(length, k_mer_frequencies(seqs, 1)[1])

    # add a stop codon, if requested
    if stop_codon:
        aa_seq += "*"

    # output to the file
    if output:
        with open(output, "w+") as output_handle:
            if not isinstance(aa_seq, Seq):
                aa_seq = Seq(aa_seq)
            SeqIO.write(
                SeqRecord(aa_seq,
                          id="Generated by Freqgen from " + str(filepath),
                          description=""), output_handle, "fasta")

    if verbose or not output:
        print(aa_seq)
Ejemplo n.º 9
0
def test_include_missing():
    assert k_mer_frequencies("GATGATGGC", 2, include_missing=True) == {2: {'AA': 0,
                                                                           'AC': 0,
                                                                           'AG': 0,
                                                                           'AT': 0.25,
                                                                           'CA': 0,
                                                                           'CC': 0,
                                                                           'CG': 0,
                                                                           'CT': 0,
                                                                           'GA': 0.25,
                                                                           'GC': 0.125,
                                                                           'GG': 0.125,
                                                                           'GT': 0,
                                                                           'TA': 0,
                                                                           'TC': 0,
                                                                           'TG': 0.25,
                                                                           'TT': 0}}
Ejemplo n.º 10
0
def test_k_values(s):
    assume(len(s) > 6)
    # make sure that the lengths are right
    for i in range(1, 6):
        assert len(k_mer_frequencies(s, i, include_missing=True)[i]) == 4**i
        assert len(k_mer_frequencies(s, i, include_missing=True, vector=True)) == 4**i

    # ensure invalid values of k raise an error
    with pytest.raises(ValueError):
        k_mer_frequencies(s, 0)
    with pytest.raises(ValueError):
        k_mer_frequencies(s, -1)
Ejemplo n.º 11
0
def featurize(filepath, k, codon_usage, trans_table):
    # get the sequences as strs
    seqs = []
    with open(filepath, "r") as handle:
        for seq in SeqIO.parse(handle, "fasta"):
            seqs.append(str(seq.seq))

    # get the k-mer k_mer_frequencies
    for _k in k:
        print(yaml.dump(
            {_k: k_mer_frequencies(seqs, _k, include_missing=True)},
            default_flow_style=False),
              end="")

    # get the codon usage frequencies
    if codon_usage:
        print(yaml.dump(
            dict(codons=codon_frequencies("".join(seqs), trans_table)),
            default_flow_style=False),
              end="")
Ejemplo n.º 12
0
def test_k_values():
    # make sure that the lengths are right
    for i in range(1, 6):
        assert len(k_mer_frequencies("GATGATGGC", i,
                                     include_missing=True)) == 4**i
        assert len(
            k_mer_frequencies("GATGATGGC",
                              i,
                              include_missing=True,
                              vector=True)) == 4**i

    # ensure invalid values of k raise an error
    with pytest.raises(ValueError):
        k_mer_frequencies("GATTACA", 0)
    with pytest.raises(ValueError):
        k_mer_frequencies("GATTACA", -1)
Ejemplo n.º 13
0
def test_invalid_args():
    with pytest.raises(ValueError):
        k_mer_frequencies("A", 1, include_missing=False, vector=True)
    with pytest.raises(ValueError):
        k_mer_frequencies("", 1)
Ejemplo n.º 14
0
def test_codon_frequencies(s):
    assert k_mer_frequencies(s, 1,
                             codons=True)["codons"] == codon_frequencies(s)
Ejemplo n.º 15
0
def visualize(original, target, optimized, title, width, height, output, show,
              genetic_code):
    target = yaml.safe_load(open(target))

    # create a list of the k_mers
    k = sorted((_k for _k in target.keys() if not isinstance(_k, str)))
    k_mers = list(
        chain.from_iterable((("".join(k_mer)
                              for k_mer in product("ACGT", repeat=_k))
                             for _k in k)))

    if "codons" in target.keys() and target.keys():
        k_mers.extend(
            [codon + "*" for codon in sorted(target["codons"].keys())])

    # generate the target vector
    target_vector = []
    for _k in k:
        k_mer_vector = [
            x[1] for x in sorted(list(target[_k].items()), key=lambda x: x[0])
        ]
        target_vector.extend(k_mer_vector)
    if "codons" in target.keys():
        target_vector.extend([
            x[1]
            for x in sorted(list(target["codons"].items()), key=lambda x: x[0])
        ])

    seq = SeqIO.read(optimized, "fasta").seq
    if k:
        optimized = list(k_mer_frequencies(seq, k, vector=True))
    else:
        optimized = []
    if "codons" in target.keys():
        optimized.extend([
            x[1] for x in sorted(list(codon_frequencies(seq).items()),
                                 key=lambda x: x[0])
        ])

    # print(list(zip(optimized, target_vector, k_mers)))
    assert len(optimized) == len(target_vector) == len(k_mers)  # sanity check

    # if the original sequence is given, calculate its k_mer_frequencies
    if original:
        original_seq = SeqIO.read(original, "fasta").seq
        if k:
            original = list(k_mer_frequencies(original_seq, k, vector=True))
        else:
            original = []
        if "codons" in target.keys():
            original.extend([
                x[1] for x in sorted(
                    list(codon_frequencies(original_seq).items()),
                    key=lambda x: x[0],
                )
            ])

    if max(k, default=0) >= 3 or "codons" in target.keys():
        click.secho(
            "Displaying a large number of k-mers and/or codons. To view the results of each k-mer, use the zoom tool in the top right of the graph to zoom in or set the width of the graph manually using --width. Suggested width: "
            + str(35 * len(k_mers)),
            fg="yellow",
        )
        click.pause()

    _visualize(
        k_mers,
        target_vector,
        optimized,
        original_freqs=original,
        title=title,
        plot_height=height,
        plot_width=width,
        filepath=output,
        codons="codons" in target.keys(),
        show=show,
    )