Exemple #1
0
def recodesite(
    input,
    output,
    site,
    clip_left,
    clip_right,
    codon_table,
    codon_usage,
    sampler,
    codon_freq_threshold,
    amber_only,
):
    """Recode a DNA sequence to remove a particular site (e.g., restriction site)

    The site needs to be recognized by Biopython, or it will be treated as a DNA
    sequence. The clipping options should determine the boundaries of the coding
    sequence, which will correspond to the part of the sequence that is
    "recodable".

    INPUT and OUTPUT are paths to fasta files or "-" to specify STDIN/STDOUT.

    """
    if sampler == "weighted":
        usage = ecoli_codon_usage
        if codon_freq_threshold is not None:
            # TODO: this is hardcoded in and there's a leaky abstraction here
            table = standard_dna_table
            usage = zero_low_freq_codons(usage, table, codon_freq_threshold)
        if amber_only:
            usage = zero_non_amber_stops(usage)
        codon_sampler = FreqWeightedCodonSampler(usage=usage)
    elif sampler == "uniform":
        codon_sampler = UniformCodonSampler()

    sites = [site2dna(s) for s in site]
    # sites is now a list[Bio.Seq.Seq]

    for seqrecord in SeqIO.parse(input, "fasta"):
        id_ = seqrecord.id
        cds_start = clip_left
        cds_end = len(seqrecord) - clip_right
        seq = recode_sites_from_cds(
            seqrecord.seq, sites, codon_sampler, cds_start, cds_end
        )
        print_fasta(SeqRecord(seq, id_, description=""), output)
Exemple #2
0
def findsite(input, site, clip_left, clip_right):
    """Find locations of a site in a DNA sequences

    If a sequence matches the specified site, write out its name and location.
    Used as a diagnostic to confirm that a particular DNA site (e.g.,
    restriction enzyme) is absent from a set of sequences. Because there may be
    adaptor sequences that contain such a site by design, the clipping option
    allows the search to be restricted. Note that a site is searched if it
    overlaps with the valid region even by one base (i.e., a site can match if
    it is mostly outside the clipped region, as long as it overlaps the target
    search region).

    INPUT is a path to fasta file or "-" to specify STDIN.

    """
    query = str(site2dna(site))
    for (name, seq, qual) in readfq(input):
        start = clip_left
        end = len(seq) - clip_right
        idx = seq[start:end].find(query)
        if idx >= 0:
            print(f"{name}|{site}|{idx + start}", file=sys.stdout)
Exemple #3
0
 def test_manual_seq(self):
     assert site2dna("AGGCG") == "AGGCG"
Exemple #4
0
 def test_bad_site(self):
     with raises(ValueError):
         site2dna("foo")
Exemple #5
0
 def test_site2dna_enzyme(self):
     assert site2dna("EcoRI") == "GAATTC"