Exemple #1
0
    def fetch_restriction_sites(self, enzymes="Common"):
        """ 
            Spike in target variant first, generate list 
            restriction enzymes that will work.
        """
        if enzymes == "ALL":
            enzyme_group = AllEnzymes
        elif enzymes == "Common":
            enzyme_group = CommOnly
        elif enzymes == "HF":
            enzyme_group = high_fidelity
        else:
            enzyme_group = RestrictionBatch(enzymes.split(","))

        # Filter ambiguous cutters
        enzyme_group = RestrictionBatch(
            [x for x in enzyme_group if x.is_ambiguous() is False])

        # Calculate rflps for ALT sites only
        self.ref_sites = dict(list(enzyme_group.search(self.ref_seq).items()))
        self.primary_variant_sites = dict(
            list(enzyme_group.search(self.primary_variant_seq).items()))
        self.rflps = {
            k: (self.ref_sites[k], self.primary_variant_sites[k])
            for k, v in list(self.ref_sites.items())
            if len(v) > 0 and len(v) <= 3
            and self.ref_sites[k] != self.primary_variant_sites[k]
        }
Exemple #2
0
def digest_genome(genome_fp, restriction_enzyme, output_dir, linear=False):
    base_fp = os.path.basename(genome_fp)
    if '.' in base_fp:
        base_fp = '{}.{}.fragments.bed'.format(base_fp[:base_fp.rfind('.')],
                                               restriction_enzyme)
    else:
        base_fp = '{}.{}.fragments.bed'.format(base_fp, restriction_enzyme)
    base_fp = os.path.join(output_dir, base_fp)
    if os.path.isfile(base_fp):
        overwrite = input(
            'WARNING: Overwriting existing fragment BED {}. Continue? [y/N]'.
            format(base_fp))
        if not overwrite.lower() == 'y':
            print("Did not overwrite existing fragment BED.")
            return
        os.remove(base_fp)
    print("Digesting")
    genome = None
    if "fasta" in genome_fp or "fa" in genome_fp:
        genome = SeqIO.parse(open(genome_fp, "rU"), format='fasta')
    else:
        genome = SeqIO.parse(open(genome_fp, "rU"), format='genbank')
    for chromosome in genome:
        print('{}\t{}'.format(chromosome.id, len(chromosome.seq)))
        # Digest the sequence data and return the cut points
        enzyme = RestrictionBatch([restriction_enzyme])
        for enzyme, cutpoints in enzyme.search(chromosome.seq,
                                               linear=linear).items():
            if len(cutpoints) == 0:
                print('No restriction sites found for {}'.format(
                    chromosome.id))
                continue
            df = pd.DataFrame(cutpoints, columns=['cutpoint'])
            df['end'] = df.cutpoint - 1
            df['start'] = df.end - (df.cutpoint.diff())
            df.loc[0, 'start'] = 0
            df['start'] = df['start'].astype('Int64')
            if len(df) > 1:
                last_fragment = pd.DataFrame({
                    'start': [df.loc[len(df) - 1, 'end']],
                    'end': [len(chromosome.seq)],
                    'cutpoint': [-1]
                })
                df = df.append(last_fragment, ignore_index=True)
            else:
                df.loc[len(df) - 1, 'end'] = len(chromosome.seq)
            df['frag_id'] = df.index
            # chromosome has 'chr'
            accession = chromosome.id
            version = ''
            if "." in chromosome.id:
                accession, version = chromosome.id.split(".")
            if not accession.startswith("chr"):
                accession = "chr" + accession
            df['chr'] = accession
            df[['chr', 'start', 'end', 'frag_id']].to_csv(base_fp,
                                                          index=False,
                                                          sep='\t',
                                                          mode='a',
                                                          header=None)
Exemple #3
0
def REsearch(goi='', goiFile='', mcs='', mcsFile=''):
    rb = RestrictionBatch(suppliers=[
        'C', 'B', 'E', 'I', 'K', 'J', 'M', 'O', 'N', 'Q', 'S', 'R', 'V', 'Y',
        'X'
    ])

    goi = Seq(goi, IUPACUnambiguousDNA()) if goi else read_seq(goiFile)
    if not goi:
        raise Exception('Please provide a GOI sequence!')
    mcs = Seq(mcs, IUPACUnambiguousDNA()) if mcs else read_seq(mcsFile)
    if not mcs:
        raise Exception('Please provide a MCS sequence!')
    result_mcs = rb.search(mcs)
    result_goi = rb.search(goi)
    REs = set([e for e in result_mcs.keys() if result_mcs[e]]) - set(
        [e for e in result_goi.keys() if result_goi[e]])

    # ana = Analysis(RestrictionBatch(list(REs)), mcs)

    # REs_sorted = sorted(REs, key=lambda e: result_mcs[e])

    # result = {e: result_mcs[e] for e in REs_sorted}

    r = []
    for e in REs:
        for site in result_mcs[e]:
            r.append((str(e), site, "blunt" if e.is_blunt() else e.elucidate(),
                      ' '.join(e.suppl)))

    r.sort(key=lambda i: i[1])

    return r
def has_restriction_site(seq):
    from Bio.Seq import Seq
    from Bio.Restriction import RestrictionBatch

    mix = RestrictionBatch(restriction_sites)
    hits = mix.search(Seq(seq))

    return any(hits.values())
    def test_batch_analysis(self):
        """Sequence analysis with a restriction batch."""
        seq = Seq("AAAA" + EcoRV.site + "AAAA" + EcoRI.site + "AAAA")
        batch = RestrictionBatch([EcoRV, EcoRI])

        hits = batch.search(seq)
        self.assertEqual(hits[EcoRV], [8])
        self.assertEqual(hits[EcoRI], [16])
    def test_batch_analysis(self):
        """Sequence analysis with a restriction batch."""
        seq = Seq("AAAA" + EcoRV.site + "AAAA" + EcoRI.site + "AAAA",
                  IUPACAmbiguousDNA())
        batch = RestrictionBatch([EcoRV, EcoRI])

        hits = batch.search(seq)
        self.assertEqual(hits[EcoRV], [8])
        self.assertEqual(hits[EcoRI], [16])
Exemple #7
0
    def re_sites(self, sequence):
        seq = Seq(sequence, IUPACAmbiguousDNA)
        # Set up analysis class with our enzymes and seq
        rb = RestrictionBatch(self.enzyme_set)

        # Do digest and reformat to dict of {site: enz, site:enz}
        re_sites = {}
        for enzyme, cutsites in rb.search(seq).items():
            for cut in cutsites:
                cut = cut + enzyme.fst3 - 1
                re_sites[cut] = enzyme
        return sorted(re_sites.items())
Exemple #8
0
def restriction_sites_present(spacer: str, rsb: RestrictionBatch) -> List[int]:
    """Determine if and where a set of restriction sites are present in a
    sequence\f

    Parameters
    ----------
    spacer : `str`
        Spacer sequence to examine for restriction sites.

    Returns
    -------
    :class:`typing.List`[`int`]
    """

    sites = bool([_ for results in rsb.search(Seq(spacer)).values() for _ in results])
    return sites
Exemple #9
0
def calc_digest_products(seq, enzymes, *, is_circular):
    from more_itertools import pairwise, flatten
    from Bio.Restriction import RestrictionBatch
    from Bio.Seq import Seq

    if not enzymes:
        raise UsageError("no enzymes specified", enzymes=enzymes)

    enzymes = [re.sub('-HF(v2)?$', '', x) for x in enzymes]

    try:
        batch = RestrictionBatch(enzymes)
    except ValueError:
        raise ConfigError(
            lambda e: f"unknown enzyme(s): {','.join(map(repr, e.enzymes))}",
            enzymes=enzymes,
        ) from None

    sites = [x - 1 for x in flatten(batch.search(Seq(seq)).values())]

    if not sites:
        raise ConfigError(
            lambda e:
            f"{','.join(map(repr, e.enzymes))} {plural(enzymes):/does/do} not cut template.",
            enzymes=enzymes,
            seq=seq,
        )

    sites += [] if is_circular else [0, len(seq)]
    sites = sorted(sites)

    seqs = []
    for i, j in pairwise(sorted(sites)):
        seqs.append(seq[i:j])

    if is_circular:
        wrap_around = seq[sites[-1]:] + seq[:sites[0]]
        seqs.append(wrap_around)

    return seqs
def find_spacers(target=None, outfile=None, refgenome=None, restriction_sites=[], largeIndex=False, cutoff=0,
                 offtargetcutoff=0, trim=False, logging=False, nuclease='Cas9', return_limit=9, reject=False):

    with open(target, 'rU') as infile:
        # Use our modified FastaIterator instead of, perhaps the more proper, SeqIO.parse() method
        # allows us to trim the UTR sequences off
        if trim:
        # If the 'trim' option is enabled, the header for each entry must be
        # GENEID | TRANSCRIPTID | EXON RANK | CONSTITUTIVE EXON | 5' UTR END | 3' UTR STOP | EXON START | EXON END
            try:
                itemiter = m_FastaIterator(infile)
            except IOError as e:
                print("I/O error({0}): {1}".format(e.errno, e.strerror))
        else:
            try:
                itemiter = SeqIO.parse(infile, 'fasta')
            except IOError as e:
                print("I/O error({0}): {1}".format(e.errno, e.strerror))
        itemlist = [temp for temp in itemiter]

    spacerlist = []

    # This will find our 20N-NGG target sequence plus the -4->-3 and +1->+3 nucleotides for scoring
    if nuclease == 'Cas9':
        PAM = r'(?i)[ACGT]{25}[G]{2}[ACGT]{3}'
    elif nuclease == 'Cpf1':
        PAM = r'(?i)[T]{2,}[A-Z]{25}'

    rsb = RestrictionBatch(restriction_sites)

    #spacerlist = map(lambda item: find_each_spacer(item, rsb, cutoff, PAM), itemlist)

    seen = []
    print("{} sequences to search for spacers.".format(len(itemlist)))
    widgets = ['Examining sequence: ', progressbar.Counter(), ' ', progressbar.Percentage(), ' ',
               progressbar.Bar(), progressbar.Timer()]
    progress = progressbar.ProgressBar(widgets=widgets, maxval=len(itemlist)).start()

    spacer_re = regex.compile(PAM)
    #pol3stop = regex.compile(r'(?i)[T]{4,}')

    for item in itemlist:
        # Find all of the potential protospacer sequences, i.e. any 21 nucleotide sequence that precedes a double g

        progress.update(itemlist.index(item)+1)

        spacerMatch = (spacer_re.findall(str(item.seq), overlapped=True) +
                       spacer_re.findall(str(item.reverse_complement().seq), overlapped=True))

        for ps in spacerMatch:
            # Note that ps[4:24] is the actual protospacer.  I need the rest of the sequence for scoring
            ps_seq = Seq(ps[4:24], IUPAC.unambiguous_dna)
            rs = rsb.search(ps_seq)
            # on_target_score_calculator only works if the sequence is in uppercase
            # otherwise, it returns a value of 0.193313360829 for some reason
            score = calc_score(ps.upper())

            # Get rid of anything with T(4+) as those act as RNAPIII terminators
            #if bool(pol3stop.findall(ps[4:24])):
            if "TTTT" in ps[4:24]:
                # TODO Should this also eliminate anything with G(4)?
                pass
            # Get rid of anything that has the verboten restriction sites
            elif bool([y for y in rs.values() if y != []]):
                pass
            # Eliminate potentials with a GC content <20 or >80%
            elif GC(ps_seq) <= 20 or GC(ps_seq) >= 80:
                pass
            elif float(score) < cutoff:
                # explicitly converting cutoff to a float because otherwise it 'sometimes' fails
                # (epecially if you set cutoff to 0.5 on the commandline)
                pass
            # keep everthing else
            else:
                if ps[4:24] not in seen:
                    position = int(str(item.seq).find(ps)) + int(item.description.split("|")[7])
                    keys = ['description','position','score','spacer','offtargetscore','name']
                    values = [item.description, int(position), score, ps[4:24], 100]
                # because of duplicated sequences or whatever, spacelist can end up with duplicate entries
                # so let's take care of them
                # For future!
                #spacer = ProtoSpacer(description=item.description, position=int(position), score=score, spacerseq=ps[4:24], offtargetscore=100)
                #if ps[4:24] not in seen:
                #    spacerset.update(spacer)
                #    seen.append(ps[4:24])
                    spacerlist.append(dict(zip(keys,values)))
                    seen.append(ps[4:24])
    progress.finish()

    if len(spacerlist) == 0:
        print("Sorry, no spacers matching that criteria were found")
        return 0
    else:
        print("Finished finding spacers.  {} spacers found.  Begining Bowtie alignment...".format(len(spacerlist)))

    # write out a file to pass off to Bowtie to use for off-target analysis
    with open('temp.fa', 'w') as tempfile:
        for entry in spacerlist:
            tempfile.writelines(">%s %s %s\n%s\n" % (entry['description'], entry['position'],
                                                     entry['score'], entry['spacer']))

    # delete lists we are not going to use in the future to save on some memory
    del(itemlist)
    del(seen)

    # Use Bowtie to find if this particular sequence has any potential off targets (i. e. two or fewer mismatches)
    # As current, Bowtie is set to return all everything that a particular spacer matches within the reference genome
    # with two or fewer mismatches.
    # TODO switch to Bowtie2 so we can account for gaps in mismatches
    # TODO can we modify this setup to account for NAG PAMs or do we even need to? (i. e. are we already?)
    program = 'bowtie'
    cpus = "-p" + str(cpu_count())
    # maybe tell bowtie to stop looking after a set number of matches

    if reject:
        if largeIndex:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, '--large-index',
                                     refgenome, '-f', 'temp.fa', 'offtargets.fa'])
        else:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '-k', reject, refgenome, '-f',
                                     'temp.fa', 'offtargets.fa'])
    else:
        if largeIndex:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, '--large-index',
                                     refgenome, '-f', 'temp.fa', 'offtargets.fa'])
        else:
            subprocess.check_output([program, '-a', "--suppress", "3,5,6,7", cpus, refgenome, '-f',
                                     'temp.fa', 'offtargets.fa'])

    print("Bowtie finished.  Begining offtarget analysis...")
    bowtie_results_file = 'offtargets.fa'

    oftcount = 1
    # This count is slow and probably unnecessary
    total_lines = sum(1 for line in open(bowtie_results_file))
    print("Total alignments from Bowtie: {}".format(total_lines))
    new_widgets = ['Scoring for off-targets. Examining: ', progressbar.Counter(), ' ', progressbar.Percentage(),
                   ' ', progressbar.Bar(), progressbar.Timer(), progressbar.ETA()]
    new_progress = progressbar.ProgressBar(widgets=new_widgets, maxval=total_lines).start()

    prunedlist = [] # List to hold spacers whose off-target score is above the minimum cutoff

    mmpos = '[0-9]{1,}'
    mmpos_re = regex.compile(mmpos)

    with open(bowtie_results_file) as offtargetsfile: # parse all the bowtie results into something we can use
        # Read in the first line and parse.
        keys = ['readname', 'strand', 'position', 'mmpositions']
        values = offtargetsfile.readline().strip('\n').split('\t')
        previous_entry = dict(zip(keys,values))
        current_set_of_offtargets = []
        # Read in the next line in the list.
        for line in offtargetsfile:
            line_values = line.strip('\n').split('\t')
            next_entry = dict(zip(keys,line_values))
            new_progress.update(oftcount)
            oftcount += 1
            if next_entry['readname'] == previous_entry['readname']:# Check to see if it is for the same spacer
                current_set_of_offtargets.append(next_entry) # group them together.
            else: # If it isn't, it is time to score that set and move to the next
                # Extract the off-target positions from each entry
                mmlist = []
                badcount = 0
                for entry in current_set_of_offtargets:
                     pos = mmpos_re.findall(entry['mmpositions'])
                     if len(pos) == 0: # Bowtie returns a match for the spacer itself in the genome
                        # if we have two such matches, we should indicate that there is a perfect off-target
                        badcount += 1
                     elif entry['strand'] == '+': mmlist.append([int(w) for w in pos])
                     elif entry['strand'] == '-': mmlist.append([19-int(w) for w in pos])
                # Find the spacer in spacerlist to which these off-targets belong and set that spacer's offtarget score
                matching_spacer = next((spacer for spacer in spacerlist if (previous_entry['readname'] ==
                    '{} {} {}'.format(spacer['description'], spacer['position'],str(spacer['score'])))), None)
                # Tally up the offtarget score for the set of off targets and set the spacer's off-target score
                if badcount > 2:
                    matching_spacer['offtargetscore'] = 0
                # Speed this up by rejecting things with over a certain set of matching off-targets
                elif reject and len(mmlist) > reject:
                    matching_spacer['offtargetscore'] = 0
                else:
                    matching_spacer['offtargetscore'] = sumofftargets(mmlist)

                if float(matching_spacer['offtargetscore']) >= float(offtargetcutoff):
                    prunedlist.append(matching_spacer)
                # Dump the previous set.
                current_set_of_offtargets = []
                # Start a new set of off-targets using this new line
                previous_entry = next_entry
                current_set_of_offtargets.append(next_entry)

    new_progress.finish()

    print("\nFinished scoring off-targets")

    finallist = []
    if len(prunedlist) == 0:
        print("No spacers were found that correspond to the parameters you set.")
    else:
        if len(prunedlist[0]['description'].split('|')) == 9: # need to make sure the header format is correct
            for entry in prunedlist:
                finallist.append(FormattedResult(entry))
            # Create a set of all the GeneIDs in our list
            geneset = set([y.GeneID for y in finallist])
            toplist = []
            for z in geneset:
                all_spacers_for_gene = [a for a in finallist if a.GeneID == z]
                ranked_spacers = sorted(all_spacers_for_gene, key=attrgetter('score','offtargetscore'), reverse=True)
                if return_limit == 'all':
                    for w in ranked_spacers: toplist.append(w)
                elif len(ranked_spacers) > int(return_limit):
                    for w in range(0,int(return_limit)): toplist.append(ranked_spacers[w])
                else:
                    for w in range(0,len(ranked_spacers)-1): toplist.append(ranked_spacers[w])
            olist = map(lambda entry: [entry.GeneID, entry.GeneName, entry.seq, entry.GC, entry.position,
                                     entry.score, entry.offtargetscore], toplist)
            headerlist = ['GeneID', 'GeneName', 'seq','%GC','position', 'score', 'off-target score']
        else: # if it isn't, just dump all the spacers we found into a file
            finallist = spacerlist
            olist = map(lambda entry: [entry.id.split(' ')[0], entry.seq, GC(entry.seq), entry.position.split(' ')[1],
                                           entry.score.split(' ')[2]], finallist)
            headerlist = ['ID', 'seq', '%GC','position', 'score']

        print("Writing file.")
        try:
            with open(outfile, 'w') as ofile:
                output = csv.writer(ofile, dialect='excel')
                output.writerows([headerlist])
                output.writerows(olist)
        except IOError:
            print("There is trouble writing to the file.  Perhaps it is open in another application?")
            choice = input("Would you like to try again? [y/n]")
            if choice == 'y' or choice == 'Y':
                try:
                    with open(outfile, 'w') as ofile:
                        output = csv.writer(ofile, dialect='excel')
                        output.writerows([headerlist])
                        output.writerows(olist)
                except:
                    print("Sorry, still was unable to write to the file")
            else:
                return 0

        print("Finished.")
Exemple #11
0
class DigestedSequence:
    def __init__(self, enzymes, sequence, is_linear=True):
        self.enzymes = enzymes
        self.sequence = sequence
        self.res_batch = RestrictionBatch(enzymes)
        self.is_linear = is_linear
        self.site_dict = self.res_batch.search(self.sequence.seq, is_linear)

    def get_sites(self):
        """
        Return the set of sites for a given contig, ordered by increasing position.
        :return: list of CutSites
        """
        cutSites = []
        for e_name, ctg_locs in self.site_dict.iteritems():
            for loc in ctg_locs:
                cutSites.append(CutSite(e_name, loc))

        return sorted(cutSites)

    def get_fragments(self):
        """
        Return the genomic fragments resulting from the digestion.
        :return: list of SeqRecords.
        """
        sites = self.get_sites()
        seq = self.sequence

        if len(sites) == 0:
            return seq

        frags = []
        for idx in xrange(1, len(sites)):
            a = sites[idx - 1]
            b = sites[idx]
            frg = seq[a:b]
            frg.id = "{0}:{1}:{2}".format(frg.id, a, b)
            frg.name = frg.id
            frg.description = "restriction digest fragment from {0} to {1}".format(a, b)
            frags.append(frg)

        return frags

    @staticmethod
    def digestion_sites(seq_list, enzyme_names=[], min_sites=1):
        """
        Return a list of sites per sequence, preserving the input list order.
        :param seq_list: list of sequences to analyze
        :param enzyme_names: enzyme used in digestion
        :param min_sites: minimum sites required for a sequence to be included.
        :param min_length: minimum sequence length to be included.
        :return: list of sites per sequence
        """
        sites = []
        for seq in seq_list:
            if seq["excluded"]:
                continue

            ds = DigestedSequence(enzyme_names, seq["record"])
            seq_sites = ds.get_sites()

            if len(seq_sites) < min_sites:
                print "\tExcluded {0} (length {1}) with only {2} sites".format(
                    seq["record"].id, len(seq["record"]), len(seq_sites)
                )
                # continue

            sites.append({"name": seq["record"].id, "pos": seq_sites})
        return sites
#
# Created:     11/04/2013
#-------------------------------------------------------------------------------

from Bio import Entrez
from Bio import SeqIO
from Bio.Restriction import Restriction
from Bio.Restriction import RestrictionBatch

email="*****@*****.**"
Entrez.email = email
fetch_seq = Entrez.efetch(db="nucleotide", rettype="fasta",retmode="text", id="294489415")
seq_record = SeqIO.read(fetch_seq, "fasta")
fetch_seq.close()

# To see how a specific restriction enzyme (Sau3AI) would digest your sequence:
print("Restriction site is", Restriction.Sau3AI.site)
digest = Restriction.Sau3AI.catalyse(seq_record.seq)
print ("Number of fragments is", len(digest))
print "------\nLengths of each fragment\n------"
for lengths in digest:
	print len(lengths)

# Run every restriction enzyme in the New England Biolabs database against the sequence
rb_supp = RestrictionBatch(first=[], suppliers=['N'])
for rest in rb_supp.search(seq_record.seq):
	# The code commented out below will only show restriction enzymes that created a number of fragments between 10 and 40
    #if len(rb_supp.search(seq_record.seq)[rest]) > 10 and len(rb_supp.search(seq_record.seq)[rest]) < 40:
    # This will show every restriction ezyme that was able to digest the sequence in some way.
    if len(rb_supp.search(seq_record.seq)[rest]) > 0:
        print rest, ":", rb_supp.search(seq_record.seq)[rest]
Exemple #13
0
class DigestedSequence:
    def __init__(self, enzymes, sequence, is_linear=True):
        self.enzymes = enzymes
        self.sequence = sequence
        self.res_batch = RestrictionBatch(enzymes)
        self.is_linear = is_linear
        self.site_dict = self.res_batch.search(self.sequence.seq, is_linear)

    def get_sites(self):
        """
        Return the set of sites for a given contig, ordered by increasing position.
        :return: list of CutSites
        """
        cutSites = []
        for e_name, ctg_locs in self.site_dict.iteritems():
            for loc in ctg_locs:
                cutSites.append(CutSite(e_name, loc))

        return sorted(cutSites)

    def get_fragments(self):
        """
        Return the genomic fragments resulting from the digestion.
        :return: list of SeqRecords.
        """
        sites = self.get_sites()
        seq = self.sequence

        if len(sites) == 0:
            return seq

        frags = []
        for idx in xrange(1, len(sites)):
            a = sites[idx - 1]
            b = sites[idx]
            frg = seq[a:b]
            frg.id = '{0}:{1}:{2}'.format(frg.id, a, b)
            frg.name = frg.id
            frg.description = 'restriction digest fragment from {0} to {1}'.format(
                a, b)
            frags.append(frg)

        return frags

    @staticmethod
    def digestion_sites(seq_list, enzyme_names=[], min_sites=1):
        """
        Return a list of sites per sequence, preserving the input list order.
        :param seq_list: list of sequences to analyze
        :param enzyme_names: enzyme used in digestion
        :param min_sites: minimum sites required for a sequence to be included.
        :param min_length: minimum sequence length to be included.
        :return: list of sites per sequence
        """
        sites = []
        for seq in seq_list:
            if seq['excluded']:
                continue

            ds = DigestedSequence(enzyme_names, seq['record'])
            seq_sites = ds.get_sites()

            if len(seq_sites) < min_sites:
                print '\tExcluded {0} (length {1}) with only {2} sites'.format(
                    seq['record'].id, len(seq['record']), len(seq_sites))
                #continue

            sites.append({'name': seq['record'].id, 'pos': seq_sites})
        return sites
Exemple #14
0
def _catalyze(record: SeqRecord,
              enzymes: List[RestrictionType],
              linear=True) -> List[Tuple[str, SeqRecord, str]]:
    """Catalyze a SeqRecord and return all post-digest SeqRecords with overhangs.

    Overhangs are returned as the overhang plus the position of the cut
    in the 5' end (^) and 3' end (_). So a 5' overhang may be:
    ^AAAA_. But a 3' overhang may be: _AAAA^.

    Args:
        record: The SeqRecord to digest with enzymes
        enzymes: List of enzymes to digest the input records with

    Keyword Args:
        linear: Whether the record to catalyze is linear or circular

    Returns:
        Tuple with: (left overhang, cut fragment, right overhang)
    """

    record = record.upper()
    batch = RestrictionBatch(enzymes)
    batch_sites = batch.search(record.seq, linear=linear)

    # order all cuts with enzymes based on index
    cuts_seen: Set[int] = set()
    enzyme_cuts: List[Tuple[RestrictionType, int]] = []
    for enzyme, cuts in batch_sites.items():
        for cut in cuts:
            if cut in cuts_seen:
                continue
            cuts_seen.add(cut)
            enzyme_cuts.append((enzyme, cut - 1))  # revert to 0-based
    enzyme_cuts = sorted(enzyme_cuts, key=lambda x: x[1])

    # list of left/right overhangs for each fragment
    frag_w_overhangs: List[Tuple[str, SeqRecord, str]] = []
    for i, (enzyme, cut) in enumerate(enzyme_cuts):
        if i == len(enzyme_cuts) - 1 and linear:
            continue

        next_enzyme, next_cut = enzyme_cuts[(i + 1) % len(enzyme_cuts)]

        enzyme_len = len(enzyme.ovhgseq)
        next_enzyme_len = len(next_enzyme.ovhgseq)

        # shift cuts left for 3overhang enzymes
        if enzyme.is_3overhang():
            cut -= enzyme_len
        if next_enzyme.is_3overhang():
            next_cut -= next_enzyme_len

        cut_rc = cut if enzyme.is_3overhang() else cut + enzyme_len
        next_cut_rc = (next_cut if next_enzyme.is_3overhang() else next_cut +
                       next_enzyme_len)

        # find the cutsite sequences
        left = record[cut:cut + enzyme_len]
        right = record[next_cut:next_cut + next_enzyme_len]
        left_rc = right.reverse_complement()
        right_rc = left.reverse_complement()

        left = str(left.seq)
        right = str(right.seq)
        left_rc = str(left_rc.seq)
        right_rc = str(right_rc.seq)

        if enzyme.is_3overhang():
            left += "^"
            right_rc += "^"
        else:
            left = "^" + left
            right_rc = "^" + right_rc

        if next_enzyme.is_3overhang():
            right += "^"
            left_rc += "^"
        else:
            right = "^" + right
            left_rc = "^" + left_rc

        # shift cuts right again for 3overhang enzymes
        if enzyme.is_3overhang():
            cut += enzyme_len
        if next_enzyme.is_3overhang():
            next_cut += next_enzyme_len

        frag = record[cut:next_cut]
        frag_rc = record[cut_rc:next_cut_rc].reverse_complement()
        frag_rc.id = record.id

        if next_cut < cut:  # wraps around the zero-index
            frag = (record + record)[cut:next_cut + len(record)]
            frag.id = record.id
            frag_rc = (record + record)[cut_rc:next_cut_rc +
                                        len(record)].reverse_complement()
            frag_rc.id = record.id

        frag_w_overhangs.append((left, frag, right))
        frag_w_overhangs.append((left_rc, frag_rc, right_rc))

    return frag_w_overhangs
Exemple #15
0
def redigest_code():
    argscheck()
    ### making outfile
    # output file
    if entry2_input.get() == "":
        outfile = 'redigest.'+ TIME + '.out'
    else:
        outfile=entry2_input.get()
    out_file = open(outfile, 'wt+')

    ### processing gene sequences
    if entry7_input.get() == "Multifasta gene file":
        genomeSeq = "N"
    elif entry7_input.get() == "Single genome sequence":
        genomeSeq = "Y"
    if genomeSeq == 'N':
        ### making report file
        verbosity = 'Y'
        report_file = outfile + '.csv'
        # open report file
        RF = open(report_file, 'wt+')
        # reverse complement the reverse primer
        if entry6_input.get() == "":
            reverse = ""
        else:
            reverse = entry6_input.get()
        reverse=str(Seq(reverse).reverse_complement())
        # counter for the NAME
        count=1
        ### iterating sequences
        input_file = entry1_input.get()
        infile= open(input_file, 'r')

        # input file format
        if entry8_input.get() == "Fasta":
            informat = "fasta"
        elif entry8_input.get() == "Genbank":
            informat = "genbank"
        for record in SeqIO.parse(infile, informat):
            header=record.id
            array=str(record.seq)
            NAME=str('RED' + TIME + str(count))
            #

            if informat == 'genbank':
                desc=str(', '.join(list(record.annotations["taxonomy"])))
            else:
                desc = ''
            ## adding primer sequence if provided
            if entry5_input.get() == "":
                forward = ""
            else:
                forward = entry5_input.get()
            if forward is not None:
                Farray=''.join(forward + array)
            else:
                Farray=array
            ### adding primer sequence if provided
            if entry6_input.get() == "":
                reverse = ""
            else:
                reverse = entry6_input.get()
            if reverse is not None:
                FarrayR=''.join(Farray.strip('\n') + reverse)
            else:
                FarrayR=Farray
            ### orientation based on tagged
            if entry4_input.get() == "Forward":
                tagg = str("F")
            elif entry4_input.get() == "Reverse":
                tagg = str("R")
            tagged=tagg.upper()
            if tagged == 'R':
                FarrayRseqFR=str(Seq(FarrayR).reverse_complement())
                SubFeat="TRF_RevComp"
            else:
                FarrayRseqFR=str(FarrayR)
                SubFeat="TRF"
            ### Restriction Enzyme check from list
            enzyme = entry3_input.get()
            enzyme_RE = RestrictionBatch([enzyme])
            ### search the restriction sites position in sequence
            FarrayRseqFR_RE=enzyme_RE.search(Seq(FarrayRseqFR))
            ### convert the dict to the list and indexing
            index=list(FarrayRseqFR_RE.values())[0]
            ### checking if restriction site is present or sequence will be uncut
            if not index:
                fragment=len(FarrayRseqFR)
            else:
                fragment=index[0]
            ### adding size to header and trimming sequence to terminal fragment length
            if not index:
                ### non-cut fragment header
                FastaHeader=NAME + "|" + str(len(FarrayRseqFR)) + "_bp" + "|" + header
                ### non-cut fragment sequence
                FastaSeq=FarrayRseqFR[:len(FarrayRseqFR)]
                Feat=SeqFeature(FeatureLocation(start=0, end=len(FarrayRseqFR)), type="REDigest", ref=SubFeat)
            else:
                ### cut fragment header
                FastaHeader=NAME + "|" + str(fragment) + "_bp" + "|" + header
                ### cut fragment sequence and slicing to the fragment length
                FastaSeq=FarrayRseqFR[:fragment]
                Feat=SeqFeature(FeatureLocation(start=0, end=fragment), type="REDigest", ref=SubFeat)
            ### terminal-screen output, info about sequence header and all the fragments
                ### based of verbosity

            if verbosity == 'Y':
                print(" ", FastaHeader, '\t', FarrayRseqFR_RE)
            ### counter for the locus name
            count +=1
            ### seq object
            FastaSequence = SeqRecord(Seq(FastaSeq, IUPAC.IUPACAmbiguousDNA()), FastaHeader, description=desc, name=NAME)
            ### append features to seqobject
            FastaSequence.features.append(Feat)
            ### seq object
            if entry9_input.get() == "Fasta":
                outformat = "fasta"
            elif entry9_input.get() == "Genbank":
                outformat = "genbank"
            if outformat == 'genbank':
                SeqIO.write(FastaSequence, out_file, outformat)
            else:
                SeqIO.write(FastaSequence, out_file, "fasta-2line")
            ### writing progress to file too
            print(FastaHeader, '\t', FarrayRseqFR_RE, file=RF)
            #####################

############################################################### Genome
    else:
        ### parsing genome sequence
        ### making report file
        if entry9_input.get() == "Fasta":
            outformat = "fasta"
        elif entry9_input.get() == "Genbank":
            outformat = "genbank"
        if entry2_input.get() == "":
            outfile = 'redigest.'+ TIME + '.out'
        else:
            outfile=entry2_input.get()
        report_file = outfile + '_RF.csv'
        # open report file
        RF = open(report_file, 'wt+')
        print("Individual restriction fragments", file=RF)
        print("[WRITING:] Individual restriction fragments to file:", report_file)
        input_file = entry1_input.get()
        infile= open(input_file, 'r')
        # input file format
        if entry8_input.get() == "Fasta":
            informat = "fasta"
        elif entry8_input.get() == "Genbank":
            informat = "genbank"
        for record in SeqIO.parse(infile, informat):
            Gen_header=record.id
            Gen_array=str(record.seq)
            #
            if informat == 'genbank':
                desc=str(', '.join(list(record.annotations["taxonomy"])))
            else:
                desc = ''
            ### Restriction Enzyme check from list
            enzyme = entry3_input.get()
            enzyme_RE = RestrictionBatch([enzyme])
            ### search the restriction sites position in sequence
            Gen_array_RE=enzyme_RE.search(Seq(Gen_array))
            Gen_array_RE_V = list(Gen_array_RE.values())[0]
            #
            ID0 = 0
            ID_min = min(Gen_array_RE_V)
            ID_max = max(Gen_array_RE_V)
            ID1 = 0
            ID2 = 1
            # first fragment from first nt to first cut
            GenFastaSeq=Gen_array[0:ID_min]
            GenFastaSeqLen = len(GenFastaSeq)
            GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header
            # verbosity
            verbosity = "Y"
            if verbosity == 'Y':
                print(" ", GenFastaHeader)
            # seq object
            GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc)
            ### seq object to file
            if outformat == 'genbank':
                SeqIO.write(GenSeqRec, out_file, outformat)
            elif outformat == 'fasta':
                SeqIO.write(GenSeqRec, out_file, outformat)
            # report to file
            print(GenFastaHeader, file=RF)
            #
            for GenomeFragment in Gen_array_RE_V:
                while ID2 < len(Gen_array_RE_V):
                    GenFastaSeq=Gen_array[Gen_array_RE_V[ID1]:Gen_array_RE_V[ID2]]
                    GenFastaSeqLen = len(GenFastaSeq)
                    GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header
                    # verbosity
                    if verbosity == 'Y':
                        print(" ", GenFastaHeader)
                    # increment value for index
                    ID1 += 1
                    ID2 += 1
                    # seq object
                    GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc)
                    ### seq object to file
                    if outformat == 'genbank':
                        SeqIO.write(GenSeqRec, out_file, outformat)
                    elif outformat == 'fasta':
                        SeqIO.write(GenSeqRec, out_file, outformat)
                    # report to file
                    print(GenFastaHeader, file=RF)
            # last fragment from last cut to last nt
            GenFastaSeq=Gen_array[ID_max:]
            GenFastaSeqLen = len(GenFastaSeq)
            GenFastaHeader=Gen_header + "|" + str(GenFastaSeqLen) + "_bp|" + Gen_header
            # verbosity
            if verbosity == 'Y':
                print(" ", GenFastaHeader)
            # seq object
            GenSeqRec = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader, description=desc)
            ### seq object to file
            if outformat == 'genbank':
                SeqIO.write(GenSeqRec, out_file, outformat)
            else:
                SeqIO.write(GenSeqRec, out_file, outformat)
            # report to file
            print(GenFastaHeader, file=RF)
            # getting all sequences from first nt to respective cut
            report_file2 = outfile + '_TRF.csv'
            # open report file
            TRF2 = open(report_file2, 'wt+')
            print("Terminal restriction fragments: from nucleotide 1 to respective cuts", file=TRF2)
            print("[WRITING:] Terminal restriction fragments, from nucleotide 1 to respective cuts to file:", report_file2)
            for GenomeFragment in Gen_array_RE_V:
                GenFastaHeader=Gen_header + "|" + str(GenomeFragment) + "_bp|" + Gen_header
                GenFastaSeq=Gen_array[:GenomeFragment]
                ### seq object
                GenFastaSequence = SeqRecord(Seq(GenFastaSeq, IUPAC.IUPACAmbiguousDNA()), GenFastaHeader,
                                             description=desc)
                ### terminal-screen output, info about sequence header and all the fragments
                ### based of verbosity
                if verbosity == 'Y':
                    print(" ", GenFastaHeader)
                ### writing progress to file too
                print(GenFastaHeader, file=TRF2)
        ## close files
        TRF2.close()
        RF.close()
    # final close
    out_file.close()
    infile.close()