Beispiel #1
0
    def recursive_extension(prev_seq, prev_cnt):  # No defaults
        bad_bases = ''.join(
            [other_seq[len(prev_seq)] for other_seq in seqs_so_far])
        if prev_seq[-2] == prev_seq[-1]:  # Don't allow triplets
            bad_bases += prev_seq[-1]
            if prev_seq[-1] == 'G':  # Illumina has higher errors with GGC
                bad_bases += 'C'
        if prev_cnt[0] + prev_cnt[
                3] == AT_max:  # Enforce AT/GC content within bounds
            bad_bases += 'AT'
        elif prev_cnt[1] + prev_cnt[2] == GC_max:
            bad_bases += 'CG'
        for i in range(len(prev_seq) -
                       4):  # Don't allow rev-comp seqs of 3+ bp
            if dna_rev_comp(prev_seq[i + 1:i + 3]) == prev_seq[-2:]:
                bad_bases += dna_rev_comp(prev_seq[i])
        if len(prev_seq) >= 4:
            for base in [b for b in bases if b not in bad_bases]:
                rc_seq = dna_rev_comp(prev_seq[-4:] + base)
                if any(rc_seq in ssf for ssf in seqs_so_far):
                    bad_bases += base

        if len(prev_seq) + 1 == k:
            for base in bases:
                if base in bad_bases:
                    continue
                else:
                    yield prev_seq + base
        else:
            for base, inc in bases_and_incs:
                if base in bad_bases:
                    continue
                for seq in recursive_extension(prev_seq + base,
                                               prev_cnt + inc):
                    yield seq
Beispiel #2
0
def no_hex_complementarity(s1, good_prefixes):
    s1_rc = seqtools.dna_rev_comp(s1)
    for i in range(len(s1) - 5):
        sub_s1_rc = s1_rc[i:i + 6]
        for s2 in good_prefixes:
            if sub_s1_rc in s2:
                return False
    return True
Beispiel #3
0
def get_cut_prefixes(complete_sequences, cannonical_cut_sites, fudge_factor):
    cut_prefixes = set()
    for site in cannonical_cut_sites:
        for pamtarg_coord in range(site - fudge_factor,
                                   site + fudge_factor + 1 + 1):
            for oligo in complete_sequences:
                cut_prefixes.add(
                    seqtools.dna_rev_comp(
                        oligo.prefix_to_pamtarg_coord(pamtarg_coord)))
                cut_prefixes.add(oligo.suffix_to_pamtarg_coord(pamtarg_coord))
    return cut_prefixes
Beispiel #4
0
def multiple_barcodes_generator(bc_lists, r):
    if r == 1:
        for bc in bc_lists[0]:
            yield [bc]
    elif r > 1:
        for bc in bc_lists[r - 1]:
            bc_rev_comp_triplets = [
                dna_rev_comp(bc[i:i + 3]) for i in range(len(bc) - 3)
            ]
            for other_bcs in multiple_barcodes_generator(bc_lists, r=r - 1):
                if go_together(bc, other_bcs, bc_rev_comp_triplets):
                    yield other_bcs + [bc]
    else:
        raise ValueError('r < 1 encountered: {}'.format(r))
Beispiel #5
0
 def is_good_seq(seq):
     # Don't allow triplets
     for b in bases:
         if b * 3 in seq:
             return False
     # Don't allow GGC
     if 'GGC' in seq:
         return False
     # Enforce GC content within bounds
     if not (GC_min <= seq.count('G') + seq.count('C') <= GC_max):
         return False
     # Don't allow rev-comps of 3+ bp
     seq_rc = seqtools.dna_rev_comp(seq)
     for i in range(len(seq) - 3):
         if seq[i:i + 3] in seq_rc[:-i - 3]:
             return False
     return True
Beispiel #6
0
    def __init__(self, exploded_fpath, perfect_target,
                 perfect_pamtarg_one_target_pos):

        # Load oligos
        self.oligos = [
            NucleaSeqOligo(*line.strip().split('\t'))
            for line in open(exploded_fpath)
        ]
        self.oligos_set = set(self.oligos)
        self.perfect_target = perfect_target
        log.info('Loaded {:,d} oligos ({:,d} unique)'.format(
            len(self.oligos), len(self.oligos_set)))
        log.info('Oligo lengths: {}'.format(Counter(map(len, self.oligos))))

        # Find useful subsets
        self.perfect_target_oligos = [
            oligo for oligo in self.oligos
            if oligo._target == self.perfect_target
        ]
        self.target_oligos = [
            oligo for oligo in self.oligos if oligo._buffer_left
        ]
        self.non_target_oligos = [
            oligo for oligo in self.oligos if not oligo._buffer_left
        ]
        log.info('{:,d} Perfect target oligos'.format(
            len(self.perfect_target_oligos)))
        log.info('{:,d} Target oligos'.format(len(self.target_oligos)))
        log.info('{:,d} Non-target oligos'.format(len(self.non_target_oligos)))

        # Find primers
        self.cr_left = assert_len_1_and_return_element(
            set(oligo._cr_left for oligo in self.oligos))
        self.cr_right_rc = assert_len_1_and_return_element(
            set(oligo._cr_right for oligo in self.oligos))
        self.cr_right = dna_rev_comp(self.cr_right_rc)
        log.info('Left/Right Primer Seqs: {} / {}'.format(
            self.cr_left, self.cr_right))
        log.info('Left/Right Primer Edit Distance: {}'.format(
            editdistance.eval(self.cr_left, self.cr_right)))

        # Check buffers seqs
        self.buffer_lefts = Counter(oligo._buffer_left
                                    for oligo in self.target_oligos)
        self.buffer_rights = Counter(oligo._buffer_right
                                     for oligo in self.target_oligos)
        log.info('Target oligo left buffers: {}'.format(
            Counter(self.buffer_lefts)))
        log.info('Target oligo right buffers: {}'.format(
            Counter(self.buffer_rights)))

        # Find seq subsets by buffer
        most_common_buffer_left = sorted(self.buffer_lefts,
                                         key=self.buffer_lefts.get,
                                         reverse=True)[0]
        most_common_buffer_right = sorted(self.buffer_rights,
                                          key=self.buffer_rights.get,
                                          reverse=True)[0]
        self.perfect_target_and_buffer_oligos = [
            oligo for oligo in self.perfect_target_oligos
            if oligo._buffer_left == most_common_buffer_left
            and oligo._buffer_right == most_common_buffer_right
        ]
        self.alt_buffer_oligos = [
            oligo for oligo in self.target_oligos
            if oligo._buffer_left != most_common_buffer_left
            or oligo._buffer_right != most_common_buffer_right
        ]
        log.info('Most common left/right buffers: {} / {}'.format(
            most_common_buffer_left, most_common_buffer_right))
        log.info('{:,d} Perfect target and most common buffer oligos'.format(
            len(self.perfect_target_and_buffer_oligos)))
        log.info('{:,d} Alternate buffer oligos'.format(
            len(self.alt_buffer_oligos)))

        # Check right_buffer_buffers
        log.info('Target right buffer buffers: {}'.format(
            Counter(oligo._right_buffer_buffer
                    for oligo in self.target_oligos)))

        # Process barcodes
        self.left_barcodes = set(oligo._barcode_left for oligo in self.oligos)
        self.right_barcodes = set(
            dna_rev_comp(oligo._barcode_right) for oligo in self.oligos)
        self.barcodes = self.left_barcodes | self.right_barcodes
        report_str = 'Barcodes not present in other side\'s barcode list: Left {} / Right {}'.format(
            len(self.left_barcodes - self.right_barcodes),
            len(self.right_barcodes - self.left_barcodes),
        )
        if len(self.left_barcodes ^ self.right_barcodes) > 2:
            log.warn(report_str)
        else:
            log.info(report_str)

        # Get bc_len
        self.bc_len = assert_len_1_and_return_element(
            set(len(bc) for bc in self.barcodes))
        log.info('Barcode length: {}'.format(self.bc_len))

        # Make relevant dicts
        self.oligo_given_left_barcode = {
            oligo._barcode_left: oligo
            for oligo in self.oligos
        }
        self.oligo_given_right_barcode = {
            dna_rev_comp(oligo._barcode_right): oligo
            for oligo in self.oligos
        }
        self.oligo_given_barcode_given_side = {
            'left': self.oligo_given_left_barcode,
            'right': self.oligo_given_right_barcode
        }
        self.oligo_given_seq = {oligo.sequence: oligo for oligo in self.oligos}
        self.target_given_seq = {
            oligo.sequence: oligo._target
            for oligo in self.oligos
        }
        log.info('Made barcode and seq dicts.')

        # Update pamtarg coords
        for oligo in self.target_oligos:
            oligo.add_pamtarg_coord_one_pos(perfect_target,
                                            perfect_pamtarg_one_target_pos)

        log.info('Pam-target-one target positions: {}'.format(
            Counter([
                oligo.pamtarg_one_target_pos for oligo in self.target_oligos
            ])))