def trim_reads(self):
        ''' Trim a random length barcode from the beginning by searching for the expected starting sequence.
        '''

        ti = self.target_info

        if ti.sequencing_direction == '+':
            start = ti.sequencing_start.start
            prefix = ti.target_sequence[start:start + 6]
        else:
            end = ti.sequencing_start.end
            prefix = utilities.reverse_complement(
                ti.target_sequence[end - 5:end + 1])

        prefix = prefix.upper()

        trimmed_fn = self.fns_by_read_type['fastq']['trimmed']
        with gzip.open(trimmed_fn, 'wt', compresslevel=1) as trimmed_fh:
            for read in self.progress(self.reads, desc='Trimming reads'):
                try:
                    start = read.seq.index(prefix, 0, 20)
                except ValueError:
                    start = 0

                end = adapters.trim_by_local_alignment(adapters.truseq_R2_rc,
                                                       read.seq)
                trimmed_fh.write(str(read[start:end]))
Ejemplo n.º 2
0
def at_least_n_Bs(pool, n, B):
    outcomes = []
    b_rc = utilities.reverse_complement(B)
    for c, s, d in pool.outcome_counts().index.values:
        if c == 'mismatches':
            outcome = knock_knock.outcome.MismatchOutcome.from_string(d)
            if Counter(outcome.snvs.basecalls)[b_rc] >= n:
                outcomes.append((c, s, d))
    return outcomes
Ejemplo n.º 3
0
def convert_insertion(ins, source_target_info, dest_target_info):
    ''' insertion are defined by starts_afters and seqs
    When switching between anchor/+ and sgRNA/sgRNA strand coordinate, starts_afters may become starts_before,
    and seqs maybe be reverse complemented.
    '''
    before_after_pairs = [(s, s + 1) for s in ins.starts_afters]
    sgRNA_coords = [(convert_to_sgRNA_coords(source_target_info, b), convert_to_sgRNA_coords(source_target_info, a)) for a, b in before_after_pairs]
    anchor_coords = [(convert_to_anchor_coords(dest_target_info, b), convert_to_anchor_coords(dest_target_info, a)) for a, b in sgRNA_coords]
    anchor_coords = [sorted(pair) for pair in anchor_coords]
    starts_afters = sorted([s for s, e in anchor_coords])
    if source_target_info.sgRNA_feature.strand != dest_target_info.sgRNA_feature.strand:
        seqs = [utilities.reverse_complement(seq) for seq in ins.seqs][::-1]
    else:
        seqs = ins.seqs
    return knock_knock.target_info.DegenerateInsertion(starts_afters, seqs)
Ejemplo n.º 4
0
def build_guide_index(guides_fn, index_dir):
    ''' index entries are in same orientation as R2 '''
    index_dir = Path(index_dir)
    index_dir.mkdir(exist_ok=True)

    fasta_fn = index_dir / 'expected_R2s.fasta'

    guides_df = pd.read_csv(guides_fn, sep='\t', index_col=0)

    before_ps = 'AGTACCAAGTTGATAACGGACTAGCCTTATTTAAACTTGCTATGCTGTTTCCAGCTTAGCTCTTAAAC'
    # Note: Cs here are from untemplated addition and are not deterministically 3.
    after_ps = 'CCCATATAAGAAA'

    with fasta_fn.open('w') as fh:
        for name, protospacer in guides_df['protospacer'].items():
            expected_R2 = before_ps + utilities.reverse_complement(
                protospacer) + after_ps
            fh.write(str(fasta.Record(name, expected_R2)))

    pysam.faidx(str(fasta_fn))

    mapping_tools.build_STAR_index([fasta_fn], index_dir)

    bustools_dir = index_dir / 'bustools_annotations'
    bustools_dir.mkdir(exist_ok=True)

    matrix_fn = bustools_dir / 'matrix.ec'
    with matrix_fn.open('w') as fh:
        for i, name in enumerate(guides_df.index):
            fh.write(f'{i}\t{i}\n')

    transcript_to_gene_fn = bustools_dir / 'transcripts_to_genes.txt'
    with transcript_to_gene_fn.open('w') as fh:
        for i, name in enumerate(guides_df.index):
            fh.write(f'{name}\t{name}\t{name}\n')

    transcripts_fn = bustools_dir / 'transcripts.txt'
    with transcripts_fn.open('w') as fh:
        for i, name in enumerate(guides_df.index):
            fh.write(f'{name}\n')
Ejemplo n.º 5
0
def get_resolvers(base_dir, group):
    sample_sheet = load_sample_sheet(base_dir, group)

    expected_seqs = {}
    resolvers = {}

    I7_indices = {
        name: details['I7_index']
        for name, details in sample_sheet['pool_details'].items()
    }
    I5_indices = {
        name: details['I5_index']
        for name, details in sample_sheet['pool_details'].items()
    }

    expected_I7_indices = set()
    for seqs in I7_indices.values():
        if not isinstance(seqs, list):
            seqs = [seqs]
        expected_I7_indices.update(seqs)

    expected_I5_indices = set()
    for seqs in I5_indices.values():
        if not isinstance(seqs, list):
            seqs = [seqs]
        expected_I5_indices.update(seqs)

    expected_seqs['I7'] = expected_I7_indices
    expected_seqs['I5'] = expected_I5_indices
    resolvers['I7'] = utilities.get_one_mismatch_resolver(I7_indices).get
    resolvers['I5'] = utilities.get_one_mismatch_resolver(I5_indices).get

    variable_guide_library = repair_seq.guide_library.GuideLibrary(
        base_dir, sample_sheet['variable_guide_library'])

    ti_prefix = sample_sheet['target_info_prefix']

    guide_seqs = {}
    guide_seqs['variable_guide'] = defaultdict(set)

    if 'fixed_guide_library' in sample_sheet:
        has_fixed_barcode = True
        fixed_guide_library = repair_seq.guide_library.GuideLibrary(
            base_dir, sample_sheet['fixed_guide_library'])
        guide_seqs['fixed_guide_barcode'] = defaultdict(set)
        guide_pairs = list(
            itertools.product(fixed_guide_library.guides,
                              variable_guide_library.guides))
    else:
        has_fixed_barcode = False
        guide_pairs = [('none', vg) for vg in variable_guide_library.guides]

    for fg, vg in guide_pairs:
        if fg == 'none':
            ti_name = f'{ti_prefix}_{variable_guide_library.name}_{vg}'
        else:
            ti_name = f'{ti_prefix}-{fg}-{vg}'

        ti = knock_knock.target_info.TargetInfo(base_dir, ti_name)

        R1_primer = ti.features[ti.target, sample_sheet['R1_primer']]
        R2_primer = ti.features[ti.target, sample_sheet['R2_primer']]

        target_seq = ti.reference_sequences[ti.target]

        expected_R1 = target_seq[R1_primer.start:R1_primer.start +
                                 sample_sheet['R1_read_length']]
        guide_seqs['variable_guide'][vg].add(expected_R1)

        if fg != 'none':
            fixed_guide_barcode = ti.features[ti.target, 'fixed_guide_barcode']

            expected_R2 = utilities.reverse_complement(
                target_seq[fixed_guide_barcode.start:R2_primer.end + 1])
            guide_seqs['fixed_guide_barcode'][fg].add(expected_R2)

    for which in ['fixed_guide_barcode', 'variable_guide']:
        if which in guide_seqs:
            dictionary = guide_seqs[which]

            for g in sorted(dictionary):
                seqs = dictionary[g]
                if len(seqs) != 1:
                    raise ValueError(which, g, seqs)
                else:
                    seq = seqs.pop()
                    dictionary[g] = seq

            # convert from defaultdict to dict
            guide_seqs[which] = dict(dictionary)

    if has_fixed_barcode:
        fixed_lengths = {
            len(s)
            for s in guide_seqs['fixed_guide_barcode'].values()
        }
        if len(fixed_lengths) != 1:
            raise ValueError(fixed_lengths)

        fixed_length = fixed_lengths.pop()
        guide_barcode_slice = idx[:fixed_length]
        # If a guide barcode is present, remove it from R2 before passing along
        # to simplify analysis of common sequences in pool.
        after_guide_barcode_slice = idx[fixed_length:]

        resolvers['fixed_guide_barcode'] = utilities.get_one_mismatch_resolver(
            guide_seqs['fixed_guide_barcode']).get
        expected_seqs['fixed_guide_barcode'] = set(
            guide_seqs['fixed_guide_barcode'].values())
    else:
        # If there weren't multiple fixed guide pools present, keep everything
        # to allow possibility of outcomes that don't include the intended NotI site.
        def fixed_guide_barcode_resolver(*args):
            return {'none'}

        resolvers['fixed_guide_barcode'] = fixed_guide_barcode_resolver
        expected_seqs['fixed_guide_barcode'] = set()

        guide_barcode_slice = slice(None)
        after_guide_barcode_slice = idx[:]

    resolvers['variable_guide'] = utilities.get_one_mismatch_resolver(
        guide_seqs['variable_guide']).get
    expected_seqs['variable_guide'] = set(
        guide_seqs['variable_guide'].values())

    return resolvers, expected_seqs, guide_barcode_slice, after_guide_barcode_slice
Ejemplo n.º 6
0
    def evaluate_candidate(al):
        results = {
            'location':
            f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}',
        }

        full_window_around = 5000

        full_around = region_fetcher(
            al.reference_name, al.reference_start - full_window_around,
            al.reference_end + full_window_around).upper()

        if sam.get_strand(al) == '+':
            ps_seq = protospacer_seq
            ps_strand = 1
        else:
            ps_seq = utilities.reverse_complement(protospacer_seq)
            ps_strand = -1

        ps_start = full_around.index(ps_seq)

        protospacer_locations = [(protospacer_name, ps_seq, ps_start,
                                  ps_strand)]

        for other_protospacer_name, other_protospacer_seq in other_protospacers:

            # Initial G may not match genome.
            if other_protospacer_seq.startswith('G'):
                other_protospacer_seq = other_protospacer_seq[1:]

            if other_protospacer_seq in full_around:
                ps_seq = other_protospacer_seq
                ps_strand = 1
            else:
                ps_seq = utilities.reverse_complement(other_protospacer_seq)
                if ps_seq not in full_around:
                    results[
                        'failed'] = f'protospacer {other_protospacer_seq} not present near protospacer {protospacer_seq}'
                    return results
                ps_strand = -1

            ps_start = full_around.index(ps_seq)
            protospacer_locations.append(
                (other_protospacer_name, ps_seq, ps_start, ps_strand))

        if 'effector' in info:
            effector_type = info['effector']
        else:
            if donor_type == 'pegRNA':
                effector_type = 'SpCas9H840A'
            else:
                effector_type = 'SpCas9'

        effector = target_info.effectors[effector_type]

        for ps_name, ps_seq, ps_start, ps_strand in protospacer_locations:
            PAM_pattern = effector.PAM_pattern

            if (ps_strand == 1 and effector.PAM_side
                    == 3) or (ps_strand == -1 and effector.PAM_side == 5):
                PAM_offset = len(ps_seq)
                PAM_transform = utilities.identity
            else:
                PAM_offset = -len(PAM_pattern)
                PAM_transform = utilities.reverse_complement

            PAM_start = ps_start + PAM_offset
            PAM = PAM_transform(full_around[PAM_start:PAM_start +
                                            len(PAM_pattern)])
            pattern, *matches = Bio.SeqUtils.nt_search(PAM, PAM_pattern)

            if 0 not in matches and not offtargets:
                # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer
                # in full_around.
                results[
                    'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})'
                return results

        if primers[0] in full_around:
            leftmost_primer = primers[0]
            rightmost_primer = utilities.reverse_complement(primers[1])
            if rightmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[1]} not present near protospacer'
                return results

            leftmost_primer_name = 'forward_primer'
            rightmost_primer_name = 'reverse_primer'

        else:
            leftmost_primer = primers[1]
            rightmost_primer = utilities.reverse_complement(primers[0])

            if leftmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[1]} not present near protospacer'
                return results

            if rightmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[0]} not present near protospacer'
                return results

            leftmost_primer_name = 'reverse_primer'
            rightmost_primer_name = 'forward_primer'

        leftmost_start = full_around.index(leftmost_primer)
        rightmost_start = full_around.index(rightmost_primer)

        if leftmost_start >= rightmost_start:
            results['failed'] = f'primers don\'t flank protospacer'
            return results

        # Now that primers have been located, redefine the target sequence to include a fixed
        # window on either side of the primers.

        final_window_around = 500

        offset = leftmost_start - final_window_around

        final_start = leftmost_start - final_window_around
        final_end = rightmost_start + len(
            rightmost_primer) + final_window_around

        target_seq = full_around[final_start:final_end]

        leftmost_location = FeatureLocation(leftmost_start - offset,
                                            leftmost_start - offset +
                                            len(leftmost_primer),
                                            strand=1)
        rightmost_location = FeatureLocation(rightmost_start - offset,
                                             rightmost_start - offset +
                                             len(rightmost_primer),
                                             strand=-1)

        colors = {
            'HA_1': '#c7b0e3',
            'HA_RT': '#c7b0e3',
            'HA_2': '#85dae9',
            'HA_PBS': '#85dae9',
            'forward_primer': '#75C6A9',
            'reverse_primer': '#9eafd2',
            'sgRNA': '#c6c9d1',
            'donor_specific': '#b1ff67',
            'PCR_adapter_1': '#F8D3A9',
            'PCR_adapter_2': '#D59687',
            'protospacer': '#ff9ccd',
            'scaffold': '#b7e6d7',
        }

        target_features = [
            SeqFeature(
                location=leftmost_location,
                id=leftmost_primer_name,
                type='misc_feature',
                qualifiers={
                    'label': leftmost_primer_name,
                    'ApEinfo_fwdcolor': colors[leftmost_primer_name],
                },
            ),
            SeqFeature(
                location=rightmost_location,
                id=rightmost_primer_name,
                type='misc_feature',
                qualifiers={
                    'label': rightmost_primer_name,
                    'ApEinfo_fwdcolor': colors[rightmost_primer_name],
                },
            ),
        ]

        if leftmost_primer_name == 'forward_primer':
            start = leftmost_start - offset
            start_location = FeatureLocation(start, start + 5, strand=1)
        else:
            start = rightmost_start - offset + len(rightmost_primer) - 5
            start_location = FeatureLocation(start, start + 5, strand=-1)

        target_features.extend([
            SeqFeature(
                location=start_location,
                id='sequencing_start',
                type='misc_feature',
                qualifiers={
                    'label': 'sequencing_start',
                },
            ),
            SeqFeature(
                location=start_location,
                id='anchor',
                type='misc_feature',
                qualifiers={
                    'label': 'anchor',
                },
            ),
        ])

        sgRNA_features = []
        for sgRNA_i, (ps_name, ps_seq, ps_start,
                      ps_strand) in enumerate(protospacer_locations):
            sgRNA_feature = SeqFeature(
                location=FeatureLocation(ps_start - offset,
                                         ps_start - offset + len(ps_seq),
                                         strand=ps_strand),
                id=f'sgRNA_{ps_name}',
                type=f'sgRNA_{effector.name}',
                qualifiers={
                    'label': f'sgRNA_{ps_name}',
                    'ApEinfo_fwdcolor': colors['sgRNA'],
                },
            )
            target_features.append(sgRNA_feature)
            sgRNA_features.append(sgRNA_feature)

        results['gb_Records'] = {}

        if has_donor:
            if not defer_HA_identification:
                # If multiple sgRNAs are given, the edited one must be listed first.
                sgRNA_feature = sgRNA_features[0]

                cut_after_offset = [
                    offset for offset in effector.cut_after_offset
                    if offset is not None
                ][0]

                if sgRNA_feature.strand == 1:
                    # sgRNA_feature.end is the first nt of the PAM
                    cut_after = sgRNA_feature.location.end + cut_after_offset
                else:
                    # sgRNA_feature.start - 1 is the first nt of the PAM
                    cut_after = sgRNA_feature.location.start - 1 - cut_after_offset - 1

                if donor_type == 'pegRNA':
                    HA_info = identify_pegRNA_homology_arms(
                        donor_seq, target_seq, cut_after, protospacer_seq,
                        colors)
                else:
                    HA_info = identify_homology_arms(donor_seq, donor_type,
                                                     target_seq, cut_after,
                                                     colors)

                if 'failed' in HA_info:
                    results['failed'] = HA_info['failed']
                    return results

                donor_Seq = Seq(HA_info['possibly_flipped_donor_seq'])
                donor_features = HA_info['donor_features']
                target_features.extend(HA_info['target_features'])

            else:
                donor_Seq = Seq(donor_seq)
                donor_features = []

            donor_Record = SeqRecord(donor_Seq,
                                     name=donor_name,
                                     features=donor_features,
                                     annotations={'molecule_type': 'DNA'})
            results['gb_Records']['donor'] = donor_Record

        target_Seq = Seq(target_seq)
        target_Record = SeqRecord(target_Seq,
                                  name=target_name,
                                  features=target_features,
                                  annotations={'molecule_type': 'DNA'})
        results['gb_Records']['target'] = target_Record

        if has_nh_donor:
            nh_donor_Seq = Seq(nh_donor_seq)
            nh_donor_Record = SeqRecord(nh_donor_Seq,
                                        name=nh_donor_name,
                                        annotations={'molecule_type': 'DNA'})
            results['gb_Records']['nh_donor'] = nh_donor_Record

        return results
Ejemplo n.º 7
0
def identify_homology_arms(donor_seq,
                           donor_type,
                           target_seq,
                           cut_after,
                           colors,
                           required_match_length=15):
    header = pysam.AlignmentHeader.from_references(
        ['donor', 'target'], [len(donor_seq), len(target_seq)])
    mapper = sw.SeedAndExtender(donor_seq.encode(), 8, header, 'donor')

    target_bytes = target_seq.encode()

    alignments = {
        'before_cut': [],
        'after_cut': [],
    }

    seed_starts = {
        'before_cut': range(cut_after - required_match_length, 0, -1),
        'after_cut': range(cut_after,
                           len(target_seq) - required_match_length),
    }

    for side in ['before_cut', 'after_cut']:
        for seed_start in seed_starts[side]:
            alignments[side] = mapper.seed_and_extend(
                target_bytes, seed_start, seed_start + required_match_length,
                'target')
            if alignments[side]:
                break

        else:
            results = {'failed': f'cannot locate homology arm on {side}'}
            return results

    possible_HA_boundaries = []

    for before_al in alignments['before_cut']:
        for after_al in alignments['after_cut']:
            if sam.get_strand(before_al) == sam.get_strand(after_al):
                strand = sam.get_strand(before_al)
                if strand == '+':
                    if before_al.reference_end < after_al.reference_start:
                        possible_HA_boundaries.append(
                            (donor_seq, before_al.reference_start,
                             after_al.reference_end))
                elif strand == '-':
                    if before_al.reference_start > after_al.reference_end:
                        flipped_seq = utilities.reverse_complement(donor_seq)
                        start = len(donor_seq) - 1 - (before_al.reference_end -
                                                      1)
                        end = len(donor_seq) - 1 - after_al.reference_start + 1
                        possible_HA_boundaries.append(
                            (flipped_seq, start, end))

    possible_HAs = []
    for possibly_flipped_donor_seq, HA_start, HA_end in possible_HA_boundaries:
        donor_window = possibly_flipped_donor_seq[HA_start:HA_end]

        donor_prefix = donor_window[:required_match_length]

        donor_suffix = donor_window[-required_match_length:]

        # Try to be resilient against multiple occurrence of HA substrings in the target
        # by prioritizing matches closest to the cut site.
        target_HA_start = target_seq.rfind(donor_prefix, 0,
                                           cut_after + required_match_length)
        target_HA_end = target_seq.find(
            donor_suffix,
            cut_after - required_match_length) + len(donor_suffix)

        if target_HA_start == -1 or target_HA_end == -1 or target_HA_start >= target_HA_end:
            results = {'failed': f'cannot locate homology arms in target'}
            return results

        relevant_target_seq = target_seq[target_HA_start:target_HA_end]

        total_HA_length = target_HA_end - target_HA_start

        mismatches_before_deletion = np.cumsum(
            [t != d for t, d in zip(relevant_target_seq, donor_window)])

        flipped_target = relevant_target_seq[::-1]
        flipped_donor = donor_window[::-1]
        mismatches_after_deletion = np.cumsum(
            [0] + [t != d
                   for t, d in zip(flipped_target, flipped_donor)][:-1])[::-1]

        total_mismatches = mismatches_before_deletion + mismatches_after_deletion

        last_index_in_HA_1 = int(np.argmin(total_mismatches))
        min_mismatches = total_mismatches[last_index_in_HA_1]

        lengths = {}
        lengths['HA_1'] = last_index_in_HA_1 + 1
        lengths['HA_2'] = total_HA_length - lengths['HA_1']
        lengths['donor_specific'] = len(donor_seq) - total_HA_length

        info = {
            'min_mismatches': min_mismatches,
            'possibly_flipped_donor_seq': possibly_flipped_donor_seq,
            'donor_HA_start': HA_start,
            'donor_HA_end': HA_end,
            'target_HA_start': target_HA_start,
            'target_HA_end': target_HA_end,
            'lengths': lengths,
        }
        possible_HAs.append((info))

    def priority(info):
        return info['min_mismatches'], -min(info['lengths']['HA_1'],
                                            info['lengths']['HA_2'])

    if not possible_HAs:
        results = {'failed': 'cannot locate homology arms'}
    else:
        results = min(possible_HAs, key=priority)

    lengths = results['lengths']

    donor_starts = {
        'HA_1': results['donor_HA_start'],
        'donor_specific': results['donor_HA_start'] + lengths['HA_1'],
        'HA_2': results['donor_HA_end'] - lengths['HA_2'],
    }
    donor_ends = {
        'HA_1': donor_starts['HA_1'] + lengths['HA_1'],
        'donor_specific': donor_starts['HA_2'],
        'HA_2': donor_starts['HA_2'] + lengths['HA_2'],
    }

    if donor_type == 'PCR':
        if donor_starts['HA_1'] != 0:
            donor_starts['PCR_adapter_1'] = 0
            donor_ends['PCR_adapter_1'] = donor_starts['HA_1']

        if donor_ends['HA_2'] != len(donor_seq):
            donor_starts['PCR_adapter_2'] = donor_ends['HA_2']
            donor_ends['PCR_adapter_2'] = len(donor_seq)

    target_starts = {
        'HA_1': results['target_HA_start'],
        'HA_2': results['target_HA_end'] - lengths['HA_2'],
    }
    target_ends = {
        key: target_starts[key] + lengths[key]
        for key in target_starts
    }

    donor_strand = 1
    target_strand = 1

    donor_features = [
        SeqFeature(
            location=FeatureLocation(donor_starts[feature_name],
                                     donor_ends[feature_name],
                                     strand=donor_strand),
            id=feature_name,
            type='misc_feature',
            qualifiers={
                'label': feature_name,
                'ApEinfo_fwdcolor': colors[feature_name],
            },
        ) for feature_name in donor_starts
    ]

    target_features = ([
        SeqFeature(
            location=FeatureLocation(target_starts[feature_name],
                                     target_ends[feature_name],
                                     strand=target_strand),
            id=feature_name,
            type='misc_feature',
            qualifiers={
                'label': feature_name,
                'ApEinfo_fwdcolor': colors[feature_name],
            },
        ) for feature_name in target_starts
    ])

    HA_info = {
        'possibly_flipped_donor_seq': results['possibly_flipped_donor_seq'],
        'donor_features': donor_features,
        'target_features': target_features,
    }

    return HA_info
Ejemplo n.º 8
0
    def evaluate_candidate(al):
        results = {
            'location':
            f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}',
            'ref_name': al.reference_name,
            'cut_afters': [],
        }

        full_window_around = 5000

        full_around = region_fetcher(
            al.reference_name, al.reference_start - full_window_around,
            al.reference_end + full_window_around).upper()

        if sam.get_strand(al) == '+':
            ps_seq = protospacer
            ps_strand = 1
        else:
            ps_seq = utilities.reverse_complement(protospacer)
            ps_strand = -1

        ps_start = full_around.index(ps_seq)

        protospacer_locations = [(ps_seq, ps_start, ps_strand)]

        for other_protospacer in other_protospacers:
            if other_protospacer in full_around:
                ps_seq = other_protospacer
                ps_strand = 1
            else:
                ps_seq = utilities.reverse_complement(other_protospacer)
                if ps_seq not in full_around:
                    results[
                        'failed'] = f'protospacer {other_protospacer} not present near protospacer {protospacer}'
                    return results
                ps_strand = -1

            ps_start = full_around.index(ps_seq)
            protospacer_locations.append((ps_seq, ps_start, ps_strand))

        for ps_seq, ps_start, ps_strand in protospacer_locations:
            if ps_strand == 1:
                PAM_offset = len(protospacer)
                PAM_transform = utilities.identity
                cut_after = al.reference_start - full_window_around + ps_start + PAM_offset - 3
            else:
                PAM_offset = -3
                PAM_transform = utilities.reverse_complement
                cut_after = al.reference_start - full_window_around + ps_start + 2

            results['cut_afters'].append(cut_after)

            PAM_start = ps_start + PAM_offset
            PAM = PAM_transform(full_around[PAM_start:PAM_start + 3])
            pattern, *matches = Bio.SeqUtils.nt_search(PAM, 'NGG')

            if 0 not in matches:
                # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer
                # in full_around.
                results[
                    'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})'
                return results

        min_start = min(
            ps_start for ps_seq, ps_start, ps_strand in protospacer_locations)
        max_start = max(
            ps_start for ps_seq, ps_start, ps_strand in protospacer_locations)

        results['min_cut_after'] = min(results['cut_afters'])
        results['max_cut_after'] = max(results['cut_afters'])

        final_window_around = 500

        final_start = min_start - final_window_around
        final_end = max_start + final_window_around

        target_seq = full_around[final_start:final_end]
        results['target_seq'] = target_seq

        return results
Ejemplo n.º 9
0
def build_doubles_guide_specific_target(
    original_target,
    fixed_guide_library,
    variable_guide_library,
    fixed_guide,
    variable_guide,
    tasks_queue=None,
):
    warnings.simplefilter('ignore')

    new_name = f'{original_target.name}-{fixed_guide}-{variable_guide}'

    new_dir = original_target.dir.parent / new_name
    new_dir.mkdir(exist_ok=True)

    original_genbank_name = 'doubles_vector'
    gb_fn = original_target.dir / f'{original_genbank_name}.gb'
    gb = Bio.SeqIO.read(str(gb_fn), 'genbank')

    fixed_ps = original_target.features[original_target.name,
                                        'fixed_protospacer']
    fixed_ps_seq = fixed_guide_library.guides_df.loc[fixed_guide,
                                                     'protospacer']
    gb.seq = gb.seq[:fixed_ps.start] + fixed_ps_seq + gb.seq[fixed_ps.end + 1:]

    variable_ps = original_target.features[original_target.name,
                                           'variable_protospacer']
    variable_ps_seq = variable_guide_library.guides_df.loc[variable_guide,
                                                           'protospacer']
    gb.seq = gb.seq[:variable_ps.
                    start] + variable_ps_seq + gb.seq[variable_ps.end + 1:]

    guide_bc_start = original_target.features[original_target.name,
                                              'fixed_guide_barcode'].start
    guide_bc_end = original_target.features[original_target.name,
                                            'fixed_guide_barcode'].end

    # guide barcode sequence in library df is on the reverse strand

    fixed_bc_seq_rc = fixed_guide_library.guides_df.loc[fixed_guide,
                                                        'guide_barcode']
    fixed_bc_seq = utilities.reverse_complement(fixed_bc_seq_rc)
    gb.seq = gb.seq[:guide_bc_start] + fixed_bc_seq + gb.seq[guide_bc_end + 1:]

    new_gb_fn = new_dir / f'{original_genbank_name}.gb'
    if new_gb_fn.exists():
        new_gb_fn.unlink()
    Bio.SeqIO.write(gb, str(new_gb_fn), 'genbank')

    fns_to_copy = [
        f'{source}.gb' for source in original_target.sources
        if source != original_genbank_name
    ]
    fns_to_copy.append('manifest.yaml')

    relative_original_dir = Path(os.path.relpath(original_target.dir, new_dir))

    for fn in fns_to_copy:
        new_fn = new_dir / fn
        old_fn = relative_original_dir / fn

        if new_fn.exists() or new_fn.is_symlink():
            new_fn.unlink()

        new_fn.symlink_to(old_fn)

    new_ti = target_info.TargetInfo(original_target.base_dir, new_name)

    new_ti.make_references()
    new_ti.identify_degenerate_indels()

    if tasks_queue is not None:
        tasks_queue.put((fixed_guide, variable_guide))