Esempio n. 1
0
def build_manual_target(base_dir, target_name):
    target_dir = base_dir / 'targets' / target_name

    gb_fns = sorted(target_dir.glob('*.gb'))

    if len(gb_fns) != 1:
        raise ValueError

    gb_fn = gb_fns[0]

    records = list(Bio.SeqIO.parse(str(gb_fn), 'genbank'))

    if len(records) != 1:
        raise ValueError

    record = records[0]

    manifest = {
        'sources': [gb_fn.stem],
        'target': record.id,
    }

    manifest_fn = target_dir / 'manifest.yaml'

    with manifest_fn.open('w') as fh:
        fh.write(yaml.dump(manifest, default_flow_style=False))

    ti = target_info.TargetInfo(base_dir, target_name)
    ti.make_references()
    ti.identify_degenerate_indels()
Esempio n. 2
0
def build_guide_specific_target(
    original_target,
    original_genbank_name,
    guide_library,
    guide,
    tasks_queue=None,
):
    warnings.simplefilter('ignore')

    new_name = f'{original_target.name}_{guide_library.name}_{guide}'

    new_dir = original_target.dir.parent / new_name
    new_dir.mkdir(exist_ok=True)

    gb_fn = original_target.dir / f'{original_genbank_name}.gb'
    gb = Bio.SeqIO.read(str(gb_fn), 'genbank')

    protospacer = original_target.features[original_target.target,
                                           'protospacer']
    ps_seq = guide_library.guides_df.loc[guide, 'protospacer']
    gb.seq = gb.seq[:protospacer.start] + ps_seq + gb.seq[protospacer.end + 1:]

    new_gb_fn = new_dir / f'{original_genbank_name}.gb'
    if new_gb_fn.exists():
        new_gb_fn.unlink()
    Bio.SeqIO.write(gb, str(new_gb_fn), 'genbank')

    fns_to_copy = [
        f'{source}.gb' for source in original_target.sources
        if source != original_genbank_name
    ]
    fns_to_copy.append('manifest.yaml')

    relative_original_dir = Path(os.path.relpath(original_target.dir, new_dir))

    for fn in fns_to_copy:
        new_fn = new_dir / fn
        old_fn = relative_original_dir / fn

        if new_fn.exists() or new_fn.is_symlink():
            new_fn.unlink()

        new_fn.symlink_to(old_fn)

    new_ti = target_info.TargetInfo(original_target.base_dir, new_name)

    new_ti.make_references()
    new_ti.identify_degenerate_indels()

    if tasks_queue is not None:
        tasks_queue.put(guide)
Esempio n. 3
0
def build_all_doubles(
    base_dir,
    fixed_guide_library_name,
    variable_guide_library_name,
    test=False,
    num_processes=18,
):
    warnings.simplefilter('ignore')

    original_target = target_info.TargetInfo(base_dir, 'doubles_vector')

    original_target.make_references()
    original_target.identify_degenerate_indels()

    args_list = []

    fixed_guide_library = GuideLibrary(base_dir, fixed_guide_library_name)
    variable_guide_library = GuideLibrary(base_dir,
                                          variable_guide_library_name)

    manager = multiprocessing.Manager()
    tasks_done_queue = manager.Queue()

    for fixed_guide in fixed_guide_library.guides:
        for variable_guide in variable_guide_library.guides:
            args = (original_target, fixed_guide_library,
                    variable_guide_library, fixed_guide, variable_guide,
                    tasks_done_queue)
            args_list.append(args)

    if test:
        args_list = args_list[:10]
        for args in tqdm.tqdm(args_list):
            build_doubles_guide_specific_target(*args)
    else:
        progress = tqdm.tqdm(desc='Making doubles_vector targets',
                             total=len(args_list))
        pool = multiprocessing.Pool(processes=num_processes)
        pool.starmap_async(build_doubles_guide_specific_target, args_list)

        while progress.n != len(args_list):
            tasks_done_queue.get()
            progress.update()
Esempio n. 4
0
def build_all_singles(
    base_dir,
    original_target_name,
    original_genbank_name,
    guide_library_names,
    test=False,
    num_processes=18,
):
    warnings.simplefilter('ignore')

    original_target = target_info.TargetInfo(base_dir, original_target_name)

    original_target.make_references()
    original_target.identify_degenerate_indels()

    manager = multiprocessing.Manager()
    tasks_done_queue = manager.Queue()

    args_list = []

    for guide_library_name in guide_library_names:
        guide_library = GuideLibrary(base_dir, guide_library_name)
        for guide in guide_library.guides:
            args_list.append((original_target, original_genbank_name,
                              guide_library, guide, tasks_done_queue))

    if test:
        args_list = args_list[:10]
        for args in tqdm.tqdm(args_list):
            build_guide_specific_target(*args)
    else:
        progress = tqdm.tqdm(desc='Making targets', total=len(args_list))
        with multiprocessing.Pool(processes=num_processes) as pool:
            pool.starmap_async(build_guide_specific_target, args_list)

            while progress.n != len(args_list):
                tasks_done_queue.get()
                progress.update()
Esempio n. 5
0
def build_target_info(
    base_dir,
    info,
    all_index_locations,
    defer_HA_identification=False,
    offtargets=False,
):
    ''' info should have keys:
            sgRNA_sequence
            amplicon_primers
        optional keys:
            donor_sequence
            nonhomologous_donor_sequence
            extra_sequences
            effector
    '''
    genome = info['genome']
    if info['genome'] not in all_index_locations:
        print(f'Error: can\'t locate indices for {genome}')
        sys.exit(0)
    else:
        index_locations = all_index_locations[genome]

    base_dir = Path(base_dir)

    name = info['name']

    donor_info = info.get('donor_sequence')
    if donor_info is None:
        donor_name = None
        donor_seq = None
    else:
        donor_name, donor_seq = donor_info
        if donor_name is None:
            donor_name = f'{name}_donor'

    if donor_seq is None:
        has_donor = False
    else:
        has_donor = True

    if info['donor_type'] is None:
        donor_type = None
    else:
        _, donor_type = info['donor_type']

    nh_donor_info = info.get('nonhomologous_donor_sequence')
    if nh_donor_info is None:
        nh_donor_name = None
        nh_donor_seq = None
    else:
        nh_donor_name, nh_donor_seq = nh_donor_info
        if nh_donor_name is None:
            nh_donor_name = f'{name}_NH_donor'

    if nh_donor_seq is None:
        has_nh_donor = False
    else:
        has_nh_donor = True

    target_dir = base_dir / 'targets' / name
    target_dir.mkdir(parents=True, exist_ok=True)

    protospacer, *other_protospacers = info['sgRNA_sequence']
    primers_name, primers = info['amplicon_primers']
    primers = primers.split(';')

    if primers_name is None:
        target_name = name
    else:
        target_name = primers_name

    protospacer_dir = target_dir / 'protospacer_alignment'
    protospacer_dir.mkdir(exist_ok=True)
    fastq_fn = protospacer_dir / 'protospacer.fastq'
    STAR_prefix = protospacer_dir / 'protospacer_'
    bam_fn = protospacer_dir / 'protospacer.bam'

    STAR_index = index_locations['STAR']

    gb_fns = {
        'target': target_dir / f'{target_name}.gb',
        'donor': target_dir / f'{donor_name}.gb',
        'nh_donor': target_dir / f'{nh_donor_name}.gb',
    }

    # Make a fastq file with a single read containing the protospacer sequence.
    protospacer_name, protospacer_seq = protospacer

    with fastq_fn.open('w') as fh:
        quals = fastq.encode_sanger([40] * len(protospacer_seq))
        read = fastq.Read('protospacer', protospacer_seq, quals)
        fh.write(str(read))

    # Align the protospacer to the reference genome.
    mapping_tools.map_STAR(fastq_fn,
                           STAR_index,
                           STAR_prefix,
                           mode='guide_alignment',
                           bam_fn=bam_fn,
                           sort=False)

    with pysam.AlignmentFile(bam_fn) as bam_fh:
        perfect_als = [
            al for al in bam_fh
            if not al.is_unmapped and sam.total_edit_distance(al) == 0
        ]
        imperfect_als = [al for al in bam_fh if not al.is_unmapped]

    region_fetcher = genomes.build_region_fetcher(index_locations['fasta'])

    def evaluate_candidate(al):
        results = {
            'location':
            f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}',
        }

        full_window_around = 5000

        full_around = region_fetcher(
            al.reference_name, al.reference_start - full_window_around,
            al.reference_end + full_window_around).upper()

        if sam.get_strand(al) == '+':
            ps_seq = protospacer_seq
            ps_strand = 1
        else:
            ps_seq = utilities.reverse_complement(protospacer_seq)
            ps_strand = -1

        ps_start = full_around.index(ps_seq)

        protospacer_locations = [(protospacer_name, ps_seq, ps_start,
                                  ps_strand)]

        for other_protospacer_name, other_protospacer_seq in other_protospacers:

            # Initial G may not match genome.
            if other_protospacer_seq.startswith('G'):
                other_protospacer_seq = other_protospacer_seq[1:]

            if other_protospacer_seq in full_around:
                ps_seq = other_protospacer_seq
                ps_strand = 1
            else:
                ps_seq = utilities.reverse_complement(other_protospacer_seq)
                if ps_seq not in full_around:
                    results[
                        'failed'] = f'protospacer {other_protospacer_seq} not present near protospacer {protospacer_seq}'
                    return results
                ps_strand = -1

            ps_start = full_around.index(ps_seq)
            protospacer_locations.append(
                (other_protospacer_name, ps_seq, ps_start, ps_strand))

        if 'effector' in info:
            effector_type = info['effector']
        else:
            if donor_type == 'pegRNA':
                effector_type = 'SpCas9H840A'
            else:
                effector_type = 'SpCas9'

        effector = target_info.effectors[effector_type]

        for ps_name, ps_seq, ps_start, ps_strand in protospacer_locations:
            PAM_pattern = effector.PAM_pattern

            if (ps_strand == 1 and effector.PAM_side
                    == 3) or (ps_strand == -1 and effector.PAM_side == 5):
                PAM_offset = len(ps_seq)
                PAM_transform = utilities.identity
            else:
                PAM_offset = -len(PAM_pattern)
                PAM_transform = utilities.reverse_complement

            PAM_start = ps_start + PAM_offset
            PAM = PAM_transform(full_around[PAM_start:PAM_start +
                                            len(PAM_pattern)])
            pattern, *matches = Bio.SeqUtils.nt_search(PAM, PAM_pattern)

            if 0 not in matches and not offtargets:
                # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer
                # in full_around.
                results[
                    'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})'
                return results

        if primers[0] in full_around:
            leftmost_primer = primers[0]
            rightmost_primer = utilities.reverse_complement(primers[1])
            if rightmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[1]} not present near protospacer'
                return results

            leftmost_primer_name = 'forward_primer'
            rightmost_primer_name = 'reverse_primer'

        else:
            leftmost_primer = primers[1]
            rightmost_primer = utilities.reverse_complement(primers[0])

            if leftmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[1]} not present near protospacer'
                return results

            if rightmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[0]} not present near protospacer'
                return results

            leftmost_primer_name = 'reverse_primer'
            rightmost_primer_name = 'forward_primer'

        leftmost_start = full_around.index(leftmost_primer)
        rightmost_start = full_around.index(rightmost_primer)

        if leftmost_start >= rightmost_start:
            results['failed'] = f'primers don\'t flank protospacer'
            return results

        # Now that primers have been located, redefine the target sequence to include a fixed
        # window on either side of the primers.

        final_window_around = 500

        offset = leftmost_start - final_window_around

        final_start = leftmost_start - final_window_around
        final_end = rightmost_start + len(
            rightmost_primer) + final_window_around

        target_seq = full_around[final_start:final_end]

        leftmost_location = FeatureLocation(leftmost_start - offset,
                                            leftmost_start - offset +
                                            len(leftmost_primer),
                                            strand=1)
        rightmost_location = FeatureLocation(rightmost_start - offset,
                                             rightmost_start - offset +
                                             len(rightmost_primer),
                                             strand=-1)

        colors = {
            'HA_1': '#c7b0e3',
            'HA_RT': '#c7b0e3',
            'HA_2': '#85dae9',
            'HA_PBS': '#85dae9',
            'forward_primer': '#75C6A9',
            'reverse_primer': '#9eafd2',
            'sgRNA': '#c6c9d1',
            'donor_specific': '#b1ff67',
            'PCR_adapter_1': '#F8D3A9',
            'PCR_adapter_2': '#D59687',
            'protospacer': '#ff9ccd',
            'scaffold': '#b7e6d7',
        }

        target_features = [
            SeqFeature(
                location=leftmost_location,
                id=leftmost_primer_name,
                type='misc_feature',
                qualifiers={
                    'label': leftmost_primer_name,
                    'ApEinfo_fwdcolor': colors[leftmost_primer_name],
                },
            ),
            SeqFeature(
                location=rightmost_location,
                id=rightmost_primer_name,
                type='misc_feature',
                qualifiers={
                    'label': rightmost_primer_name,
                    'ApEinfo_fwdcolor': colors[rightmost_primer_name],
                },
            ),
        ]

        if leftmost_primer_name == 'forward_primer':
            start = leftmost_start - offset
            start_location = FeatureLocation(start, start + 5, strand=1)
        else:
            start = rightmost_start - offset + len(rightmost_primer) - 5
            start_location = FeatureLocation(start, start + 5, strand=-1)

        target_features.extend([
            SeqFeature(
                location=start_location,
                id='sequencing_start',
                type='misc_feature',
                qualifiers={
                    'label': 'sequencing_start',
                },
            ),
            SeqFeature(
                location=start_location,
                id='anchor',
                type='misc_feature',
                qualifiers={
                    'label': 'anchor',
                },
            ),
        ])

        sgRNA_features = []
        for sgRNA_i, (ps_name, ps_seq, ps_start,
                      ps_strand) in enumerate(protospacer_locations):
            sgRNA_feature = SeqFeature(
                location=FeatureLocation(ps_start - offset,
                                         ps_start - offset + len(ps_seq),
                                         strand=ps_strand),
                id=f'sgRNA_{ps_name}',
                type=f'sgRNA_{effector.name}',
                qualifiers={
                    'label': f'sgRNA_{ps_name}',
                    'ApEinfo_fwdcolor': colors['sgRNA'],
                },
            )
            target_features.append(sgRNA_feature)
            sgRNA_features.append(sgRNA_feature)

        results['gb_Records'] = {}

        if has_donor:
            if not defer_HA_identification:
                # If multiple sgRNAs are given, the edited one must be listed first.
                sgRNA_feature = sgRNA_features[0]

                cut_after_offset = [
                    offset for offset in effector.cut_after_offset
                    if offset is not None
                ][0]

                if sgRNA_feature.strand == 1:
                    # sgRNA_feature.end is the first nt of the PAM
                    cut_after = sgRNA_feature.location.end + cut_after_offset
                else:
                    # sgRNA_feature.start - 1 is the first nt of the PAM
                    cut_after = sgRNA_feature.location.start - 1 - cut_after_offset - 1

                if donor_type == 'pegRNA':
                    HA_info = identify_pegRNA_homology_arms(
                        donor_seq, target_seq, cut_after, protospacer_seq,
                        colors)
                else:
                    HA_info = identify_homology_arms(donor_seq, donor_type,
                                                     target_seq, cut_after,
                                                     colors)

                if 'failed' in HA_info:
                    results['failed'] = HA_info['failed']
                    return results

                donor_Seq = Seq(HA_info['possibly_flipped_donor_seq'])
                donor_features = HA_info['donor_features']
                target_features.extend(HA_info['target_features'])

            else:
                donor_Seq = Seq(donor_seq)
                donor_features = []

            donor_Record = SeqRecord(donor_Seq,
                                     name=donor_name,
                                     features=donor_features,
                                     annotations={'molecule_type': 'DNA'})
            results['gb_Records']['donor'] = donor_Record

        target_Seq = Seq(target_seq)
        target_Record = SeqRecord(target_Seq,
                                  name=target_name,
                                  features=target_features,
                                  annotations={'molecule_type': 'DNA'})
        results['gb_Records']['target'] = target_Record

        if has_nh_donor:
            nh_donor_Seq = Seq(nh_donor_seq)
            nh_donor_Record = SeqRecord(nh_donor_Seq,
                                        name=nh_donor_name,
                                        annotations={'molecule_type': 'DNA'})
            results['gb_Records']['nh_donor'] = nh_donor_Record

        return results

    good_candidates = []
    bad_candidates = []

    for al in perfect_als:
        results = evaluate_candidate(al)
        if 'failed' in results:
            bad_candidates.append(results)
        else:
            good_candidates.append(results)

    if len(good_candidates) == 0:
        if len(bad_candidates) == 0:
            print(
                f'Error building {name}: no perfect matches to sgRNA {protospacer} found in {genome}'
            )
            print(imperfect_als)
            return

        else:
            print(
                f'Error building {name}: no valid genomic locations for {name}'
            )

            for results in bad_candidates:
                print(f'\t{results["location"]}: {results["failed"]}')

            return

    elif len(good_candidates) > 1:
        print(f'Warning: multiple valid genomic locations for {name}:')
        for results in good_candidates:
            print(f'\t{results["location"]}')
        best_candidate = good_candidates[0]
        print(f'Arbitrarily choosing {best_candidate["location"]}')
    else:
        best_candidate = good_candidates[0]

    truncated_name_i = 0
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=BiopythonWarning)

        for which_seq, Record in best_candidate['gb_Records'].items():
            try:
                Bio.SeqIO.write(Record, gb_fns[which_seq], 'genbank')
            except ValueError:
                # locus line too long, can't write genbank file with BioPython
                old_name = Record.name

                truncated_name = f'{Record.name[:11]}_{truncated_name_i}'
                Record.name = truncated_name
                Bio.SeqIO.write(Record, gb_fns[which_seq], 'genbank')

                Record.name = old_name

                truncated_name_i += 1

    manifest_fn = target_dir / 'manifest.yaml'

    sources = [target_name]
    if has_donor:
        sources.append(donor_name)

    extra_Records = []
    if info.get('extra_sequences') is not None:
        for extra_seq_name, extra_seq in info['extra_sequences']:
            sources.append(extra_seq_name)

            extra_Records.append(SeqRecord(extra_seq, name=extra_seq_name),
                                 annotations={'molecule_type': 'DNA'})

    manifest = {
        'sources': sources,
        'target': target_name,
    }
    if has_donor:
        manifest['donor'] = donor_name
        manifest['donor_specific'] = 'donor_specific'
        if donor_type is not None:
            manifest['donor_type'] = donor_type

    if has_nh_donor:
        manifest['nonhomologous_donor'] = nh_donor_name

    manifest['features_to_show'] = [
        [target_name, 'forward_primer'],
        [target_name, 'reverse_primer'],
    ]

    if has_donor:
        if donor_type == 'pegRNA':
            manifest['features_to_show'].extend([
                [donor_name, 'scaffold'],
                [donor_name, 'protospacer'],
                [donor_name, 'HA_RT'],
                [donor_name, 'HA_PBS'],
                [target_name, 'HA_RT'],
                [target_name, 'HA_PBS'],
            ])
        else:
            manifest['features_to_show'].extend([
                [donor_name, 'HA_1'],
                [donor_name, 'HA_2'],
                [donor_name, 'donor_specific'],
                [donor_name, 'PCR_adapter_1'],
                [donor_name, 'PCR_adapter_2'],
                [target_name, 'HA_1'],
                [target_name, 'HA_2'],
            ])

    manifest['genome_source'] = genome

    manifest_fn.write_text(yaml.dump(manifest, default_flow_style=False))

    gb_records = list(best_candidate['gb_Records'].values()) + extra_Records
    ti = target_info.TargetInfo(base_dir, name, gb_records=gb_records)
    ti.make_references()
    ti.make_protospacer_fastas()
    ti.map_protospacers(genome)
    ti.identify_degenerate_indels()

    shutil.rmtree(protospacer_dir)
Esempio n. 6
0
def build_doubles_guide_specific_target(
    original_target,
    fixed_guide_library,
    variable_guide_library,
    fixed_guide,
    variable_guide,
    tasks_queue=None,
):
    warnings.simplefilter('ignore')

    new_name = f'{original_target.name}-{fixed_guide}-{variable_guide}'

    new_dir = original_target.dir.parent / new_name
    new_dir.mkdir(exist_ok=True)

    original_genbank_name = 'doubles_vector'
    gb_fn = original_target.dir / f'{original_genbank_name}.gb'
    gb = Bio.SeqIO.read(str(gb_fn), 'genbank')

    fixed_ps = original_target.features[original_target.name,
                                        'fixed_protospacer']
    fixed_ps_seq = fixed_guide_library.guides_df.loc[fixed_guide,
                                                     'protospacer']
    gb.seq = gb.seq[:fixed_ps.start] + fixed_ps_seq + gb.seq[fixed_ps.end + 1:]

    variable_ps = original_target.features[original_target.name,
                                           'variable_protospacer']
    variable_ps_seq = variable_guide_library.guides_df.loc[variable_guide,
                                                           'protospacer']
    gb.seq = gb.seq[:variable_ps.
                    start] + variable_ps_seq + gb.seq[variable_ps.end + 1:]

    guide_bc_start = original_target.features[original_target.name,
                                              'fixed_guide_barcode'].start
    guide_bc_end = original_target.features[original_target.name,
                                            'fixed_guide_barcode'].end

    # guide barcode sequence in library df is on the reverse strand

    fixed_bc_seq_rc = fixed_guide_library.guides_df.loc[fixed_guide,
                                                        'guide_barcode']
    fixed_bc_seq = utilities.reverse_complement(fixed_bc_seq_rc)
    gb.seq = gb.seq[:guide_bc_start] + fixed_bc_seq + gb.seq[guide_bc_end + 1:]

    new_gb_fn = new_dir / f'{original_genbank_name}.gb'
    if new_gb_fn.exists():
        new_gb_fn.unlink()
    Bio.SeqIO.write(gb, str(new_gb_fn), 'genbank')

    fns_to_copy = [
        f'{source}.gb' for source in original_target.sources
        if source != original_genbank_name
    ]
    fns_to_copy.append('manifest.yaml')

    relative_original_dir = Path(os.path.relpath(original_target.dir, new_dir))

    for fn in fns_to_copy:
        new_fn = new_dir / fn
        old_fn = relative_original_dir / fn

        if new_fn.exists() or new_fn.is_symlink():
            new_fn.unlink()

        new_fn.symlink_to(old_fn)

    new_ti = target_info.TargetInfo(original_target.base_dir, new_name)

    new_ti.make_references()
    new_ti.identify_degenerate_indels()

    if tasks_queue is not None:
        tasks_queue.put((fixed_guide, variable_guide))