Esempio n. 1
0
def batch_callable_bed(bam_files, output_bed_file, work_dir, genome_fasta_file, min_depth,
                       parall_view=None):
    """ Picking random 3 samples and getting a callable for them.
        Trade off between looping through all samples in a huge batch,
        and hitting an sample with outstanding coverage.
    """
    if can_reuse(output_bed_file, bam_files):
        return output_bed_file

    work_dir = safe_mkdir(join(work_dir, 'callable_work'))
    # random.seed(1234)  # seeding random for reproducability
    # bam_files = random.sample(bam_files, min(len(bam_files), 3))

    if parall_view:
        callable_beds = parall_view.run(_calculate, [
            [bf, work_dir, genome_fasta_file, min_depth]
            for bf in bam_files])
    else:
        with parallel_view(len(bam_files), ParallelCfg(threads=len(bam_files)), work_dir) as parall_view:
            callable_beds = parall_view.run(_calculate, [
                [bf, work_dir, genome_fasta_file, min_depth]
                for bf in bam_files])

    good_overlap_sample_fraction = 0.8  # we want to pick those regions that have coverage at 80% of samples
    good_overlap_count = max(1, good_overlap_sample_fraction * len(callable_beds))
    info(f'Intersecting callable regions and picking good overlaps with >={good_overlap_count} '
         f'samples ({100 * good_overlap_sample_fraction}% of {len(callable_beds)})')
    with file_transaction(work_dir, output_bed_file) as tx:
        pybedtools.set_tempdir(safe_mkdir(join(work_dir, 'pybedtools_tmp')))
        intersection = pybedtools.BedTool() \
            .multi_intersect(i=callable_beds) \
            .filter(lambda r: len(r[4].split(',')) >= good_overlap_count)
        intersection.saveas(tx)
    info(f'Saved to {output_bed_file}')
    return output_bed_file
Esempio n. 2
0
def main(bcbio_dir, bed, depth, threads=None, isdebug=True):
    snp_file = verify_file(bed)
    depth_cutoff = depth

    log.init(isdebug)

    try:
        import az
    except ImportError:
        parallel_cfg = ParallelCfg(threads=threads)
    else:
        sys_cfg = az.init_sys_cfg()
        parallel_cfg = ParallelCfg(
            scheduler=sys_cfg.get('scheduler'),
            queue=sys_cfg.get('queue'),
            resources=sys_cfg.get('resources'),
            threads=threads or sys_cfg.get('threads'),
            tag='clearup')

    log.info('Loading bcbio project from ' + bcbio_dir)
    log.info('-' * 70)
    proj = BcbioProject()
    proj.load_from_bcbio_dir(bcbio_dir, proc_name='clearup')
    log.info('Loaded ' + proj.final_dir)
    log_dir = safe_mkdir(join(proj.log_dir, 'clearup'))
    work_dir = safe_mkdir(join(proj.work_dir, 'clearup'))
    out_dir = safe_mkdir(join(proj.date_dir, 'clearup'))
    with parallel_view(len(proj.samples), parallel_cfg, log_dir) as parall_view:
        genotype(proj.samples, snp_file, parall_view, work_dir, out_dir, proj.genome_build)
Esempio n. 3
0
def _add_project(bam_by_sample,
                 project_name,
                 bed_file=None,
                 use_callable=False,
                 data_dir='',
                 genome='hg19',
                 min_depth=DEPTH_CUTOFF,
                 depth_by_sample=None,
                 reuse_files=False):
    fp_proj = Project.query.filter(Project.name == project_name).first()
    if fp_proj:
        fp_proj.delete(reuse_files=reuse_files)

    fp_proj = Project(
        name=project_name,
        data_dir=data_dir,
        genome=genome,
        bed_fpath=bed_file,
        min_depth=min_depth,
        used_callable=use_callable,
    )
    db.session.add(fp_proj)
    db_samples = []
    for sname, bam_file in bam_by_sample.items():
        db_samples.append(Sample(sname, fp_proj, bam_file))
    db.session.add_all(db_samples)

    work_dir = safe_mkdir(fp_proj.get_work_dir())

    do_ngb = False
    do_sex = False
    do_create_run = False
    if do_ngb or do_sex or do_create_run or use_callable:
        with parallel_view(len(bam_by_sample), parallel_cfg,
                           work_dir) as p_view:
            if use_callable:
                log.info(f'Calculating callable regions for {project_name}.')
                genome_fasta_file = get_ref_fasta(genome)
                fp_proj.bed_fpath = batch_callable_bed(
                    bam_by_sample.values(),
                    join(work_dir, 'callable_regions.bed'),
                    work_dir,
                    genome_fasta_file,
                    min_depth,
                    parall_view=p_view)
                log.debug(f'Set bed file {fp_proj.bed_fpath}')

            if do_create_run:
                get_or_create_run([fp_proj], parall_view=p_view)

            if do_ngb:
                log.info('Exposing to NGB')
                _add_to_ngb(work_dir, project_name, bam_by_sample, genome,
                            bed_file, p_view)

            if do_sex:
                log.info('Genotyping sex')
                sex_work_dir = safe_mkdir(join(work_dir, 'sex'))
                sexes = p_view.run(_sex_from_bam, [[
                    db_s.name, bam_by_sample[db_s.name], bed_file,
                    sex_work_dir, genome,
                    depth_by_sample.get(db_s.name) if depth_by_sample else
                    None, [snp.depth for snp in db_s.snps.all()]
                ] for db_s in db_samples])
                for s, sex in zip(db_samples, sexes):
                    s.sex = sex

    db.session.commit()

    log.info()
    log.info('Done.')
Esempio n. 4
0
def main(ctx, subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False):
    """ Generates a PNG image with a relatedness heatmap.
    """
    if not subdirs:
        ctx.fail('Provide at least on input directory.')

    datasets = _load_datasets(subdirs)

    title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + ''
    if not Params.NORMALIZE_VAR: title += ', not norm by var'
    if not Params.NORMALIZE_DIST: title += ', not norm by dist'
    if Params.SKIP_DAMAGE: title += ', skipped damage'
    if Params.SKIP_REJECT: title += ', skipped REJECT'
    if Params.SKIP_NOCALL: title += ', skipped num called = 0'
    if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF)
    if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST)
    if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions'
    else: title += ', skiped SNP pairs between regions'

    run_id = '__'.join(d.name for d in datasets)

    run_dir = safe_mkdir(join((output_dir or join(code_dir, 'runs')), run_id))
    log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True)
    work_dir = safe_mkdir(join(run_dir, 'work'))

    all_vcf_by_label = dict()
    bed_files_by_genome = defaultdict(set)
    for d in datasets:
        all_vcf_by_label.update(d.vcf_by_label)
        if d.bed_file:
            bed_files_by_genome[d.genome].add(
                d.bed_file)  # d.bed_file=None for WGS

    genome_by_label = dict()
    for d in datasets:
        for label in d.vcf_by_label:
            genome_by_label[label] = d.genome

    parallel_cfg = ParallelCfg(threads=threads)
    log.info(f'Starting using {parallel_cfg.threads} threads')

    with parallel_view(len(all_vcf_by_label), parallel_cfg,
                       work_dir) as parall_view:
        overlap_bed_file_by_genome = dict()
        if bed_files_by_genome:
            overlap_bed_file_by_genome = _prep_bed(work_dir,
                                                   bed_files_by_genome,
                                                   overlap_bed_file_by_genome)
            log.info('Slicing VCFs to regions in BED files')
            out = parall_view.run(_slice_vcf_fn, [[
                work_dir, label, vcf,
                overlap_bed_file_by_genome.get(genome_by_label[label])
            ] for label, vcf in all_vcf_by_label.items()])
            all_vcf_by_label = dict(out)
            log.info()

        log.info('Calculating fingerprints for individual samples')
        out = parall_view.run(make_fingerprint, [[
            vcf, work_dir, label, fp_size,
            overlap_bed_file_by_genome.get(genome_by_label[label])
        ] for label, vcf in all_vcf_by_label.items()])
        print_label_pairs = dict(out)
        log.info()

    log.info('Comparing fingerprints pairwise')
    pairwise_dict = defaultdict(dict)
    for ((label1, print1), (label2,
                            print2)) in it.combinations_with_replacement(
                                print_label_pairs.items(), 2):
        dist, pvalue = compare(print1, print2)
        if dist:
            log.info(
                f'   {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}')
        else:
            log.info(f'   {label1} VS {label2}: failed to calculate')
            dist = float('NaN')
        pairwise_dict[label1][label2] = dist
        pairwise_dict[label2][label1] = dist

    log.info('Plotting comparison heatmap')
    plot_heatmap(pairwise_dict, run_dir, title)
Esempio n. 5
0
    def create(projects, parall_view=None):
        run = Run()
        db.session.add(run)
        for p in projects:
            run.projects.append(p)
        db.session.commit()

        genomes = [p.genome for p in projects]
        if len(set(genomes)) > 1:
            log.critical('Error: multiple genomes in projects: ' +
                         str(genomes))
        genome_build = genomes[0]

        snps_dir = safe_mkdir(join(run.work_dir_path(), 'snps'))
        run.snps_file = build_snps_panel(
            bed_files=[p.bed_fpath for p in projects if p.bed_fpath],
            output_dir=snps_dir,
            genome=genome_build)
        locations = extract_locations_from_file(run.snps_file)
        for loc in locations:
            db.session.add(loc)
        db.session.commit()

        log.info()
        log.info('Genotyping')
        samples = [s for p in projects for s in p.samples]
        snps_left_to_call_file = _get_snps_not_called(run.snps_file, samples)
        vcf_dir = safe_mkdir(join(run.work_dir_path(), 'vcf'))
        work_dir = safe_mkdir(join(vcf_dir, 'work'))
        bs = [BaseSample(s.long_name(), bam=s.bam) for s in samples]

        if parall_view:
            vcf_by_sample = genotype(bs,
                                     snps_left_to_call_file,
                                     parall_view,
                                     work_dir=work_dir,
                                     output_dir=vcf_dir,
                                     genome_build=genome_build)
        else:
            n_threads = parallel_cfg.threads
            if len(samples) < n_threads:  # vardict is running in 1 thread
                parallel_cfg.threads = len(samples)
            with parallel_view(len(samples), parallel_cfg,
                               safe_mkdir(join(run.work_dir_path(),
                                               'log'))) as view:
                vcf_by_sample = genotype(bs,
                                         snps_left_to_call_file,
                                         view,
                                         work_dir=work_dir,
                                         output_dir=vcf_dir,
                                         genome_build=genome_build)
            parallel_cfg.threads = n_threads

        # TODO: speed this up
        log.info('Loading called SNPs into the DB')
        for s in samples:
            recs = [r for r in VCF(vcf_by_sample[s.long_name()])]
            recs_by_rsid = defaultdict(list)
            for r in recs:
                recs_by_rsid[r.ID].append(r)
            for loc in locations:
                assert loc
                snp = s.snps.filter(SNP.rsid == loc.rsid).first()
                if not snp:
                    snp = SNP(loc)
                    build_snp_from_records(snp, recs_by_rsid[loc.rsid],
                                           s.project.min_depth)
                    s.snps.append(snp)
                    db.session.add(snp)

        log.info('Adding locations into the DB')
        run.locations.delete()
        for l in locations:
            run.locations.append(l)
        db.session.add(run)
        db.session.commit()
        log.info('Saved locations in the DB')

        log.info()
        log.info('Building tree')
        build_tree(run)

        log.info()
        log.info('Loading BAMs sliced to fingerprints')
        if parall_view:
            parall_view.run(load_bam_file, [[
                s.bam,
                safe_mkdir(join(run.work_dir_path(), 'bams')), run.snps_file,
                s.long_name()
            ] for s in samples])
        else:
            with parallel_view(len(samples), parallel_cfg,
                               safe_mkdir(join(run.work_dir_path(),
                                               'log'))) as view:
                view.run(load_bam_file, [[
                    s.bam,
                    safe_mkdir(join(run.work_dir_path(), 'bams')),
                    run.snps_file,
                    s.long_name()
                ] for s in samples])

        return run
def main(subdirs, output_dir, threads=1, fp_size=Params.L, isdebug=False):
    """ Generates a PNG image with a relatedness heatmap.
    """
    datasets = _load_datasets(subdirs)

    title = ', '.join(d.name for d in datasets) + '\nL=' + str(fp_size) + ''
    if not Params.NORMALIZE_VAR: title += ', not norm by var'
    if not Params.NORMALIZE_DIST: title += ', not norm by dist'
    if Params.SKIP_DAMAGE: title += ', skipped damage'
    if Params.SKIP_REJECT: title += ', skipped REJECT'
    if Params.SKIP_NOCALL: title += ', skipped num called = 0'
    if Params.MIN_AF: title += ', min AF=' + str(Params.MIN_AF)
    if Params.MIN_DIST: title += ', min dist=' + str(Params.MIN_DIST)
    if Params.INTERREGION_PAIRS: title += ', used SNP pairs between regions'
    else: title += ', skiped SNP pairs between regions'

    run_id = '__'.join(d.name for d in datasets)

    run_dir = safe_mkdir(join((output_dir or join(basedir, 'runs')), run_id))
    log.init(isdebug, join(run_dir, 'log.txt'), save_previous=True)
    work_dir = safe_mkdir(join(run_dir, 'work'))

    all_vcf_by_label = dict()
    bed_files_by_genome = defaultdict(set)
    for d in datasets:
        all_vcf_by_label.update(d.vcf_by_label)
        bed_files_by_genome[d.genome].add(
            d.bed_file)  # d.bed_file=None for WGS

    genome_by_label = dict()
    for d in datasets:
        for label in d.vcf_by_label:
            genome_by_label[label] = d.genome

    parallel_cfg = ParallelCfg(threads=threads)
    log.info(f'Starting using {parallel_cfg.threads} threads')

    overlap_bed_file_by_genome = dict()
    with parallel_view(len(all_vcf_by_label), parallel_cfg,
                       work_dir) as parall_view:
        if bed_files_by_genome:
            log.info(f'Found BED files: {bed_files_by_genome}')
            for genome, bed_files in bed_files_by_genome.items():
                bed_files = [b for b in bed_files if b]
                log.info(f'Overlapping BED files for genome {genome}')
                overlap_bed_file_by_genome[genome] = _overlap_bed_files(bed_files, work_dir, genome) \
                    if bed_files else None

            primary_genome = sorted(bed_files_by_genome.items(),
                                    key=lambda kv: len(kv[1]))[-1][0]
            lifted_bed_files = []
            for genome, overlap_bed_file in overlap_bed_file_by_genome.items():
                if overlap_bed_file and genome != primary_genome:
                    lifted_bed_file = lift_over(overlap_bed_file, genome,
                                                primary_genome)
                    lifted_bed_files.append(lifted_bed_file)
            if lifted_bed_files:
                primary_bed_files = [
                    b for b in lifted_bed_files +
                    [overlap_bed_file_by_genome[primary_genome]] if b
                ]
                overlap_bed_file_by_genome[
                    primary_genome] = _overlap_bed_files(
                        primary_bed_files, work_dir, primary_genome)

            log.info('Lifting BED files back')
            for genome in overlap_bed_file_by_genome:
                if genome != primary_genome:
                    overlap_bed_file_by_genome[genome] = lift_over(
                        overlap_bed_file_by_genome[primary_genome],
                        primary_genome, genome)
            log.info()

            log.info('Sorting, bgzipping and tabixing BED files')
            for g, bed in overlap_bed_file_by_genome.items():
                overlap_bed_file_by_genome[g] = bgzip_and_tabix(
                    sort_bed(bed, genome=g))
            log.info()

            log.info('Slicing VCFs to regions in BED files')
            out = parall_view.run(_slice_vcf_fn, [[
                work_dir, label, vcf,
                overlap_bed_file_by_genome.get(genome_by_label[label])
            ] for label, vcf in all_vcf_by_label.items()])
            all_vcf_by_label = dict(out)
            log.info()

        log.info('Calculating fingerprints for individual samples')
        out = parall_view.run(make_fingerprint, [[
            vcf, work_dir, label, fp_size,
            overlap_bed_file_by_genome[genome_by_label[label]]
        ] for label, vcf in all_vcf_by_label.items()])
        print_label_pairs = dict(out)
        log.info()

    log.info('Comparing fingerprints pairwise')
    pairwise_dict = defaultdict(dict)
    for ((label1, print1), (label2,
                            print2)) in it.combinations_with_replacement(
                                print_label_pairs.items(), 2):
        dist, pvalue = compare(print1, print2)
        if dist:
            log.info(
                f'   {label1} VS {label2}: {dist:.2f}, Pvalue={pvalue:.2f}')
        else:
            log.info(f'   {label1} VS {label2}: failed to calculate')
            dist = float('NaN')
        pairwise_dict[label1][label2] = dist
        pairwise_dict[label2][label1] = dist

    log.info('Plotting comparison heatmap')
    plot_heatmap(pairwise_dict, run_dir, title)