Ejemplo n.º 1
0
def main():
    cnf, samples, bed_fpath, output_dir = proc_args(sys.argv)
    info('Processing ' + str(len(samples)) + ' samples')

    if cnf.prep_bed is not False:
        if not bed_fpath:
            info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds))
            bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name)

        seq2c_bed_fname = basename(bed_fpath)

        bed_cols = count_bed_cols(bed_fpath)
        if bed_cols < 4:
            check_genome_resources(cnf)
            _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath)

        try:
            copyfile(bed_fpath, join(output_dir, seq2c_bed_fname))
        except OSError:
            err(format_exc())
            info()
        else:
            info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname))

    bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file')
    info('Using target ' + bed_fpath)

    run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
Ejemplo n.º 2
0
def process_one(cnf, output_dir, bam_fpath, features_bed,
                features_no_genes_bed):
    sample = TargQC_Sample(cnf.sample, output_dir, bed=cnf.bed, bam=cnf.bam)
    sample.l_fpath = cnf.l_fpath
    sample.r_fpath = cnf.r_fpath

    # if not sample.bam and sample.l_fpath and sample.r_fpath:
    #     sample.bam = proc_fastq(cnf, sample, verify_file(cnf.l_fpath), verify_file(cnf.r_fpath))

    info('Using alignment ' + sample.bam)

    if not bam_fpath:
        critical(sample.name + ': BAM file is required.')

    target_bed = verify_file(cnf.bed, is_critical=True) if cnf.bed else None
    bam_fpath = verify_file(sample.bam, is_critical=True)
    index_bam(cnf, bam_fpath)

    gene_keys_list = None
    if cnf.prep_bed is not False:
        info('Preparing the BED file.')
        features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds(
            cnf, features_bed, target_bed)

        gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)
    else:
        info('The BED file is ready, skipping preparing.')
        gene_keys_set, gene_keys_list, _, _, _ = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)

    picard_ins_size_hist(cnf, sample, bam_fpath, output_dir)

    avg_depth, gene_by_name_and_chrom, reports = make_targqc_reports(
        cnf, output_dir, sample, bam_fpath, features_bed,
        features_no_genes_bed, target_bed, gene_keys_list)

    # #if cnf.extended:
    # try:
    #     info('Generating flagged regions report...')
    #     flagged_report = generate_flagged_regions_report(cnf, cnf.output_dir, sample, avg_depth, gene_by_name_and_chrom)
    #     if not flagged_report:
    #         err('Flagged regions report was not generated')
    #         err()
    # except:
    #     err(format_exc())

    return reports
Ejemplo n.º 3
0
def run_seq2c_bcbio_structure(cnf, bcbio_structure):
    step_greetings('Coverage statistics for each gene for all samples')

    if cnf.prep_bed is not False:
        info('Preparing BED files')
        features_bed_fpath = cnf.features or cnf.genome.features  # only for annotation
        if cnf.bed or bcbio_structure.bed:
            _, _, _, seq2c_bed = \
                prepare_beds(cnf, features_bed=features_bed_fpath,
                    target_bed=bcbio_structure.bed, seq2c_bed=bcbio_structure.sv_bed)
        else:
            seq2c_bed = verify_bed(cnf.genome.cds)
    else:
        seq2c_bed = verify_bed(cnf.bed)

    info('Calculating normalized coverages for CNV...')
    cnv_report_fpath = run_seq2c(
        cnf, join(bcbio_structure.date_dirpath, BCBioStructure.cnv_dir),
        bcbio_structure.samples, seq2c_bed, is_wgs=cnf.is_wgs)

    # if not verify_module('matplotlib'):
    #     warn('No matplotlib, skipping plotting Seq2C')
    # else:
    #     Parallel(n_jobs=cnf.threads) \
    #         (delayed(draw_seq2c_plot)(CallCnf(cnf.__dict__), cnv_report_fpath, s.name,
    #                 cnf.output_dir, chr_lens=get_chr_lengths(cnf))
    #             for s in bcbio_structure.samples)
    #
    #     for s in bcbio_structure.samples:
    #         plot_fpath = draw_seq2c_plot(cnf, cnv_report_fpath, s.name, cnf.output_dir)
    info()
    info('*' * 70)
    if cnv_report_fpath:
        info('Seq2C:')
        if cnv_report_fpath:
            info('   ' + cnv_report_fpath)

    return [cnv_report_fpath]
Ejemplo n.º 4
0
def summarize_targqc(cnf,
                     summary_threads,
                     output_dir,
                     samples,
                     bed_fpath=None,
                     features_fpath=None,
                     tag_by_sample=None):
    step_greetings('TargQC coverage statistics for all samples')

    correct_samples = []

    for sample in samples:
        # if not sample.targetcov_done():
        #     err('Error: target coverage is not done (json, html, or detail tsv are not there)')
        # else:
        correct_samples.append(sample)
        # if not sample.ngscat_done():
        # sample.ngscat_html_fpath = None
        # if not sample.qualimap_done():
        # sample.qualimap_html_fpath = None
    samples = correct_samples

    # _make_targetcov_symlinks(samples)

    txt_fpath, tsv_fpath, html_fpath = _make_tarqc_html_report(
        cnf, output_dir, samples, bed_fpath, tag_by_sample=tag_by_sample)

    best_for_regions_fpath = None
    if any(
            verify_file(s.targetcov_detailed_tsv, silent=True)
            for s in samples):
        best_for_regions_fpath = _save_best_details_for_each_gene(
            cnf.coverage_reports.depth_thresholds, samples, output_dir)
    ''' 1. best_regions = get_best_regions()
        2. best_for_regions_fpath = save_per_region_report()
        3. calc median coverage across best regions
        4. flagged_regions_report_fpath = _generate_flagged_regions_report(
             output_dir, 'Best', average_coverage, genes, depth_threshs)
    '''

    if cnf.extended:
        if not features_fpath or not bed_fpath:
            err('For the extended analysis, capture and features BED files are required!'
                )
        else:
            features_bed, features_no_genes_cut_bed, target_bed, _ = prepare_beds(
                cnf, features_fpath, bed_fpath)

            #norm_best_var_fpath, norm_comb_var_fpath = _report_normalize_coverage_for_variant_sites(
            #    cnf, summary_threads, output_dir, samples, 'oncomine', bed_fpath)

    info()
    info('*' * 70)
    if not html_fpath and not txt_fpath:
        info(
            'TargQC summary was not generated, because there were no reports generated for individual samples.'
        )
    else:
        info('TargQC summary saved in: ')
        for fpath in [txt_fpath, html_fpath]:
            if fpath: info('  ' + fpath)

    if best_for_regions_fpath:
        info()
        info('Best stats for regions saved in:')
        info('  ' + best_for_regions_fpath)

    # if cnf.extended:
    #     if norm_best_var_fpath:
    #         info()
    #         info('Normalized depths for oncomine saved in:')
    #         info('        ' + norm_comb_var_fpath)
    #         info('  Best: ' + norm_best_var_fpath)

    return html_fpath
Ejemplo n.º 5
0
def run_targqc(cnf,
               output_dir,
               samples,
               target_bed,
               features_bed,
               genes_fpath=None):
    max_threads = cnf.threads
    threads_per_sample = 1  # max(max_threads / len(samples), 1)
    summary_threads = min(len(samples), max_threads)
    info('Number of threads to run summary: ' + str(summary_threads))

    jobs_to_wait = []
    if not cnf.only_summary:
        original_target_bed = target_bed
        features_bed, features_no_genes_bed, target_bed, seq2c_bed = prepare_beds(
            cnf, features_bed, target_bed)
        gene_keys_set, gene_keys_list, target_bed, features_bed, features_no_genes_bed = \
            extract_gene_names_and_filter_exons(cnf, target_bed, features_bed, features_no_genes_bed)
        if not genes_fpath:
            genes_fpath = join(cnf.work_dir, 'genes.txt')
            with open(genes_fpath, 'w') as f:
                f.write('\n'.join(g + '\t' + c for g, c in gene_keys_list))

        info('*' * 70)
        info()

        step = _prep_steps(cnf, threads_per_sample, summary_threads, samples,
                           target_bed, original_target_bed, features_bed,
                           features_no_genes_bed, genes_fpath)

        summary_wait_for_steps = []

        for sample in samples:
            info('Processing ' + basename(sample.name))
            input_params = ''
            if sample.bam:
                input_params = ' --bam ' + sample.bam
            elif sample.l_fpath and sample.r_fpath:
                input_params = ' -1 ' + sample.l_fpath + ' -2 ' + sample.r_fpath
            if cnf.downsampled and sample.fastqc_dirpath:
                input_params += ' --downsampled --fastqc-dirpath ' + sample.fastqc_dirpath

            j = _submit_job(cnf,
                            step,
                            sample.name,
                            threads=threads_per_sample,
                            input_params=input_params,
                            targqc_dirpath=sample.targqc_dirpath)
            jobs_to_wait.append(j)
            summary_wait_for_steps.append(step.job_name(sample.name))

            info('Done ' + basename(sample.name))
            info()

    wait_for_jobs(cnf, jobs_to_wait)

    info('Making targqc summary')
    return summarize_targqc(cnf,
                            summary_threads,
                            output_dir,
                            samples,
                            bed_fpath=target_bed,
                            features_fpath=features_bed)