Ejemplo n.º 1
0
def main():
    cnf, samples, bed_fpath, output_dir = proc_args(sys.argv)
    info('Processing ' + str(len(samples)) + ' samples')

    if cnf.prep_bed is not False:
        if not bed_fpath:
            info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds))
            bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name)

        seq2c_bed_fname = basename(bed_fpath)

        bed_cols = count_bed_cols(bed_fpath)
        if bed_cols < 4:
            check_genome_resources(cnf)
            _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath)

        try:
            copyfile(bed_fpath, join(output_dir, seq2c_bed_fname))
        except OSError:
            err(format_exc())
            info()
        else:
            info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname))

    bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file')
    info('Using target ' + bed_fpath)

    run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
Ejemplo n.º 2
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
 def _verify_input_file(_key):
     cnf[_key] = adjust_path(cnf[_key])
     if not verify_file(cnf[_key], _key):
         return False
     if 'bam' in _key and not verify_bam(cnf[_key]):
         return False
     if 'bed' in _key and not verify_bed(cnf[_key]):
         return False
     return True
Ejemplo n.º 4
0
def run_seq2c_bcbio_structure(cnf, bcbio_structure):
    step_greetings('Coverage statistics for each gene for all samples')

    if cnf.prep_bed is not False:
        info('Preparing BED files')
        features_bed_fpath = cnf.features or cnf.genome.features  # only for annotation
        if cnf.bed or bcbio_structure.bed:
            _, _, _, seq2c_bed = \
                prepare_beds(cnf, features_bed=features_bed_fpath,
                    target_bed=bcbio_structure.bed, seq2c_bed=bcbio_structure.sv_bed)
        else:
            seq2c_bed = verify_bed(cnf.genome.cds)
    else:
        seq2c_bed = verify_bed(cnf.bed)

    info('Calculating normalized coverages for CNV...')
    cnv_report_fpath = run_seq2c(
        cnf, join(bcbio_structure.date_dirpath, BCBioStructure.cnv_dir),
        bcbio_structure.samples, seq2c_bed, is_wgs=cnf.is_wgs)

    # if not verify_module('matplotlib'):
    #     warn('No matplotlib, skipping plotting Seq2C')
    # else:
    #     Parallel(n_jobs=cnf.threads) \
    #         (delayed(draw_seq2c_plot)(CallCnf(cnf.__dict__), cnv_report_fpath, s.name,
    #                 cnf.output_dir, chr_lens=get_chr_lengths(cnf))
    #             for s in bcbio_structure.samples)
    #
    #     for s in bcbio_structure.samples:
    #         plot_fpath = draw_seq2c_plot(cnf, cnv_report_fpath, s.name, cnf.output_dir)
    info()
    info('*' * 70)
    if cnv_report_fpath:
        info('Seq2C:')
        if cnv_report_fpath:
            info('   ' + cnv_report_fpath)

    return [cnv_report_fpath]
Ejemplo n.º 5
0
def get_bed_targqc_inputs(cnf, bed_fpath=None):
    if bed_fpath:
        bed_fpath = verify_bed(bed_fpath,
                               description='Input BED file',
                               is_critical=True)
        info('Using amplicons/capture panel ' + bed_fpath)

    features_bed_fpath = adjust_path(cnf.features or cnf.genome.features)
    if features_bed_fpath:
        info('Features: ' + features_bed_fpath)

    genes_fpath = None
    if cnf.genes:
        genes_fpath = adjust_path(cnf.genes)
        info('Custom genes list: ' + genes_fpath)

    return bed_fpath, features_bed_fpath, genes_fpath
Ejemplo n.º 6
0
def main():
    parser = OptionParser(usage='Usage: ' + basename(__file__) +
                          ' -o Output_BED_file -g hg19 Input_BED_file')
    parser.add_option('-o', '--output-bed', dest='output_fpath')
    parser.add_option('-g', '--genome', dest='genome')
    (opts, args) = parser.parse_args(sys.argv[1:])

    if len(args) < 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    check_genome_resources(cnf)

    if not cnf.output_fpath:
        critical(parser.usage)

    sort_bed(cnf, verify_bed(args[0], is_critical=True),
             adjust_path(cnf.output_fpath))
Ejemplo n.º 7
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input.'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    parser.add_option('--work-dir', dest='work_dir', metavar='DIR')
    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true')
    parser.add_option('-o',
                      dest='output_dir',
                      metavar='DIR',
                      default=join(os.getcwd(), 'targetqc'))
    parser.add_option('--reannotate',
                      dest='reannotate',
                      action='store_true',
                      default=False,
                      help='re-annotate BED file with gene names')
    parser.add_option('--dedup',
                      dest='dedup',
                      action='store_true',
                      default=False,
                      help='count duplicates in coverage metrics')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option(
        '--exons',
        '--exome',
        '--features',
        dest='features',
        help=
        'Annotated CDS/Exon/Gene/Transcripts BED file to make targetSeq exon/amplicon regions reports.'
    )

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        critical('No BAMs provided to input.')
    bam_fpaths = list(set([abspath(a) for a in args]))

    bad_bam_fpaths = []
    for fpath in bam_fpaths:
        if not verify_bam(fpath):
            bad_bam_fpaths.append(fpath)
    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs:' +
                 ', '.join(bad_bam_fpaths))

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'TargQC'
    set_up_dirs(cnf)
    # cnf.name = 'TargQC_' + cnf.project_name

    check_genome_resources(cnf)

    verify_bed(cnf.bed, is_critical=True)
    bed_fpath = adjust_path(cnf.bed)
    info('Using amplicons/capture panel ' + bed_fpath)

    features_bed_fpath = adjust_path(
        cnf.features) if cnf.features else adjust_path(cnf.genome.features)
    info('Features: ' + features_bed_fpath)

    genes_fpath = None
    if cnf.genes:
        genes_fpath = adjust_path(cnf.genes)
        info('Custom genes list: ' + genes_fpath)

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner:
            critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    info('*' * 70)
    info()

    targqc_html_fpath = run_targqc(cnf, cnf.output_dir, bam_fpaths, bed_fpath,
                                   features_bed_fpath, genes_fpath)
    if targqc_html_fpath:
        send_email(
            cnf, 'TargQC report for ' + cnf.project_name + ':\n  ' +
            targqc_html_fpath)
Ejemplo n.º 8
0
def main():
    if len(sys.argv[1]) < 0:
        critical('Usage: ' + __file__ +
                 ' Input_BED_file -g hg19 -o Annotated_BED_file')
    input_bed_fpath = verify_bed(sys.argv[1],
                                 is_critical=True,
                                 description='Input BED file for ' + __file__)

    cnf = read_opts_and_cnfs(
        description=
        'Annotating BED file based on reference features annotations.',
        extra_opts=[
            (['--reference'], dict(dest='reference')),
        ],
        required_keys=['output_file'],
        file_keys=['reference'],
        key_for_sample_name=None,
        fpath_for_sample_name=input_bed_fpath,
        main_output_is_file=True)
    check_system_resources(cnf)
    check_genome_resources(cnf)

    chr_order = get_chrom_order(cnf)

    features_fpath = adjust_path(cnf.genome.bed_annotation_features)
    if not verify_bed(features_fpath, 'Annotated reference BED file'):
        critical('Annotated reference is required')

    # features_and_beds = _split_reference_by_priority(cnf, features_fpath)

    bed = BedTool(input_bed_fpath).cut([0, 1, 2])

    info()

    annotated = None
    off_targets = None

    for feature in ['CDS', 'Exon', 'Transcript', 'Gene']:
        if bed:
            info('Extracting ' + feature + ' features from ' + features_fpath)
            features_bed = BedTool(features_fpath).filter(
                lambda x: x[6] == feature)

            info('Annotating based on ' + feature)
            new_annotated, off_targets = _annotate(cnf, bed, features_bed,
                                                   chr_order)
            if not annotated:
                annotated = new_annotated
                for a in annotated:
                    a.feature = feature
            else:
                annotated.extend(new_annotated)

            if off_targets:
                bed = BedTool([(r.chrom, r.start, r.end) for r in off_targets])

                # off_target_fpath = _save_regions(off_targets, join(work_dirpath, 'off_target_1.bed'))
                # log('Saved off target1 to ' + str(off_target_fpath))
                info()

    if annotated is not None and off_targets is not None:
        annotated.extend(off_targets)

    info()
    info('Saving annotated regions to ' + str(cnf.output_file))
    with open(cnf.output_file, 'w') as out:
        for region in sorted(annotated, key=lambda r: r.get_key()):
            out.write(str(region))

        # for r, overlap_size in overlaps:
        #     sys.stdout.write('\t' + '\t'.join([
        #         r.chrom, '{:,}'.format(r.start), '{:,}'.format(r.end), r.gene, r.exon, str(r.strand), r.feature, r.biotype,
        #         str(overlap_size),
        #         '{:.2f}%'.format(100.0 * overlap_size / (r.end - r.start))
        #     ]))
        # sys.stdout.write('\n')
    info('Done.')