Example #1
0
def main():
    cnf, samples, bed_fpath, output_dir = proc_args(sys.argv)
    info('Processing ' + str(len(samples)) + ' samples')

    if cnf.prep_bed is not False:
        if not bed_fpath:
            info('No input BED is specified, using CDS instead from ' + str(cnf.genome.cds))
            bed_fpath = verify_bed(cnf.genome.cds, 'CDS bed file for ' + cnf.genome.name)

        seq2c_bed_fname = basename(bed_fpath)

        bed_cols = count_bed_cols(bed_fpath)
        if bed_cols < 4:
            check_genome_resources(cnf)
            _, _, _, bed_fpath = prepare_beds(cnf, None, None, bed_fpath)

        try:
            copyfile(bed_fpath, join(output_dir, seq2c_bed_fname))
        except OSError:
            err(format_exc())
            info()
        else:
            info('Seq2C bed file is saved in ' + join(output_dir, seq2c_bed_fname))

    bed_fpath = verify_bed(bed_fpath, is_critical=True, description='Input BED file')
    info('Using target ' + bed_fpath)

    run_seq2c(cnf, output_dir, samples, bed_fpath, cnf.is_wgs)
def main():
    cnf = read_opts_and_cnfs(
        description='Plotting Seq2C results.',
        extra_opts=[
            (['--seq2c-results'], dict(
                dest='seq2c_tsv_fpath')
             ),
            (['--key-genes'], dict(
                dest='key_genes_fpath')
             ),
        ],
        required_keys=['seq2c_tsv_fpath', 'output_dir'],
        file_keys=['seq2c_tsv_fpath', 'key_genes'],
        key_for_sample_name=None,
    )
    check_system_resources(cnf)
    check_genome_resources(cnf)

    key_gene_names = None
    if cnf.key_genes_fpath:
        with open(cnf.key_genes_fpath) as f:
            key_gene_names = set([l.strip() for l in f.readlines() if l.strip() != ''])

    plot_fpath = draw_seq2c_plot(cnf, cnf.seq2c_tsv_fpath, cnf.sample, cnf.output_dir, key_gene_names)
    if plot_fpath:
        info('Saved plot to ' + plot_fpath)
Example #3
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
Example #4
0
def main(args):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--vcf', '--var'], dict(
                dest='vcf',
                help='variants to annotate')
             ),
            (['--bam'], dict(
                dest='bam',
                help='(outdated) used to generate some annotations by GATK')
             ),
            (['--match-normal-sample-name'], dict(
                dest='match_normal_normal_name')
             ),
            (['--clinical-reporting'], dict(
                dest='clinical_reporting',
                help='used to generate some annotations by GATK',
                action='store_true',
                default=None)
             ),
            (['--qc'], dict(
                dest='qc',
                action='store_true',
                default=True,
                help=SUPPRESS_HELP)
             ),
            (['--no-qc'], dict(
                dest='qc',
                action='store_false',
                help=SUPPRESS_HELP)
             ),
        ],
        required_keys=['vcf'],
        file_keys=['bam', 'vcf'],
        key_for_sample_name='vcf',
        proc_name=source.varannotate_name)

    check_system_resources(cnf,
        required=['java', 'perl', 'snpeff'],
        optional=['transcripts_fpath'])

    check_genome_resources(cnf)

    # info('Using variants ' + cnf['vcf'])
    # info('Using alignement ' + cnf['bam'])

    run_one(cnf, process_one, finalize_one)

    if not cnf['keep_intermediate']:
        shutil.rmtree(cnf['work_dir'])
Example #5
0
def main():
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--targqc-dir'], dict(dest='targqc_dirpath', )),
            (['--mutations'], dict(dest='mutations_fpath', )),
            (['--sv'], dict(dest='sv_fpath', )),
            (['--sv-vcf'], dict(dest='sv_vcf_fpath', )),
            (['--varqc'], dict(dest='varqc_json_fpath', )),
            (['--varqc-after'], dict(dest='varqc_after_json_fpath', )),
            (['--target-type'], dict(
                dest='target_type',
                default='panel',
            )),
            (['--bed'], dict(dest='bed_fpath', )),
            (['--seq2c'], dict(dest='seq2c_tsv_fpath', )),
            (['--project-level-report'], dict(dest='project_report_path', )),
            (['--targqc-html'], dict(dest='targqc_report_path', )),
            (['--match'], dict(dest='match_sample_name', )),
            (['--jira'], dict(dest='jira_url', )),
        ],
        key_for_sample_name=None,
        required_keys=[],
        file_keys=[
            'mutations_fpath',
            'varqc_json_fpath',
            'varqc_after_json_fpath',
            'bed_fpath',
            'seq2c_tsv_fpath',
            'sv_fpath',
            'sv_vcf_fpath',
            #'project_report_path',  # DO NOT UNCOMMENT! Project level report might not yet exist
        ],
        # do not check mutations_fpath! could be either of:
        #   vardict.PASS.txt,
        #   vardict-java.PASS.txt,
        #   vardict.single.PASS.txt,
        #   vardict.paired.PASS.txt,
        dir_keys=['targqc_dirpath'],
    )

    check_genome_resources(cnf)
    check_system_resources(cnf, required=['bedtools'], optional=[])

    clin_info = clinical_sample_info_from_cnf(cnf)
    html_fpath = make_clinical_report(cnf, clin_info,
                                      clin_info.sample.clinical_html)
    info('Clinical report: ' + html_fpath)

    if not cnf['keep_intermediate']:
        shutil.rmtree(cnf['work_dir'])
Example #6
0
def proc_args(argv):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--bam'], dict(dest='bam', )),
        ],
        required_keys=['bam'],
        file_keys=['bam'],
    )

    check_genome_resources(cnf)

    if not cnf.bam:
        critical('No bam file provided to input')
    if not cnf.genome:
        critical('Please, specify the --genome option (e.g. --genome hg19)')

    return cnf
def main():
    parser = OptionParser(usage='Usage: ' + basename(__file__) +
                          ' -o Output_BED_file -g hg19 Input_BED_file')
    parser.add_option('-o', '--output-bed', dest='output_fpath')
    parser.add_option('-g', '--genome', dest='genome')
    (opts, args) = parser.parse_args(sys.argv[1:])

    if len(args) < 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    check_genome_resources(cnf)

    if not cnf.output_fpath:
        critical(parser.usage)

    sort_bed(cnf, verify_bed(args[0], is_critical=True),
             adjust_path(cnf.output_fpath))
Example #8
0
def main(args):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--var', '--vcf'], dict(
                dest='vcf',
                help='variants to evaluate')
             ),
        ],
        required_keys=['vcf'],
        file_keys=['vcf'],
        key_for_sample_name='vcf',
        proc_name=source.varqc_name,
    )

    check_system_resources(cnf)

    check_genome_resources(cnf)

    info('Using variants ' + cnf['vcf'])

    run_one(cnf, process_one, finalize_one)

    if not cnf['keep_intermediate']:
        shutil.rmtree(cnf['work_dir'])
def create_oncoprints_link(cnf, bcbio_structure, project_name=None):
    if is_us(): loc = exposing.us
    # elif is_uk(): loc = exposing.uk
    else:
        loc = exposing.local
        return None

    if not bcbio_structure.variant_callers:
        info('No varianting calling performed, not generating Oncoprints')
        return None
    clinical_report_caller = \
        bcbio_structure.variant_callers.get('vardict') or \
        bcbio_structure.variant_callers.get('vardict-java')
    if not clinical_report_caller:
        err('Warning: vardict is not in the variant callers list, this not generating Oncoprints')
        return None

    step_greetings('Creating Oncoprints link')
    zhongwu_data_query_dirpath = '/home/kdld047/public_html/cgi-bin/TS'
    if not isdir(zhongwu_data_query_dirpath):
        warn('Data Query directory ' + zhongwu_data_query_dirpath + ' does not exists.')
        return None

    vardict_txt_fname = variant_filtering.mut_fname_template.format(caller_name=clinical_report_caller.name)
    vardict_txt_fpath = join(bcbio_structure.var_dirpath, vardict_txt_fname)
    cnf.mutations_fpath = add_suffix(vardict_txt_fpath, variant_filtering.mut_pass_suffix)

    cnf.seq2c_tsv_fpath = bcbio_structure.seq2c_fpath

    samples = sorted(bcbio_structure.samples)
    cnf.project_name = project_name or bcbio_structure.project_name or basename(cnf.output_dir)
    study_name = re.sub('[\.\-:&]', '_', cnf.project_name)

    check_genome_resources(cnf)

    data_query_dirpath = join(loc.dirpath, 'DataQueryTool')

    data_fpath = join(zhongwu_data_query_dirpath, study_name + '.data.txt')
    info_fpath = join(zhongwu_data_query_dirpath, study_name + '.info.txt')
    altered_genes = print_data_txt(cnf, cnf.mutations_fpath, cnf.seq2c_tsv_fpath, samples, data_fpath)
    if not altered_genes:
        err('No altered genes in ' + cnf.mutations_fpath + ' or ' + cnf.seq2c_tsv_fpath + ', not generating Oncoptints.')
        return None

    print_info_txt(cnf, samples, info_fpath)

    data_ext_fpath = data_fpath.replace('/home/', '/users/')
    info_ext_fpath = info_fpath.replace('/home/', '/users/')

    # optional:
    data_symlink = join(data_query_dirpath, study_name + '.data.txt')
    info_symlink = join(data_query_dirpath, study_name + '.info.txt')
    (symlink_to_ngs if is_us() else local_symlink)(data_ext_fpath, data_symlink)
    (symlink_to_ngs if is_us() else local_symlink)(info_ext_fpath, info_symlink)

    properties_fpath = join(zhongwu_data_query_dirpath, 'DataQuery.properties')
    add_data_query_properties(cnf, study_name, properties_fpath, data_ext_fpath, info_ext_fpath)

    genes = '%0D%0A'.join(altered_genes)
    data_query_url = join(loc.website_url_base, 'DataQueryTool', 'DataQuery.pl?'
        'analysis=oncoprint&'
        'study={study_name}&'
        'gene={genes}&'
        'order=on&'
        'freq=50&'
        'nocheckgenes=true&'
        'submit=Submit'
        .format(**locals()))

    info()
    info('Information about study was added in Data Query Tool, URL is ' + data_query_url)
    return data_query_url
Example #10
0
def proc_opts():
    parser = OptionParser()
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--expose-only',
                      dest='expose_to_ngs_server_only',
                      action='store_true',
                      default=False,
                      help='Only add project to the webserver')
    parser.add_option('--no-expose',
                      dest='expose',
                      action='store_false',
                      default=True,
                      help='Do not expose the reports')
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option('--downsample-to', dest='downsample_to', type='int')

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) < 1:
        critical('Usage: ' + __file__ + ' *.fq.gz -o output_dir')
    # if len(args) < 2:
    #     info('No dataset path specified, assuming it is the current working directory')
    #     dataset_dirpath = adjust_path(os.getcwd())
    #     jira_url = args[0]

    fastq_fpaths = [verify_file(fpath) for fpath in args]
    fastq_fpaths = [fpath for fpath in fastq_fpaths if fpath]
    info(str(len(fastq_fpaths)) + ' fastq files')

    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    info('Writing to ' + str(cnf.output_dir))

    cnf.project_name = cnf.project_name or 'preproc'

    if cnf.work_dir:
        cnf.debug = True
    else:
        all_work_dir = join(cnf.output_dir, 'work')
        safe_mkdir(all_work_dir)

        latest_fpath = join(all_work_dir, 'latest')

        if cnf.reuse_intermediate:
            cnf.work_dir = latest_fpath
        else:
            cnf.work_dir = join(
                all_work_dir,
                datetime.datetime.now().strftime("%Y-%b-%d_%H-%M"))
            if islink(latest_fpath):
                os.remove(latest_fpath)
            if isdir(latest_fpath):
                shutil.rmtree(latest_fpath)
            if not exists(latest_fpath):
                os.symlink(basename(cnf.work_dir), latest_fpath)

    cnf.work_dir = adjust_path(cnf.work_dir)
    safe_mkdir(cnf.work_dir)
    cnf.log_dir = join(cnf.work_dir, 'log')
    safe_mkdir(cnf.log_dir)
    set_up_log(cnf)
    try:
        subprocess.call(['chmod', '-R', 'g+w', cnf.work_dir])
    except OSError:
        err(traceback.format_exc())
        pass

    if cnf.samplesheet:
        cnf.samplesheet = verify_file(cnf.samplesheet, is_critical=True)

    info(' '.join(sys.argv))
    info()
    info('Created a temporary working directory: ' + cnf.work_dir)

    if cnf.project_name:
        info('Project name: ' + cnf.project_name)

    if cnf.samplesheet:
        info('Using custom sample sheet ' + cnf.samplesheet)

    check_genome_resources(cnf)
    check_system_resources(cnf, optional=['fastq'])

    return cnf, cnf.output_dir, fastq_fpaths
Example #11
0
def main(args):
    cnf = read_opts_and_cnfs(extra_opts=[
        (['--bam'], dict(dest='bam', help='a path to the BAM file to study')),
        (['-1'], dict(dest='l_fpath')), (['-2'], dict(dest='r_fpath')),
        (['--bed', '--capture', '--amplicons'],
         dict(dest='bed', help='a BED file for capture panel or amplicons')),
        (['--exons', '--exome', '--features'],
         dict(
             dest='features',
             help=
             'a BED file with real CDS/Exon/Gene/Transcript regions with annotations (default "features" is in system_config)'
         )),
        (['--exons-no-genes', '--features-no-genes'],
         dict(
             dest='features_no_genes',
             help=
             'a BED file with real CDS/Exon regions with annotations, w/o Gene/Transcript records (default "features" is in system_config)'
         )),
        (['--original-bed'],
         dict(dest='original_target_bed', help=SUPPRESS_HELP)),
        (['--original-exons', '--original-features'],
         dict(
             dest='original_features_bed',
             help='original features genes bed file path (just for reporting)')
         ),
        (['--reannotate'],
         dict(dest='reannotate',
              help='re-annotate BED file with gene names',
              action='store_true',
              default=False)),
        (['--no-prep-bed'],
         dict(dest='prep_bed',
              help='do not fix input beds and exons',
              action='store_false',
              default=True)),
        (['-e', '--extended'],
         dict(dest='extended',
              help='extended - flagged regions and missed variants',
              action='store_true',
              default=False)),
        (['--genes'], dict(dest='genes', help='custom list of genes')),
        (['--padding'],
         dict(
             dest='padding',
             help=
             'integer indicating the number of bases to extend each target region up and down-stream. '
             'Default is ' + str(defaults['coverage_reports']['padding']),
             type='int')),
        (['--no-dedup'],
         dict(dest='no_dedup', action='store_true', help=SUPPRESS_HELP)),
        (['--downsample-to'],
         dict(dest='downsample_to', type='int', help=SUPPRESS_HELP)),
        (['--downsampled'],
         dict(dest='downsampled', action='store_true', help=SUPPRESS_HELP)),
        (['--fastqc-dirpath'], dict(dest='fastqc_dirpath', help=SUPPRESS_HELP))
    ],
                             file_keys=['bam', 'l_fpath', 'r_fpath', 'bed'],
                             key_for_sample_name='bam')

    if cnf.padding:
        cnf.coverage_reports.padding = cnf.padding

    check_system_resources(cnf, required=['bedtools'], optional=[])

    check_genome_resources(cnf)

    features_bed = adjust_path(cnf.features) if cnf.features else adjust_path(
        cnf.genome.features)
    if features_bed:
        info('Features: ' + features_bed)
        features_bed = verify_file(features_bed)
    else:
        info('No features BED found')

    if cnf.bed:
        cnf.bed = verify_file(cnf.bed, is_critical=True)
        info('Using amplicons/capture panel ' + cnf.bed)
    elif features_bed:
        info('WGS, taking CDS as target')

    cnf.bam = verify_bam(cnf.bam, is_critical=True)

    reports = process_one(cnf,
                          cnf.output_dir,
                          cnf.bam,
                          features_bed=features_bed,
                          features_no_genes_bed=cnf.features_no_genes)
    summary_report, gene_report = reports[:2]

    info('')
    info('*' * 70)
    if summary_report.txt_fpath:
        info('Summary report: ' + summary_report.txt_fpath)
    if gene_report:
        if gene_report.txt_fpath:
            info('All regions: ' + gene_report.txt_fpath + ' (' +
                 str(len(gene_report.rows)) + ' regions)')

    if len(reports) > 2:
        selected_regions_report = reports[2]
        if selected_regions_report.txt_fpath:
            info('Flagged regions: ' + selected_regions_report.txt_fpath +
                 ' (' + str(len(selected_regions_report.rows)) + ' regions)')

    for fpaths in reports:
        if fpaths:
            ok = True
            info('Checking expected results...')
            if not isinstance(fpaths, list):
                fpaths = [fpaths]
            for fpath in fpaths:
                if isinstance(fpath, basestring):
                    if not verify_file(fpath):
                        ok = False
            if ok:
                info('The results are good.')

    if not cnf['keep_intermediate']:
        shutil.rmtree(cnf['work_dir'])
Example #12
0
def get_args():
    info(' '.join(sys.argv))
    info()
    description = (
        'The program will filter the VarDict output after vcf2txt.pl to '
        'candidate interpretable mutations, somatic or germline.')
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)

    parser.add_option('-o', dest='output_file')
    parser.add_option('--o-all-transcripts',
                      dest='all_transcripts_output_file')
    parser.add_option('--o-fm', dest='fm_output_file')
    parser.add_option('--o-reject', dest='rejected_output_file')

    parser.add_option('--cohort-freqs', dest='cohort_freqs_fpath')
    parser.add_option('--transcripts', dest='transcripts_fpath')

    parser.add_option('-D',
                      '--min-depth',
                      dest='filt_depth',
                      type='int',
                      help='The minimum total depth')
    parser.add_option('-V',
                      '--min-vd',
                      dest='min_vd',
                      type='int',
                      help='The minimum reads supporting variant')
    parser.add_option(
        '--gmaf',
        dest='min_gmaf',
        type='float',
        help=
        'When the GMAF is greater than specified, it\'s considered common SNP and filtered out.'
    )
    parser.add_option(
        '-f',
        '--min-freq',
        dest='min_freq',
        type='float',
        help='The minimum allele frequency for regular variants.')
    parser.add_option(
        '-F',
        '--min-freq-hs',
        '--act-min-freq',
        dest='act_min_freq',
        type='float',
        help=
        'The minimum allele frequency hotspot somatic mutations, typically lower then -f. '
        'Default: 0.01 or half -f, whichever is less')
    parser.add_option(
        '-N',
        '--keep-utr-intronic',
        dest='keep_utr_intronic',
        action='store_true',
        help=
        'Keep all intronic and UTR in the output, but will be set as "unknown".'
    )

    parser.add_option(
        '-p',
        '--platform',
        dest='platform',
        help=
        'The platform, such as WXS, WGS, RNA-Seq, VALIDATION, etc. No Default. '
        'Used for output in FM\'s format')

    parser.set_usage('Usage: ' + __file__ +
                     ' vcf2txt_res_fpath [opts] -o output_fpath')

    (opts, args) = parser.parse_args()
    if len(args) < 1:
        critical('Provide the first argument - output from vcf2txt.pl')
    logger.is_debug = opts.debug

    vcf2txt_res_fpath = verify_file(args[0], is_critical=True)

    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    if not cnf.genome:
        critical('Please, specify the --genome option (e.g. --genome hg19)')

    check_genome_resources(cnf)

    if not cnf.output_file:
        critical('Please, specify the output fpath with -o')

    info()

    return cnf, vcf2txt_res_fpath
Example #13
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input.'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    parser.add_option('--work-dir', dest='work_dir', metavar='DIR')
    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true')
    parser.add_option('-o',
                      dest='output_dir',
                      metavar='DIR',
                      default=join(os.getcwd(), 'targetqc'))
    parser.add_option('--reannotate',
                      dest='reannotate',
                      action='store_true',
                      default=False,
                      help='re-annotate BED file with gene names')
    parser.add_option('--dedup',
                      dest='dedup',
                      action='store_true',
                      default=False,
                      help='count duplicates in coverage metrics')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option(
        '--exons',
        '--exome',
        '--features',
        dest='features',
        help=
        'Annotated CDS/Exon/Gene/Transcripts BED file to make targetSeq exon/amplicon regions reports.'
    )

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        critical('No BAMs provided to input.')
    bam_fpaths = list(set([abspath(a) for a in args]))

    bad_bam_fpaths = []
    for fpath in bam_fpaths:
        if not verify_bam(fpath):
            bad_bam_fpaths.append(fpath)
    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs:' +
                 ', '.join(bad_bam_fpaths))

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'TargQC'
    set_up_dirs(cnf)
    # cnf.name = 'TargQC_' + cnf.project_name

    check_genome_resources(cnf)

    verify_bed(cnf.bed, is_critical=True)
    bed_fpath = adjust_path(cnf.bed)
    info('Using amplicons/capture panel ' + bed_fpath)

    features_bed_fpath = adjust_path(
        cnf.features) if cnf.features else adjust_path(cnf.genome.features)
    info('Features: ' + features_bed_fpath)

    genes_fpath = None
    if cnf.genes:
        genes_fpath = adjust_path(cnf.genes)
        info('Custom genes list: ' + genes_fpath)

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner:
            critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    info('*' * 70)
    info()

    targqc_html_fpath = run_targqc(cnf, cnf.output_dir, bam_fpaths, bed_fpath,
                                   features_bed_fpath, genes_fpath)
    if targqc_html_fpath:
        send_email(
            cnf, 'TargQC report for ' + cnf.project_name + ':\n  ' +
            targqc_html_fpath)
Example #14
0
def main():
    if len(sys.argv[1]) < 0:
        critical('Usage: ' + __file__ +
                 ' Input_BED_file -g hg19 -o Annotated_BED_file')
    input_bed_fpath = verify_bed(sys.argv[1],
                                 is_critical=True,
                                 description='Input BED file for ' + __file__)

    cnf = read_opts_and_cnfs(
        description=
        'Annotating BED file based on reference features annotations.',
        extra_opts=[
            (['--reference'], dict(dest='reference')),
        ],
        required_keys=['output_file'],
        file_keys=['reference'],
        key_for_sample_name=None,
        fpath_for_sample_name=input_bed_fpath,
        main_output_is_file=True)
    check_system_resources(cnf)
    check_genome_resources(cnf)

    chr_order = get_chrom_order(cnf)

    features_fpath = adjust_path(cnf.genome.bed_annotation_features)
    if not verify_bed(features_fpath, 'Annotated reference BED file'):
        critical('Annotated reference is required')

    # features_and_beds = _split_reference_by_priority(cnf, features_fpath)

    bed = BedTool(input_bed_fpath).cut([0, 1, 2])

    info()

    annotated = None
    off_targets = None

    for feature in ['CDS', 'Exon', 'Transcript', 'Gene']:
        if bed:
            info('Extracting ' + feature + ' features from ' + features_fpath)
            features_bed = BedTool(features_fpath).filter(
                lambda x: x[6] == feature)

            info('Annotating based on ' + feature)
            new_annotated, off_targets = _annotate(cnf, bed, features_bed,
                                                   chr_order)
            if not annotated:
                annotated = new_annotated
                for a in annotated:
                    a.feature = feature
            else:
                annotated.extend(new_annotated)

            if off_targets:
                bed = BedTool([(r.chrom, r.start, r.end) for r in off_targets])

                # off_target_fpath = _save_regions(off_targets, join(work_dirpath, 'off_target_1.bed'))
                # log('Saved off target1 to ' + str(off_target_fpath))
                info()

    if annotated is not None and off_targets is not None:
        annotated.extend(off_targets)

    info()
    info('Saving annotated regions to ' + str(cnf.output_file))
    with open(cnf.output_file, 'w') as out:
        for region in sorted(annotated, key=lambda r: r.get_key()):
            out.write(str(region))

        # for r, overlap_size in overlaps:
        #     sys.stdout.write('\t' + '\t'.join([
        #         r.chrom, '{:,}'.format(r.start), '{:,}'.format(r.end), r.gene, r.exon, str(r.strand), r.feature, r.biotype,
        #         str(overlap_size),
        #         '{:.2f}%'.format(100.0 * overlap_size / (r.end - r.start))
        #     ]))
        # sys.stdout.write('\n')
    info('Done.')
def main(args):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--vcf', '--var'], dict(
                dest='vcf',
                help='variants to filter')
             ),
            (['--vcf2txt'], dict(
                dest='vcf2txt',
                help='variants in vcf2txt to filter')
             ),
            (['--cohort-freqs'], dict(
                dest='cohort_freqs_fpath',
                help='frequencies of variants in a cohort')
             ),
            (['--qc'], dict(
                dest='qc',
                action='store_true',
                default=True,
                help=SUPPRESS_HELP)
             ),
            (['--no-qc'], dict(
                dest='qc',
                action='store_false',
                help=SUPPRESS_HELP)
             ),
            (['--no-tsv'], dict(
                dest='tsv',
                action='store_false',
                default=True,
                help=SUPPRESS_HELP)
             ),
        ],
        required_keys=['vcf'],
        file_keys=['vcf'],
        key_for_sample_name='vcf',
        proc_name=source.varfilter_name + '_post')

    check_system_resources(cnf, required=['perl'])
    check_genome_resources(cnf)

    if not cnf.output_file:
        cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt')

    safe_mkdir(dirname(cnf.output_file))
    safe_mkdir(cnf.output_dir)

    if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'):
        verify_vcf(cnf.vcf, is_critical=True)

    if not cnf.vcf2txt:
        vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file)
        if not vcf2txt_res_fpath:
            critical('vcf2txt run returned non-0')
        info('Saved vcf2txt output to ' + vcf2txt_res_fpath)
    else:
        cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True)
        info('Input is vcf2txt output, grepping by sample name ' + cnf.sample)
        vcf2txt_res_fpath = cnf.output_file
        with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx:
            with open(cnf.vcf2txt) as f, open(tx, 'w') as out:
                for i, l in enumerate(f):
                    if l.strip():
                        if i == 0:
                            out.write(l)
                        else:
                            if l.split('\t')[0] == cnf.sample:
                                out.write(l)
        info('Using vcf2txt from ' + vcf2txt_res_fpath)

    # if is_local():
    #     vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl'))
    #     info('Running vardict2mut perl')
    #     res = run_vardict2mut(cnf, vcf2txt_res_fpath,
    #         add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'),
    #         vardict2mut_executable=vardict2mut_pl)
    #     if not res:
    #         critical('vardict2mut.pl run returned non-0')

    mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix))
    if not mut_fpath:
        err('vardict2mut failed')
    else:
        info('Saved passed mutations to ' + mut_fpath)

        var_s = source.VarSample(cnf.sample, cnf.output_dir)
        var_s.anno_vcf_fpath = cnf.vcf
        var_s.varfilter_dirpath = var_s.dirpath

        ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0]
        ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt'))
        var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz'

        var_s.variants_fpath = vcf2txt_res_fpath
        var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix)

        ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass')
        var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass')

        filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath)
        index_vcf(cnf, var_s.name, filt_vcf, cnf.caller)
        index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller)

        if cnf.qc:
            report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s)
            qc_dirpath = join(cnf.output_dir, 'qc')
            safe_mkdir(qc_dirpath)
            qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name)
            info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')')
            info('-' * 70)
            info()

        if not cnf['keep_intermediate']:
            shutil.rmtree(cnf['work_dir'])

        info()
        info('*' * 70)
        info('Done filtering ' + var_s.name)
Example #16
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input.'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--is-wgs',
                      dest='is_wgs',
                      action='store_true',
                      default=False,
                      help='whole genome sequencing')
    parser.add_option('--is-deep-seq',
                      dest='is_deep_seq',
                      action='store_true',
                      default=False,
                      help='deep targeted sequencing')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true')
    parser.add_option('-o',
                      dest='output_dir',
                      metavar='DIR',
                      default=join(os.getcwd(), 'targetqc'))
    parser.add_option('-c', '--caller', dest='caller')
    parser.add_option('--qc', dest='qc', action='store_true', default=False)
    parser.add_option('--no-qc',
                      dest='qc',
                      action='store_false',
                      default=False)
    parser.add_option('--qc-caption', dest='qc_caption', help=SUPPRESS_HELP)
    parser.add_option('--no-tsv',
                      dest='tsv',
                      action='store_false',
                      default=True,
                      help=SUPPRESS_HELP)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        critical('No vcf files provided to input.')

    run_cnf = determine_run_cnf(opts,
                                is_targetseq=opts.is_deep_seq,
                                is_wgs=opts.is_wgs)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    vcf_fpath_by_sample = read_samples(args, cnf.caller)
    info()

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Variants'
    set_up_dirs(cnf)
    # cnf.name = 'TargQC_' + cnf.project_name
    info(' '.join(sys.argv))

    samples = [
        source.VarSample(s_name, join(cnf.output_dir, s_name), vcf=vcf_fpath)
        for s_name, vcf_fpath in vcf_fpath_by_sample.items()
    ]
    samples.sort(key=lambda _s: _s.key_to_sort())

    check_genome_resources(cnf)

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner:
            critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples