Beispiel #1
0
def main():
    info(' '.join(sys.argv))
    info()
    parser = OptionParser(
        usage='Usage: ' + basename(__file__) +
        ' --bed BED_file --bam BAM_file -g hg19 -o Output_BEDGRAPH_file '
        '--work-dir work_directory --chr chromosome')
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--samples', dest='sample_names')
    parser.add_option('--bams', dest='bams')
    parser.add_option('--vcf', dest='vcf_fpath')
    parser.add_option('--chr', dest='chrom')
    parser.add_option('--bed', dest='bed', help='BED file.')
    parser.add_option('-g',
                      '--genome',
                      dest='chr_len_fpath',
                      help='File with chromosomes lengths.')
    parser.add_option('--work-dir', dest='work_dir', help='Work directory.')
    (opts, args) = parser.parse_args(sys.argv[1:])

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})
    samples = [
        BaseSample(sample_name, None, bam=bam)
        for (sample_name,
             bam) in zip(cnf.sample_names.split(','), cnf.bams.split(','))
    ]

    if not cnf.output_dir or not cnf.bams:
        critical(parser.usage)

    safe_mkdir(cnf.output_dir)
    safe_mkdir(cnf.work_dir)
    get_regions_coverage(cnf, samples)
    info('Done.')
def main():
    info(' '.join(sys.argv))
    info()

    parser = OptionParser(usage='Usage: ' + basename(__file__) + ' --chr chr --vcf VCF_file --samples Sample1,Sample2 '
                                                                 '--bams BAM_file1,BAM_file2 -o Output_directory '
                                                                 '--features BED_file')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--samples', dest='sample_names')
    parser.add_option('--bams', dest='bams')
    parser.add_option('--vcf', dest='vcf_fpath')
    parser.add_option('--chr', dest='chrom')
    parser.add_option('--features', dest='features', help='BED file with real CDS/Exon/Gene/Transcript regions with '
                                                          'annotations (default "features" is in system_config)')
    (opts, args) = parser.parse_args(sys.argv[1:])

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})
    cnf.verbose = False

    if not cnf.output_dir or not cnf.vcf_fpath or not cnf.chrom:
        critical(parser.usage)

    cnf.features = cnf.features or cnf.genome.features
    samples = [BaseSample(sample_name, None, bam=bam) for (sample_name, bam) in zip(cnf.sample_names.split(','), cnf.bams.split(','))]
    split_bams(cnf, samples, cnf.vcf_fpath)
    info('Done.')
Beispiel #3
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input. ' \
                  'Usage: ' + basename(__file__) + ' sample2bam.tsv --bed target.bed --contols sample1:sample2 -o results_dir'
    parser = OptionParser(description=description, usage=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('-o', dest='output_dir', metavar='DIR', default=join(os.getcwd(), 'seq2c'))
    parser.add_option('--bed', dest='bed', help='BED file to run Seq2C analysis')
    parser.add_option('-c', '--controls', dest='controls', help='Optional control sample names for Seq2C. For multiple controls, separate them using :')
    parser.add_option('--seq2c-opts', dest='seq2c_opts', help='Options for the final lr2gene.pl script.')
    parser.add_option('--no-prep-bed', dest='prep_bed', help=SUPPRESS_HELP, action='store_false', default=True)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        parser.print_usage()
        sys.exit(1)
    if len(args) == 1 and not args[0].endswith('.bam'):
        sample_names, bam_fpaths = read_samples(verify_file(args[0], is_critical=True, description='Input sample2bam.tsv'))
        bam_by_sample = OrderedDict()
        for s, b in zip(sample_names, bam_fpaths):
            bam_by_sample[s] = b
    else:
        bam_by_sample = find_bams(args)

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    check_genome_resources(cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    verify_dir(dirname(cnf.output_dir), is_critical=True)
    safe_mkdir(cnf.output_dir)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Seq2C'
    set_up_dirs(cnf)

    samples = [
        source.TargQC_Sample(name=s_name, dirpath=join(cnf.output_dir, s_name), bam=bam_fpath)
            for s_name, bam_fpath in bam_by_sample.items()]
    info('Samples: ')
    for s in samples:
        info('  ' + s.name)
    samples.sort(key=lambda _s: _s.key_to_sort())

    target_bed = verify_bed(cnf.bed, is_critical=True) if cnf.bed else None

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner: critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples, target_bed, cnf.output_dir
Beispiel #4
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script runs preprocessing.'

    parser = OptionParser(description=description)
    parser.add_option('-1', dest='left_reads_fpath', help='Left reads fpath')
    parser.add_option('-2', dest='right_reads_fpath', help='Right reads fpath')
    parser.add_option('--sample', dest='sample_name', help='Sample name')
    parser.add_option('-o', dest='output_dir', help='Output directory path')
    parser.add_option(
        '--downsample-to',
        dest='downsample_to',
        default=None,
        type='int',
        help=
        'Downsample reads to avoid excessive processing times with large files. '
        'Default is 1 million. Set to 0 to turn off downsampling.')
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    cnf = Config(opts.__dict__, determine_sys_cnf(opts),
                 determine_run_cnf(opts))
    left_reads_fpath = verify_file(opts.left_reads_fpath, is_critical=True)
    right_reads_fpath = verify_file(opts.right_reads_fpath, is_critical=True)
    output_dirpath = adjust_path(
        opts.output_dir) if opts.output_dir else critical(
            'Please, specify output directory with -o')
    verify_dir(dirname(output_dirpath),
               description='output_dir',
               is_critical=True)

    with workdir(cnf):
        sample_name = cnf.sample_name
        if not sample_name:
            sample_name = _get_sample_name(left_reads_fpath, right_reads_fpath)
        results_dirpath = run_fastq(cnf,
                                    sample_name,
                                    left_reads_fpath,
                                    right_reads_fpath,
                                    output_dirpath,
                                    downsample_to=cnf.downsample_to)

    verify_dir(results_dirpath, is_critical=True)
    info()
    info('*' * 70)
    info('Fastqc results:')
    info('  ' + results_dirpath)
Beispiel #5
0
def get_args():
    description = (
        'Plots a Circos plot given vardict variant file (with all dbSNP SNPs, not the PASS one), '
        'Seq2C CNV calls and Manta SVs.')
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    parser.add_option('--bed', dest='bed_fpath', help='Path to BED file')
    parser.add_option('-v', '--mutations', dest='mutations_fpath', help='Path to VarDict.txt file')
    parser.add_option('-c', '--seq2c', dest='seq2c_tsv_fpath', help='Path to seq2c copy number file')
    parser.add_option('--sv', dest='sv_fpath', help='Path to Manta SV call vcf.gz file')
    parser.add_option('-s', '--sample', dest='sample', help='Identifier of sample in VarDict and Seq2c files')
    parser.add_option('-o', '--output-dir', dest='output_dir', default="./",
                        help='Output directory. Defaults to ./')
    (opts, args) = parser.parse_args()
    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    return cnf
def main():
    parser = OptionParser(usage='Usage: ' + basename(__file__) +
                          ' -o Output_BED_file -g hg19 Input_BED_file')
    parser.add_option('-o', '--output-bed', dest='output_fpath')
    parser.add_option('-g', '--genome', dest='genome')
    (opts, args) = parser.parse_args(sys.argv[1:])

    if len(args) < 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    check_genome_resources(cnf)

    if not cnf.output_fpath:
        critical(parser.usage)

    sort_bed(cnf, verify_bed(args[0], is_critical=True),
             adjust_path(cnf.output_fpath))
def _read_args(args_list):
    options = [
        # (['-k', '--key-genes'], dict(
        #     dest='key_genes_fpath',
        #     help='list of key genes (they are at top priority when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/az_key_genes.300.txt')
        #  ),
        # (['-a', '--approved-genes'], dict(
        #     dest='approved_genes_fpath',
        #     help='list of HGNC approved genes (they are preferable when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/HGNC_gene_synonyms.txt')
        #  ),
        # (['-e', '--ensembl-bed'], dict(
        #     dest='ensembl_bed_fpath',
        #     help='reference BED file for annotation (Ensembl)')
        #  ),
        # (['-r', '--refseq-bed'], dict(
        #     dest='refseq_bed_fpath',
        #     help='reference BED file for annotation (RefSeq)')
        #  ),
        # (['-b', '--bedtools'], dict(
        #     dest='bedtools',
        #     help='path to bedtools',
        #     default='bedtools')
        #  ),
        (['-o', '--output-bed'], dict(dest='output_fpath')),
        (['--debug'],
         dict(
             dest='debug',
             help=
             'run in a debug more (verbose output, keeping of temporary files)',
             default=False,
             action='store_true')),
        (['--output-hg'],
         dict(
             dest='output_hg',
             help=
             'output chromosome names in hg-style (chrM, chr1, .., chr22, chrX, chrY)',
             default=False,
             action='store_true')),
        (['--output-grch'],
         dict(
             dest='output_grch',
             help='output chromosome names in GRCh-style (1, .., 22, X, Y, MT)',
             default=False,
             action='store_true')),
        (['-g', '--genome'], dict(dest='genome', default='hg19')),
    ]

    parser = OptionParser(
        usage='usage: %prog [options] Input_BED_file -o Standardized_BED_file',
        description='Scripts outputs a standardized version of input BED file. '
        'Standardized BED: 1) has 4 or 8 fields (for BEDs with primer info);'
        ' 2) has HGNC approved symbol in forth column if annotation is '
        'possible and not_a_gene_X otherwise;'
        ' 3) is sorted based on chromosome name -> start -> end;'
        ' 4) has no duplicated regions (regions with the same chromosome, start and end), '
        'the only exception is _CONTROL_ regions.')
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    (opts, args) = parser.parse_args(args_list)

    if len(args) != 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    work_dirpath = tempfile.mkdtemp()
    info('Creating a temporary working directory ' + work_dirpath)
    if not exists(work_dirpath):
        os.mkdir(work_dirpath)

    input_bed_fpath = abspath(args[0])
    info('Input: ' + input_bed_fpath)

    output_bed_fpath = adjust_path(cnf.output_fpath)
    info('Writing to: ' + output_bed_fpath)

    # process configuration
    # for k, v in opts.__dict__.iteritems():
    #     if k.endswith('fpath') and verify_file(v, is_critical=True):
    #         opts.__dict__[k] = verify_file(v, k)
    if cnf.output_grch and cnf.output_hg:
        info(
            'you cannot specify --output-hg and --output-grch simultaneously!')
    # if not which(opts.bedtools):
    #     info('bedtools executable not found, please specify correct path (current is %s)! '
    #         'Did you forget to execute "module load bedtools"?' % opts.bedtools)

    # if opts.debug:
    #     info('Configuration: ')
    #     for k, v in opts.__dict__.iteritems():
    #         info('\t' + k + ': ' + str(v))
    info()

    # opts.ensembl_bed_fpath = verify_file(opts.ensembl_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/Exons.with_genes.bed'))

    # opts.refseq_bed_fpath = verify_file(opts.refseq_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/RefSeq/RefSeq_CDS_miRNA.all_features.bed'))

    return input_bed_fpath, output_bed_fpath, work_dirpath, cnf
Beispiel #8
0
def proc_opts():
    parser = OptionParser()
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--expose-only',
                      dest='expose_to_ngs_server_only',
                      action='store_true',
                      default=False,
                      help='Only add project to the webserver')
    parser.add_option('--no-expose',
                      dest='expose',
                      action='store_false',
                      default=True,
                      help='Do not expose the reports')
    parser.add_option('-o', dest='output_dir')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option('--downsample-to', dest='downsample_to', type='int')

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) < 1:
        critical('Usage: ' + __file__ + ' *.fq.gz -o output_dir')
    # if len(args) < 2:
    #     info('No dataset path specified, assuming it is the current working directory')
    #     dataset_dirpath = adjust_path(os.getcwd())
    #     jira_url = args[0]

    fastq_fpaths = [verify_file(fpath) for fpath in args]
    fastq_fpaths = [fpath for fpath in fastq_fpaths if fpath]
    info(str(len(fastq_fpaths)) + ' fastq files')

    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    cnf.output_dir = adjust_path(cnf.output_dir)
    info('Writing to ' + str(cnf.output_dir))

    cnf.project_name = cnf.project_name or 'preproc'

    if cnf.work_dir:
        cnf.debug = True
    else:
        all_work_dir = join(cnf.output_dir, 'work')
        safe_mkdir(all_work_dir)

        latest_fpath = join(all_work_dir, 'latest')

        if cnf.reuse_intermediate:
            cnf.work_dir = latest_fpath
        else:
            cnf.work_dir = join(
                all_work_dir,
                datetime.datetime.now().strftime("%Y-%b-%d_%H-%M"))
            if islink(latest_fpath):
                os.remove(latest_fpath)
            if isdir(latest_fpath):
                shutil.rmtree(latest_fpath)
            if not exists(latest_fpath):
                os.symlink(basename(cnf.work_dir), latest_fpath)

    cnf.work_dir = adjust_path(cnf.work_dir)
    safe_mkdir(cnf.work_dir)
    cnf.log_dir = join(cnf.work_dir, 'log')
    safe_mkdir(cnf.log_dir)
    set_up_log(cnf)
    try:
        subprocess.call(['chmod', '-R', 'g+w', cnf.work_dir])
    except OSError:
        err(traceback.format_exc())
        pass

    if cnf.samplesheet:
        cnf.samplesheet = verify_file(cnf.samplesheet, is_critical=True)

    info(' '.join(sys.argv))
    info()
    info('Created a temporary working directory: ' + cnf.work_dir)

    if cnf.project_name:
        info('Project name: ' + cnf.project_name)

    if cnf.samplesheet:
        info('Using custom sample sheet ' + cnf.samplesheet)

    check_genome_resources(cnf)
    check_system_resources(cnf, optional=['fastq'])

    return cnf, cnf.output_dir, fastq_fpaths
Beispiel #9
0
def _read_args(args_list):
    options = [
        # (['-k', '--key-genes'], dict(
        #     dest='key_genes_fpath',
        #     help='list of key genes (they are at top priority when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/az_key_genes.300.txt')
        #  ),
        # (['-a', '--approved-genes'], dict(
        #     dest='approved_genes_fpath',
        #     help='list of HGNC approved genes (they are preferable when choosing one of multiple annotations)',
        #     default='/ngs/reference_data/genomes/Hsapiens/common/HGNC_gene_synonyms.txt')
        #  ),
        # (['-e', '--ensembl-bed'], dict(
        #     dest='ensembl_bed_fpath',
        #     help='reference BED file for annotation (Ensembl)')
        #  ),
        # (['-r', '--refseq-bed'], dict(
        #     dest='refseq_bed_fpath',
        #     help='reference BED file for annotation (RefSeq)')
        #  ),
        # (['-b', '--bedtools'], dict(
        #     dest='bedtools',
        #     help='path to bedtools',
        #     default='bedtools')
        #  ),
        (['-o', '--output-bed'], dict(
            dest='output_fpath')
         ),
        (['--debug'], dict(
            dest='debug',
            help='run in a debug more (verbose output, keeping of temporary files)',
            default=False,
            action='store_true')
         ),
        (['--output-hg'], dict(
            dest='output_hg',
            help='output chromosome names in hg-style (chrM, chr1, .., chr22, chrX, chrY)',
            default=False,
            action='store_true')
         ),
        (['--output-grch'], dict(
            dest='output_grch',
            help='output chromosome names in GRCh-style (1, .., 22, X, Y, MT)',
            default=False,
            action='store_true')
         ),
        (['-g', '--genome'], dict(
            dest='genome',
            default='hg19')
         ),
    ]

    parser = OptionParser(usage='usage: %prog [options] Input_BED_file -o Standardized_BED_file',
                          description='Scripts outputs a standardized version of input BED file. '
                                      'Standardized BED: 1) has 4 or 8 fields (for BEDs with primer info);'
                                      ' 2) has HGNC approved symbol in forth column if annotation is '
                                      'possible and not_a_gene_X otherwise;'
                                      ' 3) is sorted based on chromosome name -> start -> end;'
                                      ' 4) has no duplicated regions (regions with the same chromosome, start and end), '
                                      'the only exception is _CONTROL_ regions.')
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)
    (opts, args) = parser.parse_args(args_list)

    if len(args) != 1:
        parser.print_help(file=sys.stderr)
        sys.exit(1)

    cnf = Config(opts.__dict__, determine_sys_cnf(opts), {})

    work_dirpath = tempfile.mkdtemp()
    info('Creating a temporary working directory ' + work_dirpath)
    if not exists(work_dirpath):
        os.mkdir(work_dirpath)

    input_bed_fpath = abspath(args[0])
    info('Input: ' + input_bed_fpath)

    output_bed_fpath = adjust_path(cnf.output_fpath)
    info('Writing to: ' + output_bed_fpath)

    # process configuration
    # for k, v in opts.__dict__.items():
    #     if k.endswith('fpath') and verify_file(v, is_critical=True):
    #         opts.__dict__[k] = verify_file(v, k)
    if cnf.output_grch and cnf.output_hg:
        info('you cannot specify --output-hg and --output-grch simultaneously!')
    # if not which(opts.bedtools):
    #     info('bedtools executable not found, please specify correct path (current is %s)! '
    #         'Did you forget to execute "module load bedtools"?' % opts.bedtools)

    # if opts.debug:
    #     info('Configuration: ')
    #     for k, v in opts.__dict__.items():
    #         info('\t' + k + ': ' + str(v))
    info()

    # opts.ensembl_bed_fpath = verify_file(opts.ensembl_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/Exons.with_genes.bed'))

    # opts.refseq_bed_fpath = verify_file(opts.refseq_bed_fpath or \
    #     ('/ngs/reference_data/genomes/Hsapiens/' + opts.genome + '/bed/Exons/RefSeq/RefSeq_CDS_miRNA.all_features.bed'))

    return input_bed_fpath, output_bed_fpath, work_dirpath, cnf
Beispiel #10
0
def get_args():
    info(' '.join(sys.argv))
    info()
    description = (
        'The program will filter the VarDict output after vcf2txt.pl to '
        'candidate interpretable mutations, somatic or germline.')
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)

    parser.add_option('-o', dest='output_file')
    parser.add_option('--o-all-transcripts',
                      dest='all_transcripts_output_file')
    parser.add_option('--o-fm', dest='fm_output_file')
    parser.add_option('--o-reject', dest='rejected_output_file')

    parser.add_option('--cohort-freqs', dest='cohort_freqs_fpath')
    parser.add_option('--transcripts', dest='transcripts_fpath')

    parser.add_option('-D',
                      '--min-depth',
                      dest='filt_depth',
                      type='int',
                      help='The minimum total depth')
    parser.add_option('-V',
                      '--min-vd',
                      dest='min_vd',
                      type='int',
                      help='The minimum reads supporting variant')
    parser.add_option(
        '--gmaf',
        dest='min_gmaf',
        type='float',
        help=
        'When the GMAF is greater than specified, it\'s considered common SNP and filtered out.'
    )
    parser.add_option(
        '-f',
        '--min-freq',
        dest='min_freq',
        type='float',
        help='The minimum allele frequency for regular variants.')
    parser.add_option(
        '-F',
        '--min-freq-hs',
        '--act-min-freq',
        dest='act_min_freq',
        type='float',
        help=
        'The minimum allele frequency hotspot somatic mutations, typically lower then -f. '
        'Default: 0.01 or half -f, whichever is less')
    parser.add_option(
        '-N',
        '--keep-utr-intronic',
        dest='keep_utr_intronic',
        action='store_true',
        help=
        'Keep all intronic and UTR in the output, but will be set as "unknown".'
    )

    parser.add_option(
        '-p',
        '--platform',
        dest='platform',
        help=
        'The platform, such as WXS, WGS, RNA-Seq, VALIDATION, etc. No Default. '
        'Used for output in FM\'s format')

    parser.set_usage('Usage: ' + __file__ +
                     ' vcf2txt_res_fpath [opts] -o output_fpath')

    (opts, args) = parser.parse_args()
    if len(args) < 1:
        critical('Provide the first argument - output from vcf2txt.pl')
    logger.is_debug = opts.debug

    vcf2txt_res_fpath = verify_file(args[0], is_critical=True)

    run_cnf = determine_run_cnf(opts)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)
    if not cnf.genome:
        critical('Please, specify the --genome option (e.g. --genome hg19)')

    check_genome_resources(cnf)

    if not cnf.output_file:
        critical('Please, specify the output fpath with -o')

    info()

    return cnf, vcf2txt_res_fpath
Beispiel #11
0
def main():
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input.'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser, threads=1)
    parser.add_option('--work-dir', dest='work_dir', metavar='DIR')
    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true')
    parser.add_option('-o',
                      dest='output_dir',
                      metavar='DIR',
                      default=join(os.getcwd(), 'targetqc'))
    parser.add_option('--reannotate',
                      dest='reannotate',
                      action='store_true',
                      default=False,
                      help='re-annotate BED file with gene names')
    parser.add_option('--dedup',
                      dest='dedup',
                      action='store_true',
                      default=False,
                      help='count duplicates in coverage metrics')
    parser.add_option('--bed',
                      dest='bed',
                      help='BED file to run targetSeq and Seq2C analysis on.')
    parser.add_option(
        '--exons',
        '--exome',
        '--features',
        dest='features',
        help=
        'Annotated CDS/Exon/Gene/Transcripts BED file to make targetSeq exon/amplicon regions reports.'
    )

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        critical('No BAMs provided to input.')
    bam_fpaths = list(set([abspath(a) for a in args]))

    bad_bam_fpaths = []
    for fpath in bam_fpaths:
        if not verify_bam(fpath):
            bad_bam_fpaths.append(fpath)
    if bad_bam_fpaths:
        critical('BAM files cannot be found, empty or not BAMs:' +
                 ', '.join(bad_bam_fpaths))

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'TargQC'
    set_up_dirs(cnf)
    # cnf.name = 'TargQC_' + cnf.project_name

    check_genome_resources(cnf)

    verify_bed(cnf.bed, is_critical=True)
    bed_fpath = adjust_path(cnf.bed)
    info('Using amplicons/capture panel ' + bed_fpath)

    features_bed_fpath = adjust_path(
        cnf.features) if cnf.features else adjust_path(cnf.genome.features)
    info('Features: ' + features_bed_fpath)

    genes_fpath = None
    if cnf.genes:
        genes_fpath = adjust_path(cnf.genes)
        info('Custom genes list: ' + genes_fpath)

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner:
            critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    info('*' * 70)
    info()

    targqc_html_fpath = run_targqc(cnf, cnf.output_dir, bam_fpaths, bed_fpath,
                                   features_bed_fpath, genes_fpath)
    if targqc_html_fpath:
        send_email(
            cnf, 'TargQC report for ' + cnf.project_name + ':\n  ' +
            targqc_html_fpath)
Beispiel #12
0
def read_opts_and_cnfs(extra_opts,
                       key_for_sample_name=None,
                       required_keys=list(),
                       file_keys=list(),
                       dir_keys=list(),
                       description='',
                       extra_msg=None,
                       proc_name=None,
                       fpath_for_sample_name=None,
                       main_output_is_file=False,
                       main_output_is_dir=True):
    options = extra_opts
    if main_output_is_file:
        options += [(['-o', '--output-file'],
                     dict(dest='output_file',
                          metavar='FILE',
                          help='Output file'))]
        options += [(
            ['--output-dir'],
            dict(
                dest='output_dir',
                metavar='DIR',
                help=
                'Output directory (or directory name in case of bcbio final dir)',
                default=os.getcwd()))]
    elif main_output_is_dir:
        options += [(
            ['-o', '--output-dir'],
            dict(
                dest='output_dir',
                metavar='DIR',
                help=
                'Output directory (or directory name in case of bcbio final dir)',
                default=os.getcwd()))]
        options += [(['--output-file'],
                     dict(dest='output_file',
                          metavar='FILE',
                          help='Output file'))]

    options += [
        (['-s', '--sample', '--name'],
         dict(
             dest='sample',
             metavar='NAME',
             help=
             'Sample name (default is part of name of the first parameter prior to the first - or .'
         )),
        (['-c', '--caller'],
         dict(
             dest='caller',
             metavar='CALLER_NAME',
             help=
             'Variant caller name (default is part of name of the first parameter between the first - and following .'
         )),
        (['-t', '--nt', '--threads'],
         dict(dest='threads', type='int', help='Number of threads')),
        (
            ['--clean'],
            dict(  # do not keep work directory
                dest='keep_intermediate',
                action='store_false',
                help=SUPPRESS_HELP)),
        (['--debug'],
         dict(dest='debug',
              action='store_true',
              default=False,
              help=SUPPRESS_HELP)),
        (['--reuse'],
         dict(
             dest='reuse_intermediate',
             help=
             'reuse intermediate non-empty files in the work dir from previous run',
             action='store_true')),
        (['--sys-cnf'],
         dict(
             dest='sys_cnf',
             metavar='SYS_CNF.yaml',
             help=
             'System configuration file with paths to external tools and genome resources. The default is  '
             '(see default one %s)' % defaults['sys_cnf'])),
        (['--run-cnf'],
         dict(
             dest='run_cnf',
             metavar='RUN_CNF.yaml',
             default=defaults['run_cnf_exome_seq'],
             help=
             'Customised run details: list of annotations/QC metrics/databases/filtering criteria. '
             'The default is %s' % defaults['run_cnf_exome_seq'])),
        (['--transcripts'], dict(dest='transcripts_fpath')),
        (['--work-dir'],
         dict(dest='work_dir', metavar='DIR', help=SUPPRESS_HELP)),
        (['--log-dir'], dict(dest='log_dir', metavar='DIR',
                             help=SUPPRESS_HELP)),
        (['--proc-name'], dict(dest='proc_name', help=SUPPRESS_HELP)),
        (['--project-name'], dict(dest='project_name')),
        (['--no-check'],
         dict(dest='no_check', action='store_true', help=SUPPRESS_HELP)),
        (['-g', '--genome'], dict(dest='genome')),
        (['--email'], dict(dest='email', help=SUPPRESS_HELP)),
        (['--done-marker'], dict(dest='done_marker', help=SUPPRESS_HELP)),
    ]

    parser = OptionParser(description=description)
    for args, kwargs in options:
        parser.add_option(*args, **kwargs)

    req_keys_usage = ''
    if required_keys:
        req_keys_usage = '\nRequired options:'
    for args, kwargs in options:
        try:
            if kwargs['dest'] in required_keys:
                req_keys_usage += '\n  ' + '/'.join(args)
        except:
            err(format_exc())
            pass
    parser.set_usage(parser.get_usage() + req_keys_usage)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    run_cnf = determine_run_cnf(opts, is_wgs=not opts.__dict__.get('bed'))
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    errors = check_keys_presence(cnf, required_keys)
    if errors:
        parser.print_help()
        critical(errors)
    file_keys = [k for k in file_keys if k in required_keys]
    dir_keys = [k for k in dir_keys if k in required_keys]
    errors = check_dirs_and_files(cnf, file_keys, dir_keys)
    if errors:
        critical(errors)

    if cnf.sample:
        cnf.sample = remove_quotes(cnf.sample)
    else:
        if not fpath_for_sample_name:
            if not key_for_sample_name:
                critical('Error: --sample must be provided in options.')

            fpath_for_sample_name = cnf[key_for_sample_name]
            if not fpath_for_sample_name:
                critical('Error: --sample or ' + (str(key_for_sample_name)) +
                         ' must be provided in options.')

            key_fname = basename(cnf[key_for_sample_name])
            cnf.sample = key_fname.split('.')[0]

    if cnf.caller:
        cnf.caller = remove_quotes(cnf.caller)
    # elif key_for_sample_name and cnf[key_for_sample_name]:
    #     key_fname = basename(cnf[key_for_sample_name])
    #     try:
    #         cnf.caller = cnf.caller or key_fname.split('.')[0].split('-')[1]
    #     except:
    #         cnf.caller = ''
    else:
        cnf.caller = None

    cnf.proc_name = cnf.proc_name or proc_name
    set_up_dirs(cnf)
    info(' '.join(sys.argv))
    info()

    return cnf
Beispiel #13
0
def proc_args(argv):
    info(' '.join(sys.argv))
    info()

    description = 'This script generates target QC reports for each BAM provided as an input.'
    parser = OptionParser(description=description)
    add_cnf_t_reuse_prjname_donemarker_workdir_genome_debug(parser)
    parser.add_option('--log-dir', dest='log_dir')
    parser.add_option('--is-wgs',
                      dest='is_wgs',
                      action='store_true',
                      default=False,
                      help='whole genome sequencing')
    parser.add_option('--is-deep-seq',
                      dest='is_deep_seq',
                      action='store_true',
                      default=False,
                      help='deep targeted sequencing')
    parser.add_option('--only-summary',
                      dest='only_summary',
                      action='store_true')
    parser.add_option('-o',
                      dest='output_dir',
                      metavar='DIR',
                      default=join(os.getcwd(), 'targetqc'))
    parser.add_option('-c', '--caller', dest='caller')
    parser.add_option('--qc', dest='qc', action='store_true', default=False)
    parser.add_option('--no-qc',
                      dest='qc',
                      action='store_false',
                      default=False)
    parser.add_option('--qc-caption', dest='qc_caption', help=SUPPRESS_HELP)
    parser.add_option('--no-tsv',
                      dest='tsv',
                      action='store_false',
                      default=True,
                      help=SUPPRESS_HELP)

    (opts, args) = parser.parse_args()
    logger.is_debug = opts.debug

    if len(args) == 0:
        critical('No vcf files provided to input.')

    run_cnf = determine_run_cnf(opts,
                                is_targetseq=opts.is_deep_seq,
                                is_wgs=opts.is_wgs)
    cnf = Config(opts.__dict__, determine_sys_cnf(opts), run_cnf)

    vcf_fpath_by_sample = read_samples(args, cnf.caller)
    info()

    if not cnf.project_name:
        cnf.project_name = basename(cnf.output_dir)
    info('Project name: ' + cnf.project_name)

    cnf.proc_name = 'Variants'
    set_up_dirs(cnf)
    # cnf.name = 'TargQC_' + cnf.project_name
    info(' '.join(sys.argv))

    samples = [
        source.VarSample(s_name, join(cnf.output_dir, s_name), vcf=vcf_fpath)
        for s_name, vcf_fpath in vcf_fpath_by_sample.items()
    ]
    samples.sort(key=lambda _s: _s.key_to_sort())

    check_genome_resources(cnf)

    if not cnf.only_summary:
        cnf.qsub_runner = adjust_system_path(cnf.qsub_runner)
        if not cnf.qsub_runner:
            critical('Error: qsub-runner is not provided is sys-config.')
        verify_file(cnf.qsub_runner, is_critical=True)

    return cnf, samples