Example #1
0
def _tracks(cnf, track_fpath, input_fpath):
    if not verify_file(track_fpath):
        return None

    field_name = splitext_plus(basename(track_fpath))[0]

    step_greetings('Intersecting with ' + field_name)

    output_fpath = intermediate_fname(cnf, input_fpath, field_name)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    toolpath = get_system_path(cnf, 'vcfannotate')
    if not toolpath:
        err('WARNING: Skipping annotation with tracks: vcfannotate '
            'executable not found, you probably need to specify path in system_config, or '
            'run load bcbio:  . /group/ngs/bin/bcbio-prod.sh"')
        return None

    # self.all_fields.append(field_name)

    cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format(
        **locals())

    assert input_fpath
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   overwrite=True)
    if not verify_vcf(output_fpath):
        err('Error: tracks resulted ' + str(output_fpath) + ' for ' +
            track_fpath)
        return output_fpath

    # Set TRUE or FALSE for tracks
    def proc_line(line, i):
        if field_name in line:
            if not line.startswith('#'):
                fields = line.split('\t')
                info_line = fields[7]
                info_pairs = [attr.split('=') for attr in info_line.split(';')]
                info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if
                              pair[0] == field_name and len(pair) > 1 else pair
                              for pair in info_pairs]
                info_line = ';'.join(
                    '='.join(pair) if len(pair) == 2 else pair[0]
                    for pair in info_pairs)
                fields = fields[:7] + [info_line] + fields[8:]
                return '\t'.join(fields)
        return line

    assert output_fpath
    output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk')
    return verify_vcf(output_fpath, is_critical=True)
Example #2
0
def _snpsift_db_nsfp(cnf, input_fpath):
    if 'dbnsfp' not in cnf.annotation or 'dbnsfp' not in cnf.genome:
        return None

    step_greetings('DB SNFP')

    output_fpath = intermediate_fname(cnf, input_fpath, 'db_nsfp')
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')

    db_path = cnf['genome']['dbnsfp']
    if not verify_file(db_path, 'DB NSFP file'):
        err('DB NSFP file is incorrect. Skipping.')
        return None

    annotations = cnf.annotation['dbnsfp'].get('annotations') or []

    # all_fields.extend(['dbNSFP_' + ann for ann in annotations])

    ann_line = ('-f ' + ','.join(annotations)) if annotations else ''

    cmdline = '{executable} dbnsfp {ann_line} -v -db {db_path} ' \
              '{input_fpath}'.format(**locals())
    if call_subprocess(cnf,
                       cmdline,
                       input_fpath,
                       output_fpath,
                       stdout_to_outputfile=True,
                       exit_on_error=False,
                       overwrite=True):
        return verify_vcf(output_fpath, is_critical=True)
    else:
        return None
def combine_vcfs(cnf,
                 vcf_fpath_by_sname,
                 combined_vcf_fpath,
                 additional_parameters=''):
    gatk = get_java_tool_cmdline(cnf, 'gatk')
    if not gatk:
        info('GATK is not found, skipping merging VCFs')
        return None

    cmdl = '{gatk} -T CombineVariants -R {cnf.genome.seq} {additional_parameters}'.format(
        **locals())
    for s_name, vcf_fpath in vcf_fpath_by_sname.items():
        if vcf_fpath:
            cmdl += ' --variant:' + s_name + ' ' + vcf_fpath
    if ' --variant:' not in cmdl:
        err('No VCFs to combine')
        return None

    if cnf.reuse_intermediate and isfile(
            combined_vcf_fpath + '.gz') and verify_vcf(combined_vcf_fpath +
                                                       '.gz'):
        info(combined_vcf_fpath + '.gz exists, reusing')
        return combined_vcf_fpath + '.gz'

    cmdl += ' -o ' + combined_vcf_fpath
    res = call(cnf,
               cmdl,
               output_fpath=combined_vcf_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)
    if res:
        info('Joined VCFs, saved into ' + combined_vcf_fpath)
        if isfile(combined_vcf_fpath + '.tx.idx'):
            try:
                os.remove(combined_vcf_fpath + '.tx.idx')
            except OSError:
                err(traceback.format_exc())
                info()
        return bgzip_and_tabix(cnf, combined_vcf_fpath)
    else:
        warn('Could not join VCFs')
        return None
Example #4
0
def add_annotation(cnf, input_fpath, key, value, number, type_, description):
    step_greetings('Adding annotation...')

    def proc_rec(rec):
        rec.INFO[key] = value
        return rec

    output_fpath = iterate_vcf(cnf, input_fpath, proc_rec)

    info('Adding header meta info...')

    def _add_format_header(l, i):
        if l.startswith('#CHROM'):
            ext_l = ''
            ext_l += '##INFO=<ID={key},Number={number},Type={type_},Description="{desc}">\n'.format(
                key=key, number=number, type_=type_, desc=description)
            return ext_l + l
        return l

    output_fpath = iterate_file(cnf, output_fpath, _add_format_header)
    return verify_vcf(output_fpath, is_critical=True)
def _annotate(cnf, samples):
    varannotate_cmdl = (get_script_cmdline(
        cnf, 'python', join('scripts', 'post', 'varannotate.py')) +
                        ' --sys-cnf ' + cnf.sys_cnf + ' --run-cnf ' +
                        cnf.run_cnf + ' --project-name ' + cnf.project_name +
                        (' --reuse ' if cnf.reuse_intermediate else '') +
                        ' --log-dir -' + ' --genome ' + cnf.genome.name +
                        (' --no-check ' if cnf.no_check else '') +
                        (' --qc' if cnf.qc else ' --no-qc') +
                        ((' --caller ' + cnf.caller) if cnf.caller else ''))

    total_reused = 0
    total_processed = 0
    total_success = 0
    total_failed = 0

    not_submitted_samples = samples
    while not_submitted_samples:
        jobs_to_wait = []
        submitted_samples = []
        reused_samples = []
        for sample in not_submitted_samples:
            if not sample.varannotate_dirpath:
                sample.varannotate_dirpath = join(sample.dirpath,
                                                  source.varannotate_name)
            if not sample.anno_vcf_fpath:
                sample.anno_vcf_fpath = join(
                    sample.varannotate_dirpath,
                    add_suffix(basename(sample.vcf), 'anno'))
            output_fpath = sample.anno_vcf_fpath
            if not output_fpath.endswith('.gz'):
                output_fpath += '.gz'
            debug('Checking ' + output_fpath)
            if cnf.reuse_intermediate and isfile(output_fpath) and verify_vcf(
                    output_fpath):
                info('Annotated results ' + output_fpath + ' exist, reusing.')
                reused_samples.append(sample)
                info()
                continue

            work_dir = join(cnf.work_dir,
                            source.varannotate_name + '_' + sample.name)
            j = submit_job(
                cnf,
                cmdline=varannotate_cmdl + ' --vcf ' + sample.vcf + ' -o ' +
                sample.varannotate_dirpath + ' -s ' + sample.name +
                ' --work-dir ' + work_dir + ' --output-file ' + output_fpath,
                job_name='VA_' + cnf.project_name + '_' + sample.name,
                output_fpath=output_fpath,
                stdout_to_outputfile=False,
                work_dir=work_dir)
            if not j.is_done:
                jobs_to_wait.append(j)
            submitted_samples.append(sample)
            if len(jobs_to_wait) >= cnf.threads:
                not_submitted_samples = [
                    s for s in not_submitted_samples
                    if s not in submitted_samples and s not in reused_samples
                ]

                if not_submitted_samples:
                    info('Submitted ' + str(len(jobs_to_wait)) +
                         ' jobs, waiting them to finish before '
                         'submitting more ' + str(len(not_submitted_samples)))
                else:
                    info('Submitted ' + str(len(jobs_to_wait)) + ' last jobs.')
                info()
                break
            info()

        info()
        info('-' * 70)
        if jobs_to_wait:
            info('Submitted ' + str(len(jobs_to_wait)) + ' jobs, waiting...')
            jobs_to_wait = wait_for_jobs(cnf, jobs_to_wait)
        else:
            info('No annotation jobs to submit.')
        info('')
        info('-' * 70)
        info('Finihsed annotating ' + str(len(jobs_to_wait)) + ' jobs')
        for j in jobs_to_wait:
            if j.is_done and not j.is_failed and not verify_vcf(
                    j.output_fpath):
                j.is_failed = True
            if j.is_done and not j.is_failed:
                if isdir(j.work_dir):
                    os.system('rm -rf ' + j.work_dir)
                else:
                    err('Job was done, but j.work_dir ' + j.work_dir +
                        ' does not exist')

        processed = sum(1 for j in jobs_to_wait if j.is_done)
        failed = sum(1 for j in jobs_to_wait if j.is_failed)
        success = sum(1 for j in jobs_to_wait if j.is_done and not j.is_failed)
        total_failed += failed
        total_reused += len(reused_samples)
        total_processed += processed
        total_success += success
        info('Reused: ' + str(len(reused_samples)))
        info('Processed: ' + str(processed))
        info('Success: ' + str(success))
        info('Failed: ' + str(failed))
        info()

        not_submitted_samples = [
            s for s in not_submitted_samples
            if s not in submitted_samples and s not in reused_samples
        ]

    info('-' * 70)
    info('Done with all ' + str(len(samples)) + ' samples.')
    info('Total reused: ' + str(total_reused))
    info('Total processed: ' + str(total_processed))
    info('Total success: ' + str(total_success))
    info('Total failed: ' + str(total_failed))
    info()
def main(args):
    cnf = read_opts_and_cnfs(
        extra_opts=[
            (['--vcf', '--var'], dict(
                dest='vcf',
                help='variants to filter')
             ),
            (['--vcf2txt'], dict(
                dest='vcf2txt',
                help='variants in vcf2txt to filter')
             ),
            (['--cohort-freqs'], dict(
                dest='cohort_freqs_fpath',
                help='frequencies of variants in a cohort')
             ),
            (['--qc'], dict(
                dest='qc',
                action='store_true',
                default=True,
                help=SUPPRESS_HELP)
             ),
            (['--no-qc'], dict(
                dest='qc',
                action='store_false',
                help=SUPPRESS_HELP)
             ),
            (['--no-tsv'], dict(
                dest='tsv',
                action='store_false',
                default=True,
                help=SUPPRESS_HELP)
             ),
        ],
        required_keys=['vcf'],
        file_keys=['vcf'],
        key_for_sample_name='vcf',
        proc_name=source.varfilter_name + '_post')

    check_system_resources(cnf, required=['perl'])
    check_genome_resources(cnf)

    if not cnf.output_file:
        cnf.output_file = join(cnf.output_dir, (cnf.caller or 'variants') + '.txt')

    safe_mkdir(dirname(cnf.output_file))
    safe_mkdir(cnf.output_dir)

    if cnf.vcf.endswith('.vcf.gz') or cnf.vcf.endswith('.vcf'):
        verify_vcf(cnf.vcf, is_critical=True)

    if not cnf.vcf2txt:
        vcf2txt_res_fpath = run_vcf2txt(cnf, {cnf.sample: cnf.vcf}, cnf.output_file)
        if not vcf2txt_res_fpath:
            critical('vcf2txt run returned non-0')
        info('Saved vcf2txt output to ' + vcf2txt_res_fpath)
    else:
        cnf.vcf2txt = verify_file(cnf.vcf2txt, is_critical=True)
        info('Input is vcf2txt output, grepping by sample name ' + cnf.sample)
        vcf2txt_res_fpath = cnf.output_file
        with file_transaction(cnf.work_dir, vcf2txt_res_fpath) as tx:
            with open(cnf.vcf2txt) as f, open(tx, 'w') as out:
                for i, l in enumerate(f):
                    if l.strip():
                        if i == 0:
                            out.write(l)
                        else:
                            if l.split('\t')[0] == cnf.sample:
                                out.write(l)
        info('Using vcf2txt from ' + vcf2txt_res_fpath)

    # if is_local():
    #     vardict2mut_pl = get_script_cmdline(cnf, 'perl', join('VarDict', 'vardict2mut.pl'))
    #     info('Running vardict2mut perl')
    #     res = run_vardict2mut(cnf, vcf2txt_res_fpath,
    #         add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix + '_perl'),
    #         vardict2mut_executable=vardict2mut_pl)
    #     if not res:
    #         critical('vardict2mut.pl run returned non-0')

    mut_fpath = run_vardict2mut(cnf, vcf2txt_res_fpath, add_suffix(vcf2txt_res_fpath, variant_filtering.mut_pass_suffix))
    if not mut_fpath:
        err('vardict2mut failed')
    else:
        info('Saved passed mutations to ' + mut_fpath)

        var_s = source.VarSample(cnf.sample, cnf.output_dir)
        var_s.anno_vcf_fpath = cnf.vcf
        var_s.varfilter_dirpath = var_s.dirpath

        ungz_anno_vcf_fpath = var_s.anno_vcf_fpath if not var_s.anno_vcf_fpath.endswith('.gz') else splitext(var_s.anno_vcf_fpath)[0]
        ungz_filt_vcf_fpath = join(cnf.output_dir, add_suffix(basename(ungz_anno_vcf_fpath), 'filt'))
        var_s.filt_vcf_fpath = ungz_filt_vcf_fpath + '.gz'

        var_s.variants_fpath = vcf2txt_res_fpath
        var_s.variants_pass_fpath = add_suffix(vcf2txt_res_fpath, source.mut_pass_suffix)

        ungz_pass_filt_vcf_fpath = add_suffix(ungz_filt_vcf_fpath, 'pass')
        var_s.pass_filt_vcf_fpath = add_suffix(var_s.filt_vcf_fpath, 'pass')

        filt_vcf = write_vcf(cnf, var_s, cnf.output_dir, cnf.caller, vcf2txt_res_fpath, mut_fpath)
        index_vcf(cnf, var_s.name, filt_vcf, cnf.caller)
        index_vcf(cnf, var_s.name, ungz_pass_filt_vcf_fpath, cnf.caller)

        if cnf.qc:
            report = qc.make_report(cnf, var_s.pass_filt_vcf_fpath, var_s)
            qc_dirpath = join(cnf.output_dir, 'qc')
            safe_mkdir(qc_dirpath)
            qc.save_report(cnf, report, var_s, cnf.caller, qc_dirpath, source.varqc_after_name)
            info('Saved QC to ' + qc_dirpath + ' (' + report.html_fpath + ')')
            info('-' * 70)
            info()

        if not cnf['keep_intermediate']:
            shutil.rmtree(cnf['work_dir'])

        info()
        info('*' * 70)
        info('Done filtering ' + var_s.name)
def convert_vardict_txts_to_bcbio_vcfs(cnf,
                                       bs,
                                       sample,
                                       output_dir=None,
                                       pass_only=False):
    info('')
    info('Preparing data for ' + sample.name)
    anno_filt_vcf_fpath = sample.find_filt_vcf_by_callername(cnf.caller_name)
    if not anno_filt_vcf_fpath:
        return None, None

    if not output_dir:
        output_dir = cnf.output_dir or os.path.dirname(anno_filt_vcf_fpath)
    output_vcf_fpath = join(
        output_dir, sample.name + '-' + cnf.caller_name + filt_vcf_ending)
    pass_output_vcf_fpath = add_suffix(output_vcf_fpath, 'pass')
    if cnf.reuse_intermediate and verify_vcf(
            output_vcf_fpath + '.gz') and verify_vcf(pass_output_vcf_fpath +
                                                     '.gz'):
        info(output_vcf_fpath + '.gz and ' + pass_output_vcf_fpath +
             '.gz exists, reusing')
        return output_vcf_fpath + '.gz', pass_output_vcf_fpath + '.gz'

    info('Parsing PASS and REJECT mutations...')
    pass_mut_dict, reject_mut_dict, filter_values = get_mutation_dicts(
        cnf, bs, sample, pass_only=pass_only)
    sorted_mut_dict = combine_mutations(pass_mut_dict, reject_mut_dict)

    info('')
    info('Writing VCFs')
    vcf_reader = vcf.Reader(open_gzipsafe(anno_filt_vcf_fpath, 'r'))
    vcf_reader = add_keys_to_header(vcf_reader, filter_values)
    with file_transaction(cnf.work_dir, output_vcf_fpath) as filt_tx, \
        file_transaction(cnf.work_dir, pass_output_vcf_fpath) as pass_tx:
        vcf_writer = None
        if not pass_only:
            vcf_writer = vcf.Writer(open(filt_tx, 'w'), template=vcf_reader)
        vcf_pass_writer = vcf.Writer(open(pass_tx, 'w'), template=vcf_reader)
        for key, mut in sorted_mut_dict.items():
            record = get_record_from_vcf(vcf_reader, mut)
            if record:
                if key in pass_mut_dict:
                    record.FILTER = ['PASS']
                    if mut.reason:
                        record.INFO['Reason'] = mut.reason.replace(' ', '_')
                elif pass_only:
                    continue
                elif key in reject_mut_dict:
                    if not mut.reason:
                        continue
                    reject_reason_ids = [
                        filter_descriptions_dict[reason]
                        if reason in filter_descriptions_dict else reason
                        for reason in mut.reason.split(' and ')
                    ]
                    record.FILTER = [';'.join(reject_reason_ids)]
                if mut.signif:
                    record.INFO['Signif'] = mut.signif
                if mut.status:
                    record.INFO['Status'] = mut.status
                if vcf_writer:
                    vcf_writer.write_record(record)
                if key in pass_mut_dict:
                    vcf_pass_writer.write_record(record)
            else:
                warn('No record was found in ' + anno_filt_vcf_fpath +
                     ' for mutation ' + str(mut))

    output_gzipped_vcf_fpath = None
    if vcf_writer:
        vcf_writer.close()
        output_gzipped_vcf_fpath = bgzip_and_tabix(cnf, output_vcf_fpath)
        info('VCF file for vardict.txt is saved to ' +
             output_gzipped_vcf_fpath)
    vcf_pass_writer.close()
    output_gzipped_pass_vcf_fpath = bgzip_and_tabix(cnf, pass_output_vcf_fpath)
    info('VCF file for vardict.PASS.txt is saved to ' +
         output_gzipped_pass_vcf_fpath)
    return output_gzipped_vcf_fpath, output_gzipped_pass_vcf_fpath
Example #8
0
def read_samples(args, caller_name=None):
    vcf_fpath_by_sample = OrderedDict()
    bad_vcf_fpaths = []

    info('Reading samples...')

    if len(args) == 1:
        first_fpath = args[0]
        if not first_fpath.endswith('.vcf') and not first_fpath.endswith(
                '.vcf.gz'):  # TODO: check ##fileformat=VCF ?
            info(
                'First argument file name does not look like VCF, assuming TSV with files names'
            )

            with open(first_fpath) as f:
                for i, l in enumerate(f):
                    fs = l.strip().split('\t')
                    if len(fs) != 2:
                        critical('Line ' + str(i) + ' has only ' +
                                 str(len(fs)) +
                                 ' fields. Expecting 2 (sample and vcf_fpath)')
                    sn, vcf_fpath = fs
                    if not verify_file(vcf_fpath):
                        bad_vcf_fpaths.append(vcf_fpath)
                    vcf_fpath_by_sample[sn] = adjust_path(vcf_fpath)

            if bad_vcf_fpaths:
                critical('VCF files cannot be found, empty or not VCFs:' +
                         ', '.join(bad_vcf_fpaths))
            info('Done reading ' + str(len(vcf_fpath_by_sample)) + ' samples')
            return vcf_fpath_by_sample

    for arg in args or [os.getcwd()]:
        vcf_fpath = verify_vcf(arg.split(',')[0])
        if not verify_file(vcf_fpath):
            bad_vcf_fpaths.append(vcf_fpath)
        if len(arg.split(',')) > 1:
            sn = arg.split(',')[1]
        else:
            sn = basename(splitext_plus(vcf_fpath)[0])
            if caller_name and sn.endswith('-' + caller_name):
                sn = sn[:-len(caller_name) - 1]
            info('  ' + sn)
        if sn in vcf_fpath_by_sample:
            if vcf_fpath_by_sample[sn] != vcf_fpath:
                warn('Duplicated record ' + sn +
                     ', VCF file is different (existing: ' +
                     vcf_fpath_by_sample[sn] + ', new: ' + vcf_fpath + ')')
            else:
                warn('Duplicated record ' + sn + ', VCF file is the same: ' +
                     vcf_fpath)
        else:
            vcf_fpath_by_sample[sn] = vcf_fpath
    if bad_vcf_fpaths:
        critical('VCF files cannot be found, empty or not VCFs:' +
                 ', '.join(bad_vcf_fpaths))
    info('Done reading ' + str(len(vcf_fpath_by_sample)) + ' samples')

    # TODO: read sample names from VCF
    # def get_main_sample(self, main_sample_index=None):
    #     if len(self._sample_indexes) == 0:
    #         return None
    #     if main_sample_index is not None:
    #         return self.samples[main_sample_index]
    #     try:
    #         sample_index = [sname.lower() for sname in self._sample_indexes] \
    #                         .index(self.sample_name_from_file.lower())
    #     except ValueError:
    #         return self.samples[0]
    #     else:
    #         return self.samples[sample_index]

    return vcf_fpath_by_sample
Example #9
0
def _snpeff(cnf, input_fpath):
    if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome:
        return None, None, None

    step_greetings('SnpEff')

    output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff')
    stats_fpath = join(
        cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') +
        '.snpEff_summary.csv')

    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'

    snpeff = get_java_tool_cmdline(cnf, 'snpeff')

    ref_name = cnf.genome.snpeff.reference or cnf.genome.name
    if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'):
        ref_name = 'GRCh37.75'
    if ref_name.startswith('hg38'): ref_name = 'GRCh38.82'

    opts = ''
    if cnf.annotation.snpeff.cancer: opts += ' -cancer'

    assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!'
    verify_file(cnf.transcripts_fpath,
                'Transcripts for snpEff -onlyTr',
                is_critical=True)
    opts += ' -onlyTr ' + cnf.transcripts_fpath + ' '

    db_path = adjust_system_path(cnf.genome.snpeff.data)
    if db_path:
        opts += ' -dataDir ' + db_path
    elif cnf.resources.snpeff.config:
        conf = get_system_path(cnf, cnf.resources.snpeff.config)
        if conf:
            opts += ' -c ' + conf + ' '
        else:
            err('Cannot find snpEff config file ' +
                str(cnf.resources.snpeff.config))

    if cnf.annotation.snpeff.extra_options:
        opts += ''

    if not cnf.no_check:
        info('Removing previous snpEff annotations...')
        res = remove_prev_eff_annotation(cnf, input_fpath)
        if not res:
            err('Could not remove preivous snpEff annotations')
            return None, None, None
        input_fpath = res

    snpeff_type = get_snpeff_type(snpeff)
    if snpeff_type == "old":
        opts += ' -stats ' + stats_fpath + ' -csvStats'
    else:
        opts += ' -csvStats ' + stats_fpath

    cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format(
        **locals())

    for i in range(1, 20):
        try:
            res = call_subprocess(cnf,
                                  cmdline,
                                  input_fpath,
                                  output_fpath,
                                  exit_on_error=False,
                                  stdout_to_outputfile=True,
                                  overwrite=True)
        except OSError:
            import traceback, time
            err(traceback.format_exc())
            warn()
            info('Waiting 1 minute')
            time.sleep(60)
            info('Rerunning ' + str(i))
        else:
            break

    output_fpath = verify_vcf(output_fpath, is_critical=True)

    snpeff_summary_html_fpath = 'snpEff_summary.html'
    if isfile(snpeff_summary_html_fpath):
        info('SnpEff created ' + snpeff_summary_html_fpath +
             ' in the cwd, removing it...')
        try:
            os.remove(snpeff_summary_html_fpath)
        except OSError:
            pass

    if res:
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'
    else:
        return None, None, None
Example #10
0
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath):
    if not vcf_conf:
        err('No database for ' + dbname + ', skipping.')
        return None

    step_greetings('Annotating with ' + dbname)

    output_fpath = intermediate_fname(cnf, input_fpath, dbname)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')
    java = get_system_path(cnf, 'java')
    info('Java version:')
    call(cnf, java + ' -version')
    info()

    db_path = cnf['genome'].get(dbname)
    if not db_path:
        db_path = vcf_conf.get('path')
        if not db_path:
            err('Please, provide a path to ' + dbname +
                ' in the "genomes" section in the system config. The config is: '
                + str(cnf['genome']))
            return
        verify_file(db_path, is_critical=True)

    annotations = vcf_conf.get('annotations')

    if not cnf.no_check:
        info('Removing previous annotations...')

        def delete_annos(rec):
            for anno in annotations:
                if anno in rec.INFO:
                    del rec.INFO[anno]
            return rec

        if annotations:
            input_fpath = iterate_vcf(cnf,
                                      input_fpath,
                                      delete_annos,
                                      suffix='d')

    anno_line = ''
    if annotations:
        anno_line = '-info ' + ','.join(annotations)

    cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format(
        **locals())
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   exit_on_error=False,
                                   overwrite=True)
    if not output_fpath:
        err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname)
        return output_fpath
    verify_vcf(output_fpath, is_critical=True)
    # f = open(output_fpath)
    # l = f.readline()
    # if 'Cannot allocate memory' in l:
    #     f.close()
    #     f = open(output_fpath)
    #     contents = f.read()
    #     critical('SnpSift failed with memory issue:\n' + contents)
    #     f.close()
    #     return None

    if not cnf.no_check:
        info_pattern = re.compile(
            r'''\#\#INFO=<
            ID=(?P<id>[^,]+),\s*
            Number=(?P<number>-?\d+|\.|[AG]),\s*
            Type=(?P<type>Integer|Float|Flag|Character|String),\s*
            Description="(?P<desc>[^"]*)"
            >''', re.VERBOSE)

        def _fix_after_snpsift(line, i, ctx):
            if not line.startswith('#'):
                if not ctx['met_CHROM']:
                    return None
                line = line.replace(' ', '_')
                assert ' ' not in line

            # elif line.startswith('##INFO=<ID=om'):
            #     line = line.replace(' ', '')

            elif not ctx['met_CHROM'] and line.startswith('#CHROM'):
                ctx['met_CHROM'] = True

            elif line.startswith('##INFO'):
                m = info_pattern.match(line)
                if m:
                    line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format(
                        m.group('id'), m.group('number'), m.group('type'),
                        m.group('desc'))
            return line

        output_fpath = iterate_file(cnf,
                                    output_fpath,
                                    _fix_after_snpsift,
                                    suffix='fx',
                                    ctx=dict(met_CHROM=False))

    return verify_vcf(output_fpath, is_critical=True)
Example #11
0
def run_annotators(cnf, vcf_fpath, bam_fpath):
    original_vcf = cnf.vcf

    db_section_by_name = OrderedDict(
        (dbname, cnf.annotation[dbname])
        for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine']
        if dbname in cnf.annotation
        and not cnf.annotation[dbname].get('skip-annotation'))

    # if not cnf.no_check:
    #     to_delete_id_ref = []
    #     if 'dbsnp' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as rs*')
    #         to_delete_id_ref.append('rs')
    #     if 'cosmic' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as COS*')
    #         to_delete_id_ref.append('COS')
    #
    #     def delete_ids(rec):  # deleting existing dbsnp and cosmic ID annotations
    #         if rec.ID:
    #             if isinstance(rec.ID, basestring):
    #                 if any(rec.ID.startswith(pref) for pref in to_delete_id_ref):
    #                     rec.ID = None
    #             else:
    #                 rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)]
    #
    #         if not rec.FILTER:
    #             rec.FILTER = 'PASS'
    #
    #         return rec
    #
    #     info('Removing previous rs* and COS* IDs')
    #     vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID')

    bcftools = get_system_path(cnf, 'bcftools')

    if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'):
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    cmdl = '{bcftools} annotate --remove ID {vcf_fpath}'
    res = call(cnf,
               cmdl.format(**locals()),
               output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid'))
    if res:
        vcf_fpath = res
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get(
            'custom_vcfs', dict()).items():
        step_greetings('Annotating using ' + dbname)
        annotations = ','.join('INFO/' + a for a in dbconf.get('annotations'))
        if dbname in ('cosmic', 'dbsnp'):
            annotations += ',=ID'
        db_fpath = get_db_path(cnf, dbconf, dbname)
        if db_fpath:
            cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}'
            res = call(cnf,
                       cmdl.format(**locals()),
                       output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname))
            if res:
                vcf_fpath = res
                vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    verify_vcf(vcf_fpath, is_critical=True)

    if 'dbnsfp' in cnf.annotation:
        res = _snpsift_db_nsfp(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    if 'snpeff' in cnf.annotation:
        res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath)
        if res:
            vcf_fpath = res
            verify_vcf(vcf_fpath, is_critical=True)
            final_summary_fpath = join(cnf.output_dir, basename(summary_fpath))
            final_genes_fpath = join(cnf.output_dir, basename(genes_fpath))
            if isfile(final_summary_fpath): os.remove(final_summary_fpath)
            if isfile(final_genes_fpath): os.remove(final_genes_fpath)
            if file_exists(summary_fpath):
                shutil.move(summary_fpath, final_summary_fpath)
            if file_exists(genes_fpath):
                shutil.move(genes_fpath, final_genes_fpath)

    if 'tracks' in cnf.annotation and cnf.annotation[
            'tracks'] and cnf.annotation['tracks']:
        track_fapths = []
        for track_name in cnf.annotation['tracks']:
            if isfile(track_name) and verify_file(track_name):
                track_fapths.append(track_name)
            else:
                if 'tracks' in cnf['genome'] and cnf['genome'][
                        'tracks'] and track_name in cnf['genome']['tracks']:
                    track_fpath = cnf['genome']['tracks'][track_name]
                    if verify_file(track_fpath):
                        track_fapths.append(track_fpath)
        for track_fapth in track_fapths:
            res = _tracks(cnf, track_fapth, vcf_fpath)
            if res:
                vcf_fpath = res

    step_greetings('Intersection with database VCFs...')
    if 'intersect_with' in cnf.annotation:
        for key, db_fpath in cnf.annotation['intersect_with'].items():
            res = intersect_vcf(cnf,
                                input_fpath=vcf_fpath,
                                db_fpath=db_fpath,
                                key=key)
            if res:
                vcf_fpath = res

    if 'mongo' in cnf.annotation:
        res = _mongo(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    return vcf_fpath