コード例 #1
0
def bgzip_and_tabix(cnf, vcf_fpath, tabix_parameters='', **kwargs):
    gzipped_fpath = join(vcf_fpath + '.gz')
    tbi_fpath = gzipped_fpath + '.tbi'

    if cnf.reuse_intermediate and \
           file_exists(gzipped_fpath) and \
           file_exists(tbi_fpath) and getctime(tbi_fpath) >= getctime(gzipped_fpath):
        info('Actual compressed VCF and index exist, reusing')
        return gzipped_fpath

    info('Compressing and tabixing VCF file, writing ' + gzipped_fpath + '(.tbi)')
    bgzip = get_system_path(cnf, 'bgzip')
    tabix = get_system_path(cnf, 'tabix')
    if not bgzip:
        err('Cannot index VCF because bgzip is not found in PATH or ' + cnf.sys_cnf)
    if not tabix:
        err('Cannot index VCF because tabix is not found in PATH or ' + cnf.sys_cnf)
    if not bgzip and not tabix:
        return vcf_fpath

    retrying = False
    while True:
        if isfile(tbi_fpath): os.remove(tbi_fpath)
        if isfile(vcf_fpath):
            if isfile(gzipped_fpath):
                 os.remove(gzipped_fpath)
            info('BGzipping VCF')
            cmdline = '{bgzip} {vcf_fpath}'.format(**locals())
            call(cnf, cmdline, None, **kwargs)
        else:
            if not verify_file(gzipped_fpath):
                err('Neither uncompressed ' + vcf_fpath + ' nor ' + gzipped_fpath + ' exist')
                return None

        info('Tabixing VCF')
        cmdline = '{tabix} {tabix_parameters} {gzipped_fpath}'.format(**locals())

        exit_on_error = False
        if retrying:
            exit_on_error = True
        kwargs['exit_on_error'] = exit_on_error
        call(cnf, cmdline, **kwargs)
        if isfile(gzipped_fpath + '.tbi'):
            break
        if retrying:
            critical('Cannot tabix ' + vcf_fpath)
        if not isfile(vcf_fpath):
            call(cnf, 'gunzip ' + gzipped_fpath, None)
        retrying = True

    return gzipped_fpath
コード例 #2
0
def proc_args(argv):
    group1_name = 'Resistant'
    group2_name = 'Sensitive'

    description = 'This script find genes with mutations presented in (almost) all samples in one groups' \
                  'and (almost) not presented in another group' \
                  ' (default group names: Resistant vs Sensitive). Input is PASS.txt files from bcbio-postproc.'
    parser = OptionParser(description=description)
    parser.add_option(
        '-n',
        '--num-samples-limit',
        dest='ns',
        default=1,
        type=int,
        help=
        'For each reported gene: max number of samples WITHOUT the gene in group1, '
        'max number of samples WITH the gene in group2')

    (opts, args) = parser.parse_args(argv)

    if len(args) == 0:
        critical('No PASS.txt files provided to input.')

    variants_fpaths = [fpath for fpath in args if file_exists(fpath)]
    return opts, [group1_name, group2_name], variants_fpaths
コード例 #3
0
def set_up_log(cnf,
               proc_name=None,
               project_name=None,
               project_fpath=None,
               output_dir=None):
    logger.proc_name = proc_name
    logger.project_name = project_name
    logger.project_fpath = project_fpath or output_dir
    logger.cnf_address = remove_quotes(cnf.email) if cnf.email else ''
    logger.smtp_host = cnf.smtp_host

    if cnf.log_dir:
        log_fname = (proc_name + '_' if proc_name else
                     '') + (cnf.sample + '_' if cnf.sample else '') + 'log.txt'
        log_fpath = join(cnf.log_dir, log_fname)

        if file_exists(log_fpath):
            timestamp = datetime.datetime.fromtimestamp(
                os.stat(log_fpath).st_mtime)
            mv_log_fpath = log_fpath + '.' + timestamp.strftime(
                "%Y-%m-%d_%H-%M-%S")
            try:
                if isfile(mv_log_fpath):
                    os.remove(mv_log_fpath)
                if not isfile(mv_log_fpath):
                    os.rename(log_fpath, mv_log_fpath)
            except OSError:
                pass
        info('log_fpath: ' + log_fpath)
        info()
        logger.log_fpath = cnf.log = log_fpath
コード例 #4
0
def check_system_resources(cnf, required=list(), optional=list()):
    to_exit = False

    for program in required:
        if not which(program):
            if cnf.resources is None:
                critical('No "resources" section in system config.')

            data = cnf.resources.get(program)
            if data is None:
                err(program +
                    ' is required. Specify path in system config or in your environment.'
                    )
                to_exit = True
            else:
                if 'module' in data:
                    os.system('module load ' + data['module'])
                    # if 'path' not in data:
                    #     data['path'] = program
                elif 'path' in data:
                    data['path'] = adjust_system_path(data['path'])
                    if not isdir(data['path']) and not file_exists(
                            data['path']):
                        err(data['path'] + ' does not exist.')
                        to_exit = True

    for program in optional:
        resources = cnf.get('resources')
        if not resources:
            break

        data = resources.get(program)
        if data is None:
            continue
        else:
            data['path'] = adjust_system_path(data['path'])
            if not isdir(data['path']) and not file_exists(data['path']):
                err(data['path'] + ' does not exist.')
                to_exit = True

    if to_exit:
        exit()
コード例 #5
0
def check_file_changed(cnf, new, in_work):
    if not file_exists(in_work):
        cnf['reuse_intermediate'] = False

    if cnf.get('reuse_intermediate'):
        if (basename(in_work) != basename(new) or
            md5_for_file(open(in_work, 'rb')) !=
            md5_for_file(open_gzipsafe(new, 'rb'))):

            info('Input file %s changed, setting "reuse_intermediate" '
                'to False.' % str(new))
            cnf['reuse_intermediate'] = False
コード例 #6
0
def get_chr_len_fpath(cnf):
    chr_len_fpath = join(cnf.work_dir, 'chr_lengths.txt')
    if cnf.reuse_intermediate and file_exists(chr_len_fpath):
        info(chr_len_fpath + ' exists, reusing')
        return chr_len_fpath

    else:
        if not cnf.genome.seq:
            critical('There is no "seq" key in ' + cnf.sys_cnf + ' for "' +
                     cnf.genome.name + '" section')
            return None

        chr_lengths = get_chr_lengths_from_seq(adjust_path(cnf.genome.seq))

        with file_transaction(cnf.work_dir, chr_len_fpath) as tx:
            with open(tx, 'w') as handle:
                for c, l in chr_lengths:
                    handle.write(c + '\t' + str(l) + '\n')
    return chr_len_fpath
コード例 #7
0
def _extract_fields(cnf, vcf_fpath, samplename, main_sample_index=0):
    fname, _ = splitext_plus(basename(vcf_fpath))
    tsv_fpath = join(cnf.work_dir, fname + '.tsv')

    if cnf.get('reuse_intermediate'):
        if file_exists(tsv_fpath):
            info(tsv_fpath + ' exists, reusing')
            return tsv_fpath

    manual_tsv_fields = cnf.annotation['tsv_fields']
    if not manual_tsv_fields:
        return None

    all_fields = []
    basic_fields = []
    info_fields = []
    eff_fields = []
    gt_fields = []
    tumor_gt = 'GEN[' + str(main_sample_index) + '].'
    normal_gt = 'GEN[' + str(1 - main_sample_index) + '].'

    lines = []

    with open(vcf_fpath) as inp:
        reader = vcf.Reader(inp)

        info('TSV saver: Building field list')
        for f in [rec.keys()[0] for rec in manual_tsv_fields]:
            if f.startswith('GEN'):
                _f = f.split('.')[1]
                if len(reader.samples) > 0:
                    if _f in reader.formats:
                        gt_fields.append(_f)
                        all_fields.append(f.replace('GEN[*].', tumor_gt))
                        if len(reader.samples) > 1:
                            all_fields.append(f.replace('GEN[*].', normal_gt))
                else:
                    warn('TSV Saver: Warning: ' + f + ' is not in VCF header FORMAT records')

            elif f in ['CHROM', 'POS', 'REF', 'ALT', 'ID', 'FILTER', 'QUAL']:
                all_fields.append(f)
                basic_fields.append(f)

            elif any(f.startswith(af) and af in reader.infos for af in ['EFF', 'ANN']):
                all_fields.append(f)
                eff_fields.append(f)

            else:
                if f in reader.infos:
                    info_fields.append(f)
                    all_fields.append(f)
                elif f == 'SAMPLE':
                    all_fields.append(f)
                else:
                    warn('TSV Saver: Warning: ' + f + ' is not in VCF header INFO records')

        info('TSV saver: Iterating over records...')
        d = OrderedDict()
        for rec in reader:
            for f in basic_fields:
                d[f] = rec.__dict__[f]

            for f in info_fields:
                d[f] = rec.INFO[f] if f in rec.INFO else ''

            if 'SAMPLE' not in d:
                d['SAMPLE'] = samplename

            if eff_fields:
                eff = rec.INFO.get(eff_fields[0][:3])
                if not eff:
                    for f in eff_fields:
                        d[f] = ''
                else:
                    eff_fs = eff[0].split('|')
                    eff_d = dict()
                    for val, header in zip(eff_fs, ['ALLELE', 'EFFECT', 'IMPACT', 'GENE', 'GENEID', 'FEATURE', 'FEATUREID', 'BIOTYPE', 'RANK', 'HGVS_C', 'HGVS_P', 'CDNA_POSLEN', 'CDS_POSLEN', 'AA_POSLEN', 'DISTANCE', 'LOG']):
                        if 'POSLEN' in header:
                            eff_d[header.split('_')[0] + '_POS'] = val.split('/')[0] if val else ''
                            eff_d[header.split('_')[0] + '_LEN'] = val.split('/')[1] if val else ''
                        else:
                            eff_d[header] = val
                    #ANN=GA |3_prime_UTR_variant|MODIFIER|RPL22|RPL22|transcript|NM_000983.3|Coding|4/4|c.*173dupT|||||173|;
                    #Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'
                    for f in eff_fields:
                        d[f] = eff_d[f.split('.')[1]]

            if rec.FORMAT:
                for _f in gt_fields:
                    if _f in rec.FORMAT:
                        d[tumor_gt + _f] = rec.samples[main_sample_index][_f]
                        if len(rec.samples) > 1 - main_sample_index:
                            d[normal_gt + _f] = rec.samples[1 - main_sample_index][_f]
                        else:
                            d[normal_gt + _f] = ''
                    else:
                        d[tumor_gt + _f] = ''
                        d[normal_gt + _f] = ''

            fs = []
            for f in all_fields:
                v = d[f]
                fs.append(v if v != '.' else '')
            lines.append(fs)

    info('TSV saver: Adding GEN[*] fields both for sample and for matched normal...')
    field_map = dict()
    for rec in manual_tsv_fields:
        k = rec.keys()[0]
        v = rec.values()[0]
        if k.startswith('GEN[*].'):
            _f = k.split('.')[1]
            field_map[tumor_gt + _f] = v
            field_map[normal_gt + _f] = 'Matched_' + v
        else:
            field_map[k] = v

    info('TSV saver: Writing TSV to ' + tsv_fpath)
    with file_transaction(cnf.work_dir, tsv_fpath) as tx:
        with open(tx, 'w') as out:
            out.write('\t'.join(field_map[f] for f in all_fields) + '\n')
            for fs in lines:
                new_fs = []
                for f in fs:
                    if isinstance(f, list):
                        new_fs.append(','.join(map(str, f)))
                    elif f is None:
                        new_fs.append('')
                    else:
                        new_fs.append(str(f))
                out.write('\t'.join(new_fs) + '\n')

    info('TSV saver: saved ' + tsv_fpath)
    return tsv_fpath
コード例 #8
0
def run_annotators(cnf, vcf_fpath, bam_fpath):
    original_vcf = cnf.vcf

    db_section_by_name = OrderedDict(
        (dbname, cnf.annotation[dbname])
        for dbname in ['dbsnp', 'clinvar', 'cosmic', 'oncomine']
        if dbname in cnf.annotation
        and not cnf.annotation[dbname].get('skip-annotation'))

    # if not cnf.no_check:
    #     to_delete_id_ref = []
    #     if 'dbsnp' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as rs*')
    #         to_delete_id_ref.append('rs')
    #     if 'cosmic' in db_section_by_name.keys():
    #         info('Removing IDs from dbsnp as COS*')
    #         to_delete_id_ref.append('COS')
    #
    #     def delete_ids(rec):  # deleting existing dbsnp and cosmic ID annotations
    #         if rec.ID:
    #             if isinstance(rec.ID, basestring):
    #                 if any(rec.ID.startswith(pref) for pref in to_delete_id_ref):
    #                     rec.ID = None
    #             else:
    #                 rec.ID = [id_ for id_ in rec.ID if not any(id_.startswith(pref) for pref in to_delete_id_ref)]
    #
    #         if not rec.FILTER:
    #             rec.FILTER = 'PASS'
    #
    #         return rec
    #
    #     info('Removing previous rs* and COS* IDs')
    #     vcf_fpath = iterate_vcf(cnf, vcf_fpath, delete_ids, suffix='delID')

    bcftools = get_system_path(cnf, 'bcftools')

    if not vcf_fpath.endswith('.gz') or not file_exists(vcf_fpath + '.tbi'):
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    cmdl = '{bcftools} annotate --remove ID {vcf_fpath}'
    res = call(cnf,
               cmdl.format(**locals()),
               output_fpath=add_suffix(rm_gz_ext(vcf_fpath), 'rmid'))
    if res:
        vcf_fpath = res
        vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    for dbname, dbconf in db_section_by_name.items() + cnf.annotation.get(
            'custom_vcfs', dict()).items():
        step_greetings('Annotating using ' + dbname)
        annotations = ','.join('INFO/' + a for a in dbconf.get('annotations'))
        if dbname in ('cosmic', 'dbsnp'):
            annotations += ',=ID'
        db_fpath = get_db_path(cnf, dbconf, dbname)
        if db_fpath:
            cmdl = '{bcftools} annotate -a ' + db_fpath + ' -c ' + annotations + ' {vcf_fpath}'
            res = call(cnf,
                       cmdl.format(**locals()),
                       output_fpath=add_suffix(rm_gz_ext(vcf_fpath), dbname))
            if res:
                vcf_fpath = res
                vcf_fpath = bgzip_and_tabix(cnf, vcf_fpath)

    verify_vcf(vcf_fpath, is_critical=True)

    if 'dbnsfp' in cnf.annotation:
        res = _snpsift_db_nsfp(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    if 'snpeff' in cnf.annotation:
        res, summary_fpath, genes_fpath = _snpeff(cnf, vcf_fpath)
        if res:
            vcf_fpath = res
            verify_vcf(vcf_fpath, is_critical=True)
            final_summary_fpath = join(cnf.output_dir, basename(summary_fpath))
            final_genes_fpath = join(cnf.output_dir, basename(genes_fpath))
            if isfile(final_summary_fpath): os.remove(final_summary_fpath)
            if isfile(final_genes_fpath): os.remove(final_genes_fpath)
            if file_exists(summary_fpath):
                shutil.move(summary_fpath, final_summary_fpath)
            if file_exists(genes_fpath):
                shutil.move(genes_fpath, final_genes_fpath)

    if 'tracks' in cnf.annotation and cnf.annotation[
            'tracks'] and cnf.annotation['tracks']:
        track_fapths = []
        for track_name in cnf.annotation['tracks']:
            if isfile(track_name) and verify_file(track_name):
                track_fapths.append(track_name)
            else:
                if 'tracks' in cnf['genome'] and cnf['genome'][
                        'tracks'] and track_name in cnf['genome']['tracks']:
                    track_fpath = cnf['genome']['tracks'][track_name]
                    if verify_file(track_fpath):
                        track_fapths.append(track_fpath)
        for track_fapth in track_fapths:
            res = _tracks(cnf, track_fapth, vcf_fpath)
            if res:
                vcf_fpath = res

    step_greetings('Intersection with database VCFs...')
    if 'intersect_with' in cnf.annotation:
        for key, db_fpath in cnf.annotation['intersect_with'].items():
            res = intersect_vcf(cnf,
                                input_fpath=vcf_fpath,
                                db_fpath=db_fpath,
                                key=key)
            if res:
                vcf_fpath = res

    if 'mongo' in cnf.annotation:
        res = _mongo(cnf, vcf_fpath)
        if res:
            vcf_fpath = res

    return vcf_fpath