Beispiel #1
0
def total_merge_bed(cnf, bed_fpath):
    bedops = get_system_path(cnf, 'bedops')
    if bedops:
        cmdline = '{bedops} --merge {bed_fpath}'.format(**locals())
        output_fpath = intermediate_fname(cnf, bed_fpath, 'total_merged')
        call(cnf, cmdline, output_fpath)
        return output_fpath
    else:
        bedtools = get_system_path(cnf, 'bedtools')
        cmdline = '{bedtools} merge -i {bed_fpath}'.format(**locals())
        output_fpath = intermediate_fname(cnf, bed_fpath, 'total_merged')
        call(cnf, cmdline, output_fpath)
        return output_fpath
Beispiel #2
0
def annotate_target(cnf, target_bed):
    output_fpath = intermediate_fname(cnf, target_bed, 'ann')
    if not cnf.genome.bed_annotation_features:
        return output_fpath
    if can_reuse(output_fpath, target_bed):
        info(output_fpath + ' exists, reusing')
        return output_fpath

    features_bed = verify_bed(
        cnf.genome.bed_annotation_features,
        is_critical=True,
        description='bed_annotation_features in system config')

    # annotate_bed_py = get_system_path(cnf, 'python', join('tools', 'bed_processing', 'annotate_bed.py'))
    # bedtools = get_system_path(cnf, 'bedtools')

    annotate_bed_py = which('annotate_bed.py')
    if not annotate_bed_py:
        critical(
            'Error: annotate_bed.py not found in PATH, please install TargQC.')

    cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} -g {cnf.genome.name} ' \
              '-o {output_fpath} --canonical'.format(**locals())
    # cmdline = '{annotate_bed_py} {target_bed} --work-dir {cnf.work_dir} --reference {features_bed} ' \
    #           '--genome {cnf.genome.name} --sys-cnf {cnf.sys_cnf} --run-cnf {cnf.run_cnf} ' \
    #           '-o {output_fpath}'.format(**locals())
    call(cnf, cmdline, output_fpath, stdout_to_outputfile=False)

    output_fpath = remove_comments(cnf, output_fpath)

    return output_fpath
Beispiel #3
0
def _rename_fields(cnf, inp_tsv_fpath, field_map):
    if cnf.get('keep_intermediate'):
        step_greetings('Renaming fields.')

    with open(inp_tsv_fpath) as f:
        first_line = f.readline()
    fields = first_line.split()
    new_fields = [field_map.get(f) or f for f in fields]
    new_first_line = '\t'.join(new_fields)

    if cnf.get('keep_intermediate'):
        out_tsv_fpath = intermediate_fname(cnf, inp_tsv_fpath, 'renamed')
    else:
        out_tsv_fpath = inp_tsv_fpath

    with file_transaction(cnf.work_dir, out_tsv_fpath) as tx_out_fpath:
        with open(tx_out_fpath, 'w') as out:
            out.write(new_first_line + '\n')
            with open(inp_tsv_fpath) as f:
                for i, l in enumerate(f):
                    if i >= 1:
                        out.write(l)

    if not cnf.get('keep_intermediate'):
        shutil.move(out_tsv_fpath, inp_tsv_fpath)
        return inp_tsv_fpath
    else:
        return out_tsv_fpath
Beispiel #4
0
def _tracks(cnf, track_fpath, input_fpath):
    if not verify_file(track_fpath):
        return None

    field_name = splitext_plus(basename(track_fpath))[0]

    step_greetings('Intersecting with ' + field_name)

    output_fpath = intermediate_fname(cnf, input_fpath, field_name)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    toolpath = get_system_path(cnf, 'vcfannotate')
    if not toolpath:
        err('WARNING: Skipping annotation with tracks: vcfannotate '
            'executable not found, you probably need to specify path in system_config, or '
            'run load bcbio:  . /group/ngs/bin/bcbio-prod.sh"')
        return None

    # self.all_fields.append(field_name)

    cmdline = '{toolpath} -b {track_fpath} -k {field_name} {input_fpath}'.format(
        **locals())

    assert input_fpath
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   overwrite=True)
    if not verify_vcf(output_fpath):
        err('Error: tracks resulted ' + str(output_fpath) + ' for ' +
            track_fpath)
        return output_fpath

    # Set TRUE or FALSE for tracks
    def proc_line(line, i):
        if field_name in line:
            if not line.startswith('#'):
                fields = line.split('\t')
                info_line = fields[7]
                info_pairs = [attr.split('=') for attr in info_line.split(';')]
                info_pairs = [[pair[0], ('TRUE' if pair[1] else 'FALSE')] if
                              pair[0] == field_name and len(pair) > 1 else pair
                              for pair in info_pairs]
                info_line = ';'.join(
                    '='.join(pair) if len(pair) == 2 else pair[0]
                    for pair in info_pairs)
                fields = fields[:7] + [info_line] + fields[8:]
                return '\t'.join(fields)
        return line

    assert output_fpath
    output_fpath = iterate_file(cnf, output_fpath, proc_line, suffix='trk')
    return verify_vcf(output_fpath, is_critical=True)
Beispiel #5
0
def get_padded_bed_file(cnf, bed, genome, padding):
    info('Making bed file for padded regions...')
    bedtools = get_system_path(cnf, 'bedtools')
    cmdline = '{bedtools} slop -i {bed} -g {genome} -b {padding}'.format(
        **locals())
    output_fpath = intermediate_fname(cnf, bed, 'padded')
    call(cnf, cmdline, output_fpath)
    return output_fpath
Beispiel #6
0
def sort_bed(cnf, input_bed_fpath, output_bed_fpath=None):
    input_bed_fpath = verify_bed(input_bed_fpath)
    output_bed_fpath = adjust_path(
        output_bed_fpath) if output_bed_fpath else intermediate_fname(
            cnf, input_bed_fpath, 'sorted')

    class Region(SortableByChrom):
        def __init__(self, chrom, start, end, other_fields, chrom_ref_order):
            SortableByChrom.__init__(self, chrom, chrom_ref_order)
            self.start = start
            self.end = end
            self.chrom_ref_order = chrom_ref_order
            self.other_fields = tuple(other_fields)

        def get_key(self):
            return self.chrom_ref_order, self.start, self.end, self.other_fields

    regions = []
    chr_lengths = get_chr_lengths_from_seq(cnf.genome.seq)
    chr_order = {c: i for i, (c, l) in enumerate(chr_lengths)}

    info('Sorting regions in ' + input_bed_fpath)
    if cnf.reuse_intermediate and isfile(output_bed_fpath) and verify_bed(
            output_bed_fpath):
        info(output_bed_fpath + ' exists, reusing')
        return output_bed_fpath

    with open(input_bed_fpath) as f:
        with file_transaction(cnf.work_dir, output_bed_fpath) as tx:
            with open(tx, 'w') as out:
                for l in f:
                    if not l.strip():
                        continue
                    if l.strip().startswith('#'):
                        out.write(l)
                        continue

                    fs = l.strip().split('\t')
                    chrom = fs[0]
                    start = int(fs[1])
                    end = int(fs[2])
                    other_fields = fs[3:]
                    order = chr_order.get(chrom, -1)
                    regions.append(
                        Region(chrom, start, end, other_fields, order))

                for region in sorted(regions, key=lambda r: r.get_key()):
                    fs = [region.chrom, str(region.start), str(region.end)]
                    fs.extend(region.other_fields)
                    out.write('\t'.join(fs) + '\n')

    info('Sorted ' + str(len(regions)) + ' regions, saved to ' +
         output_bed_fpath + '\n')
    return output_bed_fpath
def vcf_one_per_line(cnf, vcf_fpath):
    info('Converting VCF to one-effect-per-line...')

    oneperline_vcf_fpath = intermediate_fname(cnf, vcf_fpath, 'opl')
    vcfoneperline_cmline = get_script_cmdline(cnf, 'perl', join('ext_tools', 'vcfOnePerLine.pl'))
    call(cnf, vcfoneperline_cmline, oneperline_vcf_fpath, stdin_fpath=vcf_fpath, exit_on_error=False)
    info()

    if not verify_file(oneperline_vcf_fpath):
        critical('Error: vcf_one_per_line didn\'t generate output file.')
    return oneperline_vcf_fpath
Beispiel #8
0
def _clip_vcf_by_bed(cnf, vcf_fpath, bed_fpath):
    info('Clipping VCF ' + vcf_fpath + ' using BED ' + bed_fpath)

    bedtools = get_system_path(cnf, 'bedtools')

    clipped_vcf_fpath = intermediate_fname(cnf, vcf_fpath, 'clip')
    cmdline = '{bedtools} intersect -header	-a {vcf_fpath} -b {bed_fpath}'.format(
        **locals())
    res = call(cnf, cmdline, output_fpath=clipped_vcf_fpath)

    clipped_gz_vcf_fpath = bgzip_and_tabix(cnf, clipped_vcf_fpath)

    return clipped_gz_vcf_fpath
Beispiel #9
0
def group_and_merge_regions_by_gene(cnf, bed_fpath, keep_genes=False):
    output_fpath = intermediate_fname(cnf, bed_fpath, 'grp_mrg')

    group_merge_bed_py = get_system_path(
        cnf, 'python',
        join('tools', 'bed_processing', 'group_and_merge_by_gene.py'))

    cmdline = '{group_merge_bed_py} {bed_fpath}'.format(**locals())
    if not keep_genes:
        cmdline += ' | grep -vw Gene'

    call(cnf, cmdline, output_fpath)

    return output_fpath
Beispiel #10
0
def fix_vcf_sample_name(cnf, sample_name, vcf_fpath, output_fpath=None):
    output_fpath = output_fpath or intermediate_fname(cnf, vcf_fpath, 'sample')
    def fix_sample_name(l, i):
        if l.startswith('#CHROM'):
            fs = l.split('\t')
            fs[9] = sample_name
            l = '\t'.join(fs)
        elif not l.startswith('#'):
            fs = l.split('\t')
            kvs = fs[7].split(';')
            for i, kv in enumerate(kvs[:]):
                if kv.startswith('SAMPLE='):
                    kvs[i] = 'SAMPLE=' + sample_name
            l = '\t'.join(fs[:7]) + '\t' + ';'.join(kvs) + '\t' + '\t'.join(fs[8:])
            # l = re.sub("(?<=SAMPLE=)[^;](?=;)", sample_name, l)
        return l
    fixed_vcf = iterate_file(cnf, vcf_fpath, fix_sample_name, output_fpath=output_fpath)
    return bgzip_and_tabix(cnf, fixed_vcf)
Beispiel #11
0
def split_bed_by_chrom(cnf, bed_fpath):
    info('Splitting the BED file ' + bed_fpath + ' by chromosome: ', ending='')
    bed_fpath_by_chrom = dict()
    cur_chr_f = None
    cur_chr = None

    with open(bed_fpath) as f:
        for l in f:
            fs = l.strip().split('\t')
            if fs:
                if fs[0] != cur_chr:
                    if cur_chr:
                        info(str(cur_chr), ending=', ', print_date=False)
                    cur_chr = fs[0]
                    cur_chr_fpath = intermediate_fname(cnf, bed_fpath, cur_chr)
                    cur_chr_f = open(cur_chr_fpath, 'w')
                    bed_fpath_by_chrom[cur_chr] = cur_chr_fpath
                cur_chr_f.write(l)
    info('Done.', print_date=False)
    return bed_fpath_by_chrom
Beispiel #12
0
def remove_dups_picard(cnf, bam_fpath):
    picard = get_system_path(cnf, 'java', 'picard')
    if not picard:
        critical('No picard in the system')

    info('Running picard dedup for "' + basename(bam_fpath) + '"')

    dup_metrics_txt = join(cnf.work_dir, 'picard_dup_metrics.txt')
    output_fpath = intermediate_fname(cnf, bam_fpath, 'pcd_dedup')

    cmdline = '{picard} MarkDuplicates' \
              ' I={bam_fpath}' \
              ' O={output_fpath}' \
              ' METRICS_FILE={dup_metrics_txt}' \
              ' REMOVE_DUPLICATES=True' \
              ' VALIDATION_STRINGENCY=LENIENT'
    res = call(cnf,
               cmdline.format(**locals()),
               output_fpath=output_fpath,
               stdout_to_outputfile=False,
               exit_on_error=False)

    if res != output_fpath:  # error occurred, try to correct BAM and restart
        warn('Picard deduplication failed for "' + basename(bam_fpath) +
             '". Fixing BAM and restarting Picard...')
        bam_fpath = _fix_bam_for_picard(cnf, bam_fpath)
        res = call(cnf,
                   cmdline.format(**locals()),
                   output_fpath=output_fpath,
                   stdout_to_outputfile=False,
                   exit_on_error=False)

    if res == output_fpath:
        dup_rate = _parse_picard_dup_report(dup_metrics_txt)
        assert dup_rate <= 1.0 or dup_rate is None, str(dup_rate)
        info('Duplication rate (picard): ' + str(dup_rate))
        return output_fpath
    else:
        return None
Beispiel #13
0
def _snpsift_db_nsfp(cnf, input_fpath):
    if 'dbnsfp' not in cnf.annotation or 'dbnsfp' not in cnf.genome:
        return None

    step_greetings('DB SNFP')

    output_fpath = intermediate_fname(cnf, input_fpath, 'db_nsfp')
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')

    db_path = cnf['genome']['dbnsfp']
    if not verify_file(db_path, 'DB NSFP file'):
        err('DB NSFP file is incorrect. Skipping.')
        return None

    annotations = cnf.annotation['dbnsfp'].get('annotations') or []

    # all_fields.extend(['dbNSFP_' + ann for ann in annotations])

    ann_line = ('-f ' + ','.join(annotations)) if annotations else ''

    cmdline = '{executable} dbnsfp {ann_line} -v -db {db_path} ' \
              '{input_fpath}'.format(**locals())
    if call_subprocess(cnf,
                       cmdline,
                       input_fpath,
                       output_fpath,
                       stdout_to_outputfile=True,
                       exit_on_error=False,
                       overwrite=True):
        return verify_vcf(output_fpath, is_critical=True)
    else:
        return None
Beispiel #14
0
def get_bedgraph_coverage(cnf,
                          bam_fpath,
                          chr_len_fpath=None,
                          output_fpath=None,
                          bed_fpath=None,
                          exit_on_error=True):
    chr_len_fpath = chr_len_fpath or get_chr_len_fpath(cnf)
    dedup_bam = intermediate_fname(cnf, bam_fpath, source.dedup_bam)
    if not verify_bam(dedup_bam, silent=True):
        info('Deduplicating bam file ' + bam_fpath)
        remove_dups(cnf, bam_fpath, dedup_bam)
    else:
        info(dedup_bam + ' exists')
    index_bam(cnf, dedup_bam)
    bam_bed_fpath = bam_to_bed(cnf, dedup_bam, to_gzip=False)
    if getsize(bam_bed_fpath) <= 0:
        info('No coverage for ' + bam_fpath + ', skipping.')
        return None

    sorted_bed_fpath = sort_bed_by_alphabet(cnf,
                                            bam_bed_fpath,
                                            chr_len_fpath=chr_len_fpath)
    if bed_fpath:
        in_bed_fpath = intersect_bed(cnf, sorted_bed_fpath, bed_fpath)
    else:
        in_bed_fpath = sorted_bed_fpath

    if not verify_file(in_bed_fpath, silent=True):
        info('No coverage in ' + in_bed_fpath)
        return None

    bedgraph_fpath = output_fpath or '%s.bedgraph' % splitext(bam_fpath)[0]
    with file_transaction(cnf.work_dir, bedgraph_fpath) as tx_fpath:
        bedtools = get_system_path(cnf, 'bedtools')
        cmdl = '{bedtools} genomecov -bg -split -g {chr_len_fpath} -i {in_bed_fpath}'.format(
            **locals())
        call(cnf, cmdl, exit_on_error=exit_on_error, output_fpath=tx_fpath)
    return bedgraph_fpath
Beispiel #15
0
def _mongo(cnf, input_fpath):
    step_greetings('Annotating from Mongo')

    if 'mongo' not in cnf.annotation:
        return None

    executable = get_java_tool_cmdline(
        cnf, join('ext_tools', 'mongo_loader', 'VCFStore.jar'))
    output_fpath = intermediate_fname(cnf, input_fpath, 'mongo')
    project_name = cnf.project_name

    cmdline = ('{executable} -module annotation -inputFile {input_fpath} '
               ''
               '-outputFile {output_fpath} -project {project_name} ').format(
                   **locals())
    if call_subprocess(cnf,
                       cmdline,
                       input_fpath,
                       output_fpath,
                       stdout_to_outputfile=False,
                       exit_on_error=False):
        return output_fpath
    else:
        return None
Beispiel #16
0
def _read_vcf_records_per_bed_region_and_clip_vcf(cnf, vcf_fpath, bed_fpath,
                                                  region_type, sample):
    info()
    info('Intersecting VCF ' + vcf_fpath + ' using BED ' + bed_fpath)

    vcf_columns_num = count_bed_cols(vcf_fpath)
    bed_columns_num = count_bed_cols(bed_fpath)

    vcf_bed_intersect = join(
        cnf.work_dir,
        splitext(basename(vcf_fpath))[0] + '_' + region_type +
        '_vcf_bed.intersect')
    bedtools = get_system_path(cnf, 'bedtools')
    if not cnf.reuse_intermediate or not verify_file(
            vcf_bed_intersect, silent=True, is_critical=False):
        cmdline = '{bedtools} intersect -header -a {vcf_fpath} -b {bed_fpath} -wo'.format(
            **locals())
        res = call(cnf,
                   cmdline,
                   output_fpath=vcf_bed_intersect,
                   max_number_of_tries=1,
                   exit_on_error=False)
        if not res:
            return None, None, None, None

    regions_in_order = []
    regions_set = set()
    vars_by_region = defaultdict(dict)
    var_by_site = dict()

    clipped_vcf_fpath = intermediate_fname(cnf,
                                           splitext(basename(vcf_fpath))[0],
                                           '_' + region_type + '_clip')

    with open(vcf_bed_intersect) as f, open(clipped_vcf_fpath,
                                            'w') as clip_vcf:
        for l in f:
            l = l.strip()
            if not l or l.startswith('#'):
                clip_vcf.write(l + '\n')
                continue
            fs = l.split('\t')
            chrom, pos, id_, ref, alt, qual, filt, info_fields = fs[:8]
            chrom_b, start_b, end_b, symbol, strand, feature, biotype = None, None, None, None, None, None, None
            if bed_columns_num >= 8:
                chrom_b, start_b, end_b, symbol, _, strand, feature, biotype, _ = fs[
                    -(bed_columns_num + 1):][:9]
            elif bed_columns_num >= 4:
                chrom_b, start_b, end_b, symbol, _ = fs[-(bed_columns_num +
                                                          1):][:5]
            assert chrom == chrom_b, l
            r = chrom, id_, start_b, end_b, symbol, strand, feature, biotype
            if r not in regions_set:
                regions_set.add(r)
                regions_in_order.append(r)

            cls = None
            if '=Hotspot' in info_fields: cls = 'Hotspot'
            if '=Deleterious' in info_fields: cls = 'Deleterious'
            if cls:
                var = Variant(chrom, pos, ref, alt, cls)
                vars_by_region[r][(chrom, pos, ref, alt)] = var
                var_by_site[(chrom, pos, ref, alt)] = var
                clip_vcf.write('\t'.join(
                    [chrom, pos, id_, ref, alt, qual, filt, info_fields]) +
                               '\n')

    clipped_gz_vcf_fpath = bgzip_and_tabix(cnf,
                                           clipped_vcf_fpath,
                                           max_number_of_tries=1,
                                           exit_on_error=False)

    return clipped_gz_vcf_fpath, regions_in_order, vars_by_region, var_by_site
Beispiel #17
0
def _fix_bam_for_picard(cnf, bam_fpath):
    def __process_problem_read_aligns(read_aligns):
        # each alignment: 0:NAME 1:FLAG 2:CHR 3:COORD 4:MAPQUAL 5:CIGAR 6:MATE_CHR 7:MATE_COORD TLEN SEQ ...
        def __get_key(align):
            return align.split('\t')[2] + '@' + align.split('\t')[3]

        def __get_mate_key(align):
            return (align.split('\t')[6] if align.split('\t')[2] != '=' else align.split('\t')[2]) \
                   + '@' + align.split('\t')[7]

        chr_coord = OrderedDict()
        for align in read_aligns:
            key = __get_key(align)
            if key not in chr_coord:
                chr_coord[key] = []
            chr_coord[key].append(align)
        correct_pairs = []
        for align in read_aligns:
            mate_key = __get_mate_key(align)
            if mate_key in chr_coord:
                for pair_align in chr_coord[mate_key]:
                    if read_aligns.index(pair_align) <= read_aligns.index(
                            align):
                        continue
                    if __get_mate_key(pair_align) == __get_key(align):
                        correct_pairs.append((align, pair_align))
        if not correct_pairs:
            return []
        if len(correct_pairs) > 1:
            # sort by sum of mapping quality of both alignments
            correct_pairs.sort(key=lambda pair: pair[0].split('\t')[4] + pair[
                1].split('\t')[4],
                               reverse=True)
        return [correct_pairs[0][0], correct_pairs[0][1]]

    samtools = get_system_path(cnf, 'samtools')
    try:
        import pysam
        without_pysam = False
    except ImportError:
        without_pysam = True

    # find reads presented more than twice in input BAM
    if without_pysam:
        qname_sorted_sam_fpath = intermediate_fname(
            cnf, bam_fpath, 'qname_sorted')[:-len('bam')] + 'sam'
        # queryname sorting; output is SAM
        cmdline = '{samtools} view {bam_fpath} | sort '.format(**locals())
        call(cnf, cmdline, qname_sorted_sam_fpath)
        qname_sorted_file = open(qname_sorted_sam_fpath, 'r')
    else:
        qname_sorted_bam_fpath = intermediate_fname(cnf, bam_fpath,
                                                    'qname_sorted')
        # queryname sorting (-n), to stdout (-o), 'prefix' is not used; output is BAM
        cmdline = '{samtools} sort -n -o {bam_fpath} prefix'.format(**locals())
        call(cnf, cmdline, qname_sorted_bam_fpath)
        qname_sorted_file = pysam.Samfile(qname_sorted_bam_fpath, 'rb')
    problem_reads = dict()
    cur_read_aligns = []
    for line in qname_sorted_file:
        line = str(line)
        if cur_read_aligns:
            if line.split('\t')[0] != cur_read_aligns[0].split('\t')[0]:
                if len(cur_read_aligns) > 2:
                    problem_reads[cur_read_aligns[0].split('\t')
                                  [0]] = cur_read_aligns
                cur_read_aligns = []
        flag = int(line.split('\t')[1])
        cur_read_aligns.append(line)
    if len(cur_read_aligns) > 2:
        problem_reads[cur_read_aligns[0].split('\t')[0]] = cur_read_aligns
    qname_sorted_file.close()

    for read_id, read_aligns in problem_reads.items():
        problem_reads[read_id] = __process_problem_read_aligns(read_aligns)

    # correct input BAM
    fixed_bam_fpath = intermediate_fname(cnf, bam_fpath, 'fixed_for_picard')
    fixed_sam_fpath = fixed_bam_fpath[:-len('bam')] + 'sam'
    if without_pysam:
        sam_fpath = intermediate_fname(cnf, bam_fpath,
                                       'tmp')[:-len('bam')] + 'sam'
        cmdline = '{samtools} view -h {bam_fpath}'.format(**locals())
        call(cnf, cmdline, sam_fpath)
        input_file = open(sam_fpath, 'r')
        fixed_file = open(fixed_sam_fpath, 'w')
    else:
        input_file = pysam.Samfile(bam_fpath, 'rb')
        fixed_file = pysam.Samfile(fixed_bam_fpath, 'wb', template=input_file)
    for line in input_file:
        if without_pysam and line.startswith('@'):  # header
            fixed_file.write(line)
            continue
        read_name = str(line).split('\t')[0]
        if read_name in problem_reads and str(
                line) not in problem_reads[read_name]:
            continue
        fixed_file.write(line)
    input_file.close()
    fixed_file.close()
    if without_pysam:
        cmdline = '{samtools} view -bS {fixed_sam_fpath}'.format(**locals())
        call(cnf, cmdline, fixed_bam_fpath)

    return fixed_bam_fpath
Beispiel #18
0
def _snpeff(cnf, input_fpath):
    if 'snpeff' not in cnf.annotation or 'snpeff' not in cnf.genome:
        return None, None, None

    step_greetings('SnpEff')

    output_fpath = intermediate_fname(cnf, input_fpath, 'snpEff')
    stats_fpath = join(
        cnf.work_dir, cnf.sample + (('-' + cnf.caller) if cnf.caller else '') +
        '.snpEff_summary.csv')

    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'

    snpeff = get_java_tool_cmdline(cnf, 'snpeff')

    ref_name = cnf.genome.snpeff.reference or cnf.genome.name
    if ref_name.startswith('hg19') or ref_name.startswith('GRCh37'):
        ref_name = 'GRCh37.75'
    if ref_name.startswith('hg38'): ref_name = 'GRCh38.82'

    opts = ''
    if cnf.annotation.snpeff.cancer: opts += ' -cancer'

    assert cnf.transcripts_fpath, 'Transcript for annotation must be specified!'
    verify_file(cnf.transcripts_fpath,
                'Transcripts for snpEff -onlyTr',
                is_critical=True)
    opts += ' -onlyTr ' + cnf.transcripts_fpath + ' '

    db_path = adjust_system_path(cnf.genome.snpeff.data)
    if db_path:
        opts += ' -dataDir ' + db_path
    elif cnf.resources.snpeff.config:
        conf = get_system_path(cnf, cnf.resources.snpeff.config)
        if conf:
            opts += ' -c ' + conf + ' '
        else:
            err('Cannot find snpEff config file ' +
                str(cnf.resources.snpeff.config))

    if cnf.annotation.snpeff.extra_options:
        opts += ''

    if not cnf.no_check:
        info('Removing previous snpEff annotations...')
        res = remove_prev_eff_annotation(cnf, input_fpath)
        if not res:
            err('Could not remove preivous snpEff annotations')
            return None, None, None
        input_fpath = res

    snpeff_type = get_snpeff_type(snpeff)
    if snpeff_type == "old":
        opts += ' -stats ' + stats_fpath + ' -csvStats'
    else:
        opts += ' -csvStats ' + stats_fpath

    cmdline = '{snpeff} eff {opts} -noLog -i vcf -o vcf {ref_name} {input_fpath}'.format(
        **locals())

    for i in range(1, 20):
        try:
            res = call_subprocess(cnf,
                                  cmdline,
                                  input_fpath,
                                  output_fpath,
                                  exit_on_error=False,
                                  stdout_to_outputfile=True,
                                  overwrite=True)
        except OSError:
            import traceback, time
            err(traceback.format_exc())
            warn()
            info('Waiting 1 minute')
            time.sleep(60)
            info('Rerunning ' + str(i))
        else:
            break

    output_fpath = verify_vcf(output_fpath, is_critical=True)

    snpeff_summary_html_fpath = 'snpEff_summary.html'
    if isfile(snpeff_summary_html_fpath):
        info('SnpEff created ' + snpeff_summary_html_fpath +
             ' in the cwd, removing it...')
        try:
            os.remove(snpeff_summary_html_fpath)
        except OSError:
            pass

    if res:
        return output_fpath, stats_fpath, splitext(
            stats_fpath)[0] + '.genes.txt'
    else:
        return None, None, None
Beispiel #19
0
def _snpsift_annotate(cnf, vcf_conf, dbname, input_fpath):
    if not vcf_conf:
        err('No database for ' + dbname + ', skipping.')
        return None

    step_greetings('Annotating with ' + dbname)

    output_fpath = intermediate_fname(cnf, input_fpath, dbname)
    if output_fpath.endswith('.gz'):
        output_fpath = output_fpath[:-3]
    if cnf.reuse_intermediate and verify_vcf(output_fpath):
        info('VCF ' + output_fpath + ' exists, reusing...')
        return output_fpath

    executable = get_java_tool_cmdline(cnf, 'snpsift')
    java = get_system_path(cnf, 'java')
    info('Java version:')
    call(cnf, java + ' -version')
    info()

    db_path = cnf['genome'].get(dbname)
    if not db_path:
        db_path = vcf_conf.get('path')
        if not db_path:
            err('Please, provide a path to ' + dbname +
                ' in the "genomes" section in the system config. The config is: '
                + str(cnf['genome']))
            return
        verify_file(db_path, is_critical=True)

    annotations = vcf_conf.get('annotations')

    if not cnf.no_check:
        info('Removing previous annotations...')

        def delete_annos(rec):
            for anno in annotations:
                if anno in rec.INFO:
                    del rec.INFO[anno]
            return rec

        if annotations:
            input_fpath = iterate_vcf(cnf,
                                      input_fpath,
                                      delete_annos,
                                      suffix='d')

    anno_line = ''
    if annotations:
        anno_line = '-info ' + ','.join(annotations)

    cmdline = '{executable} annotate -v {anno_line} {db_path} {input_fpath}'.format(
        **locals())
    output_fpath = call_subprocess(cnf,
                                   cmdline,
                                   input_fpath,
                                   output_fpath,
                                   stdout_to_outputfile=True,
                                   exit_on_error=False,
                                   overwrite=True)
    if not output_fpath:
        err('Error: snpsift resulted ' + str(output_fpath) + ' for ' + dbname)
        return output_fpath
    verify_vcf(output_fpath, is_critical=True)
    # f = open(output_fpath)
    # l = f.readline()
    # if 'Cannot allocate memory' in l:
    #     f.close()
    #     f = open(output_fpath)
    #     contents = f.read()
    #     critical('SnpSift failed with memory issue:\n' + contents)
    #     f.close()
    #     return None

    if not cnf.no_check:
        info_pattern = re.compile(
            r'''\#\#INFO=<
            ID=(?P<id>[^,]+),\s*
            Number=(?P<number>-?\d+|\.|[AG]),\s*
            Type=(?P<type>Integer|Float|Flag|Character|String),\s*
            Description="(?P<desc>[^"]*)"
            >''', re.VERBOSE)

        def _fix_after_snpsift(line, i, ctx):
            if not line.startswith('#'):
                if not ctx['met_CHROM']:
                    return None
                line = line.replace(' ', '_')
                assert ' ' not in line

            # elif line.startswith('##INFO=<ID=om'):
            #     line = line.replace(' ', '')

            elif not ctx['met_CHROM'] and line.startswith('#CHROM'):
                ctx['met_CHROM'] = True

            elif line.startswith('##INFO'):
                m = info_pattern.match(line)
                if m:
                    line = '##INFO=<ID={0},Number={1},Type={2},Description="{3}">'.format(
                        m.group('id'), m.group('number'), m.group('type'),
                        m.group('desc'))
            return line

        output_fpath = iterate_file(cnf,
                                    output_fpath,
                                    _fix_after_snpsift,
                                    suffix='fx',
                                    ctx=dict(met_CHROM=False))

    return verify_vcf(output_fpath, is_critical=True)
Beispiel #20
0
def prepare_beds(cnf, features_bed=None, target_bed=None, seq2c_bed=None):
    if features_bed is None and target_bed is None:
        warn(
            'No input target BED, and no features BED in the system config specified. Not making detailed per-gene reports.'
        )
        # return None, None, None, None

    if target_bed:
        target_bed = verify_bed(target_bed, is_critical=True)

    if seq2c_bed:
        seq2c_bed = verify_bed(seq2c_bed, is_critical=True)

    if features_bed:
        features_bed = verify_bed(features_bed, is_critical=True)

    # if features_bed and target_bed and abspath(features_bed) == abspath(target_bed):
    #     warn('Same file used for exons and amplicons: ' + features_bed)

    # Features
    features_no_genes_bed = None
    if features_bed:
        # info()
        # info('Merging regions within genes...')
        # exons_bed = group_and_merge_regions_by_gene(cnf, exons_bed, keep_genes=True)
        #
        # info()
        # info('Sorting exons by (chrom, gene name, start)')
        # exons_bed = sort_bed(cnf, exons_bed)

        info()
        info(
            'Filtering the features bed file to have only non-gene and no-transcript records...'
        )
        features_no_genes_bed = intermediate_fname(cnf, features_bed,
                                                   'no_genes')
        call(cnf,
             'grep -vw Gene ' + features_bed + ' | grep -vw Transcript',
             output_fpath=features_no_genes_bed)

    ori_target_bed_path = target_bed
    if target_bed:
        info()
        info('Remove comments in target...')
        target_bed = remove_comments(cnf, target_bed)

        info()
        info('Cut -f1,2,3,4 target...')
        target_bed = cut(cnf, target_bed, 4)

        info()
        info('Sorting target...')
        target_bed = sort_bed(cnf, target_bed)

        cols = count_bed_cols(target_bed)
        if cnf.reannotate or cols < 4:
            info()
            if not features_bed:
                critical(
                    str(cols) +
                    ' columns (less than 4), and no features to annotate regions '
                    '(please make sure you have set the "features" key in the corresponding genome section '
                    '(' + cnf.genome.name + ') in ' + cnf.sys_cnf)
            info(
                'cnf.reannotate is ' + str(cnf.reannotate) +
                ', and cols in the target BED is ' + str(cols) +
                '. Annotating target with the gene names from the "features" file '
                + features_bed + '...')
            target_bed = annotate_target(cnf, target_bed)

    def remove_no_anno(l, i):
        if l.split('\t')[3].strip() == '.': return None
        else: return l

    if not seq2c_bed and target_bed or seq2c_bed and seq2c_bed == ori_target_bed_path:
        info('Seq2C bed: remove regions with no gene annotation')
        seq2c_bed = target_bed
        seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt')

    elif seq2c_bed:
        info()
        info('Remove comments in seq2c bed...')
        seq2c_bed = remove_comments(cnf, seq2c_bed)

        info()
        info('Sorting seq2c bed...')
        seq2c_bed = sort_bed(cnf, seq2c_bed)

        cols = count_bed_cols(seq2c_bed)
        if cols < 4:
            info()
            info('Number columns in SV bed is ' + str(cols) +
                 '. Annotating amplicons with gene names...')
            seq2c_bed = annotate_target(cnf, seq2c_bed)
        elif 8 > cols > 4:
            seq2c_bed = cut(cnf, seq2c_bed, 4)
        elif cols > 8:
            seq2c_bed = cut(cnf, seq2c_bed, 8)
        info('Filtering non-annotated entries in seq2c bed')
        seq2c_bed = iterate_file(cnf, seq2c_bed, remove_no_anno, suffix='filt')

    else:
        seq2c_bed = verify_bed(cnf.genome.cds)

    if target_bed:
        info()
        # info('Merging amplicons...')
        # target_bed = group_and_merge_regions_by_gene(cnf, target_bed, keep_genes=False)

        info('Sorting target by (chrom, gene name, start)')
        target_bed = sort_bed(cnf, target_bed)

    return features_bed, features_no_genes_bed, target_bed, seq2c_bed
Beispiel #21
0
def cut(cnf, fpath, col_num):
    cut_fpath = intermediate_fname(cnf, fpath, 'cut')
    cmdline = 'cut -f' + ','.join(map(str, range(1,
                                                 col_num + 1))) + ' ' + fpath
    call(cnf, cmdline, cut_fpath)
    return cut_fpath