Ejemplo n.º 1
0
def sambamba_depth(work_dir,
                   bed,
                   bam,
                   depth_thresholds=None,
                   output_fpath=None,
                   sample_name=None,
                   threads=1):
    if not bam:
        return None
    sample_name = sample_name or splitext_plus(basename(bam))[0]
    depth_thresholds = depth_thresholds or []

    if isinstance(bed, BedTool):
        bed = bed.saveas().fn
    if not output_fpath:
        output_fpath = join(
            work_dir,
            splitext_plus(basename(bed))[0] + '_' + sample_name +
            '_sambamba_depth.txt')

    if can_reuse(output_fpath, [bam, bed]):
        return output_fpath

    thresholds_str = ''.join(
        [' -T' + str(int(d)) for d in depth_thresholds if d is not None])
    cmdline = (
        'depth region -F "not duplicate and not failed_quality_control" '
        '-t {threads} -L {bed} {thresholds_str} {bam}').format(**locals())

    call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath)
    return output_fpath
Ejemplo n.º 2
0
def intersect_bed(work_dir, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed')
    if can_reuse(output_fpath, [bed1, bed2]):
        return output_fpath
    bedtools = which('bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call_process.run(cmdline, output_fpath=output_fpath, checks=[call_process.file_exists_check])
    return output_fpath
Ejemplo n.º 3
0
def intersect_bed(work_dir, bed1, bed2):
    bed1_fname, _ = splitext_plus(basename(bed1))
    bed2_fname, _ = splitext_plus(basename(bed2))
    output_fpath = join(work_dir, bed1_fname + '__' + bed2_fname + '.bed')
    if can_reuse(output_fpath, [bed1, bed2]):
        return output_fpath
    bedtools = which('bedtools')
    cmdline = '{bedtools} intersect -u -a {bed1} -b {bed2}'.format(**locals())
    call_process.run(cmdline,
                     output_fpath=output_fpath,
                     checks=[call_process.file_exists_check])
    return output_fpath
Ejemplo n.º 4
0
def _overlap_bed_files(bed_files, work_dir, genome):
    from clearup.panel import overlap_bed_files

    fnames = [basename(splitext_plus(fp)[0]) for fp in bed_files]
    overlapped_file = join(work_dir, f'{"__".join(fnames)}.{genome}.bed')
    if not can_reuse(overlapped_file, bed_files):
        overlap_bed_files(bed_files, overlapped_file)
    return overlapped_file
Ejemplo n.º 5
0
def bam_to_bed(bam_fpath, to_gzip=True):
    debug('Converting the BAM to BED to save some memory.')  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz' if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
Ejemplo n.º 6
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info('Converting the BAM to BED to save some memory.')  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(**locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
Ejemplo n.º 7
0
def sambamba_depth(work_dir, bed, bam, depth_thresholds=None,
                   output_fpath=None, sample_name=None, threads=1):
    if not bam:
        return None
    sample_name = sample_name or splitext_plus(basename(bam))[0]
    depth_thresholds = depth_thresholds or []
    
    if isinstance(bed, BedTool):
        bed = bed.saveas().fn
    if not output_fpath:
        output_fpath = join(work_dir,
            splitext_plus(basename(bed))[0] + '_' + sample_name + '_sambamba_depth.txt')

    if can_reuse(output_fpath, [bam, bed]):
        return output_fpath

    thresholds_str = ''.join([' -T' + str(int(d)) for d in depth_thresholds if d is not None])
    cmdline = ('depth region -F "not duplicate and not failed_quality_control" '
               '-t {threads} -L {bed} {thresholds_str} {bam}').format(**locals())

    call_sambamba(cmdline, bam_fpath=bam, output_fpath=output_fpath)
    return output_fpath
Ejemplo n.º 8
0
def bam_to_bed(bam_fpath, to_gzip=True):
    debug(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + ('.bed.gz'
                                                   if to_gzip else '.bed')
    if can_reuse(bam_bed_fpath, bam_fpath):
        return bam_bed_fpath
    bedtools = which('bedtools')
    gzip = which('gzip')
    cmdline = '{bedtools} bamtobed -i {bam_fpath}'.format(**locals())
    cmdline += ' | {gzip}'.format(**locals()) if to_gzip else ''
    call_process.run(cmdline, output_fpath=bam_bed_fpath)
    return bam_bed_fpath
Ejemplo n.º 9
0
def bam_to_bed_nocnf(bam_fpath, bedtools='bedtools', gzip='gzip'):
    info(
        'Converting the BAM to BED to save some memory.'
    )  # from here: http://davetang.org/muse/2015/08/05/creating-a-coverage-plot-using-bedtools-and-r/
    bam_bed_fpath = splitext_plus(bam_fpath)[0] + '.bed.gz'
    cmdline = '{bedtools} bamtobed -i {bam_fpath} | {gzip} > {bam_bed_fpath}'.format(
        **locals())
    info(cmdline)
    os.system(cmdline)
    bam_bed_fpath = verify_file(bam_bed_fpath)
    if bam_bed_fpath:
        info('Done, saved to ' + bam_bed_fpath)
    else:
        err('Error, result is non-existent or empty')
    return bam_bed_fpath
Ejemplo n.º 10
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' +
                         l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' +
                         r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
Ejemplo n.º 11
0
def _load_datasets(subdirs):
    vcf_by_project_by_genome = defaultdict(dict)
    # vcf_by_label = dict()
    # all_bed_files = []
    # project_names = []
    datasets = []

    for subdir in subdirs:
        dataset = Dataset()

        if ':' in subdir:
            subdir, dataset.genome = subdir.split(':')
        else:
            dataset.genome = 'hg19'

        dir_path = subdir
        if glob(join(dir_path, '*.vcf.gz')):
            log.info(f'Found .vcf.gz files in directory {dir_path}')
            # Simple directory with VCF files and an optional BED file?
            dataset.name = subdir.replace('/', '__')
            if glob(join(dir_path, '*.bed')):
                dataset.bed_file = glob(join(dir_path, '*.bed'))[0]
            for vcf_fpath in glob(join(dir_path, '*.vcf.gz')):
                label = join(subdir,
                             basename(splitext_plus(vcf_fpath)[0])).replace(
                                 '/', '__')
                dataset.vcf_by_label[label] = vcf_fpath
        else:
            log.info(
                f'Not found any .vcf.gz files in directory {dir_path}. Checking if that\'s a bcbio folder.'
            )
            # Bcbio directory?
            bcbio_proj = BcbioProject()
            bcbio_proj.load_from_bcbio_dir(subdir, proc_name='clearup')
            dataset.name = bcbio_proj.project_name
            dataset.genome = bcbio_proj.genome_build
            for s in bcbio_proj.samples:
                vcf_file = s.find_raw_vcf()
                if vcf_file:
                    dataset.vcf_by_label[bcbio_proj.project_name + '__' +
                                         s.name] = vcf_file
            if bcbio_proj.coverage_bed:
                dataset.bed_file = bcbio_proj.coverage_bed

        datasets.append(dataset)
    return datasets
Ejemplo n.º 12
0
def find_fastq_pairs(fpaths):
    info('Finding FastQ pairs...')
    fastqs_by_sample_name = dict()
    for fpath in fpaths:
        fn, ext = splitext_plus(basename(fpath))
        if ext in ['.fq', '.fq.gz', '.fastq', '.fastq.gz']:
            sname, l_fpath, r_fpath = None, None, None
            if fn.endswith('_1'):
                sname = fn[:-2]
                l_fpath = fpath
            if fn.endswith('_R1'):
                sname = fn[:-3]
                l_fpath = fpath
            if fn.endswith('_2'):
                sname = fn[:-2]
                r_fpath = fpath
            if fn.endswith('_R2'):
                sname = fn[:-3]
                r_fpath = fpath

            if sname:
                m = re.match(r'(.*)_S\d+', sname)
                if m:
                    sname = m.group(1)
                sname = sname.replace('-', '_')
            else:
                sname = fn
                info('Cannot detect file for ' + sname)

            l, r = fastqs_by_sample_name.get(sname, (None, None))
            if l and l_fpath:
                critical('Duplicated left FastQ files for ' + sname + ': ' + l + ' and ' + l_fpath)
            if r and r_fpath:
                critical('Duplicated right FastQ files for ' + sname + ': ' + r + ' and ' + r_fpath)
            fastqs_by_sample_name[sname] = l or l_fpath, r or r_fpath

    fixed_fastqs_by_sample_name = dict()
    for sname, (l, r) in fastqs_by_sample_name.items():
        if not l:
            err('ERROR: for sample ' + sname + ', left reads not found')
        if not r:
            err('ERROR: for sample ' + sname + ', right reads not found')
        if l and r:
            fixed_fastqs_by_sample_name[sname] = l, r

    return fixed_fastqs_by_sample_name
Ejemplo n.º 13
0
def make_fingerprint(vcf_file,
                     work_dir=None,
                     label=None,
                     fp_size=20,
                     bed_file=None):
    log.info('Starting processing file ' + vcf_file)
    work_dir = work_dir or dirname(vcf_file)

    if label: print_name = label
    else: print_name = splitext_plus(basename(vcf_file))[0]
    print_name += '.print' + str(fp_size)
    print_name += '_dist' + str(Params.MIN_DIST)
    print_name += '_af' + str(Params.MIN_AF)
    if not Params.INTERREGION_PAIRS:
        print_name += '_skip_interregion_pairs'

    raw_print_file = join(work_dir, print_name)
    if can_reuse(raw_print_file, vcf_file):
        with open(raw_print_file) as f:
            raw = np.fromfile(f).reshape((len(index_by_key), fp_size))
    else:
        raw = _raw_fingerprint(vcf_file, fp_size=fp_size, bed_file=bed_file)
        with open(raw_print_file, 'w') as f:
            raw.tofile(f)
        log.info(f'Saved raw fingerprints into {raw_print_file}')

    norm_print_name = print_name
    if Params.NORMALIZE_DIST: norm_print_name += '_normdist'
    if Params.NORMALIZE_VAR: norm_print_name += '_normvar'

    norm_print_file = join(work_dir, norm_print_name)
    if can_reuse(norm_print_file, raw_print_file):
        with open(norm_print_file) as f:
            norm = np.fromfile(f).reshape((len(index_by_key), fp_size))
    else:
        norm = _normalize_fingerprint(raw)
        with open(norm_print_file, 'w') as f:
            norm.tofile(f)
        log.info(f'Saved normalised fingerprints into {norm_print_file}')

    return label, norm
Ejemplo n.º 14
0
def main(output_dir=None,
         tumor_bam=None,
         normal_bam=None,
         normal_name=None,
         tumor_name=None,
         genome=None,
         input_genomes_url=None,
         ref_fa=None,
         viruses_fa=None,
         repeat_masker_bed=None,
         breakend_pon=None,
         bp_pon=None,
         bp_hotspots=None,
         min_tumor_af=None,
         requested_cores=None,
         unlock=False,
         dryrun=False,
         maxcoverage=None,
         chunksize_mil=None,
         jvm_heap=None,
         externalaligner=None):

    conf = {}

    output_dir = output_dir or 'gridss_results'
    output_dir = safe_mkdir(abspath(output_dir))
    log_dir = safe_mkdir(join(output_dir, 'log'))
    logger.init(log_fpath_=join(log_dir, 'gridss.log'), save_previous=True)
    if isfile(join(output_dir, 'work', 'all.done')):
        run_simple('rm ' + join(output_dir, 'work', 'all.done'))
    conf['output_dir'] = adjust_path(output_dir)

    tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    if normal_bam:
        normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\
            .replace('-ready', '').replace('-sorted', '')
        conf['normal_bam'] = verify_file(normal_bam, 'Normal BAM, -N option'),
        conf['normal_name'] = normal_name
    conf['tumor_bam'] = verify_file(tumor_bam, 'Tumor BAM, -T option')
    conf['tumor_name'] = tumor_name

    try:
        machine_cores = len(os.sched_getaffinity(0))
    except:
        machine_cores = 1
    cores = min(machine_cores, 8)
    if requested_cores:
        cores = min(cores, requested_cores)
    conf['cores'] = cores

    if maxcoverage:
        conf['maxcoverage'] = maxcoverage
    if chunksize_mil:
        conf['chunksize_mil'] = chunksize_mil
    if jvm_heap:
        conf['jvm_heap'] = jvm_heap
    if externalaligner:
        conf['externalaligner'] = externalaligner

    conf['genome'] = genome
    try:
        from reference_data import api as refdata
    except:
        pass
    else:
        # check reference_data can find the genomes dir, and error out if not
        genomes_dir = refdata.find_genomes_dir(input_genomes_url)
        if genomes_dir:
            conf['genomes_dir'] = genomes_dir

    if ref_fa:
        if not externalaligner == 'minimap2' and not verify_file(ref_fa +
                                                                 '.bwt'):
            log.critical(f'Please, index {ref_fa} using'
                         f'    bwa index {ref_fa}')
        if not verify_file(ref_fa + '.fai'):
            log.critical(f'Please, index {ref_fa} using'
                         f'    samtools faidx {ref_fa}')
        conf['ref_fa'] = ref_fa
    if viruses_fa:
        if not externalaligner == 'minimap2' and not verify_file(viruses_fa +
                                                                 '.bwt'):
            log.critical(f'Please, index {viruses_fa} using: '
                         f'    bwa index {viruses_fa}')
        if not verify_file(viruses_fa + '.fai'):
            log.critical(f'Please, index {viruses_fa} using '
                         f'    samtools faidx {viruses_fa}')
        dict_file = viruses_fa.replace('.fa', '.dict')
        if not verify_file(dict_file):
            log.critical(f'Please, index {viruses_fa} using: '
                         f'   samtools dict {viruses_fa} -o {dict_file}')
        img_file = viruses_fa + '.img'
        if not verify_file(img_file):
            log.critical(
                f'Please, create an img file for {viruses_fa} using:\n'
                f'   gatk BwaMemIndexImageCreator -I  {viruses_fa} -O {img_file}'
            )

        conf['viruses_fa'] = verify_file(viruses_fa)
    if repeat_masker_bed:
        conf['repeat_masker_bed'] = repeat_masker_bed
    if breakend_pon:
        conf['breakend_pon'] = breakend_pon
    if bp_pon:
        conf['bp_pon'] = bp_pon
    if bp_hotspots:
        conf['bp_hotspots'] = bp_hotspots
    if min_tumor_af:
        conf['min_tumor_af'] = min_tumor_af

    py_path = sys.executable  # e.g. /miniconda/envs/umccrise_hmf/bin/python
    env_path = dirname(dirname(py_path))  # e.g. /miniconda/envs/umccrise_hmf
    found = glob.glob(join(env_path, 'share/gridss-*/gridss.jar'))
    if not found:
        hmf_env_path = secondary_conda_env('hmf', is_critical=False)
        if hmf_env_path:
            found = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))
            if not found:
                critical(
                    'Cannot find gridss JAR. Make sure you ran `conda install -c bioconda gridss`'
                )
        conf['gridss_env'] = hmf_env_path
    conf['gridss_jar'] = found[0]

    run_snakemake(join(package_path(), 'gridss', 'Snakefile'),
                  conf,
                  cores=cores,
                  output_dir=output_dir,
                  unlock=unlock,
                  dryrun=dryrun)
Ejemplo n.º 15
0
def main(output_dir=None,
         normal_bam=None,
         tumor_bam=None,
         snv_vcf=None,
         normal_name=None,
         tumor_name=None,
         sample=None,
         genome=None,
         genomes_dir=None,
         gridss_ref_dir=None,
         ref_fa=None,
         threads=None,
         jvmheap=None):

    gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx'))
    gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts'))

    normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    sample = sample or tumor_name

    output_dir = safe_mkdir(abspath(output_dir or 'gridss'))
    logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True)
    output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf')

    assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet'

    if genomes_dir:
        refdata.find_genomes_dir(genomes_dir)
    if not gridss_ref_dir:
        gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir')
    if not ref_fa:
        ref_fa = ref_fa.get_ref_file(genome, 'fa')

    hmf_env_path = conda_utils.secondary_conda_env('hmf')

    gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0]
    amber_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0]
    cobalt_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0]
    purple_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0]
    linx_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0]

    cmd = f"""
PATH={hmf_env_path}/bin:$PATH \
THREADS={threads} \
GRIDSS_JAR={gridss_jar} \
AMBER_JAR={amber_jar} \
COBALT_JAR={cobalt_jar} \
PURPLE_JAR={purple_jar} \
LINX_JAR={linx_jar} \
bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \
-n {normal_bam} \
-t {tumor_bam} \
-v {output_vcf} \
-s {sample} \
--normal_sample {normal_name} \
--tumour_sample {tumor_name} \
--snvvcf {snv_vcf} \
--ref_dir {gridss_ref_dir} \
--install_dir {gridss_scripts_dir} \
--reference {ref_fa} \
--output_dir {output_dir} \
{f"--jvmheap {jvmheap}" if jvmheap else ""}
""".strip()

    try:
        run_simple(cmd)
    except subprocess.SubprocessError:
        err('--------\n')
        err(f'Error running GRIDSS-PURPLE-LINX.\n')
        raise