Example #1
0
def iter_vcf__pysam(input_file,
                    proc_rec=None,
                    proc_hdr=None,
                    output_file=None):
    import pysam
    import sys

    vcf = pysam.VariantFile(input_file)
    if output_file:
        w = open(output_file, 'w')
    else:
        w = sys.stdout

    # Header
    if proc_hdr is not None:
        proc_hdr(vcf)
    w.write(str(vcf.header))

    # Records
    for rec in vcf:
        if proc_rec:
            rec_res = proc_rec(rec)
            if rec_res is not None:
                print(rec_res)
                w.write(str(rec_res))

    vcf.close()

    if output_file:
        w.close()
        out_ungz, out_gz = get_ungz_gz(output_file)
        run_simple(f'bgzip -f {out_ungz} && tabix -f -p vcf {out_gz}')
Example #2
0
def iter_vcf(input_file,
             output_file,
             proc_rec,
             proc_hdr=None,
             postproc_hdr=None,
             **kwargs):
    """
    :param input_file: path to input VCF file
    :param output_file: path to output VCF file (can be .vcf or .vcf.gz, but it will always bgzip/tabix and write with .vcf.gz extention)
    :param proc_rec: a function to process a single cyvcf Record object. Returns either a (new) Record object to write, or None to indicate that the record should be discarded
    :param proc_hdr: a function to process cyvcf object once (i.e. to add values to the header with vcf.add_info_to_header, etc)
    :param postproc_hdr: a function to postprocess finalized header string (vcf.rawheader), e.g. in order to remove values
    :param kwargs: any paramters to pass directly into proc_rec
    """
    from cyvcf2 import VCF
    vcf = VCF(input_file, gts012=True)
    if proc_hdr is not None:
        proc_hdr(vcf)

    # w = None
    if output_file is not None:
        out_ungz, out_gz = get_ungz_gz(output_file)
        # w = Writer(out_ungz, vcf)
        # w.write_header()
        w = open(out_ungz, 'w')
    else:
        # sys.stdout.write(vcf.raw_header)
        w = sys.stdout

    header = vcf.raw_header
    if postproc_hdr is not None:
        header = postproc_hdr(header)
    w.write(header)

    for rec in vcf:
        if proc_rec:
            rec_res = proc_rec(rec, vcf, **kwargs)
            if rec_res is not None:
                # if w is not None:
                #     sys.stderr.write('Writing record', rec_res, '\n')
                #     w.write_record(rec_res)
                # else:
                #     print(rec_res)
                # sys.stderr.write(f'Writing record {rec_res}\n')
                w.write(f'{rec_res}')

    sys.stderr.write(f'Finished writing {output_file}\n')
    vcf.close()
    if output_file is not None:
        w.close()
        run_simple(f'bgzip -f {out_ungz} && tabix -f -p vcf {out_gz}')
        sys.stderr.write(f'Compressed {output_file}\n')
Example #3
0
def setup_tibanna(tibanna_id=None, buckets=None):
    try:
        subprocess.check_call(f'tibanna --version', shell=True)
    except subprocess.CalledProcessError:
        logger.err('Error: tibanna is not installed. Please run `pip install -S tibanna`')
        sys.exit(1)

    if not tibanna_id:
        tibanna_id = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) 
                             for _ in range(8))
        assert not check_tibanna_id_exists(tibanna_id), 'Random tibanna ID already exists: ' + tibanna_id

    step_func_name = f'tibanna_unicorn_{tibanna_id}'
    if not check_tibanna_id_exists(tibanna_id):
        buckets_str = '' if not buckets else ('-b ' + ','.join(buckets))
        run_simple(f'tibanna deploy_unicorn -g {step_func_name} {buckets_str} --no-setenv')

    return step_func_name
def _test_pvac(bedpe_path):
    pvac_bedpe = bedpe_path.replace('.bedpe', '.pvac.bedpe')
    pvac_tsv_path = bedpe_path.replace('.bedpe', '.pvac.tsv')
    pvac_fasta_fpath = bedpe_path.replace('.bedpe', '.pvac.fasta')
    pvac_fasta_key_fpath = bedpe_path.replace('.bedpe', '.pvac.fasta_key')

    run_simple(f'grep -v ^chr {bedpe_path} > {pvac_bedpe}')

    from lib.fasta_generator import FusionFastaGenerator
    from lib.pipeline import MHCIPipeline

    class_i_arguments = {
        'input_file': pvac_bedpe,
        'input_file_type': 'bedpe',
        'sample_name': bedpe_path.replace('.bedpe', '.pvac'),
        'alleles': 'HLA-A*02:01',
        'prediction_algorithms': 'NetMHCcons',
        'output_dir': dirname(bedpe_path),
        'epitope_lengths': 11,
    }
    if isfile(pvac_tsv_path): os.remove(pvac_tsv_path)
    pipeline = MHCIPipeline(**class_i_arguments)
    pipeline.convert_vcf()

    generate_fasta_params = {
        'input_file': pvac_tsv_path,
        'epitope_length': 11,
        'output_file': pvac_fasta_fpath,
        'output_key_file': pvac_fasta_key_fpath,
        'downstream_sequence_length': 1000,
    }
    fasta_generator = FusionFastaGenerator(**generate_fasta_params)
    fasta_generator.execute()

    generate_fasta_params = {
        'input_file': pvac_tsv_path,
        'epitope_length': 8,
        'output_file': pvac_fasta_fpath,
        'output_key_file': pvac_fasta_key_fpath,
        'downstream_sequence_length': 1000,
    }
    fasta_generator = FusionFastaGenerator(**generate_fasta_params)
    fasta_generator.execute()
Example #5
0
def ungzip_if_needed(cnf, fpath, silent=False):
    if fpath.endswith('.gz'):
        fpath = fpath[:-3]
    if not file_exists(fpath) and file_exists(fpath + '.gz'):
        gz_fpath = fpath + '.gz'
        cmdline = 'gunzip -c {gz_fpath} > {fpath}'.format(**locals())
        res = run_simple(cmdline)
        if not silent: info()
        if not res:
            return None
    return fpath
def requanitify_pizzly(pizzly_ref_fa, fusions_fasta, work_dir, fastq):
    """ Returns dict fusion-fasta-id -> {length  eff_length  est_counts   tpm}
    """
    trx_with_fusions = join(work_dir, 'transcripts_with_fusions.fasta.gz')
    kidx = join(work_dir, 'transcripts_with_fusions.kidx')

    if not isfile(trx_with_fusions):
        run_simple(
            f"cat {pizzly_ref_fa} {fusions_fasta} | gzip -c > {trx_with_fusions}"
        )

    if not isfile(kidx):
        run_simple(f"kallisto index -k31 -i {kidx} {trx_with_fusions}")

    abundance = join(work_dir, 'abundance.tsv')
    if not isfile(abundance):
        run_simple(f"kallisto quant -i {kidx} -o {work_dir} {' '.join(fastq)}")

    logger.debug(f'Reading expression from {abundance}')
    expr_by_fusion = dict()
    with open(abundance) as f:
        header = f.readline().strip().split('\t')
        for row in csv.DictReader(f, delimiter='\t', fieldnames=header):
            expr_by_fusion[row['target_id']] = row
    return expr_by_fusion
Example #7
0
def main(output_dir=None,
         tumor_bam=None,
         normal_bam=None,
         normal_name=None,
         tumor_name=None,
         genome=None,
         input_genomes_url=None,
         ref_fa=None,
         viruses_fa=None,
         repeat_masker_bed=None,
         breakend_pon=None,
         bp_pon=None,
         bp_hotspots=None,
         min_tumor_af=None,
         requested_cores=None,
         unlock=False,
         dryrun=False,
         maxcoverage=None,
         chunksize_mil=None,
         jvm_heap=None,
         externalaligner=None):

    conf = {}

    output_dir = output_dir or 'gridss_results'
    output_dir = safe_mkdir(abspath(output_dir))
    log_dir = safe_mkdir(join(output_dir, 'log'))
    logger.init(log_fpath_=join(log_dir, 'gridss.log'), save_previous=True)
    if isfile(join(output_dir, 'work', 'all.done')):
        run_simple('rm ' + join(output_dir, 'work', 'all.done'))
    conf['output_dir'] = adjust_path(output_dir)

    tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    if normal_bam:
        normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\
            .replace('-ready', '').replace('-sorted', '')
        conf['normal_bam'] = verify_file(normal_bam, 'Normal BAM, -N option'),
        conf['normal_name'] = normal_name
    conf['tumor_bam'] = verify_file(tumor_bam, 'Tumor BAM, -T option')
    conf['tumor_name'] = tumor_name

    try:
        machine_cores = len(os.sched_getaffinity(0))
    except:
        machine_cores = 1
    cores = min(machine_cores, 8)
    if requested_cores:
        cores = min(cores, requested_cores)
    conf['cores'] = cores

    if maxcoverage:
        conf['maxcoverage'] = maxcoverage
    if chunksize_mil:
        conf['chunksize_mil'] = chunksize_mil
    if jvm_heap:
        conf['jvm_heap'] = jvm_heap
    if externalaligner:
        conf['externalaligner'] = externalaligner

    conf['genome'] = genome
    try:
        from reference_data import api as refdata
    except:
        pass
    else:
        # check reference_data can find the genomes dir, and error out if not
        genomes_dir = refdata.find_genomes_dir(input_genomes_url)
        if genomes_dir:
            conf['genomes_dir'] = genomes_dir

    if ref_fa:
        if not externalaligner == 'minimap2' and not verify_file(ref_fa +
                                                                 '.bwt'):
            log.critical(f'Please, index {ref_fa} using'
                         f'    bwa index {ref_fa}')
        if not verify_file(ref_fa + '.fai'):
            log.critical(f'Please, index {ref_fa} using'
                         f'    samtools faidx {ref_fa}')
        conf['ref_fa'] = ref_fa
    if viruses_fa:
        if not externalaligner == 'minimap2' and not verify_file(viruses_fa +
                                                                 '.bwt'):
            log.critical(f'Please, index {viruses_fa} using: '
                         f'    bwa index {viruses_fa}')
        if not verify_file(viruses_fa + '.fai'):
            log.critical(f'Please, index {viruses_fa} using '
                         f'    samtools faidx {viruses_fa}')
        dict_file = viruses_fa.replace('.fa', '.dict')
        if not verify_file(dict_file):
            log.critical(f'Please, index {viruses_fa} using: '
                         f'   samtools dict {viruses_fa} -o {dict_file}')
        img_file = viruses_fa + '.img'
        if not verify_file(img_file):
            log.critical(
                f'Please, create an img file for {viruses_fa} using:\n'
                f'   gatk BwaMemIndexImageCreator -I  {viruses_fa} -O {img_file}'
            )

        conf['viruses_fa'] = verify_file(viruses_fa)
    if repeat_masker_bed:
        conf['repeat_masker_bed'] = repeat_masker_bed
    if breakend_pon:
        conf['breakend_pon'] = breakend_pon
    if bp_pon:
        conf['bp_pon'] = bp_pon
    if bp_hotspots:
        conf['bp_hotspots'] = bp_hotspots
    if min_tumor_af:
        conf['min_tumor_af'] = min_tumor_af

    py_path = sys.executable  # e.g. /miniconda/envs/umccrise_hmf/bin/python
    env_path = dirname(dirname(py_path))  # e.g. /miniconda/envs/umccrise_hmf
    found = glob.glob(join(env_path, 'share/gridss-*/gridss.jar'))
    if not found:
        hmf_env_path = secondary_conda_env('hmf', is_critical=False)
        if hmf_env_path:
            found = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))
            if not found:
                critical(
                    'Cannot find gridss JAR. Make sure you ran `conda install -c bioconda gridss`'
                )
        conf['gridss_env'] = hmf_env_path
    conf['gridss_jar'] = found[0]

    run_snakemake(join(package_path(), 'gridss', 'Snakefile'),
                  conf,
                  cores=cores,
                  output_dir=output_dir,
                  unlock=unlock,
                  dryrun=dryrun)
Example #8
0
def main(output_dir=None,
         normal_bam=None,
         tumor_bam=None,
         snv_vcf=None,
         normal_name=None,
         tumor_name=None,
         sample=None,
         genome=None,
         genomes_dir=None,
         gridss_ref_dir=None,
         ref_fa=None,
         threads=None,
         jvmheap=None):

    gridss_linx_dir = abspath(join(package_path(), '..', 'gridss-purple-linx'))
    gridss_scripts_dir = abspath(join(package_path(), '..', 'gridss/scripts'))

    normal_name = normal_name or splitext_plus(basename(normal_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    tumor_name = tumor_name or splitext_plus(basename(tumor_bam))[0]\
        .replace('-ready', '').replace('-sorted', '')
    sample = sample or tumor_name

    output_dir = safe_mkdir(abspath(output_dir or 'gridss'))
    logger.init(log_fpath_=join(output_dir, 'gridss.log'), save_previous=True)
    output_vcf = join(output_dir, f'{sample}-gridss-purple-linx.vcf')

    assert genome == 'GRCh37', 'Only GRCh37 is supported for GRIDSS yet'

    if genomes_dir:
        refdata.find_genomes_dir(genomes_dir)
    if not gridss_ref_dir:
        gridss_ref_dir = refdata.get_ref_file(genome, 'gridss_purple_linx_dir')
    if not ref_fa:
        ref_fa = ref_fa.get_ref_file(genome, 'fa')

    hmf_env_path = conda_utils.secondary_conda_env('hmf')

    gridss_jar = glob.glob(join(hmf_env_path, 'share/gridss-*/gridss.jar'))[0]
    amber_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-amber-*/amber.jar'))[0]
    cobalt_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-cobalt-*/cobalt.jar'))[0]
    purple_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-purple-*/purple.jar'))[0]
    linx_jar = glob.glob(
        join(hmf_env_path, 'share/hmftools-linx-*/sv-linx.jar'))[0]

    cmd = f"""
PATH={hmf_env_path}/bin:$PATH \
THREADS={threads} \
GRIDSS_JAR={gridss_jar} \
AMBER_JAR={amber_jar} \
COBALT_JAR={cobalt_jar} \
PURPLE_JAR={purple_jar} \
LINX_JAR={linx_jar} \
bash -x {join(gridss_linx_dir, 'gridss-purple-linx.sh')} \
-n {normal_bam} \
-t {tumor_bam} \
-v {output_vcf} \
-s {sample} \
--normal_sample {normal_name} \
--tumour_sample {tumor_name} \
--snvvcf {snv_vcf} \
--ref_dir {gridss_ref_dir} \
--install_dir {gridss_scripts_dir} \
--reference {ref_fa} \
--output_dir {output_dir} \
{f"--jvmheap {jvmheap}" if jvmheap else ""}
""".strip()

    try:
        run_simple(cmd)
    except subprocess.SubprocessError:
        err('--------\n')
        err(f'Error running GRIDSS-PURPLE-LINX.\n')
        raise
Example #9
0
def run_snakemake(snakefile, conf, jobs=None, output_dir=None, forcerun=None,
                  unlock=False, dryrun=False, target_rules=None, cluster=None, cluster_cmd=None,
                  log_dir=None, dag=None, report=None, restart_times=None):

    conf['total_cores'] = jobs

    #########################
    #### Setting cluster ####
    #########################

    cluster_param = ''
    cluster_log_dir = ''
    if cluster or cluster_cmd:
        assert log_dir, 'For cluster run, must also specify log_dir'
        if cluster_cmd:
            cluster_param = f' --cluster "{cluster_cmd}"'
        else:
            cluster_log_dir = safe_mkdir(join(log_dir, 'cluster'))
            cluster_param = make_cluster_cmdl(cluster_log_dir, 'umccrise')

    ##########################
    #### Preparing config ####
    ##########################

    if log_dir:
        safe_mkdir(log_dir)
        conf_f = open(join(log_dir, '.conf.yaml'), 'w')
    else:
        conf_f = tempfile.NamedTemporaryFile(mode='wt', delete=False)
    yaml.dump(conf, conf_f)
    conf_f.close()

    ###############################
    #### Building command line ####
    ###############################
    if forcerun:
        forcerun = " ".join(forcerun.split(','))

    cmd = (
        f'snakemake '
        f'{" ".join(flatten([target_rules])) if target_rules else ""} ' +
        f'--snakefile {snakefile} '
        f'--printshellcmds '
        f'{"--dryrun " if dryrun else ""}'
        f'{"--dag " if dag else ""}'
        f'{f"--report {report} " if report else ""}'
        f'{f"--directory {output_dir} " if output_dir else ""}'
        f'{f"-j {jobs} " if jobs else ""}'
        f'--rerun-incomplete '
        f'{f"--restart-times {restart_times} " if restart_times else ""}'
        f'{cluster_param} '
        f'--configfile {conf_f.name} ' +
        f'{"--dag " if dag else ""}'
        f'{f"--forcerun {forcerun}" if forcerun else ""}'
    )

    #################
    #### Running ####
    #################

    if unlock:
        print('* Unlocking previous run... *')
        run_simple(cmd + ' --unlock')
        print('* Now rerunning *')

    try:
        run_simple(cmd)
    except subprocess.CalledProcessError:
        logger.error('--------')
        logger.error(f'Error: snakemake returned a non-zero status. Working directory: {output_dir}')
        if cluster_log_dir:
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
            logger.error(f'Review cluster job logs in {cluster_log_dir}')
        sys.exit(1)
    except KeyboardInterrupt:
        logger.error('--------')
        logger.error(f'Interrupted. Fixing logs permissions. Working directory: {output_dir}')
        if cluster_log_dir:
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
            logger.error(f'Review cluster job logs in {cluster_log_dir}')
        sys.exit(1)
    else:
        logger.info('--------')
        if cluster_log_dir:
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
        logger.info(f'Finished. Output directory: {output_dir}')
Example #10
0
def run_snakemake(snakefile, conf, cores=None, output_dir=None, forcerun=None,
                  unlock=False, dryrun=False, target_rules=None, debug=False,
                  log_dir=None, dag=None, report=None, restart_times=1,
                  tibanna_cfg=None,
                  resources=None, cluster_param=None, cluster_log_dir=None,
                  local_cores=None, ncpus_per_batch=None, ncpus_per_sample=None,
                  tmp_dirs:list = None):

    ##########################
    #### Preparing config ####
    ##########################

    if unlock: conf['unlock'] = 'yes'

    if debug:
        conf['debug'] = 'yes'
        if restart_times is None:
            restart_times = 0
    if restart_times is None:
        restart_times = DEFAULT_RESTART_TIMES
    restart_times = int(restart_times)

    if ncpus_per_batch:
        conf['threads_per_batch'] = ncpus_per_batch
    if ncpus_per_sample:
        conf['threads_per_sample'] = ncpus_per_sample

    if log_dir:
        safe_mkdir(log_dir)
        conf_f = open(join(log_dir, '.conf.yaml'), 'w')
    else:
        conf_f = tempfile.NamedTemporaryFile(mode='wt', delete=False)
    yaml.dump(conf, conf_f)
    conf_f.close()

    ###############################
    #### Building command line ####
    ###############################

    if forcerun:
        forcerun = " ".join(forcerun.split(','))

    tibanna_opts = ''
    if tibanna_cfg:
        output_s3 = tibanna_cfg['output_s3']
        output_bucket_name = output_s3.split('/')[0]
        if ':' in output_bucket_name:
            output_bucket_name = output_bucket_name.split(':')[1]
        step_func_name = setup_tibanna(tibanna_cfg['id'], [output_bucket_name])
        tibanna_opts = f'--tibanna --default-remote-prefix {output_s3} --tibanna-sfn {step_func_name}'

    cmd = (
        f'snakemake '
        f'{" ".join(flatten([target_rules])) if target_rules else ""} ' +
        f'--snakefile {snakefile} '
        f'--printshellcmds '
        f'{"--dryrun " if dryrun else ""}'
        f'--rerun-incomplete '
        f'{"--dag " if dag else ""}'
        f'{f"--report {report} " if report else ""}'
        f'{f"--directory {output_dir} " if output_dir else ""}'
        f'--cores {cores} '
        f'{f"--local-cores {local_cores} " if local_cores else ""}'
        f'{f"--restart-times {restart_times - 1} " if restart_times > 1 else ""}'
        f'{cluster_param if cluster_param else ""} '
        f'--configfile {conf_f.name} ' +
        f'{f"--forcerun {forcerun} " if forcerun else ""}' +
        f'{f"--resources {resources} " if resources else ""} '
        f'{tibanna_opts}'
    )

    #################
    #### Running ####
    #################

    if unlock:
        print('* Unlocking previous run... *')
        run_simple(cmd + ' --unlock')
        print('* Now rerunning *')

    try:
        run_simple(cmd)
    except subprocess.CalledProcessError:
        logger.error('--------')
        logger.error(f'Error: snakemake returned a non-zero status. Working directory: {output_dir}')
        if cluster_log_dir and isdir(cluster_log_dir):
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
            logger.error(f'Review cluster job logs in {cluster_log_dir}')
        for tmp_dir in tmp_dirs or []: tmp_dir.cleanup()
        sys.exit(1)
    except KeyboardInterrupt:
        logger.error('--------')
        logger.error(f'Interrupted. Fixing logs permissions. Working directory: {output_dir}')
        if cluster_log_dir and isdir(cluster_log_dir):
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
            logger.error(f'Review cluster job logs in {cluster_log_dir}')
        for tmp_dir in tmp_dirs or []: tmp_dir.cleanup()
        sys.exit(1)
    else:
        logger.info('--------')
        if cluster_log_dir and isdir(cluster_log_dir):
            run_simple(f'chmod -R a+r {cluster_log_dir}', silent=True)
        logger.info(f'Finished. Output directory: {output_dir}')
        for tmp_dir in tmp_dirs or []: tmp_dir.cleanup()