def do(ref_fpath, samples, sample_ids, scratch_dirpath, output_dirpath):
    from libs.joblib import Parallel, delayed
    n_jobs = min(len(samples), config.max_threads)
    num_threads = max(1, config.max_threads//n_jobs)
    utils.prepare_reference(ref_fpath, scratch_dirpath)
    final_bam_fpaths = Parallel(n_jobs=n_jobs)(delayed(process_single_sample)(ref_fpath, sample_ids[i], samples[i], scratch_dirpath, output_dirpath, str(num_threads))
                                               for i in range(len(samples)))
    return final_bam_fpaths
def process_single_file(ref_fpath, sampleID, bam_fpath, output_dirpath, scratch_dirpath):
    log_fpath = os.path.join(output_dirpath, sampleID + '.log')
    chunks = []
    for chr in chr_names:
        range_start = 1
        range_end = min(chunk_size, chr_lengths[chr])
        part = 1
        while range_start <= chr_lengths[chr]:
            chunks.append((chr, part, range_start, range_end))
            range_start = range_end + 1
            range_end += chunk_size
            range_end = min(range_end, chr_lengths[chr])
            part += 1

    n_jobs = min(len(chunks), config.max_gatk_threads)
    raw_vcf_fpaths = Parallel(n_jobs=n_jobs)(delayed(process_single_chunk)(ref_fpath, sampleID, bam_fpath, scratch_dirpath,
                                                log_fpath, chr, part, start, end) for (chr, part, start, end) in chunks)
    return raw_vcf_fpaths
def process_files(ref_fpath, sample_ids, bam_fpaths, scratch_dirpath, output_dirpath, project_id, sample_files, sample_names):
    log_fpath = os.path.join(output_dirpath, project_id + '.log')
    num_threads = str(config.max_threads)

    print 'Calling variants...'
    raw_vcf_fpaths = [process_single_file(ref_fpath, sample_ids[i], bam_fpaths[i], output_dirpath, scratch_dirpath)
                                               for i in range(len(bam_fpaths))]
    n_jobs = min(len(raw_vcf_fpaths), config.max_threads)
    g_vcf_fpaths = Parallel(n_jobs=n_jobs)(delayed(merge_vcfs)(output_dirpath, sample_ids[i], raw_vcf_fpaths[i], ref_fpath)
                                               for i in range(len(raw_vcf_fpaths)))
    raw_vcf_fpath = os.path.join(scratch_dirpath, project_id + '.raw.vcf')
    vcf_fpath = os.path.join(output_dirpath, project_id + '.vcf')
    if reduced_workflow:
        raw_vcf_fpath = vcf_fpath

    print 'Joint genotyping...'
    variants = ['-V %s' % g_vcf_fpaths[i] for i in range(len(g_vcf_fpaths))]
    variants = (' '.join(variants)).split()
    cmd = ['java', '-jar', gatk_fpath, '-T', 'GenotypeGVCFs', '-R', ref_fpath, '-nt', num_threads,
           '-o', raw_vcf_fpath, '-stand_call_conf', config.low_call_conf if config.low_emit else config.stand_call_conf,
           '-stand_emit_conf', config.low_emit_conf if config.low_emit else config.stand_emit_conf]

    utils.call_subprocess(cmd + variants, stderr=open(log_fpath, 'a'))
    if not reduced_workflow:
        print 'Filtering variants...'
        mem_gb = str(config.max_memory)
        recal_fpath = os.path.join(scratch_dirpath, project_id + '_SNP.recal')
        tranches_fpath = os.path.join(scratch_dirpath, project_id + '_SNP.tranches')

        raw_indels_vcf_fpath = os.path.join(scratch_dirpath, project_id + '_raw_indels.vcf')
        recal_indel_fpath = os.path.join(scratch_dirpath, project_id + '_INDEL.recal')
        tranches_indel_fpath = os.path.join(scratch_dirpath, project_id + '_INDEL.tranches')
        # variant filtering
        return_code = utils.call_subprocess(
            ['java', '-Xmx%sg' % mem_gb, '-jar', gatk_fpath, '-T', 'VariantRecalibrator', '-R', ref_fpath, '-input', raw_vcf_fpath,
               '-resource:hapmap,known=false,training=true,truth=true,prior=15.0', hapmap_fpath,
               '-resource:omni,known=false,training=true,truth=true,prior=12.0', omni_fpath,
               '-resource:1000G,known=false,training=true,truth=false,prior=10.0', tg_indels_fpath,
               '-resource:dbsnp,known=true,training=false,truth=false,prior=2.0', dbsnp_fpath,
               '-an', 'DP', '-an', 'QD', '-an', 'FS', '-an', 'MQRankSum', '-an', 'ReadPosRankSum',
               '-mode', 'SNP', '-recalFile', recal_fpath, '-tranchesFile', tranches_fpath], stderr=open(log_fpath, 'a'))
        if return_code != 0:
            print_variant_filtering_warning(raw_vcf_fpath, vcf_fpath)
        else:
            utils.call_subprocess(
                ['java', '-Xmx%sg' % mem_gb, '-jar', gatk_fpath, '-T', 'ApplyRecalibration', '-R', ref_fpath, '-input', raw_vcf_fpath, '-mode', 'SNP',
                 '--ts_filter_level', '99.5', '-recalFile', recal_fpath, '-tranchesFile', tranches_fpath, '-o', raw_indels_vcf_fpath], stderr=open(log_fpath, 'a'))

            return_code = utils.call_subprocess(
                ['java', '-Xmx%sg' % mem_gb, '-jar', gatk_fpath, '-T', 'VariantRecalibrator', '-R', ref_fpath, '-input', raw_indels_vcf_fpath,
                   '-resource:mills,known=true,training=true,truth=true,prior=12.0', mills_fpath,
                   '-resource:dbsnp,known=true,training=false,truth=false,prior=2.0', dbsnp_fpath,
                   '-an', 'DP', '-an', 'QD', '-an', 'FS', '-an', 'MQRankSum', '-an', 'ReadPosRankSum',
                   '-mode', 'INDEL', '--maxGaussians', '4', '-recalFile', recal_indel_fpath,
                   '-tranchesFile', tranches_indel_fpath], stderr=open(log_fpath, 'a'))

            if return_code != 0:
                print_variant_filtering_warning(raw_vcf_fpath, vcf_fpath)
            else:
                utils.call_subprocess(
                ['java', '-Xmx%sg' % mem_gb, '-jar', gatk_fpath, '-T', 'ApplyRecalibration', '-R', ref_fpath, '-input', raw_indels_vcf_fpath,
                 '-mode', 'INDEL', '--ts_filter_level', '99.0', '-recalFile', recal_indel_fpath, '-tranchesFile', tranches_indel_fpath,
                 '-o', vcf_fpath], stderr=open(log_fpath, 'a'))

    report_vars_fpath = os.path.join(scratch_dirpath, project_id + '.var.txt')
    utils.call_subprocess(['java', '-jar', gatk_fpath, '-T', 'VariantEval', '-R', ref_fpath, '-eval', vcf_fpath,
                           '-noST', '-noEV', '-EV', 'CountVariants', '-ST', 'Sample', '-o', report_vars_fpath], stderr=open(log_fpath, 'a'))

    report_tstv_fpath = os.path.join(scratch_dirpath, project_id + '.tv.txt')
    utils.call_subprocess(['java', '-jar', gatk_fpath, '-T', 'VariantEval', '-R', ref_fpath, '-eval', vcf_fpath,
                           '-noST', '-noEV', '-EV', 'TiTvVariantEvaluator', '-ST', 'Sample', '-o', report_tstv_fpath], stderr=open(log_fpath, 'a'))

    printReport(report_vars_fpath, report_tstv_fpath, sample_names, sample_ids, sample_files, output_dirpath)

    for g_vcf_fpath in g_vcf_fpaths:
        utils.call_subprocess(['bgzip', '-f', g_vcf_fpath], stderr=open(log_fpath, 'a'))
        utils.call_subprocess(['tabix', '-p', 'vcf', g_vcf_fpath + '.gz'], stderr=open(log_fpath, 'a'))

    utils.call_subprocess(['bgzip', '-f', vcf_fpath], stderr=open(log_fpath, 'a'))
    utils.call_subprocess(['tabix', '-p', 'vcf', vcf_fpath + '.gz'], stderr=open(log_fpath, 'a'))
    return vcf_fpath