Exemple #1
0
def batch_qc(fn, cont, obj, t):
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    inputs = parser.parse_args()
    fh = open(inputs.fn, 'r')
    src_cmd = '. ~/.novarc;'
    jobs = []
    for line in fh:
        line = line.rstrip('\n')
        # All files for current bnid to be stored in cwd
        swift_cmd = src_cmd + 'swift list ' + cont + ' --prefix ' + obj + '/' + line
        sys.stderr.write(date_time() + 'Checking for sequence files for sample ' + line + '\n' + swift_cmd + '\n')
        try:
            contents = subprocess.check_output(swift_cmd, shell=True)
            if len(contents) < len(line):
                sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n')
                continue
        except:
            sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n')
            continue
        seqfile = re.findall('(\S+[sequence|f*q]*\.gz)', contents)
        sf1 = seqfile[0]
        end1 = os.path.basename(sf1)
        sf2 = seqfile[1]
        end2 = os.path.basename(sf2)
        swift_cmd = src_cmd + "swift download " + cont + " --skip-identical --prefix " + obj + '/' + line
        link_cmd = 'ln -s ' + sf1 + ' .;ln -s ' + sf2
        fastqc_cmd = 'mkdir -p PREQC/' + line + '; fastqc -t 2 -o PREQC/' + line + ' ' + sf1 + ' ' + sf2
        upload_cmd = src_cmd + 'swift upload ' + cont + ' PREQC/' + line
        cleanup_cmd = 'rm -rf RAW/' + line + ' PREQC/' + line + ' ' + end1 + ' ' + end2
        jobs.append(';'.join([swift_cmd, link_cmd, fastqc_cmd, upload_cmd, cleanup_cmd]))
    sys.stderr.write(date_time() + 'Job list created, running jobs!\n')
    job_manager(jobs, t)
    return 0
Exemple #2
0
def qc_bam_pipe(sample_list, config_file, ref_mnt):
    (cont, obj) = parse_config(config_file)
    job_list = []
    log_dir = 'LOGS/'
    src_cmd = '. ~/.novarc;'
    create_start_dirs = 'mkdir LOGS QC'
    subprocess.call(create_start_dirs, shell=True)
    for sample in open(sample_list):
        sample = sample.rstrip('\n')
        parts = sample.split('_')
        bam = sample + '.Aligned.sortedByCoord.out.bam'
        dl_list = (log_dir + sample + '.cutadapt.log', log_dir + sample + '.Log.final.out', 'QC/' + sample
                   + '_subset.insert_metrics.hist', 'QC/' + sample + '_1_sequence_fastqc/fastqc_data.txt', 'BAMS/'
                   + bam)
        dl_cmd = src_cmd
        prefix = obj + '/' + parts[0] + '/'
        for fn in dl_list:
            dl_cmd += 'swift download ' + cont + ' ' + prefix + fn + ';'
        mv_cmd = 'mv ' + prefix + dl_list[2] + ' .;mv ' + prefix + dl_list[4] + ' .;mv ' + prefix + dl_list[0] \
                 + ' LOGS/;mv ' + prefix + dl_list[1] + ' LOGS;mv ' + prefix + 'QC/' + sample + '* QC/;'
        qc_cmd = '~/TOOLS/Scripts/alignment/qc_bam.py -sa ' + sample + ' -j ' + config_file + ' -m ' + ref_mnt + ';'
        rm_cmd = 'rm ' + bam + ';'
        parse_qc_cmd = '~/TOOLS/Scripts/alignment/parse_qc.py -j ' + config_file + ' -sa ' + sample + ';'
        full_cmd = dl_cmd + mv_cmd + qc_cmd + rm_cmd + parse_qc_cmd
        job_list.append(full_cmd)
    job_manager(job_list, 4)
Exemple #3
0
def qc_bam_pipe(sample_list, config_file, ref_mnt):
    (cont, obj) = parse_config(config_file)
    job_list = []
    log_dir = 'LOGS/'
    src_cmd = '. ~/.novarc;'
    create_start_dirs = 'mkdir LOGS QC'
    subprocess.call(create_start_dirs, shell=True)
    for sample in open(sample_list):
        sample = sample.rstrip('\n')
        parts = sample.split('_')
        bam = sample + '.Aligned.sortedByCoord.out.bam'
        dl_list = (log_dir + sample + '.cutadapt.log',
                   log_dir + sample + '.Log.final.out',
                   'QC/' + sample + '_subset.insert_metrics.hist',
                   'QC/' + sample + '_1_sequence_fastqc/fastqc_data.txt',
                   'BAMS/' + bam)
        dl_cmd = src_cmd
        prefix = obj + '/' + parts[0] + '/'
        for fn in dl_list:
            dl_cmd += 'swift download ' + cont + ' ' + prefix + fn + ';'
        mv_cmd = 'mv ' + prefix + dl_list[2] + ' .;mv ' + prefix + dl_list[4] + ' .;mv ' + prefix + dl_list[0] \
                 + ' LOGS/;mv ' + prefix + dl_list[1] + ' LOGS;mv ' + prefix + 'QC/' + sample + '* QC/;'
        qc_cmd = '~/TOOLS/Scripts/alignment/qc_bam.py -sa ' + sample + ' -j ' + config_file + ' -m ' + ref_mnt + ';'
        rm_cmd = 'rm ' + bam + ';'
        parse_qc_cmd = '~/TOOLS/Scripts/alignment/parse_qc.py -j ' + config_file + ' -sa ' + sample + ';'
        full_cmd = dl_cmd + mv_cmd + qc_cmd + rm_cmd + parse_qc_cmd
        job_list.append(full_cmd)
    job_manager(job_list, 4)
Exemple #4
0
def snpeff_pipe(config_file, sample_pairs, ref_mnt, cflag):
    # edit to grab from config max thread count
    max_t = 8
    (java, snpeff, snpsift, report, dbsnp, intervals) = parse_config(config_file)
    dbsnp = ref_mnt + '/' + dbsnp
    intervals = ref_mnt + '/' + intervals
    fh = open(sample_pairs)
    mk_log_dir = 'mkdir LOGS'
    subprocess.call(mk_log_dir, shell=True)
    cmd_list = []
    run_snpsift = java + ' -jar ' + snpsift + ' annotate ' + dbsnp
    run_snpeff = java + ' -jar ' + snpeff + ' eff -t hg19 '
    for line in fh:
        line = line.rstrip('\n')
        (sample, tumor_id, normal_id) = line.split('\t')
        # run snpsift first, then snpeff
        run_report = report + ' -i ' + sample + '.out.keep.eff.vcf -c '
        if cflag == 'n':
            run_report += intervals
        else:
            run_report += 'n'
        run_report += ' > ' + sample + '.vcf.keep.eff.xls'
        run_snp = run_snpsift + ' ' + sample + '.out.keep > ' + sample + '.out.keep.sift.vcf 2> LOGS/' + sample \
                  + '.snpeff.log;' + run_snpeff + ' ' + sample + '.out.keep.sift.vcf -v > ' + sample \
                  + '.out.keep.eff.vcf  2>> LOGS/' + sample + '.snpeff.log;' + run_report
        cmd_list.append(run_snp)
    job_manager(cmd_list, max_t)
    sys.stderr.write(date_time() + 'SNP annotation  completed!\n')
    return 0
def calc_pos_cov(table, samtools, out):
    fh = open(table)
    head = next(fh)
    bnids = []
    head = head.rstrip('\n').split('\t')
    for i in range(1, len(head), 1):
        bnid = head[i].split('_')
        bnids.append(bnid[0])
    # create bed file to get coverage
    bed_fn = out + '.bed'
    bed = open(bed_fn, 'w')
    vlist = []
    # in the event an indel happens in one sample and snv in another at same position, don't process twice

    for line in fh:
        vlist.append(create_bed(line, bed))
    bed.close()
    fh.close()
    job_list = []
    src_cmd = '. ~/.novarc;'
    # get bams, then build jobs
    for i in range(0, len(bnids), 1):
        sys.stderr.write(date_time() + 'Getting bam for ' + bnids[i] + '\n')
        bam = 'ALIGN/' + bnids[i] + '/BAM/' + bnids[i] + '.merged.final.bam'
        dl_cmd = src_cmd + 'swift download PDX --prefix ALIGN/' + bnids[i] + '/BAM/' + bnids[i] + '.merged.final.ba;'
        subprocess.call(dl_cmd, shell=True)
        # try pdx container, if not, try pancan
        if os.path.isfile(bam):
            job_list.append(build_jobs(samtools, bed_fn, bnids[i]))
        else:
            sys.stderr.write(date_time() + dl_cmd + '\nBam for sample ' + bnids[i] + ' not in PDX contaner, '
                                                                                    'trying PANCAN\n')
            dl_cmd = src_cmd + 'swift download PANCAN --prefix ALIGN/' + bnids[i] + '/BAM/' + bnids[i] \
                     + '.merged.final.ba;'
            subprocess.call(dl_cmd, shell=True)
            if os.path.isfile(bam):
                job_list.append(build_jobs(samtools, bed_fn, bnids[i]))
            else:
                sys.stderr.write(date_time() + dl_cmd + '\nCould not find bam for ' + bnids[i] + '\n')
                exit(1)
    sys.stderr.write('Running depth jobs\n')
    job_manager(job_list, '8')
    sys.stderr.write(date_time() + 'Compiling results\n')
    cov_dict = compile_results(bnids)
    sys.stderr.write(date_time() + 'Writing to output table\n')
    out_fh = open(out + '_variant_coverage_table.txt', 'w')
    out_fh.write('\t'.join(head) + '\n')
    for var in vlist:
        out_fh.write(var)
        for i in range(0, len(bnids), 1):
            m = re.search('\S+-(chr\w+)_(\d+)_\w+->\w+', var)
            (chrom, pos) = (m.group(1), m.group(2))
            if bnids[i] in cov_dict[chrom][pos]:
                out_fh.write('\t' + cov_dict[chrom][pos][bnids[i]])
            else:
                out_fh.write('\t0')
        out_fh.write('\n')
    out_fh.close()
    sys.stderr.write(date_time() + 'Fin\n')
Exemple #6
0
def cnv_pipe(config_file, tum_bam, norm_bam, o_flag, project2):
    (project_dir, project, bedtools, ana, bed,
     cores) = parse_config(config_file)
    job_list = []
    tum_id = re.match('(\d+-\d+)\.', os.path.basename(tum_bam))
    tum_id = tum_id.group(1)
    norm_id = re.match('(\d+-\d+)\.', os.path.basename(norm_bam))
    norm_id = norm_id.group(1)
    pair = tum_id + '_' + norm_id
    bed_t1 = bed.replace('.bed', '_t1.bed')
    bed_t2 = bed.replace('.bed', '_t2.bed')
    t1_genes = get_genes(bed_t1)
    t2_genes = get_genes(bed_t2)
    t1_suffix = '.t1.bedtools.coverage.txt'
    t2_suffix = '.t2.bedtools.coverage.txt'
    cnv_dir = project_dir + project + '/' + ana + '/' + pair + '/OUTPUT/'
    if not os.path.isdir(cnv_dir):
        sys.stderr.write(date_time() + 'Output path ' + cnv_dir +
                         ' does not exist! Trying with backup project ' +
                         project2 + '\n')
        cnv_dir = project_dir + project2 + '/' + ana + '/' + pair + '/OUTPUT/'
        if not os.path.isdir(cnv_dir):
            sys.stderr.write(date_time() + 'Output path ' + cnv_dir +
                             ' does not exist! Check config and try again!\n')
            exit(1)
    clist = (tum_bam + ' -b ' + bed_t1 + ' > ',
             tum_bam + ' -b ' + bed_t2 + ' > ',
             norm_bam + ' -b ' + bed_t1 + ' > ',
             norm_bam + ' -b ' + bed_t2 + ' > ')
    flist = (cnv_dir + tum_id + t1_suffix, cnv_dir + tum_id + t2_suffix,
             cnv_dir + norm_id + t1_suffix, cnv_dir + norm_id + t2_suffix)
    if o_flag == 'y':
        sys.stderr.write(date_time() +
                         'Overwrite yes given, creating new coverage files\n')
        for i in range(0, len(flist)):
            job_list.append(bedtools + ' coverage -abam ' + clist[i] +
                            flist[i])
    else:
        sys.stderr.write(
            date_time() +
            'Overwrite no given, checking for existing coverage files first\n')
        for i in range(0, len(flist)):
            if not os.path.isfile(flist[i]):
                job_list.append(bedtools + ' coverage -abam ' + clist[i] +
                                flist[i])

    sys.stderr.write(date_time() + 'Calculating read depth for ' + pair + '\n')
    job_manager(job_list, cores)
    # process coverage files, assess cnv
    sys.stderr.write(date_time() +
                     'Collapsing read counts in to tiers and gene\n')
    calc_tn_cov_ratios(cnv_dir, tum_id, norm_id, t1_genes, t2_genes, t1_suffix,
                       t2_suffix)
    sys.stderr.write(date_time() + 'CNV analysis complete for ' + pair + '\n')
    return 0
Exemple #7
0
def convert_vcf(config_file, sample_pairs, suffix):
    (java, sift, th) = parse_config(config_file)
    cmd_list = []
    for pair in open(sample_pairs, 'r'):
        pair = pair.rstrip('\n').split('\t')
        pair = pair[0]
        in_vcf = pair + '/' + pair + suffix
        out_xls = pair + '/' + pair + '.indels.xls'
        cmd = java + ' -jar ' + sift + ' extractFields ' + in_vcf + \
              ' CHROM POS REF ALT "EFF[0].EFFECT" "EFF[0].FUNCLASS" "EFF[0].CODON" "EFF[0].AA" "EFF[0].AA_LEN" ' \
              '"EFF[0].GENE" "EFF[0].BIOTYPE" "EFF[0].CODING" MINCOV ALTCOV COVRATIO ID > ' + out_xls
        cmd_list.append(cmd)
    job_manager(cmd_list, th)
Exemple #8
0
def varscan_germline(config_file, sample, ref_mnt):
    (samtools, varscan, region, fasta, th) = parse_config(config_file)
    region = ref_mnt + '/' + region
    fasta = ref_mnt + '/' + fasta
    rf = open(region, 'r')
    cmd_list = []
    for line in rf:
        chrom = line.split('\t')
        cmd = samtools + ' mpileup -r ' + chrom[0] + ' -B -f ' + fasta + ' ' + sample + \
              '.merged.final.bam | java -Xmx4000m -jar ' + varscan + ' mpileup2cns --output-vcf 1 --min-var-freq 0.35 --variants 1 > ' + \
              chrom[0] + '.vcf'
        cmd_list.append(cmd)
    rf.close()
    proc = int(th) - 2
    job_manager(cmd_list, str(proc))
Exemple #9
0
def varscan_germline(config_file, sample, ref_mnt):
    (samtools, varscan, region, fasta, th) = parse_config(config_file)
    region = ref_mnt + '/' + region
    fasta = ref_mnt + '/' + fasta
    rf = open(region, 'r')
    cmd_list = []
    for line in rf:
        chrom = line.split('\t')
        cmd = samtools + ' mpileup -r ' + chrom[0] + ' -B -f ' + fasta + ' ' + sample + \
              '.merged.final.bam | java -Xmx4000m -jar ' + varscan + ' mpileup2cns --output-vcf 1 --min-var-freq 0.35' \
                                                                     ' --variants 1 > ' + \
              chrom[0] + '.vcf'
        cmd_list.append(cmd)
    rf.close()
    proc = int(th) - 2
    job_manager(cmd_list, str(proc))
Exemple #10
0
def calc_coverage(bedtools2_tool, sample, bedfile, cont, obj):
    src_cmd = '. /home/ubuntu/.novarc;'
    job_list = []
    for bnid in open(sample):
        bnid = bnid.rstrip('\n')
        (dl_cmd, bam, bai) = get_bam_name(bnid, src_cmd, cont, obj)
        if isinstance(bam, str):
            bed_cmd = bedtools2_tool + ' coverage -hist -abam ' + bam + ' -b ' + bedfile + ' > ' + bnid + '.hist;'
            cleanup = 'rm ' + bam + ' ' + bai + ';'
            final = dl_cmd + bed_cmd + cleanup
            job_list.append(final)
        else:
            for i in range(len(bam)):
                bed_cmd = bedtools2_tool + ' coverage -hist -abam ' + bam[i] + ' -b ' + bedfile + ' > ' + bnid \
                          + '_' + str(i) + '.hist;'
                cleanup = 'rm ' + bam[i] + ' ' + bai[i] + ';'
                final = dl_cmd[i] + bed_cmd + cleanup
                job_list.append(final)
    job_manager(job_list, '8')
Exemple #11
0
def filter_bam_pipe(config_file, lane, ref_mnt):
    (th, cont, obj, mouse_filter) = parse_config(config_file)
    job_list = []
    src_cmd = ". /home/ubuntu/.novarc;"
    # pdb.set_trace()
    fh = open(lane, 'r')
    for la in fh:
        la = la.rstrip('\n')
        info = la.split('\t')
        lanes = info[2].split(', ')
        for rg in lanes:
            fn = obj + '/' + info[0] + '/BAM/' + info[0] + '_' + rg + '.bam'
            stub = info[0] + '_' + rg
            swift_cmd = src_cmd + "swift download " + cont + " --skip-identical " + fn + " >> dl.log 2>> dl.log"
            mf = mouse_filter + ' -b ' + fn + ' -o ' + stub
            cmd = swift_cmd + '; ' + mf + '; rm ' + fn + ';'
            job_list.append(cmd)
    fh.close()
    job_manager(job_list, th)
Exemple #12
0
def cnv_pipe(config_file, tum_bam, norm_bam, o_flag, project2):
    (project_dir, project, bedtools, ana, bed, cores) = parse_config(config_file)
    job_list = []
    tum_id = re.match('(\d+-\d+)\.', os.path.basename(tum_bam))
    tum_id = tum_id.group(1)
    norm_id = re.match('(\d+-\d+)\.', os.path.basename(norm_bam))
    norm_id = norm_id.group(1)
    pair = tum_id + '_' + norm_id
    bed_t1 = bed.replace('.bed', '_t1.bed')
    bed_t2 = bed.replace('.bed', '_t2.bed')
    t1_genes = get_genes(bed_t1)
    t2_genes = get_genes(bed_t2)
    t1_suffix = '.t1.bedtools.coverage.txt'
    t2_suffix = '.t2.bedtools.coverage.txt'
    cnv_dir = project_dir + project + '/' + ana + '/' + pair + '/OUTPUT/'
    if not os.path.isdir(cnv_dir):
        sys.stderr.write(date_time() + 'Output path ' + cnv_dir + ' does not exist! Trying with backup project '
                         + project2 + '\n')
        cnv_dir = project_dir + project2 + '/' + ana + '/' + pair + '/OUTPUT/'
        if not os.path.isdir(cnv_dir):
            sys.stderr.write(date_time() + 'Output path ' + cnv_dir + ' does not exist! Check config and try again!\n')
            exit(1)
    clist = (tum_bam + ' -b ' + bed_t1 + ' > ', tum_bam + ' -b ' + bed_t2 + ' > ', norm_bam + ' -b ' + bed_t1
             + ' > ', norm_bam + ' -b ' + bed_t2 + ' > ')
    flist = (cnv_dir + tum_id + t1_suffix, cnv_dir + tum_id + t2_suffix, cnv_dir + norm_id + t1_suffix, cnv_dir
             + norm_id + t2_suffix)
    if o_flag == 'y':
        sys.stderr.write(date_time() + 'Overwrite yes given, creating new coverage files\n')
        for i in range(0, len(flist)):
            job_list.append(bedtools + ' coverage -abam ' + clist[i] + flist[i])
    else:
        sys.stderr.write(date_time() + 'Overwrite no given, checking for existing coverage files first\n')
        for i in range(0, len(flist)):
            if not os.path.isfile(flist[i]):
                job_list.append(bedtools + ' coverage -abam ' + clist[i] + flist[i])

    sys.stderr.write(date_time() + 'Calculating read depth for ' + pair + '\n')
    job_manager(job_list, cores)
    # process coverage files, assess cnv
    sys.stderr.write(date_time() + 'Collapsing read counts in to tiers and gene\n')
    calc_tn_cov_ratios(cnv_dir, tum_id, norm_id, t1_genes, t2_genes, t1_suffix, t2_suffix)
    sys.stderr.write(date_time() + 'CNV analysis complete for ' + pair + '\n')
    return 0
Exemple #13
0
def filter_bam_pipe(config_file, lane, ref_mnt):
    (th, cont, obj, mouse_filter) = parse_config(config_file)
    job_list = []
    src_cmd = ". /home/ubuntu/.novarc;"
    # pdb.set_trace()
    fh = open(lane, 'r')
    for la in fh:
        la = la.rstrip('\n')
        info = la.split('\t')
        lanes = info[2].split(', ')
        for rg in lanes:
            fn = obj + '/' + info[0] + '/BAM/' + info[0] + '_' + rg + '.bam'
            stub = info[0] + '_' + rg
            swift_cmd = src_cmd + "swift download " + cont + " --skip-identical " + fn + " >> dl.log 2>> dl.log"
            mf = mouse_filter + ' -b ' + fn + ' -o ' + stub
            cmd = swift_cmd + '; ' + mf + '; rm ' + fn + ';'
            job_list.append(cmd)
    fh.close()
    job_manager(job_list, th)
Exemple #14
0
def batch_qc(fn, cont, obj, t):
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    inputs = parser.parse_args()
    fh = open(inputs.fn, 'r')
    src_cmd = '. ~/.novarc;'
    jobs = []
    for line in fh:
        line = line.rstrip('\n')
        # All files for current bnid to be stored in cwd
        swift_cmd = src_cmd + 'swift list ' + cont + ' --prefix ' + obj + '/' + line
        sys.stderr.write(date_time() +
                         'Checking for sequence files for sample ' + line +
                         '\n' + swift_cmd + '\n')
        try:
            contents = subprocess.check_output(swift_cmd, shell=True)
            if len(contents) < len(line):
                sys.stderr.write(date_time() +
                                 'Can\'t find sequencing files for ' + line +
                                 ' skipping!\n')
                continue
        except:
            sys.stderr.write(date_time() +
                             'Can\'t find sequencing files for ' + line +
                             ' skipping!\n')
            continue
        seqfile = re.findall('(\S+[sequence|f*q]*\.gz)', contents)
        sf1 = seqfile[0]
        end1 = os.path.basename(sf1)
        sf2 = seqfile[1]
        end2 = os.path.basename(sf2)
        swift_cmd = src_cmd + "swift download " + cont + " --skip-identical --prefix " + obj + '/' + line
        link_cmd = 'ln -s ' + sf1 + ' .;ln -s ' + sf2
        fastqc_cmd = 'mkdir -p PREQC/' + line + '; fastqc -t 2 -o PREQC/' + line + ' ' + sf1 + ' ' + sf2
        upload_cmd = src_cmd + 'swift upload ' + cont + ' PREQC/' + line
        cleanup_cmd = 'rm -rf RAW/' + line + ' PREQC/' + line + ' ' + end1 + ' ' + end2
        jobs.append(';'.join(
            [swift_cmd, link_cmd, fastqc_cmd, upload_cmd, cleanup_cmd]))
    sys.stderr.write(date_time() + 'Job list created, running jobs!\n')
    job_manager(jobs, t)
    return 0
Exemple #15
0
def fastqc_pipe(flist, config_file):
    (cont, obj, fastqc_tool, threads) = parse_config(config_file)
    src_cmd = '. /home/ubuntu/.novarc;'
    job_list = []
    for fq in open(flist):
        fq = fq.rstrip('\n')
        root = os.path.basename(fq).replace('_sequence.txt.gz', '')
        parts = fq.split('/')
        dl_cmd = src_cmd + 'swift download ' + cont + ' ' + fq + ';'
        outdir = obj + '/' + parts[1] + '/QC/'
        logdir = obj + '/' + parts[1] + '/LOGS/'
        setup_cmd = 'mkdir -p ' + outdir + ' ' + logdir + ';'
        logfile = logdir + root + '.fastqc.log'
        fastqc_cmd = fastqc_tool + ' --extract -o ' + outdir + ' ' + fq + ' 2> ' + logfile + ';'
        up_cmd = src_cmd + 'swift upload ' + cont + ' ' + logfile + ';'
        up_cmd += 'find ' + outdir + ' -name ' + root + '* | xargs -IFN swift upload ' + cont + ' FN;'
        cleanup = 'rm ' + fq + ';'
        final_cmd = dl_cmd + setup_cmd + fastqc_cmd + up_cmd + cleanup
        job_list.append(final_cmd)
    job_manager(job_list, threads)
def snpeff_pipe(config_file, sample_list, ref_mnt, novarc):
    (java, snpeff, snpsift, report, dbsnp, bed, cont, obj, max_t) = parse_config(config_file)
    dbsnp = ref_mnt + '/' + dbsnp
    bed = ref_mnt + '/' + bed
    fh = open(sample_list)
    mk_log_dir = 'mkdir LOGS'
    subprocess.call(mk_log_dir, shell=True)
    cmd_list = []
    run_snpsift = java + ' -jar ' + snpsift + ' annotate ' + dbsnp
    run_snpeff = java + ' -jar ' + snpeff + ' eff -t hg19 -interval ' + bed
    source_novarc(novarc)
    for line in fh:
        line = line.rstrip('\n')
        in_vcf = obj + '/' + line + '.merged.final.bam.germline_calls.vcf'
        sift_vcf = obj + '/' + line + '.snpSift.vcf'
        final_vcf = obj + '/' + line + '.snpSift.snpEff.vcf'
        dl_vcf = 'swift download ' + cont + ' ' + in_vcf + ';'
        log = 'LOGS/' + line + '.snpeff.log'
        run_cmd = dl_vcf + run_snpsift + ' ' + in_vcf + ' > ' + sift_vcf + ' 2> ' + log + ';' + run_snpeff + ' ' \
                  + sift_vcf + ' -v > ' + final_vcf + ' 2>> ' + log
        cmd_list.append(run_cmd)
    job_manager(cmd_list, max_t)
Exemple #17
0
def snpeff_pipe(config_file, sample_list, ref_mnt, novarc):
    (java, snpeff, snpsift, report, dbsnp, bed, cont, obj,
     max_t) = parse_config(config_file)
    dbsnp = ref_mnt + '/' + dbsnp
    bed = ref_mnt + '/' + bed
    fh = open(sample_list)
    mk_log_dir = 'mkdir LOGS'
    subprocess.call(mk_log_dir, shell=True)
    cmd_list = []
    run_snpsift = java + ' -jar ' + snpsift + ' annotate ' + dbsnp
    run_snpeff = java + ' -jar ' + snpeff + ' eff -t hg19 -interval ' + bed
    source_novarc(novarc)
    for line in fh:
        line = line.rstrip('\n')
        in_vcf = obj + '/' + line + '.merged.final.bam.germline_calls.vcf'
        sift_vcf = obj + '/' + line + '.snpSift.vcf'
        final_vcf = obj + '/' + line + '.snpSift.snpEff.vcf'
        dl_vcf = 'swift download ' + cont + ' ' + in_vcf + ';'
        log = 'LOGS/' + line + '.snpeff.log'
        run_cmd = dl_vcf + run_snpsift + ' ' + in_vcf + ' > ' + sift_vcf + ' 2> ' + log + ';' + run_snpeff + ' ' \
                  + sift_vcf + ' -v > ' + final_vcf + ' 2>> ' + log
        cmd_list.append(run_cmd)
    job_manager(cmd_list, max_t)
Exemple #18
0
def capture_coverage(bedtools2_tool, sample, capture_bed_ref, wait_flag):
    prefix = capture_bed_ref[:-4]
    cc_t1_cmd = bedtools2_tool + " coverage -hist -abam " + sample + ".rmdup.srt.bam -b " + prefix + '_t1.bed' \
                + " | grep all > " + sample + ".capture_t1.hist"
    cc_t2_cmd = bedtools2_tool + " coverage -hist -abam " + sample + ".rmdup.srt.bam -b " + prefix + '_t2.bed' \
                + " | grep all > " + sample + ".capture_t2.hist"
    sys.stderr.write(date_time() + cc_t1_cmd + "\n" + cc_t2_cmd + "\n")
    if wait_flag == 0:
        Popen(cc_t1_cmd,
              shell=True,
              stdin=None,
              stdout=None,
              stderr=None,
              close_fds=True)
        Popen(cc_t2_cmd,
              shell=True,
              stdin=None,
              stdout=None,
              stderr=None,
              close_fds=True)
    else:
        jobs = [cc_t1_cmd, cc_t2_cmd]
        job_manager(jobs, 2)
    return 0
Exemple #19
0
def list_bam(cont, obj, sample, threads):
    list_cmd = '. /home/ubuntu/.novarc;swift list ' + cont + ' --prefix ' + obj + '/' + sample + '/'
    sys.stderr.write(date_time() + list_cmd + '\nGetting BAM list\n')
    flist = subprocess.check_output(list_cmd, shell=True)
    # Use to check on download status
    p = []

    for fn in re.findall('(.*)\n', flist):
        if re.match('.*.merged.final.ba', fn):
            sys.stderr.write(date_time() + 'Downloading relevant BAM file ' + fn + '\n')
            dl_cmd = '. /home/ubuntu/.novarc;swift download ' + cont + ' --skip-identical ' + fn + ' --output '\
                     + os.path.basename(fn)
            p.append(dl_cmd)
    if len(p) < 1:
        sys.stderr.write(date_time() + 'No merged bam found for ' + sample + '\n')
        return 1
    f = job_manager(p, threads)
    if f == 0:
        sys.stderr.write(date_time() + 'BAM download complete\n')
        return 0
    else:
        sys.stderr.write(date_time() + 'BAM download failed\n')
        exit(1)
Exemple #20
0
def mutect_pipe(config_file, tumor_id, normal_id):
    (java, mutect, intervals, fa_ordered, max_t, ram, project_dir, project,
     align) = parse_config(config_file)

    # break up intervals into max threads junks to run all in parallel
    int_fh = open(intervals, 'r')
    int_dict = {}
    i = 0
    # create temp directory
    tmp_cmd = 'mkdir temp'
    subprocess.call(tmp_cmd, shell=True)
    # create sub-interval files - split by chromosome
    mk_dir_bed = 'mkdir bed'
    subprocess.call(mk_dir_bed, shell=True)
    for interval in int_fh:
        (chrom, start, end) = interval.split('\t')
        try:
            int_dict[chrom]['fh'].write(interval)
        except:
            int_dict[chrom] = {}
            int_dict[chrom]['fn'] = 'bed/intervals_' + chrom + '.bed'
            int_dict[chrom]['fh'] = open(int_dict[chrom]['fn'], 'w')
            int_dict[chrom]['fh'].write(interval)
        i += 1
    job_ram = int(int(ram) / int(max_t))
    run_mut = java + ' -Djava.io.tmpdir=./temp -Xmx' + str(
        job_ram) + 'g -jar ' + mutect
    # array will store commands to run, next def will take care of job management using popen
    cmd_list = []
    bam_dir = project_dir + project + '/' + align
    tumor_bam = bam_dir + '/' + tumor_id + '/BAM/' + tumor_id + '.merged.final.bam'
    normal_bam = bam_dir + '/' + normal_id + '/BAM/' + normal_id + '.merged.final.bam'
    sys.stderr.write(date_time() + 'Processing pair T: ' + tumor_bam + ' N: ' +
                     normal_bam + '\n')
    out = tumor_id + '_' + normal_id
    # make result directory for current pair
    i = 1
    for intvl in sorted(int_dict):
        int_dict[intvl]['fh'].close()
        cur = run_mut
        output_file = out + '.' + intvl + '.out'
        vcf_file = out + '.' + intvl + '.vcf'
        log_file = 'LOGS/' + out + '.mut.' + intvl + '.log'
        cur = cur + ' -T MuTect -fixMisencodedQuals -R ' + fa_ordered + ' --intervals ' + int_dict[intvl][
            'fn'] + '  --input_file:normal ' + normal_bam + '  --input_file:tumor ' + tumor_bam + \
              ' --max_alt_alleles_in_normal_count 1000 --max_alt_alleles_in_normal_qscore_sum 37 ' \
              '--max_alt_allele_in_normal_fraction 0.05 --out ' + output_file + ' -vcf ' + vcf_file \
              + ' --enable_extended_output --strand_artifact_power_threshold 0 -log ' + log_file \
              + ' >> ' + log_file + ' 2>> ' + log_file + '; cat ' + output_file \
              + ' | grep -v REJECT > ' + output_file + '.keep; cat ' + vcf_file \
              + ' | grep -v REJECT > ' + vcf_file + '.keep '
        cmd_list.append(cur)
        i += 1
    # fix encode flag won't work if already phred 33, if a job fails try without
    try:
        job_manager(cmd_list, max_t)
    except:
        for i in range(0, len(cmd_list), 1):
            cmd_list[i] = cmd_list[i].replace('-fixMisencodedQuals ', '')
        job_manager(cmd_list, max_t)
    cleanup_temp_dirs = 'rm -rf temp bed'
    sys.stderr.write('Cleaning up temp dirs ' + cleanup_temp_dirs + '\n')
    subprocess.call(cleanup_temp_dirs, shell=True)
    sys.stderr.write(date_time() + 'SNV calling completed for ' + out + '\n')
    return 0
Exemple #21
0
def mutect_pipe(config_file, sample_pairs, ref_mnt):
    (java, gatk, intervals, fa_ordered, max_t, ram) = parse_config(config_file)
    intervals = ref_mnt + '/' + intervals
    # break up intervals into max threads junks to run all in parallel
    int_fh = open(intervals, 'r')
    int_dict = {}
    i = 0
    # create temp directory
    tmp_cmd = 'mkdir temp'
    subprocess.call(tmp_cmd, shell=True)
    # create sub-interval files - split by chromosome
    mk_dir_bed = 'mkdir bed'
    subprocess.call(mk_dir_bed, shell=True)
    for interval in int_fh:
        (chrom, start, end) = interval.split('\t')
        intvl = start + '-' + end  # normally not need if using normal interval file
        try:
            int_dict[chrom]['fh'].write(interval)
        except:
            int_dict[chrom] = {}
            int_dict[chrom]['fn'] = 'bed/intervals_' + chrom + '.bed'
            int_dict[chrom]['fh'] = open(int_dict[chrom]['fn'], 'w')
            int_dict[chrom]['fh'].write(interval)
        i += 1
    fa_ordered = ref_mnt + '/' + fa_ordered
    fh = open(sample_pairs)
    job_ram = (int(ram) / int(max_t))
    run_mut = java + ' -Djava.io.tmpdir=./temp -Xmx' + str(job_ram) + 'g -jar ' + gatk
    mk_log_dir = 'mkdir LOGS'
    subprocess.call(mk_log_dir, shell=True)
    for line in fh:
        # array will store commands to run, next def will take care of job management using popen
        cmd_list = []
        line = line.rstrip('\n')
        (sample, tumor_id, normal_id) = line.split('\t')
        tumor_bam = tumor_id + '.merged.final.bam'
        normal_bam = normal_id + '.merged.final.bam'
        sys.stderr.write(date_time() + 'Processing pair T: ' + tumor_bam + ' N: ' + normal_bam + '\n')
        out = tumor_id + '_' + normal_id
        # make result directory for current pair
        mk_res = 'mkdir ' + out
        subprocess.call(mk_res, shell=True)
        i = 1
        for intvl in sorted(int_dict):
            int_dict[intvl]['fh'].close()
            cur = run_mut
            vcf_file = out + '.' + intvl + '.vcf'
            log_file = 'LOGS/' + out + '.mut.' + intvl + '.log'
            cur = cur + ' -T MuTect2 -S LENIENT -R ' + fa_ordered + ' --intervals ' + int_dict[intvl]['fn'] + \
                  '  -I:normal ' + normal_bam + '  -I:tumor ' + tumor_bam + ' --max_alt_alleles_in_normal_count 1000'\
                  ' --max_alt_alleles_in_normal_qscore_sum 37 --max_alt_allele_in_normal_fraction 0.05 --out ' + out \
                  + '/' + vcf_file + ' 2>> ' + log_file + ';'
            cmd_list.append(cur)
            i += 1
        # fix encode flag won't work if alread phred 33, if a job fails try without
        try:
            job_manager(cmd_list, max_t)
        except:
            for i in range(0, len(cmd_list), 1):
                cmd_list[i] = cmd_list[i].replace('-fixMisencodedQuals ', '')
            job_manager(cmd_list, max_t)
    sys.stderr.write(date_time() + 'Variant calling completed!\n')
    return 0
Exemple #22
0
def calc_pos_cov(table, samtools, out):
    fh = open(table)
    head = next(fh)
    bnids = []
    head = head.rstrip('\n').split('\t')
    for i in range(1, len(head), 1):
        bnid = head[i].split('_')
        bnids.append(bnid[0])
    # create bed file to get coverage
    bed_fn = out + '.bed'
    bed = open(bed_fn, 'w')
    vlist = []
    # in the event an indel happens in one sample and snv in another at same position, don't process twice

    for line in fh:
        vlist.append(create_bed(line, bed))
    bed.close()
    fh.close()
    job_list = []
    src_cmd = '. ~/.novarc;'
    # get bams, then build jobs
    for i in range(0, len(bnids), 1):
        sys.stderr.write(date_time() + 'Getting bam for ' + bnids[i] + '\n')
        bam = 'ALIGN/' + bnids[i] + '/BAM/' + bnids[i] + '.merged.final.bam'
        dl_cmd = src_cmd + 'swift download PDX --prefix ALIGN/' + bnids[
            i] + '/BAM/' + bnids[i] + '.merged.final.ba;'
        subprocess.call(dl_cmd, shell=True)
        # try pdx container, if not, try pancan
        if os.path.isfile(bam):
            job_list.append(build_jobs(samtools, bed_fn, bnids[i]))
        else:
            sys.stderr.write(date_time() + dl_cmd + '\nBam for sample ' +
                             bnids[i] + ' not in PDX contaner, '
                             'trying PANCAN\n')
            dl_cmd = src_cmd + 'swift download PANCAN --prefix ALIGN/' + bnids[i] + '/BAM/' + bnids[i] \
                     + '.merged.final.ba;'
            subprocess.call(dl_cmd, shell=True)
            if os.path.isfile(bam):
                job_list.append(build_jobs(samtools, bed_fn, bnids[i]))
            else:
                sys.stderr.write(date_time() + dl_cmd +
                                 '\nCould not find bam for ' + bnids[i] + '\n')
                exit(1)
    sys.stderr.write('Running depth jobs\n')
    job_manager(job_list, '8')
    sys.stderr.write(date_time() + 'Compiling results\n')
    cov_dict = compile_results(bnids)
    sys.stderr.write(date_time() + 'Writing to output table\n')
    out_fh = open(out + '_variant_coverage_table.txt', 'w')
    out_fh.write('\t'.join(head) + '\n')
    for var in vlist:
        out_fh.write(var)
        for i in range(0, len(bnids), 1):
            m = re.search('\S+-(chr\w+)_(\d+)_\w+->\w+', var)
            (chrom, pos) = (m.group(1), m.group(2))
            if bnids[i] in cov_dict[chrom][pos]:
                out_fh.write('\t' + cov_dict[chrom][pos][bnids[i]])
            else:
                out_fh.write('\t0')
        out_fh.write('\n')
    out_fh.close()
    sys.stderr.write(date_time() + 'Fin\n')
Exemple #23
0
def mutect_pipe(config_file, sample_pairs, ref_mnt):
    (java, gatk, intervals, fa_ordered, max_t, ram) = parse_config(config_file)
    intervals = ref_mnt + '/' + intervals
    # break up intervals into max threads junks to run all in parallel
    int_fh = open(intervals, 'r')
    int_dict = {}
    i = 0
    # create temp directory
    tmp_cmd = 'mkdir temp'
    subprocess.call(tmp_cmd, shell=True)
    # create sub-interval files - split by chromosome
    mk_dir_bed = 'mkdir bed'
    subprocess.call(mk_dir_bed, shell=True)
    for interval in int_fh:
        (chrom, start, end) = interval.split('\t')
        intvl = start + '-' + end  # normally not need if using normal interval file
        try:
            int_dict[chrom]['fh'].write(interval)
        except:
            int_dict[chrom] = {}
            int_dict[chrom]['fn'] = 'bed/intervals_' + chrom + '.bed'
            int_dict[chrom]['fh'] = open(int_dict[chrom]['fn'], 'w')
            int_dict[chrom]['fh'].write(interval)
        i += 1
    fa_ordered = ref_mnt + '/' + fa_ordered
    fh = open(sample_pairs)
    job_ram = (int(ram) / int(max_t))
    run_mut = java + ' -Djava.io.tmpdir=./temp -Xmx' + str(
        job_ram) + 'g -jar ' + gatk
    mk_log_dir = 'mkdir LOGS'
    subprocess.call(mk_log_dir, shell=True)
    for line in fh:
        # array will store commands to run, next def will take care of job management using popen
        cmd_list = []
        line = line.rstrip('\n')
        (sample, tumor_id, normal_id) = line.split('\t')
        tumor_bam = tumor_id + '.merged.final.bam'
        normal_bam = normal_id + '.merged.final.bam'
        sys.stderr.write(date_time() + 'Processing pair T: ' + tumor_bam +
                         ' N: ' + normal_bam + '\n')
        out = tumor_id + '_' + normal_id
        # make result directory for current pair
        mk_res = 'mkdir ' + out
        subprocess.call(mk_res, shell=True)
        i = 1
        for intvl in sorted(int_dict):
            int_dict[intvl]['fh'].close()
            cur = run_mut
            vcf_file = out + '.' + intvl + '.vcf'
            log_file = 'LOGS/' + out + '.mut.' + intvl + '.log'
            cur = cur + ' -T MuTect2 -S LENIENT -R ' + fa_ordered + ' --intervals ' + int_dict[intvl]['fn'] + \
                  '  -I:normal ' + normal_bam + '  -I:tumor ' + tumor_bam + ' --max_alt_alleles_in_normal_count 1000'\
                  ' --max_alt_alleles_in_normal_qscore_sum 37 --max_alt_allele_in_normal_fraction 0.05 --out ' + out \
                  + '/' + vcf_file + ' 2>> ' + log_file + ';'
            cmd_list.append(cur)
            i += 1
        # fix encode flag won't work if alread phred 33, if a job fails try without
        try:
            job_manager(cmd_list, max_t)
        except:
            for i in range(0, len(cmd_list), 1):
                cmd_list[i] = cmd_list[i].replace('-fixMisencodedQuals ', '')
            job_manager(cmd_list, max_t)
    sys.stderr.write(date_time() + 'Variant calling completed!\n')
    return 0
Exemple #24
0
def mutect_pipe(config_file, tumor_id, normal_id):
    (java, mutect, intervals, fa_ordered, max_t, ram, project_dir, project, align) = parse_config(config_file)

    # break up intervals into max threads junks to run all in parallel
    int_fh = open(intervals, 'r')
    int_dict = {}
    i = 0
    # create temp directory
    tmp_cmd = 'mkdir temp'
    subprocess.call(tmp_cmd, shell=True)
    # create sub-interval files - split by chromosome
    mk_dir_bed = 'mkdir bed'
    subprocess.call(mk_dir_bed, shell=True)
    for interval in int_fh:
        (chrom, start, end) = interval.split('\t')
        try:
            int_dict[chrom]['fh'].write(interval)
        except:
            int_dict[chrom] = {}
            int_dict[chrom]['fn'] = 'bed/intervals_' + chrom + '.bed'
            int_dict[chrom]['fh'] = open(int_dict[chrom]['fn'], 'w')
            int_dict[chrom]['fh'].write(interval)
        i += 1
    job_ram = int(int(ram) / int(max_t))
    run_mut = java + ' -Djava.io.tmpdir=./temp -Xmx' + str(job_ram) + 'g -jar ' + mutect
    # array will store commands to run, next def will take care of job management using popen
    cmd_list = []
    bam_dir = project_dir + project + '/' + align
    tumor_bam = bam_dir + '/' + tumor_id + '/BAM/' + tumor_id + '.merged.final.bam'
    normal_bam = bam_dir + '/' + normal_id + '/BAM/' + normal_id + '.merged.final.bam'
    sys.stderr.write(date_time() + 'Processing pair T: ' + tumor_bam + ' N: ' + normal_bam + '\n')
    out = tumor_id + '_' + normal_id
    # make result directory for current pair
    i = 1
    for intvl in sorted(int_dict):
        int_dict[intvl]['fh'].close()
        cur = run_mut
        output_file = out + '.' + intvl + '.out'
        vcf_file = out + '.' + intvl + '.vcf'
        log_file = 'LOGS/' + out + '.mut.' + intvl + '.log'
        cur = cur + ' -T MuTect -fixMisencodedQuals -R ' + fa_ordered + ' --intervals ' + int_dict[intvl][
            'fn'] + '  --input_file:normal ' + normal_bam + '  --input_file:tumor ' + tumor_bam + \
              ' --max_alt_alleles_in_normal_count 1000 --max_alt_alleles_in_normal_qscore_sum 37 ' \
              '--max_alt_allele_in_normal_fraction 0.05 --out ' + output_file + ' -vcf ' + vcf_file \
              + ' --enable_extended_output --strand_artifact_power_threshold 0 -log ' + log_file \
              + ' >> ' + log_file + ' 2>> ' + log_file + '; cat ' + output_file \
              + ' | grep -v REJECT > ' + output_file + '.keep; cat ' + vcf_file \
              + ' | grep -v REJECT > ' + vcf_file + '.keep '
        cmd_list.append(cur)
        i += 1
    # fix encode flag won't work if already phred 33, if a job fails try without
    try:
        job_manager(cmd_list, max_t)
    except:
        for i in range(0, len(cmd_list), 1):
            cmd_list[i] = cmd_list[i].replace('-fixMisencodedQuals ', '')
        job_manager(cmd_list, max_t)
    cleanup_temp_dirs = 'rm -rf temp bed'
    sys.stderr.write('Cleaning up temp dirs ' + cleanup_temp_dirs + '\n')
    subprocess.call(cleanup_temp_dirs, shell=True)
    sys.stderr.write(date_time() + 'SNV calling completed for ' + out + '\n')
    return 0
Exemple #25
0
def mutect_pipe(config_file, sample_pairs, ref_mnt):
    (java, mutect, intervals, fa_ordered, max_t, ram) = parse_config(config_file)
    intervals = ref_mnt + "/" + intervals
    # break up intervals into max threads junks to run all in parallel
    int_fh = open(intervals, "r")
    int_dict = {}
    i = 0
    # create temp directory
    tmp_cmd = "mkdir temp"
    subprocess.call(tmp_cmd, shell=True)
    # create sub-interval files - split by chromosome
    mk_dir_bed = "mkdir bed"
    subprocess.call(mk_dir_bed, shell=True)
    for interval in int_fh:
        (chrom, start, end) = interval.split("\t")
        intvl = start + "-" + end  # normally not need if using normal interval file
        try:
            int_dict[chrom]["fh"].write(interval)
        except:
            int_dict[chrom] = {}
            int_dict[chrom]["fn"] = "bed/intervals_" + chrom + ".bed"
            int_dict[chrom]["fh"] = open(int_dict[chrom]["fn"], "w")
            int_dict[chrom]["fh"].write(interval)
        i += 1
    fa_ordered = ref_mnt + "/" + fa_ordered
    fh = open(sample_pairs)
    job_ram = int(ram) / int(max_t)
    run_mut = java + " -Djava.io.tmpdir=./temp -Xmx" + str(job_ram) + "g -jar " + mutect
    mk_log_dir = "mkdir LOGS"
    subprocess.call(mk_log_dir, shell=True)
    for line in fh:
        # array will store commands to run, next def will take care of job management using popen
        cmd_list = []
        line = line.rstrip("\n")
        (sample, tumor_id, normal_id) = line.split("\t")
        tumor_bam = tumor_id + ".merged.final.bam"
        normal_bam = normal_id + ".merged.final.bam"
        sys.stderr.write(date_time() + "Processing pair T: " + tumor_bam + " N: " + normal_bam + "\n")
        out = tumor_id + "_" + normal_id
        # make result directory for current pair
        mk_res = "mkdir " + out
        subprocess.call(mk_res, shell=True)
        i = 1
        for intvl in sorted(int_dict):
            int_dict[intvl]["fh"].close()
            cur = run_mut
            output_file = out + "." + intvl + ".out"
            vcf_file = out + "." + intvl + ".vcf"
            log_file = "LOGS/" + out + ".mut." + intvl + ".log"
            cur = (
                cur
                + " -T MuTect -fixMisencodedQuals -R "
                + fa_ordered
                + " --intervals "
                + int_dict[intvl]["fn"]
                + "  --input_file:normal "
                + normal_bam
                + "  --input_file:tumor "
                + tumor_bam
                + " --max_alt_alleles_in_normal_count 1000 --max_alt_alleles_in_normal_qscore_sum 37 "
                "--max_alt_allele_in_normal_fraction 0.05 --out "
                + out
                + "/"
                + output_file
                + " -vcf "
                + out
                + "/"
                + vcf_file
                + " --enable_extended_output --strand_artifact_power_threshold 0 -log "
                + log_file
                + " >> "
                + log_file
                + " 2>> "
                + log_file
                + "; cat "
                + out
                + "/"
                + output_file
                + " | grep -v REJECT > "
                + out
                + "/"
                + output_file
                + ".keep; cat "
                + out
                + "/"
                + vcf_file
                + " | grep -v REJECT > "
                + out
                + "/"
                + vcf_file
                + ".keep "
            )
            cmd_list.append(cur)
            i += 1
        # fix encode flag won't work if alread phred 33, if a job fails try without
        try:
            job_manager(cmd_list, max_t)
        except:
            for i in xrange(0, len(cmd_list), 1):
                cmd_list[i] = cmd_list[i].replace("-fixMisencodedQuals ", "")
            job_manager(cmd_list, max_t)
    sys.stderr.write(date_time() + "Variant calling completed!\n")
    return 0