Example #1
0
def calc_coverage(sample_list, suffix, bed):
    slist = []
    sys.stderr.write(date_time() + 'Processing bed file ' + bed + '\n')
    (gene_list, gene_dict, master_dict) = process_bed(bed)
    for sample in open(sample_list):
        sys.stderr.write(date_time() + 'Processing sample ' + sample)
        sample = sample.rstrip('\n')
        slist.append(sample)
        cur = sample + suffix
        temp_dict = copy.deepcopy(gene_dict)
        for entry in open(cur):
            info = entry.rstrip('\n').split('\t')
            if info[0] == 'all':
                break
            parts = info[3].split('_')
            temp_dict[parts[0]]['tot_cov'] += (int(info[4]) * int(info[5]))
        for gene in gene_list:
            master_dict[gene][sample] = (float(temp_dict[gene]['tot_cov'])/temp_dict[gene]['len'])

    sys.stderr.write(date_time() + 'Outputting results\n')
    sys.stdout.write('Gene/Sample\t')
    print '\t'.join(slist)
    for gene in gene_list:
        sys.stdout.write(gene)
        for sample in slist:
            sys.stdout.write('\t' + str(master_dict[gene][sample]))
        print
    sys.stderr.write(date_time() + 'Fin!\n')
Example #2
0
def preprocess_bams(config_file, sample_pairs):
    # create sample list
    sample_list = 'sample_list.txt'
    fh = open(sample_pairs, 'r')
    sl = open(sample_list, 'w')
    temp = {}
    for line in fh:
        cur = line.rstrip('\n').split('\t')
        if len(cur) == 3:
            if cur[1] not in temp:
                sl.write(cur[1] + '\n')
                temp[cur[1]] = 1
            if cur[2] not in temp:
                sl.write(cur[2] + '\n')
                temp[cur[2]] = 1
        else:
            if cur[0] not in temp:
                sl.write(cur[0] + '\n')
                temp[cur[0]] = 1
    sl.close()
    fh .close()
    miss_list = check_for_merged_bams(config_file, sample_list)
    if len(miss_list) > 0:
        sys.stderr.write(date_time() + 'Missing files detected, merging lane files\n')
        temp_fn = 'temp_samp_list.txt'
        temp_fh = open(temp_fn, 'w')
        temp_fh.write('\n'.join(miss_list))
        temp_fh.close()
        run_novosort(config_file, temp_fn)
    else:
        sys.stderr.write(date_time() + 'All bams found. Ready for next step!\n')
Example #3
0
def platypus_germline(config_file, sample, log_dir, cflag):

    loc = log_dir + sample + ".platypus.log"
    # here for safety as python is confusing about whether variables exist outside of if-else statements or not
    platypus_cmd = ''
    if cflag == 'y':
        (platypus, fasta, threads, project_dir, project, align) = parse_config(config_file, cflag)
        bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam'
        platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \
                       + " --bamFiles=" + bam + " -o " + sample + ".germline_calls.vcf --logFileName=" \
                       + log_dir + sample + ".platypus.log" + " >> " + loc + " 2>&1"
    else:
        (platypus, fasta, threads, region_file, minVAF, samtools, project_dir, project, align) \
            = parse_config(config_file, cflag)

        bam = project_dir + project + '/' + align + '/' + sample + '/BAM/' + sample + '.merged.final.bam'
        if not (os.path.isfile(bam + '.bai') or os.path.isfile(bam[:-1] + 'i')):
            log(loc, date_time() + bam + ' not indexed.  Indexing\n')
            cmd = samtools + ' index ' + bam
            log(loc, date_time() + cmd + '\n')
            subprocess.call(cmd, shell=True)
        platypus_cmd = "python2.7 " + platypus + " callVariants --nCPU=" + threads + " --refFile=" + fasta \
                       + " --bamFiles=" + bam + " --filterDuplicates=0 -o " + sample \
                       + ".germline_calls.vcf --minVarFreq=" + minVAF + " --regions=" + region_file \
                       + " --logFileName=" + loc + " >> " + loc + " 2>&1"
    log(loc, date_time() + platypus_cmd + "\n")
    f = 0
    try:
        f = subprocess.call(platypus_cmd, shell=True)
    except:
        log(loc, 'platypus germline variant calling failed for sample ' + sample + '\n')
        return f

    return 0
Example #4
0
def download_from_swift(cont, obj, lane_list):
    src_cmd = ". /home/ubuntu/.novarc;"
    lanes = open(lane_list, 'r')
    head = ''
    print 'BID\tread group\ttotal starting read pairs(rp)\t% r1 w/ adapter\t% r2 w/ adapter\trp too short\t% rp passed' \
          '\ttotal starting base pairs(bp)\tread1 bp trimmed\tread2 bp trimmed\t% bp written'
    for line in lanes:
        line = line.rstrip('\n')
        (bid, seqtype, lane_csv) = line.split('\t')
        for lane in lane_csv.split(', '):
            cur = obj + '/' + bid + '/LOGS/' + bid + '_' + lane + '.cutadapt.log'
            swift_cmd = src_cmd + "swift download " + cont + " --skip-identical " + cur
            sys.stderr.write(date_time() + swift_cmd + "\n")
            try:
                check = check_output(swift_cmd, shell=True, stderr=subprocess.PIPE)
            except:
                sys.stderr.write(date_time() + "Download of " + obj + " from " + cont + " failed\n")
                exit(1)

            temp = parseCUTADAPT(cur)

            print bid + '\t' + lane + '\t' + '\t'.join(temp)

    lanes.close()
    sys.stdout.write(head)

    return 0
Example #5
0
def calc_coverage(sample_list, suffix, bed):
    slist = []
    sys.stderr.write(date_time() + 'Processing bed file ' + bed + '\n')
    (gene_list, gene_dict, master_dict) = process_bed(bed)
    for sample in open(sample_list):
        sys.stderr.write(date_time() + 'Processing sample ' + sample)
        sample = sample.rstrip('\n')
        slist.append(sample)
        cur = sample + suffix
        temp_dict = copy.deepcopy(gene_dict)
        for entry in open(cur):
            info = entry.rstrip('\n').split('\t')
            if info[0] == 'all':
                break
            parts = info[3].split('_')
            temp_dict[parts[0]]['tot_cov'] += (int(info[4]) * int(info[5]))
        for gene in gene_list:
            master_dict[gene][sample] = (float(temp_dict[gene]['tot_cov']) /
                                         temp_dict[gene]['len'])

    sys.stderr.write(date_time() + 'Outputting results\n')
    sys.stdout.write('Gene/Sample\t')
    print('\t'.join(slist))
    for gene in gene_list:
        sys.stdout.write(gene)
        for sample in slist:
            sys.stdout.write('\t' + str(master_dict[gene][sample]))
        print()
    sys.stderr.write(date_time() + 'Fin!\n')
Example #6
0
def fastqc(fastqc_tool, sample, end1, end2, t):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.fastqc.log'
    fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2
    log(loc, date_time() + fastqc_cmd + "\n")
    f = Popen(fastqc_cmd,
              shell=True,
              stdin=None,
              stdout=None,
              stderr=None,
              close_fds=True)
    # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score
    #  didn't fit
    call('sleep 20s', shell=True)

    if str(f.poll()) == '1':
        log(
            loc,
            date_time() +
            'fastqc returned an error.  Check your inputs and try again!\n')
        exit(1)
    return 0
Example #7
0
def lane_express_quant(bams, config_file):
    (stranded, strand, express, express_sl, transcriptome) = parse_config(config_file)
    for bam in open(bams):
        bam = bam.rstrip('\n')
        bam_dir = os.path.dirname(bam)
        root = os.path.basename(re.sub('.Aligned.toTranscriptome.out.*', '', bam))
        qc_dir = bam_dir.replace('BAMS', 'QC')
        qc_file = qc_dir + '/' + root + '.qc_stats.json'
        qc_data = json.loads(open(qc_file, 'r').read())
        (x, s) = (str(int(round(float(qc_data['picard_stats']['x_ins_size'])))),
                  str(int(round(float(qc_data['picard_stats']['s_ins_size'])))))
        wd = qc_dir + '/' + root + '/'
        loc = wd + root + '.log'
        express_cmd = 'mkdir ' + wd + ';'
        call(express_cmd, shell=True)
        sys.stderr.write(date_time() + 'Created dir ' + wd + ' to quantify ' + bam + '\n' + express_cmd + '\n')
        if stranded == 'N':
            express_cmd = express + ' ' + transcriptome + ' ' + bam + ' --no-update-check -o ' + wd + ' -m '\
                          + x + ' -s ' + s + ' --logtostderr 2>> ' + loc + ';'
        else:
            express_cmd = 'sbatch -c 4 --export=express="' + express + '",transcriptome="' + transcriptome + '",bam="' \
                          + bam + '",wd="' + wd + '",strand="' + strand + '",x="' + x + '",s="' + s + '",loc="' + loc \
                          + '",root="' + root + '" ' + express_sl
            # express + ' ' + transcriptome + ' ' + bam + ' --no-update-check -o ' + wd + ' --'\
            #              + strand + ' -m ' + x + ' -s ' + s + ' --logtostderr 2>> ' + loc + ';'

            # express_cmd += 'mv ' + wd + 'results.xprs ' + wd + root + '.express_quantification.txt; mv ' + wd \
            #               + 'params.xprs ' + wd + root + '.params.xprs;'
        sys.stderr.write(date_time() + 'Submitting quantification job\n' + express_cmd + '\n')
        call(express_cmd, shell=True)

    return 0
Example #8
0
def gen_report(vcf):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.indels.vep_priority.report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    vcf_in = VariantFile(vcf)

    out = open(parts[0] + '.indels.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0, 'VARIANT_CLASS': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in xrange(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write('chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact'
            '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n')
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf) = (record.contig, str(record.pos), record.ref, record.alts[0],
                                str(record.info['MINCOV']), str(record.info['ALTCOV']), str(record.info['COVRATIO']))
        ann_list = [_.split('|') for _ in record.info['ANN'].split(',')]
        output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf, ann_list, desired, out)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
Example #9
0
def downsample_bam(samtools, bam, frac, out_dir, th):
    out_root = os.path.basename(bam.replace('.bam', ''))
    cmd = 'sbatch -c ' + th + ' ' + samtools + ' view --threads ' + th + ' -b ' + bam + ' -s ' + frac + ' > ' \
          + out_dir + '/' + out_root + '_subsample_' + frac + '.bam'
    sys.stderr.write(date_time() + 'Downsampling ' + bam + '\n' + cmd + '\n')
    subprocess.call(cmd, shell=True)
    sys.stderr.write(date_time() + 'process complete!\n')
Example #10
0
def novosort_merge_pe(config_file, sample_list):
    fh = open(sample_list, 'r')
    (novosort, java_tool, picard_tool, project, project_dir, align, threads, ram, novo_merge_rmdup_slurm) \
        = parse_config(config_file)

    for sample in fh:
        sample = sample.rstrip('\n')
        loc = '../LOGS/' + sample + '.novosort_merge.log'
        job_loc = sample + '.novosort_merge.log'
        (bam_list, n) = list_bam(project, align, sample)
        bam_string = " ".join(bam_list)
        cur_dir = project_dir + project + '/' + align + '/' + sample + '/BAMS/'
        os.chdir(cur_dir)
        out_bam = sample + '.merged.transcriptome.bam'
        if n > 1:
            batch = 'sbatch -c ' + threads + ' --mem ' + ram + 'G -o ' + job_loc + ' --export=novosort="' \
                    + novosort + '",threads="' + threads + '",ram="' + ram + 'G",out_bam="' + out_bam \
                    + '",bam_string="' + bam_string + '",loc="' + loc + '"' + ' ' + novo_merge_rmdup_slurm
            log(loc, date_time() + 'Submitting merge bam job for sample ' + batch + "\n")
            subprocess.call(batch, shell=True)


        else:

                link_bam = 'ln -s ' + bam_list[0] + ' ' + sample + '.merged.transcriptome.bam;'
                log(loc, date_time() + 'Creating symlink for merged final bam since only one exists\n'
                    + link_bam + '\n')
                subprocess.call(link_bam, shell=True)

    sys.stderr.write(date_time() + 'Merged file request submitted and processed, check logs.\n')
    return 0
Example #11
0
def preprocess_bams(config_file, sample_pairs):
    # create sample list
    sample_list = 'sample_list.txt'
    fh = open(sample_pairs, 'r')
    sl = open(sample_list, 'w')
    temp = {}
    for line in fh:
        cur = line.rstrip('\n').split('\t')
        if len(cur) == 3:
            if cur[1] not in temp:
                sl.write(cur[1] + '\n')
                temp[cur[1]] = 1
            if cur[2] not in temp:
                sl.write(cur[2] + '\n')
                temp[cur[2]] = 1
        else:
            if cur[0] not in temp:
                sl.write(cur[0] + '\n')
                temp[cur[0]] = 1
    sl.close()
    fh.close()
    miss_list = check_for_merged_bams(config_file, sample_list)
    if len(miss_list) > 0:
        sys.stderr.write(date_time() +
                         'Missing files detected, merging lane files\n')
        temp_fn = 'temp_samp_list.txt'
        temp_fh = open(temp_fn, 'w')
        temp_fh.write('\n'.join(miss_list))
        temp_fh.close()
        run_novosort(config_file, temp_fn)
    else:
        sys.stderr.write(date_time() +
                         'All bams found. Ready for next step!\n')
Example #12
0
 def organize_dirs(self):
     # check for existing BAM, QC and LOG dirs one level up
     try:
         if not os.path.isdir('../' + self.bam_dir):
             mk_bam_dir = 'mkdir ../' + self.bam_dir
             log(self.loc, date_time() + 'Making BAM directory ' + mk_bam_dir + '\n')
             call(mk_bam_dir, shell=True)
         if not os.path.isdir('../' + self.qc_dir):
             mk_qc_dir = 'mkdir ../' + self.qc_dir
             log(self.loc, date_time() + 'Making QC directory ' + mk_qc_dir + '\n')
             call(mk_qc_dir, shell=True)
         if not os.path.isdir('../' + self.log_dir):
             mk_log_dir = 'mkdir ../' + self.log_dir
             log(self.loc, date_time() + 'Making LOGS directory ' + mk_log_dir + '\n')
             call(mk_log_dir, shell=True)
         reloc_files = 'mv ' + self.bam_dir + '* ../' + self.bam_dir + '; mv ' + self.log_dir + '* ../' \
                       + self.log_dir + '; mv ' + self.qc_dir + '* ../' + self.qc_dir
         log(self.loc, date_time() + 'Relocating files ' + reloc_files + '\n')
         call(reloc_files, shell=True)
         # need to reassign log file location since it's being moved!
         self.loc = '../' + self.loc
         rm_old = 'rmdir ' + ' '.join((self.bam_dir , self.log_dir, self.qc_dir))
         log(self.loc, date_time() + 'Clearing out working dirs ' + rm_old + '\n')
         call(rm_old, shell=True)
         return 0
     except:
         return 1
Example #13
0
def download_from_swift(cont, obj, lane_list):
    src_cmd = ". /home/ubuntu/.novarc;"
    lanes = open(lane_list, 'r')
    head = ''
    print(
        'BID\tread group\ttotal starting read pairs(rp)\t% r1 w/ adapter\t% r2 w/ adapter\trp too short\t% rp passed'
        '\ttotal starting base pairs(bp)\tread1 bp trimmed\tread2 bp trimmed\t% bp written'
    )
    for line in lanes:
        line = line.rstrip('\n')
        (bid, seqtype, lane_csv) = line.split('\t')
        for lane in lane_csv.split(', '):
            cur = obj + '/' + bid + '/LOGS/' + bid + '_' + lane + '.cutadapt.log'
            swift_cmd = src_cmd + "swift download " + cont + " --skip-identical " + cur
            sys.stderr.write(date_time() + swift_cmd + "\n")
            try:
                check = check_output(swift_cmd,
                                     shell=True,
                                     stderr=subprocess.PIPE).decode()
            except:
                sys.stderr.write(date_time() + "Download of " + obj +
                                 " from " + cont + " failed\n")
                exit(1)

            temp = parseCUTADAPT(cur)

            print(bid + '\t' + lane + '\t' + '\t'.join(temp))

    lanes.close()
    sys.stdout.write(head)

    return 0
Example #14
0
def batch_qc(fn, cont, obj, t):
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    inputs = parser.parse_args()
    fh = open(inputs.fn, 'r')
    src_cmd = '. ~/.novarc;'
    jobs = []
    for line in fh:
        line = line.rstrip('\n')
        # All files for current bnid to be stored in cwd
        swift_cmd = src_cmd + 'swift list ' + cont + ' --prefix ' + obj + '/' + line
        sys.stderr.write(date_time() + 'Checking for sequence files for sample ' + line + '\n' + swift_cmd + '\n')
        try:
            contents = subprocess.check_output(swift_cmd, shell=True)
            if len(contents) < len(line):
                sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n')
                continue
        except:
            sys.stderr.write(date_time() + 'Can\'t find sequencing files for ' + line + ' skipping!\n')
            continue
        seqfile = re.findall('(\S+[sequence|f*q]*\.gz)', contents)
        sf1 = seqfile[0]
        end1 = os.path.basename(sf1)
        sf2 = seqfile[1]
        end2 = os.path.basename(sf2)
        swift_cmd = src_cmd + "swift download " + cont + " --skip-identical --prefix " + obj + '/' + line
        link_cmd = 'ln -s ' + sf1 + ' .;ln -s ' + sf2
        fastqc_cmd = 'mkdir -p PREQC/' + line + '; fastqc -t 2 -o PREQC/' + line + ' ' + sf1 + ' ' + sf2
        upload_cmd = src_cmd + 'swift upload ' + cont + ' PREQC/' + line
        cleanup_cmd = 'rm -rf RAW/' + line + ' PREQC/' + line + ' ' + end1 + ' ' + end2
        jobs.append(';'.join([swift_cmd, link_cmd, fastqc_cmd, upload_cmd, cleanup_cmd]))
    sys.stderr.write(date_time() + 'Job list created, running jobs!\n')
    job_manager(jobs, t)
    return 0
Example #15
0
def gen_report(vcf, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    sample = parts[0]
    loc = 'LOGS/' + sample + '.indels.vep_priority.report.log'
    log(loc,
        date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    vcf_in = VariantFile(vcf)
    out_fn = sample + '.indels.vep.prioritized_impact.report.xls'
    out = open(out_fn, 'w')
    desired = {
        'Consequence': 0,
        'IMPACT': 0,
        'SYMBOL': 0,
        'Feature': 0,
        'Protein_position': 0,
        'Amino_acids': 0,
        'Codons': 0,
        'Existing_variation': 0,
        'ExAC_MAF': 0,
        'BIOTYPE': 0,
        'VARIANT_CLASS': 0
    }

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace(
        'Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write(
        'chr\tpos\tref\talt\tsnp_ID\tExAC_MAF\tgene\ttranscript_id\tvariant_class_effect\teffect\timpact'
        '\tbiotype\tcodon_change\tamino_acid_change\talt_cov\tnon_alt_cov\tvaf\n'
    )
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)

    for record in vcf_in.fetch():
        (chrom, pos, ref, alt, alt_ct, non_alt_ct,
         vaf) = (record.contig, str(record.pos), record.ref, record.alts[0],
                 str(record.info['MINCOV']), str(record.info['ALTCOV']),
                 str(record.info['COVRATIO']))
        ann_list = [_.split('|') for _ in record.info['ANN']]
        output_highest_impact(chrom, pos, ref, alt, alt_ct, non_alt_ct, vaf,
                              ann_list, desired, out, ref_flag)

    out.close()
    log(
        loc,
        date_time() + 'Creating prioritized report for ' + vcf +
        ' complete!\n')
    return 0
Example #16
0
def job_manager(cmd_list, max_t):
    x = len(cmd_list)
    # cur position in command list
    cur = 0
    # completed
    comp = 0
    # initialize process list
    p = {}
    sys.stderr.write(date_time() + 'Initializing run\n')
    n = int(max_t)
    if n > x:
        n = x
    for i in range(0, n, 1):
        p[i] = {}
        p[i]['job'] = subprocess.Popen(cmd_list[i], shell=True)
        p[i]['cmd'] = cmd_list[i]
        p[i]['status'] = 'Running'
        sys.stderr.write(cmd_list[i] + '\n')
        cur += 1
    s = 0
    j = 30
    m = 30
    while comp < x:
        if s % m == 0:
            sys.stderr.write(date_time() + 'Checking job statuses. ' +
                             str(comp) + ' of ' + str(x) + ' completed. ' +
                             str(s) + ' seconds have passed\n')
            for i in range(0, n, 1):
                check = p[i]['job'].poll()
                if str(check) == '1':
                    sys.stderr.write(date_time() +
                                     'Job returned an error while running ' +
                                     p[i]['cmd'] + '  aborting!\n')
                    for k in range(0, n, 1):
                        p[k]['job'].kill()
                        sys.stderr.write('Killing job ' + str(k) + '\n')
                    exit(1)
                if str(check) == '0' and p[i]['status'] != str(check):
                    comp += 1
                    p[i]['status'] = str(check)
                    if comp <= (x - n):
                        try:
                            p[i]['job'] = subprocess.Popen(cmd_list[cur],
                                                           shell=True)
                            p[i]['cmd'] = cmd_list[cur]
                            p[i]['status'] = 'Running'
                            cur += 1
                        except:
                            sys.stderr.write(date_time() +
                                             "Tried to queue command " +
                                             p[i]['cmd'] + '\n was ' +
                                             str(cur) + ' in command list, ' +
                                             str(i) + ' in queue list\n')
                            exit(1)
        s += j
        sleep_cmd = 'sleep ' + str(j) + 's'
        subprocess.call(sleep_cmd, shell=True)
    sys.stderr.write(date_time() + str(comp) + ' jobs completed\n')
    return 0
Example #17
0
def annot_platypus(config_file, sample, skip):
    (vep_tool, vep_cache, plugin_dir, fasta, threads, java, cadd_snv, cadd_indel, tx_index, project_dir, project,
     analysis, annotation, user, group) = parse_config(config_file)
    src_env = '. /etc/environment'
    subprocess.call(src_env, shell=True)
    ana_dir = project_dir + project + '/' + analysis + '/' + sample
    if skip == 'n':
        pass_filter(ana_dir + '/' + sample)
        set_acls(ana_dir, user, group)
    in_vcf = ana_dir + '/' + sample + '.germline_pass.vcf'
    out_vcf = sample + '.germline.vep91.vcf'
    buffer_size = '5000'
    ann_dir = project_dir + project + '/' + annotation + '/' + sample
    if not os.path.isdir(ann_dir):
        mk_ann = 'mkdir -p ' + ann_dir
        sys.stderr.write('Creating annotation output directories ' + mk_ann + '\n')
        subprocess.call(mk_ann, shell=True)
    os.chdir(ann_dir)
    sys.stderr.write(date_time() + 'Changed to working directory ' + ann_dir + '\n')
    if int(threads) > 1:
        threads = str(int(threads) - 1)
    run_cmd = run_vep(vep_tool, in_vcf, out_vcf, threads, fasta, vep_cache, cadd_snv, cadd_indel, sample, buffer_size,
                      plugin_dir)
    sys.stderr.write(date_time() + 'Annotating sample ' + in_vcf + ' ' + run_cmd + '\n')
    # from stack overflow to allow killing of spawned processes in main process fails for cleaner restart
    check = subprocess.Popen(run_cmd, stdout=subprocess.PIPE, shell=True, preexec_fn=os.setsid)
    check_run = watch_mem(check, sample)
    if check_run != 0:

        buffer_size = str(int(buffer_size) // 2)
        clean_up = 'rm \'' + out_vcf + '*\''
        sys.stderr.write(date_time() + 'VEP failed. Status of run was ' + str(check_run)
                         + ' Trying smaller buffer size of ' + buffer_size + '\n' + clean_up + '\n')
        try:
            os.killpg(os.getpgid(check.pid), signal.SIGINT)
        except:
            sys.stderr.write(date_time() + 'Killing process failed.  Might have already died for other reasons...\n')

        subprocess.call(clean_up, shell=True)
        run_cmd = run_vep(vep_tool, in_vcf, out_vcf, threads, fasta, vep_cache, cadd_snv, cadd_indel, sample,
                          buffer_size, plugin_dir)
        sys.stderr.write(date_time() + 'Annotating sample ' + sample + in_vcf + '\n')
        check = subprocess.call(run_cmd, shell=True)
        if check != 0:
            sys.stderr.write(date_time() + 'VEP failed for sample ' + sample + '\n')
            exit(1)
    else:
        sys.stderr.write(date_time() + 'VEP annotation of ' + in_vcf + ' successful!\n')

    check = gen_report(out_vcf, sample, tx_index)
    if check == 0:
        sys.stderr.write(date_time() + 'Summary table of germline calls completed!\n')
    else:
        sys.stderr.write(date_time() + 'Summary table for ' + out_vcf + ' FAILED!\n')
        return 1
    set_acls(ann_dir, user, group)
    sys.stderr.write(date_time() + 'VEP91 annotation of ' + sample + ' complete!\n')

    return 0
Example #18
0
def vep(config_file, sample_pairs, ref_mnt, in_suffix, out_suffix, source):
    from annotation.annot_vcf_vep import annot_vcf_vep_pipe
    check = annot_vcf_vep_pipe(config_file, sample_pairs, ref_mnt, in_suffix, out_suffix, source)
    if check == 0:
        sys.stderr.write(date_time() + 'vep annotation of ' + source + ' output successful.\n')
    else:
        sys.stderr.write(date_time() + 'vep annotation of ' + source + ' output failed.\n')
        exit(1)
Example #19
0
def run_novosort(config_file, sample_list):
    check = novosort_merge_pe(config_file, sample_list)
    if check == 0:
        sys.stderr.write(date_time() + 'File merge complete!\n')

    else:
        sys.stderr.write(date_time() + 'File download and merge failed.\n')
        exit(1)
Example #20
0
def run_novosort(config_file, sample_list):
        check = novosort_merge_pe(config_file, sample_list)
        if check == 0:
            sys.stderr.write(date_time() + 'File merge complete!\n')

        else:
            sys.stderr.write(date_time() + 'File download and merge failed.\n')
            exit(1)
Example #21
0
def novosort_merge_pe(config_file, sample_list):
    fh = open(sample_list, 'r')
    (novosort, java_tool, picard_tool, project, project_dir, align, threads, ram, rmdup, novo_merge_rmdup_slurm,
     novo_picard_merge_rmdup_slurm) = parse_config(config_file)

    for sample in fh:
        sample = sample.rstrip('\n')
        loc = sample + '.novosort_merge.log'
        (bam_list, bai_list, n) = list_bam(project, align, sample)
        bam_string = " ".join(bam_list)
        cur_dir = project_dir + project + '/' + align + '/' + sample + '/BAM/'
        os.chdir(cur_dir)
        out_bam = sample + '.merged.final.bam'
        if n > 1:
            if rmdup == 'Y':
                job_loc = sample + '.novosort_merge.log'
                job_name = sample + '_novosort_merge'

                batch = 'sbatch -c ' + threads + ' -J ' + job_name + ' --mem ' + ram + 'G -o ' + job_loc \
                        + ' --export=novosort="' + novosort + '",threads="' + threads + '",ram="' + ram \
                        + 'G",out_bam="' + out_bam + '",bam_string="' + bam_string + '",loc="' + loc + '"' + ' ' \
                        + novo_merge_rmdup_slurm
                log(loc, date_time() + 'Submitting merge bam job for sample ' + batch + "\n")
                subprocess.call(batch, shell=True)

            else:
                # run legacy pipe for removing dups using picard
                picard_tmp = 'picard_tmp'
                job_loc = sample + '.novosort_merge.picard_rmdup.log'
                job_name = sample + '_novosort_merge.picard_rmdup'

                # setting max records in ram to half of ram
                recs = str(int((int(ram) / 2) * (1000000000 / 200)))
                in_bam = sample + '.merged.bam'
                in_bai = sample + '.merged.bam.bai'

                mets = sample + '.rmdup.srt.metrics'
                batch = 'sbatch -c ' + threads + ' --mem ' + ram + 'G -o ' + job_loc + ' -J ' + job_name \
                        + ' --export=novosort="' + novosort + '",threads="' + threads + '",ram="' + ram \
                        + 'G",in_bam="' + in_bam + '",bam_string="' + bam_string + '",loc="' + job_loc \
                        + '",java_tool="' + java_tool + '",picard_tool="' + picard_tool + '",tmp="' + picard_tmp \
                        + '",recs="' + recs + '",out_bam="' + out_bam + '",mets="' + mets + '",in_bai="' + in_bai \
                        + '" ' + novo_picard_merge_rmdup_slurm
                sys.stderr.write(date_time() + 'Merging with novosort and rmdup with picard for legacy reasons!\n'
                                 + batch + '\n')
                subprocess.call(batch, shell=True)

        else:

                link_bam = 'ln -s ' + bam_list[0] + ' ' + sample + '.merged.final.bam; ln -s ' + bai_list[0] + ' ' \
                           + sample + '.merged.final.bam.bai'
                log(loc, date_time() + 'Creating symlink for merged final bam since only one exists\n'
                    + link_bam + '\n')
                subprocess.call(link_bam, shell=True)

    sys.stderr.write(date_time() + 'Merged file request submitted and processed, check logs.\n')
    return 0
Example #22
0
def filter_wrap(mmu_filter, star_tool, genome_ref, end1, end2, sample, log_dir,
                threads, novosort, mem):
    meta = sample.split('_')
    RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[
        4] + "\tSM:" + meta[0] + "\tPL:illumina"
    loc = log_dir + sample + ".mmu.star.pe.log"
    mk_srt_tmp = 'mkdir TMP'
    subprocess.call(mk_srt_tmp, shell=True)
    # split threads for star and novosort as well as memory
    nmem = 2
    ncpu = 2
    threads = int(threads)
    sthreads = threads
    if threads >= 10:
        if threads == 10:
            sthreads = 6
            ncpu = 4
        else:
            if threads % 2.0 == 0.0:
                sthreads = int(threads / 2)
                ncpu = int(threads / 2)
            else:
                sthreads = int(math.ceil(threads / 2.0))
                ncpu = int(math.floor(threads / 2.0))
    else:
        sthreads = int(sthreads) - 2
    mem = int(mem)
    if mem > 42:
        nmem = mem - 40
    star_cmd = "(" + star_tool + " --runMode alignReads --outSAMattrRGline " + RGRP + " --outFileNamePrefix " \
            + sample + ".mmu_filt. --runThreadN " + str(sthreads) + " --genomeDir " + genome_ref\
            + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat --outSAMtype BAM Unsorted --outStd " \
            "BAM_Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 " \
            "--alignSJDBoverhangMin 1 --outFilterMismatchNmax 0" + " --alignIntronMin 20 --alignIntronMax 1000000 " \
            "--alignMatesGapMax 1000000 --outSAMunmapped Within 2>> " + loc + "  | " + novosort + " - -n -c " \
            + str(ncpu) + " -m " + str(nmem) + "G -t TMP 2>> " + loc + " | tee " + sample + ".mmu.nsrt.bam | python " \
            + mmu_filter + " -s " + sample + " -n 0 -t RNA | gzip -4 -c - > " + sample \
            + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz"

    log(loc, date_time() + star_cmd + '\n')
    try:
        subprocess.call(star_cmd, shell=True)
    except:
        log(
            loc,
            date_time() +
            'Star alignment and filter against against mouse genome failed\n')
        exit(1)
    log(loc, date_time() + 'Filtering completed, replacing fastq file\n')
    rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 \
            + ';rm -rf TMP'
    check = subprocess.call(rn_fq, shell=True)
    if check != 0:
        log(loc, date_time() + 'File rename failed\n' + rn_fq + '\n')
        exit(1)
    return 0
Example #23
0
def run_novosort(config_file, sample_list, obj):
        check = novosort_merge_pe(config_file, sample_list)
        if check == 0:
            sys.stderr.write(date_time() + 'File download and merge complete!\n')
            # rm unmerged bams, no longer needed
            rm_bam = 'rm -rf ' + obj
            call(rm_bam, shell=True)
        else:
            sys.stderr.write(date_time() + 'File download and merge failed.\n')
            exit(1)
Example #24
0
def find_project_files(file_dir, file_prefix):
    find_cmd = "find " + file_dir + " -name \'" + file_prefix + '*\''
    sys.stderr.write(date_time() + find_cmd + "\n")
    try:
        results = check_output(find_cmd, shell=True, stderr=subprocess.PIPE).decode()
        return results
    except:
        sys.stderr.write(date_time() + "Search of " + file_prefix + " from " + file_dir + " failed\n")
        exit(1)
    return 0
Example #25
0
def watch_mem(proc_obj, sample, loc):
    from time import sleep
    while proc_obj.poll() is None:
        mem_pct = psutil.virtual_memory().percent
        log(loc, date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample + '\n')
        if mem_pct >= 99:
            log(loc, date_time() + 'Memory exceeded while running VEP.')
            return 1
        sleep(30)

    return proc_obj.poll()
Example #26
0
def find_project_files(file_dir, file_prefix):
    find_cmd = "find " + file_dir + " -name \'" + file_prefix + '*\''
    sys.stderr.write(date_time() + find_cmd + "\n")
    try:
        results = check_output(find_cmd, shell=True,
                               stderr=subprocess.PIPE).decode()
        return results
    except:
        sys.stderr.write(date_time() + "Search of " + file_prefix + " from " +
                         file_dir + " failed\n")
        exit(1)
    return 0
Example #27
0
def watch_mem(proc_obj, sample):
    from time import sleep
    while proc_obj.poll() is None:
        mem_pct = psutil.virtual_memory().percent
        sys.stderr.write(date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample
            + ' from platypus ' + '\n')
        if mem_pct >= 99:
            sys.stderr.write(date_time() + 'Memory exceeded while running VEP.')
            return 1
        sleep(30)

    return proc_obj.poll()
Example #28
0
def picard_insert_size(java_tool, picard_tool, sample, log_dir, ram):
    loc = log_dir + sample + ".picard.insert_size.log"
    picard_insert_size_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " CollectInsertSizeMetrics I=" \
                             + sample + ".rmdup.srt.bam H=" + sample + ".insert_metrics.pdf O=" \
                             + sample + ".insert_metrics.hist  >> " + log_dir + sample + ".picard.insert_size.log 2>&1"
    log(loc , date_time() + picard_insert_size_cmd + "\n")
    try:
        call(picard_insert_size_cmd, shell=True)
        return 0
    except:
        log(loc, date_time() + 'Picard failed using java ' + java_tool + '\n')
        return 1
Example #29
0
def vep(config_file, sample_pairs, in_suffix, out_suffix, in_mutect, source, vep_cache):
    if vep_cache == '84':
        from annotation.deprecated.annot_vcf_vep import annot_vcf_vep_pipe
    else:
        from annotation.annot_vcf_VEP91 import annot_vcf_vep_pipe
    check = annot_vcf_vep_pipe(config_file, sample_pairs, in_suffix, out_suffix, in_mutect, source)
    if check == 0:
        sys.stderr.write(date_time() + 'vep annotation of ' + source + ' output successful.\n')
    else:
        sys.stderr.write(date_time() + 'vep annotation of ' + source + ' output failed.\n')
        exit(1)
    return 0
Example #30
0
def watch_mem(proc_obj, source, sample, loc):
    from time import sleep
    while proc_obj.poll() is None:
        mem_pct = psutil.virtual_memory().percent
        log(loc, date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample
            + ' from source ' + source + '\n')
        if mem_pct >= 99:
            log(loc, date_time() + 'Memory exceeded while running VEP.')
            return 1
        sleep(30)

    return proc_obj.poll()
Example #31
0
def picard_insert_size(java_tool, picard_tool, sample, log_dir, ram):
    loc = log_dir + sample + ".picard.insert_size.log"
    picard_insert_size_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " CollectInsertSizeMetrics I=" \
                             + sample + ".rmdup.srt.bam H=" + sample + ".insert_metrics.pdf O=" \
                             + sample + ".insert_metrics.hist  >> " + log_dir + sample + ".picard.insert_size.log 2>&1"
    log(loc, date_time() + picard_insert_size_cmd + "\n")
    try:
        call(picard_insert_size_cmd, shell=True)
        return 0
    except:
        log(loc, date_time() + 'Picard failed using java ' + java_tool + '\n')
        return 1
Example #32
0
def watch_mem(proc_obj, sample):
    from time import sleep
    while proc_obj.poll() is None:
        mem_pct = psutil.virtual_memory().percent
        sys.stderr.write(date_time() + 'Current memory usage at ' + str(mem_pct) + '% processing sample ' + sample
            + ' from platypus ' + '\n')
        if mem_pct >= 99:
            sys.stderr.write(date_time() + 'Memory exceeded while running VEP.')
            return 1
        sleep(30)

    return proc_obj.poll()
Example #33
0
def scalpel_indel(tumor_id, normal_id, log_dir, config_file):
    (scalpel, bedtools, bed, fasta, cpus, dustmask_flag, dustmask_bed, wg, project_dir, project, align) \
        = parse_config(config_file)

    sample_pair = tumor_id + '_' + normal_id
    loc = log_dir + sample_pair + '.scalpel.log'
    bam_dir = project_dir + project + '/' + align
    tumor_bam = bam_dir + '/' + tumor_id + '/BAM/' + tumor_id + '.merged.final.bam'
    normal_bam = bam_dir + '/' + normal_id + '/BAM/' + normal_id + '.merged.final.bam'
    if wg == 'n':
        scalpel_cmd = scalpel + ' --somatic --logs --numprocs ' + cpus + ' --tumor ' + tumor_bam + ' --normal ' \
                      + normal_bam + ' --bed ' + bed + ' --ref ' + fasta + ' 2>> ' + loc
        sys.stderr.write(date_time() + 'Starting indel calls for ' +
                         sample_pair + '\n')
        log(
            loc,
            date_time() + 'Starting indel calls for ' + sample_pair +
            ' in capture mode with command:\n' + scalpel_cmd + '\n')
        check = call(scalpel_cmd, shell=True)
        if check != 0:
            sys.stderr.write(date_time() + 'Indel calling failed for pair ' +
                             sample_pair + ' with command:\n' + scalpel_cmd +
                             '\n')
            exit(1)
    else:
        check = wg_mode(scalpel, tumor_bam, normal_bam, fasta, cpus,
                        sample_pair, config_file)
        if check[0] != 0:
            sys.stderr.write('Scalpel failed for ' + normal_id + ' at ' +
                             tumor_id + '\n')
            exit(1)
    log(
        loc,
        date_time() + 'Indel calling complete for pair ' + sample_pair +
        ' moving output files\n')
    mv_cmd = 'mv outdir/main/* .; rmdir outdir/main;'
    log(loc, date_time() + mv_cmd + '\n')
    call(mv_cmd, shell=True)
    sys.stderr.write(date_time() + 'Completed indel calls for ' + sample_pair +
                     '\n')
    if dustmask_flag == 'Y':
        log(loc, date_time() + 'Filter dustmask flag given\n')
        check = filter_indel(bedtools, dustmask_bed, sample_pair, loc)
        if check != 0:
            sys.stderr.write(date_time() + 'Dustmask failed for ' +
                             sample_pair + '\n')
            exit(1)
        else:
            log(loc,
                date_time() + 'Dustmask complete for ' + sample_pair + '\n')
    sys.stderr.write(date_time() + 'Indel call completed\n')
    return 0
Example #34
0
def list_bam(project, align, sample):
    bam_dir = '/cephfs/PROJECTS/' + project + '/' + align + '/' + sample + '/BAMS/'
    find_bam_cmd = 'find ' + bam_dir + ' -name \'*.Aligned.toTranscriptome.out.bam\''
    sys.stderr.write(date_time() + find_bam_cmd + '\nGetting BAM list\n')
    try:
        bam_find = subprocess.check_output(find_bam_cmd, shell=True).decode().rstrip('\n')
        bam_list = bam_find.split('\n')
        ct = len(bam_list)

        return bam_list, ct
    except:
        sys.stderr.write(date_time() + 'No bams found for ' + sample + '\n')
        exit(1)
Example #35
0
def filter_wrap(mmu_filter, star_tool, genome_ref, end1, end2, sample, log_dir, threads, novosort, mem):
    meta = sample.split('_')
    RGRP = "ID:" + sample + "\tLB:" + meta[0] + "\tPU:" + meta[4] + "\tSM:" + meta[0] + "\tPL:illumina"
    loc = log_dir + sample + ".mmu.star.pe.log"
    mk_srt_tmp = 'mkdir TMP'
    subprocess.call(mk_srt_tmp, shell=True)
    # split threads for star and novosort as well as memory
    nmem = 2
    ncpu = 2
    threads = int(threads)
    sthreads = threads
    if threads >= 10:
        if threads == 10:
            sthreads = 6
            ncpu = 4
        else:
            if threads % 2.0 == 0.0:
                sthreads = int(threads/2)
                ncpu = int(threads/2)
            else:
                sthreads = int(math.ceil(threads/2.0))
                ncpu = int(math.floor(threads/2.0))
    else:
        sthreads = int(sthreads) - 2
    mem = int(mem)
    if mem > 42:
        nmem = mem - 40
    star_cmd = "(" + star_tool + " --runMode alignReads --outSAMattrRGline " + RGRP + " --outFileNamePrefix " \
            + sample + ".mmu_filt. --runThreadN " + str(sthreads) + " --genomeDir " + genome_ref\
            + " --readFilesIn " + end1 + " " + end2 + " --readFilesCommand zcat --outSAMtype BAM Unsorted --outStd " \
            "BAM_Unsorted --outFilterType BySJout --outFilterMultimapNmax 20 --alignSJoverhangMin 8 " \
            "--alignSJDBoverhangMin 1 --outFilterMismatchNmax 0" + " --alignIntronMin 20 --alignIntronMax 1000000 " \
            "--alignMatesGapMax 1000000 --outSAMunmapped Within 2>> " + loc + "  | " + novosort + " - -n -c " \
            + str(ncpu) + " -m " + str(nmem) + "G -t TMP 2>> " + loc + " | tee " + sample + ".mmu.nsrt.bam | python " \
            + mmu_filter + " -s " + sample + " -n 0 -t RNA | gzip -4 -c - > " + sample \
            + "_1.filtered.fq.gz;) 2>&1 | gzip -4 -c - > " + sample + "_2.filtered.fq.gz"

    log(loc, date_time() + star_cmd + '\n')
    try:
        subprocess.call(star_cmd, shell=True)
    except:
        log(loc, date_time() + 'Star alignment and filter against against mouse genome failed\n')
        exit(1)
    log(loc, date_time() + 'Filtering completed, replacing fastq file\n')
    rn_fq = 'mv ' + sample + '_1.filtered.fq.gz ' + end1 + '; mv ' + sample + '_2.filtered.fq.gz ' + end2 \
            + ';rm -rf TMP'
    check = subprocess.call(rn_fq, shell=True)
    if check != 0:
        log(loc, date_time() + 'File rename failed\n' + rn_fq + '\n')
        exit(1)
    return 0
Example #36
0
def annot_platypus(config_file, sample):
    (vep_tool, vep_cache, plugin_dir, fasta, threads, java, cadd, tx_index, project_dir, project, analysis) \
        = parse_config(config_file)
    ana_dir = project_dir + project + '/' + analysis + '/' + sample
    pass_filter(ana_dir + '/' + sample)
    in_vcf = ana_dir + '/' + sample + '.germline_pass.vcf'
    out_vcf = sample + '.germline_pass.vep.vcf'
    buffer_size = '2000'
    if int(threads) > 1:
        threads = str(int(threads) - 1)
    run_cmd = run_vep(vep_tool, in_vcf, out_vcf, threads, fasta, vep_cache,
                      cadd, sample, buffer_size, plugin_dir)
    sys.stderr.write(date_time() + 'Annotating sample ' + in_vcf + ' ' +
                     run_cmd + '\n')
    # from stack overflow to allow killing of spawned processes in main process fails for cleaner restart
    check = subprocess.Popen(run_cmd,
                             stdout=subprocess.PIPE,
                             shell=True,
                             preexec_fn=os.setsid)
    check_run = watch_mem(check, sample)
    if check_run != 0:

        buffer_size = str(int(buffer_size) / 2)
        clean_up = 'rm \'' + out_vcf + '*\''
        sys.stderr.write(date_time() + 'VEP failed. Status of run was ' +
                         str(check_run) + ' Trying smaller buffer size of ' +
                         buffer_size + '\n' + clean_up + '\n')
        try:
            os.killpg(os.getpgid(check.pid), signal.SIGINT)
        except:
            sys.stderr.write(
                date_time() +
                'Killing process failed.  Might have already died for other reasons...\n'
            )

        subprocess.call(clean_up, shell=True)
        run_cmd = run_vep(vep_tool, in_vcf, out_vcf, threads, fasta, vep_cache,
                          cadd, sample, buffer_size, plugin_dir)
        sys.stderr.write(date_time() + 'Annotating sample ' + sample + in_vcf +
                         '\n')
        check = subprocess.call(run_cmd, shell=True)
        if check != 0:
            sys.stderr.write(date_time() + 'VEP failed for sample ' + sample +
                             '\n')
            exit(1)
    else:
        sys.stderr.write(date_time() + 'VEP annotation of ' + in_vcf +
                         ' successful!\n')

    check = gen_report(out_vcf, sample, tx_index)
    if check == 0:
        sys.stderr.write(date_time() +
                         'Summary table of germline calls completed!\n')
    else:
        sys.stderr.write(date_time() + 'Summary table for ' + out_vcf +
                         ' FAILED!\n')
        return 1

    return 0
def gen_report(vcf, out, c, ref_flag):
    # open out file and index counts, context, etc
    fn = os.path.basename(vcf)
    parts = fn.split('.')
    loc = 'LOGS/' + parts[0] + '.subsitutions.vep.priority_report.log'
    log(loc, date_time() + 'Creating prioritized impact reports for ' + vcf + '\n')
    mut_dict = create_mutect_ind(out)
    log(loc, date_time() + 'Created index for added mutect info\n')
    on_dict = {}
    if c != 'n':
        on_dict = create_target(c)
        log(loc, date_time() + 'Target file given, creating index for on target info\n')
    vcf_in = VariantFile(vcf)

    out = open(parts[0] + '.subsitutions.vep.prioritized_impact.report.xls', 'w')
    desired = {'Consequence': 0, 'IMPACT': 0, 'SYMBOL': 0, 'Feature': 0, 'Protein_position': 0, 'Amino_acids': 0,
               'Codons': 0, 'Existing_variation': 0, 'ExAC_MAF': 0, 'BIOTYPE': 0}

    desc_string = vcf_in.header.info['ANN'].record['Description']
    desc_string = desc_string.lstrip('"')
    desc_string = desc_string.rstrip('"')
    desc_string = desc_string.replace('Consequence annotations from Ensembl VEP. Format: ', '')
    f_pos_list = []
    desc_list = desc_string.split('|')
    ann_size = len(desc_list)
    for i in range(0, ann_size, 1):
        if desc_list[i] in desired:
            f_pos_list.append(i)
            desired[desc_list[i]] = i
    out.write('chr\tpos\tcontext\tref\talt\tnormal_ref_count\tnormal_alt_count\t%_normal_alt\ttumor_ref_count\t'
              'tumor_alt_count\t%_tumor_alt\tT/N_%_alt_ratio\tsnp_ID\tgnomAD_AF\tgene\ttx_id\teffect\timpact\tbiotype\t'
              'codon_change\tamino_acid_change\ton/off-target\n')
    if ref_flag != 'n':
        ref_flag = create_index(ref_flag)
    for record in vcf_in.fetch():
        (chrom, pos, ref, alt) = record.contig, str(record.pos), record.ref, record.alts[0]
        ann_list = [_.split('|') for _ in record.info['ANN']]
        tflag = 'NA'
        if c != 'n':
            tflag = mark_target(chrom, pos, on_dict)
            # only outputting ON TARGET hits
            if tflag == 'OFF':
                continue
        output_highest_impact(chrom, pos, ref, alt, ann_list, mut_dict, desired, tflag, out, ref_flag)

    out.close()
    log(loc, date_time() + 'Creating prioritized report for ' + vcf + ' complete!\n')
    return 0
Example #38
0
def cutadapter(sample, end1, end2, config_file):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    # designed to be run in a subdirectory, keep original file names
    sf1 = end1
    sf2 = end2
    end1 = os.path.basename(sf1)
    end2 = os.path.basename(sf2)
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.cutadapt.log'
    (cutadapt_tool, threads, minlen, r1adapt, r2adapt, r1trim, r2trim, qual,
     mqual) = parse_config(config_file)
    cut_th = threads
    if int(cut_th) >= 4:
        cut_th = str(int(int(threads) / 2))

    cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \
                   + ' -a ' + r1adapt + ' -A ' + r2adapt + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 \
                   + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 + ' >> ' + loc + ' 2>> ' + loc
    if r1adapt == '' and r2adapt == '':
        cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \
                       + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 \
                       + ' >> ' + loc + ' 2>> ' + loc
    log(loc, date_time() + cutadapt_cmd + "\n")
    call(cutadapt_cmd, shell=True)
    return 0
def list_bam(project, project_dir, align_dir, sample):
    project_dir + project
    bam = project_dir + project + '/' + align_dir + '/' + sample + '/BAM/' + sample + '.merged.final.bam'
    check_file = os.path.isfile(bam)
    if not check_file:
        sys.stderr.write(date_time() + 'Merged bam ' + bam + ' not found.\n')
    return check_file
Example #40
0
def mutect_merge_sort(config_file, sample_pair):
    # use fasta index to get sort order for file output
    (fai) = parse_config(config_file)
    fai_list = []
    fai_fh = open(fai, 'r')
    for line in fai_fh:
        line = line.rstrip('\n')
        data = line.split('\t')
        fai_list.append(data[0])
    fai_fh.close()
    # output files should be in directory named after sample-pairs

    dir_list = os.listdir('./')
    suffix_dict = {}
    for fn in dir_list:
        parts = fn.split('.')
        if len(parts) >= 3:
            if parts[2] == 'out' or parts[2] == 'vcf':
                suffix = '.'.join(parts[2:])
                if suffix not in suffix_dict:
                    suffix_dict[suffix] = []
                suffix_dict[suffix].append(fn)
    merge_sort(suffix_dict, sample_pair, fai_list)
    sys.stderr.write(date_time() + 'File merging completed\n')
    return 0
Example #41
0
def merge_filter_stats(project_dir, project, align_dir, lane_list):
    lanes = open(lane_list, 'r')
    head = ''
    data = []
    print 'BID\tread group\ttotal alignment pairs(ap)\t% unambiguous ap\t% ambiguous ap\t% total ap filtered' \
          '\t%total ap kept'
    for line in lanes:
        line = line.rstrip('\n')
        (bid, seqtype, lane_csv) = line.split('\t')
        for lane in lane_csv.split(', '):
            cur = project_dir + project + '/' + align_dir + '/' + bid + '/QC/' + bid + '_' + lane + '.runlog.txt'
            if os.path.isfile(cur):
                stat = open(cur, 'r')
                skip_lines(stat, 4)
                temp = []
                group = process_line(stat, 2)
                # may need to adjust or switch to regex in a case % sign present
                unamb_pairs_pct = group[0][-1][:-1]
                amb_pairs_pct = group[1][-1][:-1]
                filt = str(100 - float(unamb_pairs_pct) - float(amb_pairs_pct))
                kept = str(float(unamb_pairs_pct) + float(amb_pairs_pct))
                temp.extend(
                    (group[0][6], unamb_pairs_pct, amb_pairs_pct, filt, kept))

                print bid + '\t' + lane + '\t' + '\t'.join(temp)
                stat.close()
            else:
                sys.stderr.write(date_time() + 'Could not find ' + cur +
                                 ' SKIP!\n')

    lanes.close()
    sys.stdout.write(head)
    for datum in data:
        sys.stdout.write(datum)
    return 0
Example #42
0
def snpeff_pipe(config_file, sample_pairs, ref_mnt, cflag):
    # edit to grab from config max thread count
    max_t = 8
    (java, snpeff, snpsift, report, dbsnp, intervals) = parse_config(config_file)
    dbsnp = ref_mnt + '/' + dbsnp
    intervals = ref_mnt + '/' + intervals
    fh = open(sample_pairs)
    mk_log_dir = 'mkdir LOGS'
    subprocess.call(mk_log_dir, shell=True)
    cmd_list = []
    run_snpsift = java + ' -jar ' + snpsift + ' annotate ' + dbsnp
    run_snpeff = java + ' -jar ' + snpeff + ' eff -t hg19 '
    for line in fh:
        line = line.rstrip('\n')
        (sample, tumor_id, normal_id) = line.split('\t')
        # run snpsift first, then snpeff
        run_report = report + ' -i ' + sample + '.out.keep.eff.vcf -c '
        if cflag == 'n':
            run_report += intervals
        else:
            run_report += 'n'
        run_report += ' > ' + sample + '.vcf.keep.eff.xls'
        run_snp = run_snpsift + ' ' + sample + '.out.keep > ' + sample + '.out.keep.sift.vcf 2> LOGS/' + sample \
                  + '.snpeff.log;' + run_snpeff + ' ' + sample + '.out.keep.sift.vcf -v > ' + sample \
                  + '.out.keep.eff.vcf  2>> LOGS/' + sample + '.snpeff.log;' + run_report
        cmd_list.append(run_snp)
    job_manager(cmd_list, max_t)
    sys.stderr.write(date_time() + 'SNP annotation  completed!\n')
    return 0
Example #43
0
def parse_config(json_config):
    config_data = json.loads(open(json_config, 'r').read())
    try:
        return config_data['tools']['slurm_wrap'], config_data['tools']['mojo_pipe'], \
               config_data['params']['threads'], config_data['params']['ram']
    except:
        try:
            sys.stderr.write(date_time() + 'Accessing keys failed.  Attempting to output current keys:\n')
            for key in config_data:
                sys.stderr.write(key + '\n')
                for subkey in config_data[key]:
                    sys.stderr.write(key + ":" + subkey + ":" + config_data[key][subkey] + '\n')
            exit(1)
        except:
            sys.stderr.write(date_time() + 'Could not read config file ' + json_config + '\n')
            exit(1)
Example #44
0
def fastqc(fastqc_tool, sample, end1, end2, t):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.fastqc.log'
    fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 + ' 2>> ' + loc
    log(loc, date_time() + fastqc_cmd + "\n")
    check = call(fastqc_cmd, shell=True)
    # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score
    #  didn't fit

    if check != 0:
        log(loc, date_time() + 'FastQC Failed for sample ' + sample + '\n')
        exit(1)
    return 0
Example #45
0
def list_bam(project, project_dir, align_dir, sample):
    project_dir + project
    bam = project_dir + project + '/' + align_dir + '/' + sample + '/BAMS/' + sample + '.merged.transcriptome.bam'
    check_file = os.path.isfile(bam)
    if not check_file:
        sys.stderr.write(date_time() + 'Merged bam ' + bam + ' not found.\n')
    return check_file
Example #46
0
def cutadapter(sample, end1, end2, config_file):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    # designed to be run in a subdirectory, keep original file names
    sf1 = end1
    sf2 = end2
    end1 = os.path.basename(sf1)
    end2 = os.path.basename(sf2)
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.cutadapt.log'
    (cutadapt_tool, threads, minlen, r1adapt, r2adapt, r1trim, r2trim, qual, mqual) = parse_config(config_file)
    cut_th = threads
    if int(cut_th) >= 4:
        cut_th = str(int(int(threads) / 2))

    cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \
                   + ' -a ' + r1adapt + ' -A ' + r2adapt + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 \
                   + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 + ' >> ' + loc + ' 2>> ' + loc
    if r1adapt == '' and r2adapt == '':
        cutadapt_cmd = cutadapt_tool + ' -j ' + cut_th + ' -m ' + minlen + ' --quality-base=' + qual + ' -q ' + mqual \
                       + ' -u ' + r1trim + ' -U ' + r2trim + ' -o ' + end1 + ' -p ' + end2 + ' ' + sf1 + ' ' + sf2 \
                       + ' >> ' + loc + ' 2>> ' + loc
    log(loc, date_time() + cutadapt_cmd + "\n")
    call(cutadapt_cmd, shell=True)
    return 0
Example #47
0
def vep(config_file, sample_pairs, in_suffix, out_suffix, in_mutect, source,
        vep_cache):
    if vep_cache == '84':
        from annotation.deprecated.annot_vcf_vep import annot_vcf_vep_pipe
    else:
        from annotation.annot_vcf_VEP91 import annot_vcf_vep_pipe
    check = annot_vcf_vep_pipe(config_file, sample_pairs, in_suffix,
                               out_suffix, in_mutect, source)
    if check == 0:
        sys.stderr.write(date_time() + 'vep annotation of ' + source +
                         ' output successful.\n')
    else:
        sys.stderr.write(date_time() + 'vep annotation of ' + source +
                         ' output failed.\n')
        exit(1)
    return 0
Example #48
0
def mutect_merge_sort(config_file, sample_pair):
    # use fasta index to get sort order for file output
    (fai) = parse_config(config_file)
    fai_list = []
    fai_fh = open(fai, 'r')
    for line in fai_fh:
        line = line.rstrip('\n')
        data = line.split('\t')
        fai_list.append(data[0])
    fai_fh.close()
    # output files should be in directory named after sample-pairs

    dir_list = os.listdir('./')
    suffix_dict = {}
    for fn in dir_list:
        parts = fn.split('.')
        if len(parts) >= 3:
            if parts[2] == 'out' or parts[2] == 'vcf':
                suffix = '.'.join(parts[2:])
                if suffix not in suffix_dict:
                    suffix_dict[suffix] = []
                suffix_dict[suffix].append(fn)
    merge_sort(suffix_dict, sample_pair, fai_list)
    sys.stderr.write(date_time() + 'File merging completed\n')
    return 0
Example #49
0
def merge_filter_stats(project_dir, project, align_dir, lane_list):
    lanes = open(lane_list, 'r')
    head = ''
    data = []
    print 'BID\tread group\ttotal alignment pairs(ap)\t% unambiguous ap\t% ambiguous ap\t% total ap filtered' \
          '\t%total ap kept'
    for line in lanes:
        line = line.rstrip('\n')
        (bid, seqtype, lane_csv) = line.split('\t')
        for lane in lane_csv.split(', '):
            cur = project_dir + project + '/' + align_dir + '/' + bid + '/QC/' + bid + '_' + lane + '.runlog.txt'
            if os.path.isfile(cur):
                stat = open(cur, 'r')
                skip_lines(stat, 4)
                temp = []
                group = process_line(stat, 2)
                # may need to adjust or switch to regex in a case % sign present
                unamb_pairs_pct = group[0][-1][:-1]
                amb_pairs_pct = group[1][-1][:-1]
                filt = str(100-float(unamb_pairs_pct)-float(amb_pairs_pct))
                kept = str(float(unamb_pairs_pct) + float(amb_pairs_pct))
                temp.extend((group[0][6], unamb_pairs_pct, amb_pairs_pct, filt, kept))

                print bid + '\t' + lane + '\t' + '\t'.join(temp)
                stat.close()
            else:
                sys.stderr.write(date_time() + 'Could not find ' + cur + ' SKIP!\n')

    lanes.close()
    sys.stdout.write(head)
    for datum in data:
        sys.stdout.write(datum)
    return 0
Example #50
0
def fastqc(fastqc_tool, sample, end1, end2, t):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.fastqc.log'
    fastqc_cmd = fastqc_tool + ' --extract -t ' + t + ' -o QC/ ' + end1 + ' ' + end2 + ' 2>> ' + loc
    log(loc, date_time() + fastqc_cmd + "\n")
    check = call(fastqc_cmd, shell=True)
    # check after a minute whether the process is still good - shouldn't take too long to ascertain whether phred score
    #  didn't fit

    if check != 0:
        log(loc, date_time() + 'FastQC Failed for sample ' + sample + '\n')
        exit(1)
    return 0
Example #51
0
def align_stats(sample):
    # casual logging - look for a LOGS directory, otherwise assume current dir
    log_dir = './'
    if os.path.isdir('LOGS'):
        log_dir = 'LOGS/'
    loc = log_dir + sample + '.aln.log'
    log(loc, date_time() + "Converting to table summary format\n")
    fh = open(sample + '/' + 'align_summary.txt', 'r')
    fo = open(sample + '.align.txt', 'w')
    fo.write(
        'Sample\tMean insert size estimate(10k reads)\tStd dev read insert size estimate(10 k reads)\tStarting left reads\t% mapped\tmultimapped(mm)\tgt 20 mm\tStarting right reads\t% mapped\t% mm\tgt 20 mm\tOverall map rate\tAligned pairs\t% mm\t% discordant\t% condordant\n'
        + sample + '\t')
    fi = open(sample + '_subset.insert_metrics.hist')
    for i in range(0, 7, 1):
        skip = next(fi)
    stats = next(fi)
    fi.close()
    stat = stats.split('\t')
    fo.write('\t'.join([str(int(float(stat[4]))), str(int(float(stat[5])))]))
    next(fh)
    lstart = next(fh)
    m = re.search('(\d+)\n$', lstart)
    fo.write('\t' + m.group(1))
    pct = next(fh)
    m = re.search('\(\s*(\S+) of input\)\n', pct)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm)
    fo.write('\t' + m.group(1) + '\t' + m.group(2))

    next(fh)
    rstart = next(fh)
    m = re.search('(\d+)\n$', rstart)
    fo.write('\t' + m.group(1))
    pct = next(fh)
    m = re.search('\(\s*(\S+) of input\)\n', pct)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\).*\((\d+) have >20\)\n', mm)
    fo.write('\t' + m.group(1) + '\t' + m.group(2))
    ovr = next(fh)
    m = re.search('\s*(^\S+)', ovr)
    fo.write('\t' + m.group(1))
    next(fh)

    aln = next(fh)
    m = re.search('(\d+)\n$', aln)
    fo.write('\t' + m.group(1))
    mm = next(fh)
    m = re.search('\(\s*(\S+)\) have', mm)
    fo.write('\t' + m.group(1))
    dc = next(fh)
    m = re.search('\(\s*(\S+)\) are', dc)
    fo.write('\t' + m.group(1))
    cc = next(fh)
    m = re.search('^\s*(\S+)', cc)
    fo.write('\t' + m.group(1) + '\n')
    fo.close
    return 0
Example #52
0
def flagstats(samtools_tool, sample):

    # test for sorted bam, otherwise use unsorted bam
    raw_bam = sample + ".srt.bam"
    res_file = sample + ".srt.bam.flagstats"
    if os.path.isfile(raw_bam) == False:
        raw_bam = sample + ".bam"
        res_file = sample + ".bam.flagstats"

    flagstats_cmd = samtools_tool + " flagstat " + raw_bam + " > " + res_file
    sys.stderr.write(date_time() + flagstats_cmd + "\n")
    Popen(flagstats_cmd, shell=True, stdin=None, stdout=None, stderr=None, close_fds=True)


    flagstats_cmd = samtools_tool + " flagstat " + sample + ".rmdup.srt.bam > " + sample + ".rmdup.srt.bam.flagstats"
    sys.stderr.write(date_time() + flagstats_cmd + "\n")
    Popen(flagstats_cmd, shell=True, stdin=None, stdout=None, stderr=None, close_fds=True)
Example #53
0
def mojo_pipe(sample, config_file, fq1, fq2):
    (project_dir, project, align_dir, mojo, m_config, cores, mem, user,
     group) = parse_config(config_file)
    fq_dir = project_dir + project + '/' + align_dir + '/' + sample + '/TRIMMED_FQ/'
    out_dir = project_dir + project + '/' + align_dir + '/' + sample + '/MOJO/'
    os.mkdir(out_dir)
    loc = out_dir + sample + '.mojo_run.log'
    log(loc, date_time() + 'Made output directory ' + out_dir + '\n')
    log(loc, date_time() + 'Changing to fastq directory ' + fq_dir + '\n')
    os.chdir(fq_dir)
    run_mojo = mojo + ' --config ' + m_config + ' --sample_name ' + sample + ' --output_dir ' + out_dir + ' --fq1 ' \
               + fq1 + ' --fq2 ' + fq2 + ' --cores ' + cores + ' --mem ' + mem
    log(loc, date_time() + 'Running MOJO with command ' + run_mojo + '\n')
    try:
        subprocess.call(run_mojo, shell=True)
        log(loc, date_time() + 'MOJO complete! Setting acls\n')
        check = set_acls(out_dir, user, group)
        if check == 0:
            log(loc,
                date_time() + 'Setting acls complete.  Pipeline complete!\n')
        else:
            log(loc, date_time() + 'Setting acls failed.  Check logs!\n')
    except:
        sys.stderr.write(date_time() + 'MOJO failed!  Check logs in ' + loc +
                         '\n')
        return 1
Example #54
0
def pre_report(mode, bam, sample, pos, config_file, ref_mnt):
    (samtools_tool, samtools_ref) = parse_config(config_file)
    samtools_ref = ref_mnt + '/' + samtools_ref
    create_pos_ref(pos)
    sys.stderr.write(date_time() + 'Creating mpileup with samtools\n')
    pre_rpt_cmd = samtools_tool + ' mpileup -D -d 500000 -l pos_list.txt -f ' + samtools_ref
    out = ''
    if mode == 'b':
        out = 'batch_pileup.txt'
        pre_rpt_cmd += ' -b  ' + bam + ' > ' + out
    else:
        out = sample + '_pileup.txt'
        pre_rpt_cmd += ' ' + bam + ' > ' + out

    sys.stderr.write(date_time() + pre_rpt_cmd + "\n")
    try:
        subprocess.call(pre_rpt_cmd, shell=True)
    except:
        sys.stderr.write(date_time() + 'Pileup failed\n')
    cov = {}
    index = {}
    sys.stderr.write(date_time() + 'Parsing mpileup output\n')
    samp_list = parse_pileup(out, pos, sample, mode, cov, index)
    sys.stderr.write(
        date_time() +
        'Calculating means and standard deviations of base quality scores\n')
    calc_values(cov)
    sys.stderr.write(date_time() + 'Generating report\n')
    gen_report(cov, index, samp_list)
    sys.stderr.write(date_time() + 'Report complete\n')
    return 0
Example #55
0
def downsample_pipe(bam_list, config_file, depth):
    (samtools, threads) = parse_config(config_file)
    for bam in open(bam_list):
        sys.stderr.write(date_time() + 'Setting up for ' + bam)
        bam = bam.rstrip('\n')
        bam_root = bam.replace('.bam', '')
        bam_dir = os.path.dirname(bam)
        # cleanup sample name, sloppy i know!
        qc_root = bam_root.replace('BAMS', 'QC', 1)
        qc_root = qc_root.replace('.srt', '', 1)
        qc_root = qc_root.replace('.Aligned.toTranscriptome.out', '', 1)
        sys.stderr.write(date_time() + 'Calculating downsample fraction\n')
        frac = get_from_depth(qc_root, depth)
        # submit to job queue
        downsample_bam(samtools, bam, frac, bam_dir, threads)
        #cmd = ' '.join(('sbatch', '-c', threads, '--oversubscribe', downsample_bam, '-b ', bam, '-f', frac, '-o ',
        #                bam_dir, '-t', threads, '-s', samtools))
        sys.stderr.write(date_time() + 'Submitting to queue ' + bam + '\n')
Example #56
0
def picard_rmdup(java_tool, picard_tool, picard_tmp, sample, log_dir, ram):
    picard_rmdup_cmd = java_tool + " -Xmx" + ram + "g -jar " + picard_tool + " MarkDuplicates CREATE_INDEX=true " \
                    "TMP_DIR=" + picard_tmp + " REMOVE_DUPLICATES=true ASSUME_SORTED=true " \
                    "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 INPUT=" + sample + ".srt.bam OUTPUT=" + sample \
                       + ".rmdup.srt.bam METRICS_FILE=" + sample + ".rmdup.srt.metrics VALIDATION_STRINGENCY=LENIENT " \
                                                                   "> " + log_dir + sample + ".picard.rmdup.pe.log 2>&1"
    log(log_dir + sample + ".picard.rmdup.pe.log",
        date_time() + picard_rmdup_cmd + "\n")
    call(picard_rmdup_cmd, shell=True)
Example #57
0
def get_bam_name(bnid, project_dir, project, align_dir):
    bam_dir = project_dir + project + '/' + align_dir + '/' + bnid + '/BAM/'
    bam = bam_dir + bnid + '.merged.final.bam'
    bai = bam_dir + bnid + '.merged.final.bai'
    f = 0
    if not os.path.isfile(bam):
        sys.stderr.write(date_time() + 'Bam not found in ' + bam_dir + '\n')
        f = 1
        return f, bam, bai
    if not os.path.isfile(bai):
        bai = bam_dir + bnid + '.merged.final.bam.bai'
        if not os.path.isfile(bai):
            sys.stderr.write(date_time() + 'Bam index file for ' + bnid +
                             ' not found!  Please index first\n')
            f = 1
            return f, bam, bai

    return f, bam, bai