def run(self): mod_name = 'mpileup' out_dir = utl.mk_outdir(mod_name) # continue to next phase glv.outlist.outfile[mod_name] = list() start = time.time() # for each region for region in glv.conf.region_bed_list: out_file1 = "{}/{}_{}.{}{}".format( out_dir, mod_name, region, 1, '.vcf.gz') glv.outlist.outfile[mod_name].append(out_file1) log.debug("{}".format(out_file1)) if utl.progress_check(mod_name) == False: log.info("progress={} so skip {}.".format( glv.conf.progress, mod_name)) continue log.info("go on {}".format(mod_name)) mpileup = '{} {} {} -O u -r {} -f {} {}' mp_cmd = mpileup.format( 'bcftools', 'mpileup', glv.conf.mpl_mpileup_param, region, glv.conf.ref_fasta, " ".join(glv.conf.bam_list)) pipe_call = '{} {} {} -O u' ca_cmd = pipe_call.format( 'bcftools', 'call', glv.conf.mpl_call_param) pipe_filter = '{} {} {} -O z --threads {} -o {}' fi_cmd = pipe_filter.format( 'bcftools', 'filter', glv.conf.mpl_filter_param, glv.conf.thread, out_file1) cmd1 = "{} | {} | {}".format(mp_cmd, ca_cmd, fi_cmd) utl.save_to_tmpfile(out_file1) utl.try_exec(cmd1) utl.tabix(out_file1) log.info("mpileup finished {}".format( utl.elapsed_time(time.time(), start)))
def _copy_ini_file(self): # ini file self.ini_file_path # out_dir self.out_dir # back up ini_base = os.path.basename(self.ini_file_path) out_dir_ini_file = "{}/{}".format(self.out_dir, ini_base) utl.save_to_tmpfile(out_dir_ini_file) cmd = "cp {} {}".format(self.ini_file_path, out_dir_ini_file) utl.try_exec(cmd)
def run(self): mod_name = 'concat' out_dir = utl.mk_outdir(mod_name) heterozygosity = glv.conf.heterozygosity dname = os.path.basename(glv.conf.out_dir) out_file1 = "{}/{}.{}.{}.{}{}".format(out_dir, mod_name, dname, 'SNP_INDEL', heterozygosity, '.vcf.gz') glv.outlist.outfile[mod_name] = list() glv.outlist.outfile[mod_name].append(out_file1) log.debug("{}".format(out_file1)) all_vcf = " ".join( glv.outlist.outfile['snpfilter'] + \ glv.outlist.outfile['indelfilter']) log.debug("{}".format(all_vcf)) utl.save_to_tmpfile(out_file1) start = time.time() if heterozygosity != 'hetero': concat_nohetero = '{} {} {} -O v {} --threads {}' cmd1 = concat_nohetero.format('bcftools', 'concat', glv.conf.concat_nh_param, all_vcf, glv.conf.thread) pipe_view = '{} {} {} -O z -o {}' cmd2 = pipe_view.format('bcftools', 'view', glv.conf.concat_nh_view_param, out_file1) cmd1 = "{} | {}".format(cmd1, cmd2) else: concat_hetero = '{} {} {} -O z {} --threads {} -o {}' cmd1 = concat_hetero.format('bcftools', 'concat', glv.conf.concat_hetero_param, all_vcf, glv.conf.thread, out_file1) utl.try_exec(cmd1) utl.tabix(out_file1) log.info("concat finished {}".format( utl.elapsed_time(time.time(), start)))
def _make_bwaidx(self): bwaidx = "{}{}".format(glv.conf.ref_fasta, '.bwt') bwaidx_title = os.path.basename(glv.conf.ref_fasta) if os.path.isfile(bwaidx): log.debug("{} exist.".format(bwaidx)) else: os.chdir(glv.conf.ref_dir) cmd1 = "bwa index -p {} {}".format(bwaidx_title, glv.conf.ref_fasta) utl.try_exec(cmd1) log.info("pwd {}".format(os.getcwd())) os.chdir(glv.conf.cwd)
def run(self): mod_name = 'indelfilter' out_dir = utl.mk_outdir(mod_name) heterozygosity = glv.conf.heterozygosity # continue to next phase glv.outlist.outfile[mod_name] = list() start = time.time() # for each region for (input_file, region) in zip(glv.outlist.outfile['svaba'], glv.conf.region_bed_list): # for sbava debug input_base_gz = os.path.basename(input_file) input_base = re.sub(r"\.gz$", "", input_base_gz) out_file0 = "{}/{}".format(out_dir, input_base) out_file0_gz = "{}/{}".format(out_dir, input_base_gz) out_file1 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region, 'annote', '.vcf.gz') out_file2 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region, 'norm', '.vcf.gz') out_file3 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region, 'h**o', '.vcf.gz') if heterozygosity == 'h**o': glv.outlist.outfile[mod_name].append(out_file3) else: glv.outlist.outfile[mod_name].append(out_file2) if utl.progress_check(mod_name) == False: log.info("progress={} so skip {}.".format( glv.conf.progress, mod_name)) continue log.info("go on {}".format(mod_name)) # for svaba bug with open(out_file0, mode='w') as f: with gzip.open(input_file, "rt") as fi: for liner in fi: r_line = liner.strip() if r_line.startswith('#'): f.write("{}\n".format(r_line)) else: w_line = list() for col_n, item in enumerate(r_line.split('\t')): if col_n <= 8: w_line += [item] else: if '/' in item: w_line += [item] f.write("{}\n".format('\t'.join(w_line))) utl.save_to_tmpfile(out_file0_gz) cmd1 = "bgzip -@ {} {}".format(glv.conf.thread, out_file0) utl.try_exec(cmd1) utl.tabix(out_file0_gz) input_valid_vcf = "{}/{}".format(out_dir, input_base_gz) view1 = '{} {} {} -O v -r {} {}' v1_cmd = view1.format('bcftools', 'view', glv.conf.indf_view1_param, region, input_valid_vcf) pipe_annotate = '{} {} {} -O z --threads {} -o {}' # use threads only -O z|b an_cmd = pipe_annotate.format('bcftools', 'annotate', glv.conf.indf_annotate_param, glv.conf.thread, out_file1) cmd1 = '{} | {}'.format(v1_cmd, an_cmd) utl.try_exec(cmd1) utl.tabix(out_file1) norm = '{} {} {} -O z --threads {} -f {} -o {} {}' # use threads only -O z|b cmd2 = norm.format('bcftools', 'norm', glv.conf.indf_norm_param, glv.conf.thread, glv.conf.ref_fasta, out_file2, out_file1) utl.try_exec(cmd2) utl.tabix(out_file2) tabix1 = "{}.tbi".format(out_file1) os.remove(out_file1) os.remove(tabix1) log.info("remove {} {}".format(out_file1, tabix1)) #------------------------- if heterozygosity != 'h**o': continue view2 = '{} {} {} -O z --threads {} -r {} -o {} {}' cmd3 = view2.format('bcftools', 'view', glv.conf.indf_view2_param, glv.conf.thread, region, out_file3, out_file2) #utl.save_to_tmpfile(out_file3) utl.try_exec(cmd3) utl.tabix(out_file3) tabix2 = "{}.tbi".format(out_file2) os.remove(out_file2) os.remove(tabix2) log.info("remove {} {}".format(out_file2, tabix2)) log.info("indelfilter finished {}".format( utl.elapsed_time(time.time(), start)))
def prepare_ref(self): # user's fasta: convert relative path to absolute path based on cwd if glv.conf.ref.startswith('/'): # originally absolute path glv.conf.ref_fasta_user = glv.conf.ref else: # cwd + relative path glv.conf.ref_fasta_user = "******".format(glv.conf.cwd, glv.conf.ref) log.info("glv.conf.ref_fasta_user {}".format(glv.conf.ref_fasta_user)) # ref_fasta_user: existence confirmation if os.path.isfile(glv.conf.ref_fasta_user): log.info("{} found.".format(glv.conf.ref_fasta_user)) else: log.info("{} not found. exit.".format(glv.conf.ref_fasta_user)) sys.exit(1) # ext, basename, without_ext # https://note.nkmk.me/python-os-basename-dirname-split-splitext/ basename_user = os.path.basename(glv.conf.ref_fasta_user) root_ext_pair = os.path.splitext(glv.conf.ref_fasta_user) without_ext = root_ext_pair[0] basename_without_ext = os.path.basename(without_ext) ext = root_ext_pair[1] # ref_fasta_slink_system # make symlink user's fasta to sys_ref_dir as .org(.gz) if ext != '.gz': glv.conf.ref_fasta_slink_system = "{}/{}{}".format( glv.conf.ref_dir, basename_user, '.org_slink') glv.conf.ref_fasta = "{}/{}".format(glv.conf.ref_dir, basename_user) else: glv.conf.ref_fasta_slink_system = "{}/{}{}".format( glv.conf.ref_dir, basename_user, '.org_slink.gz') glv.conf.ref_fasta = "{}/{}".format(glv.conf.ref_dir, basename_without_ext) if os.path.isfile(glv.conf.ref_fasta_slink_system): log.info("{} exist.".format(glv.conf.ref_fasta_slink_system)) else: log.info("os.symlink {} {}.".format( glv.conf.ref_fasta_user, glv.conf.ref_fasta_slink_system)) os.symlink(glv.conf.ref_fasta_user, glv.conf.ref_fasta_slink_system) log.info("ext ({}).".format(ext)) # convert to bgz if ext is .gz and set to ref_fasta if ext != '.gz': # it should be convert to bgz in ref_dir glv.conf.ref_fasta = "{}/{}".format(glv.conf.ref_dir, basename_user) if os.path.isfile(glv.conf.ref_fasta): log.info("symlink exist {}".format(glv.conf.ref_fasta)) else: os.symlink(glv.conf.ref_fasta_user, glv.conf.ref_fasta) log.info("symlink {} {}".format(glv.conf.ref_fasta_user, glv.conf.ref_fasta)) else: # it should be convert to bgz in ref_dir cmd1 = 'bgzip -cd -@ {} {} > {}'.format( glv.conf.thread, glv.conf.ref_fasta_slink_system, glv.conf.ref_fasta) # execute if os.path.isfile(glv.conf.ref_fasta): log.debug("{} exist.".format(glv.conf.ref_fasta)) else: log.debug("{} not exist. do cmd={}".format( glv.conf.ref_fasta, cmd1)) utl.try_exec(cmd1) # make fai file cmd2 = 'samtools faidx {}'.format(glv.conf.ref_fasta, glv.conf.log_dir) glv.conf.ref_fasta_fai = "{}{}".format(glv.conf.ref_fasta, '.fai') if os.path.isfile(glv.conf.ref_fasta_fai): log.debug("{} exist.".format(glv.conf.ref_fasta_fai)) else: log.debug("{} not exist. do {}".format(glv.conf.ref_fasta_fai, cmd2)) utl.try_exec(cmd2) # ref to makeblastdb self._make_bwaidx() return self
def run(self): mod_name = 'svaba' out_dir = utl.mk_outdir(mod_name) # target_vcfs = [ # 'svaba.indel.vcf', 'svaba.sv.vcf', # 'svaba.unfiltered.indel.vcf', 'svaba.unfiltered.sv.vcf'] # continue to next phase glv.outlist.outfile[mod_name] = list() start = time.time() os.chdir(out_dir) for region in glv.conf.region_bed_list: title = "{}/indel_{}".format(out_dir, region) if len(glv.conf.svaba_normalize_bams_list) != 0: norm = " -n ".join(glv.conf.svaba_normalize_bams_list) norm = "-n " + norm glv.outlist.outfile[mod_name].append( "{}.{}{}".format(title, 'svaba.indel.vcf', '.gz')) else: glv.outlist.outfile[mod_name].append( "{}.{}{}".format(title, 'svaba.indel.vcf', '.gz')) # normalize option norm = '' log.debug("{}".format(norm)) if utl.progress_check(mod_name) == False: log.info("progress={} so skip {}.".format( glv.conf.progress, mod_name)) continue log.info("go on {}".format(mod_name)) svaba = '{} {} -t {} -G {} -k {} {} -p {} {} -a {}' cmd1 = svaba.format( 'svaba', 'run', " -t ".join(glv.conf.bam_list), glv.conf.ref_fasta, region, norm, glv.conf.thread, glv.conf.svb_svaba_param, title) utl.try_exec(cmd1) # *.vcf #target_vcfs = list() #for fpath in glob.glob("{}*.vcf".format(out_dir)): # target_vcfs.append(fpath) target_vcfs = utl.check_for_files("{}/*.vcf".format(out_dir)) log.debug("{}".format(target_vcfs)) for t_vcf in target_vcfs: cmd2 = "bgzip -@ {} {}".format( glv.conf.thread, t_vcf) utl.try_exec(cmd2) utl.tabix("{}{}".format(t_vcf, '.gz')) os.chdir(glv.conf.cwd) log.info("svaba finished {}".format( utl.elapsed_time(time.time(), start)))
def run(self): mod_name = 'snpfilter' out_dir = utl.mk_outdir(mod_name) heterozygosity = glv.conf.heterozygosity # continue to next phase glv.outlist.outfile[mod_name] = list() start = time.time() # for each region for (input_file, region) in zip(glv.outlist.outfile['mpileup'], glv.conf.region_bed_list): out_file1 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region, 'annote', '.vcf.gz') out_file2 = "{}/{}_{}.{}{}".format(out_dir, mod_name, region, 'h**o', '.vcf.gz') if heterozygosity == 'h**o': glv.outlist.outfile[mod_name].append(out_file2) else: glv.outlist.outfile[mod_name].append(out_file1) if utl.progress_check(mod_name) == False: log.info("progress={} so skip {}.".format( glv.conf.progress, mod_name)) continue log.info("go on {}".format(mod_name)) view1 = '{} {} {} -O v -r {} {}' v1_cmd = view1.format('bcftools', 'view', glv.conf.snpf_view1_param, region, input_file) pipe_annotate = '{} {} {} -O z --threads {} -o {}' an_cmd = pipe_annotate.format('bcftools', 'annotate', glv.conf.snpf_annotate_param, glv.conf.thread, out_file1) cmd1 = '{} | {}'.format(v1_cmd, an_cmd) utl.try_exec(cmd1) utl.tabix(out_file1) #------------------------- if heterozygosity != 'h**o': continue view2 = '{} {} {} -O z --threads {} -r {} -o {} {}' cmd2 = view2.format('bcftools', 'view', glv.conf.snpf_view2_param, glv.conf.thread, region, out_file2, out_file1) utl.save_to_tmpfile(out_file2) utl.try_exec(cmd2) utl.tabix(out_file2) tabix1 = "{}.tbi".format(out_file1) os.remove(out_file1) os.remove(tabix1) log.info("remove {} {}".format(out_file1, tabix1)) log.info("snpfilter finished {}".format( utl.elapsed_time(time.time(), start)))