def preprocess_sam(sfsam): SAMTOOLS_PATH = get_samtools_path() #filter out those unqualified pairs sfsam_temp = "{0}_temp.sam".format(sfsam) mapq = 30 filter_sam(sfsam, mapq, sfsam_temp) cmd = "{0} view -h -S -b {1}_temp.sam > {2}.bam".format( SAMTOOLS_PATH, sfsam, sfsam) #print "Running command: "+ cmd +"..." print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() cmd = "{0} sort {1}.bam -o {2}.sort.bam".format(SAMTOOLS_PATH, sfsam, sfsam) #print "Running command: "+ cmd +"..." print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() cmd = "{0} index {1}.sort.bam".format(SAMTOOLS_PATH, sfsam) #print "Running command: "+ cmd +"..." print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() #remove useless files #os.remove(sfsam) ##temporarily leave it there os.remove(sfsam_temp) os.remove("{0}.bam".format(sfsam))
def align_read_to_contigs(file_list): OUTPUT_FOLDER = get_output_folder() BWA_PATH = get_bwa_path() THREADS = get_threads_num() SAMTOOLS_PATH = get_samtools_path() sall = OUTPUT_FOLDER + "contigs.fa" cmd = "{0} faidx {1}".format(SAMTOOLS_PATH, sall) #print "Running command: "+ cmd +"..." print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() #check whether already indexed, if not then create bwa_index_path = "{0}/{1}.bwt".format(OUTPUT_FOLDER, sall) if os.path.exists(bwa_index_path) != True: cmd = "{0} index {1}".format(BWA_PATH, sall) #print "Running command: "+ cmd +"..." print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() nfiles = len(file_list) if nfiles % 2 != 0: print "Something wrong with the file list, it's not paired!!!" return -1 i = 0 j = 0 while i < nfiles: sfleft_reads = file_list[i][0] i += 1 sfright_reads = file_list[i][0] i += 1 sfsam = "{0}_{1}.sam".format(sall, j) cmd = "{0} mem -t {1} {2} {3} {4} > {5}".format( BWA_PATH, THREADS, sall, sfleft_reads, sfright_reads, sfsam) #print "Running command: "+ cmd +"..." if os.path.exists(sfsam) == False: print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() #filter out those unqualified pairs sfsam_temp = "{0}_{1}_temp.sam".format(sall, j) mapq = 30 filter_sam(sfsam, mapq, sfsam_temp) cmd = "{0} view -h -S -b {1}_{2}_temp.sam > {3}_{4}.bam".format( SAMTOOLS_PATH, sall, j, sall, j) #print "Running command: "+ cmd +"..." print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() cmd = "{0} sort {1}_{2}.bam -o {3}_{4}.sort.bam".format( SAMTOOLS_PATH, sall, j, sall, j) #print "Running command: "+ cmd +"..." print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() cmd = "{0} index {1}_{2}.sort.bam".format(SAMTOOLS_PATH, sall, j) #print "Running command: "+ cmd +"..." print_command(cmd) Popen(cmd, shell=True, stdout=PIPE).communicate() j += 1
def remove_duplicate_contained(fcontig, foutput, cutoff, rm_contained): BWA_PATH=get_bwa_path() SAMTOOLS_PATH=get_samtools_path() REFINER_PATH=get_refiner_path() #remove duplicate or contained contigs cmd="{0} faidx {1}".format(SAMTOOLS_PATH,fcontig) #print_command("Running command: "+cmd) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="{0} -U -r {1} -o {2}".format(REFINER_PATH,fcontig,fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="{0} faidx {1}".format(SAMTOOLS_PATH,fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="{0} index {1}".format(BWA_PATH,fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="{0} mem -a {1} {2} > {3}.itself.sam".format(BWA_PATH,fcontig,fcontig,fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="{0} view -h -S -b {1}.itself.sam > {2}.itself.bam".format(SAMTOOLS_PATH,fcontig,fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="{0} sort {1}.itself.bam -o {2}.itself.sort.bam".format(SAMTOOLS_PATH,fcontig,fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="{0} index {1}.itself.sort.bam".format(SAMTOOLS_PATH,fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() if rm_contained==0: cmd="{0} -P -b {1}.itself.sort.bam -r {2} -o {3} -c {4} -g".format(REFINER_PATH,fcontig,fcontig,foutput,cutoff) elif rm_contained==1: cmd="{0} -P -b {1}.itself.sort.bam -r {2} -o {3} -c {4}".format(REFINER_PATH,fcontig,fcontig,foutput,cutoff) else: cmd="{0} -K -b {1}.itself.sort.bam -r {2} -o {3} -c {4}".format(REFINER_PATH,fcontig,fcontig,foutput,cutoff) print_command(cmd) Popen(cmd, shell = True, stdout = PIPE).communicate() ##clean all the temporary files cmd="rm {0}.sa".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.pac".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.bwt".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.ann".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.amb".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.itself.sam".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.itself.bam".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.itself.sort.bam".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.itself.sort.bam.bai".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate() cmd="rm {0}.fai".format(fcontig) Popen(cmd, shell = True, stdout = PIPE).communicate()
def dispath_collect_jobs(self): cmd_list = [] samtools_path = get_samtools_path() with open(self.sf_fai) as fin_fai: for line in fin_fai: fields = line.split() cmd="{0} view {1} \"{2}\" | python collect_discordant_low_mapq_reads.py {3} -"\ .format(samtools_path, self.sf_bam, fields[0], self.working_folder) cmd_list.append(cmd) pool = ThreadPool(self.nthreads) pool.map(run_cmd_discordant, cmd_list) pool.close() pool.join()
def filter_sam(sfsam, mapq, sfsam_output): SAMTOOLS_PATH = get_samtools_path() print "First, filter out those unmapped reads in {0} ...".format(sfsam) #First, only keep those fully mapped reads sintermediate = "{0}_intermediate.sam".format(sfsam) cmd = "{0} view -h -S -F 4 {1} > {2}".format(SAMTOOLS_PATH, sfsam, sintermediate) Popen(cmd, shell=True, stdout=PIPE).communicate() cmd = "{0} view -h -S -b {1} > {2}.bam".format(SAMTOOLS_PATH, sintermediate, sintermediate) Popen(cmd, shell=True, stdout=PIPE).communicate() os.remove(sfsam) ##keep the mapped reads(for calculating coverage), covert to bam and sort sbam_for_cov = "{0}_for_coverage".format(sfsam) print "Sort filtered {0} ...".format(sintermediate) cmd = "{0} sort {1}.bam -o {2}.sorted.bam".format(SAMTOOLS_PATH, sintermediate, sbam_for_cov) Popen(cmd, shell=True, stdout=PIPE).communicate() cmd = "{0} index {1}.sorted.bam".format(SAMTOOLS_PATH, sbam_for_cov) Popen(cmd, shell=True, stdout=PIPE).communicate() print "Sort filtered {0} by read name ...".format(sfsam) #first covert to bam, then sort cmd = "{0} sort -n {1}.bam -o {2}.sortbyname.bam".format( SAMTOOLS_PATH, sintermediate, sintermediate) Popen(cmd, shell=True, stdout=PIPE).communicate() # then covert back to sam cmd = "{0} view -h {1}.sortbyname.bam > {2}".format( SAMTOOLS_PATH, sintermediate, sintermediate) Popen(cmd, shell=True, stdout=PIPE).communicate() print "Filter out pairs that mapping quality are low, or only one in pair is qualified...." #Then, filter through mapping quality filter_PE_by_map_quality(sintermediate, mapq, sfsam_output) #remove useless files os.remove(sintermediate) #os.remove("{0}.bam".format(sintermediate)) os.remove("{0}.sortbyname.bam".format(sintermediate))