def preprocess_sam(sfsam):
    SAMTOOLS_PATH = get_samtools_path()
    #filter out those unqualified pairs
    sfsam_temp = "{0}_temp.sam".format(sfsam)
    mapq = 30
    filter_sam(sfsam, mapq, sfsam_temp)

    cmd = "{0} view -h -S -b {1}_temp.sam > {2}.bam".format(
        SAMTOOLS_PATH, sfsam, sfsam)
    #print "Running command: "+ cmd +"..."
    print_command(cmd)
    Popen(cmd, shell=True, stdout=PIPE).communicate()
    cmd = "{0} sort {1}.bam -o {2}.sort.bam".format(SAMTOOLS_PATH, sfsam,
                                                    sfsam)
    #print "Running command: "+ cmd +"..."
    print_command(cmd)
    Popen(cmd, shell=True, stdout=PIPE).communicate()
    cmd = "{0} index {1}.sort.bam".format(SAMTOOLS_PATH, sfsam)
    #print "Running command: "+ cmd +"..."
    print_command(cmd)
    Popen(cmd, shell=True, stdout=PIPE).communicate()

    #remove useless files
    #os.remove(sfsam) ##temporarily leave it there
    os.remove(sfsam_temp)
    os.remove("{0}.bam".format(sfsam))
def align_read_to_contigs(file_list):
    OUTPUT_FOLDER = get_output_folder()
    BWA_PATH = get_bwa_path()
    THREADS = get_threads_num()
    SAMTOOLS_PATH = get_samtools_path()

    sall = OUTPUT_FOLDER + "contigs.fa"
    cmd = "{0} faidx {1}".format(SAMTOOLS_PATH, sall)
    #print "Running command: "+ cmd +"..."
    print_command(cmd)
    Popen(cmd, shell=True, stdout=PIPE).communicate()

    #check whether already indexed, if not then create
    bwa_index_path = "{0}/{1}.bwt".format(OUTPUT_FOLDER, sall)
    if os.path.exists(bwa_index_path) != True:
        cmd = "{0} index {1}".format(BWA_PATH, sall)
        #print "Running command: "+ cmd +"..."
        print_command(cmd)
        Popen(cmd, shell=True, stdout=PIPE).communicate()

    nfiles = len(file_list)
    if nfiles % 2 != 0:
        print "Something wrong with the file list, it's not paired!!!"
        return -1
    i = 0
    j = 0
    while i < nfiles:
        sfleft_reads = file_list[i][0]
        i += 1
        sfright_reads = file_list[i][0]
        i += 1
        sfsam = "{0}_{1}.sam".format(sall, j)
        cmd = "{0} mem -t {1} {2} {3} {4} > {5}".format(
            BWA_PATH, THREADS, sall, sfleft_reads, sfright_reads, sfsam)
        #print "Running command: "+ cmd +"..."
        if os.path.exists(sfsam) == False:
            print_command(cmd)
            Popen(cmd, shell=True, stdout=PIPE).communicate()

        #filter out those unqualified pairs
        sfsam_temp = "{0}_{1}_temp.sam".format(sall, j)
        mapq = 30
        filter_sam(sfsam, mapq, sfsam_temp)

        cmd = "{0} view -h -S -b {1}_{2}_temp.sam > {3}_{4}.bam".format(
            SAMTOOLS_PATH, sall, j, sall, j)
        #print "Running command: "+ cmd +"..."
        print_command(cmd)
        Popen(cmd, shell=True, stdout=PIPE).communicate()
        cmd = "{0} sort {1}_{2}.bam -o {3}_{4}.sort.bam".format(
            SAMTOOLS_PATH, sall, j, sall, j)
        #print "Running command: "+ cmd +"..."
        print_command(cmd)
        Popen(cmd, shell=True, stdout=PIPE).communicate()
        cmd = "{0} index {1}_{2}.sort.bam".format(SAMTOOLS_PATH, sall, j)
        #print "Running command: "+ cmd +"..."
        print_command(cmd)
        Popen(cmd, shell=True, stdout=PIPE).communicate()
        j += 1
Beispiel #3
0
def remove_duplicate_contained(fcontig, foutput, cutoff, rm_contained):
    BWA_PATH=get_bwa_path()
    SAMTOOLS_PATH=get_samtools_path()
    REFINER_PATH=get_refiner_path()

    #remove duplicate or contained contigs
    cmd="{0} faidx {1}".format(SAMTOOLS_PATH,fcontig)
    #print_command("Running command: "+cmd)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="{0} -U -r {1} -o {2}".format(REFINER_PATH,fcontig,fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()

    cmd="{0} faidx {1}".format(SAMTOOLS_PATH,fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="{0} index {1}".format(BWA_PATH,fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="{0} mem -a {1} {2} > {3}.itself.sam".format(BWA_PATH,fcontig,fcontig,fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()

    cmd="{0} view -h -S -b {1}.itself.sam > {2}.itself.bam".format(SAMTOOLS_PATH,fcontig,fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="{0} sort {1}.itself.bam -o {2}.itself.sort.bam".format(SAMTOOLS_PATH,fcontig,fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="{0} index {1}.itself.sort.bam".format(SAMTOOLS_PATH,fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()

    if rm_contained==0:
        cmd="{0} -P -b {1}.itself.sort.bam -r {2} -o {3} -c {4} -g".format(REFINER_PATH,fcontig,fcontig,foutput,cutoff)
    elif rm_contained==1:
        cmd="{0} -P -b {1}.itself.sort.bam -r {2} -o {3} -c {4}".format(REFINER_PATH,fcontig,fcontig,foutput,cutoff)
    else:
        cmd="{0} -K -b {1}.itself.sort.bam -r {2} -o {3} -c {4}".format(REFINER_PATH,fcontig,fcontig,foutput,cutoff)
    print_command(cmd)
    Popen(cmd, shell = True, stdout = PIPE).communicate()

    ##clean all the temporary files
    cmd="rm {0}.sa".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.pac".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.bwt".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.ann".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.amb".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.itself.sam".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.itself.bam".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.itself.sort.bam".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.itself.sort.bam.bai".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
    cmd="rm {0}.fai".format(fcontig)
    Popen(cmd, shell = True, stdout = PIPE).communicate()
Beispiel #4
0
    def dispath_collect_jobs(self):
        cmd_list = []
        samtools_path = get_samtools_path()
        with open(self.sf_fai) as fin_fai:
            for line in fin_fai:
                fields = line.split()
                cmd="{0} view {1} \"{2}\" | python collect_discordant_low_mapq_reads.py {3} -"\
                    .format(samtools_path, self.sf_bam, fields[0], self.working_folder)
                cmd_list.append(cmd)

        pool = ThreadPool(self.nthreads)
        pool.map(run_cmd_discordant, cmd_list)
        pool.close()
        pool.join()
Beispiel #5
0
def filter_sam(sfsam, mapq, sfsam_output):
    SAMTOOLS_PATH = get_samtools_path()

    print "First, filter out those unmapped reads in {0} ...".format(sfsam)
    #First, only keep those fully mapped reads
    sintermediate = "{0}_intermediate.sam".format(sfsam)
    cmd = "{0} view -h -S -F 4 {1} > {2}".format(SAMTOOLS_PATH, sfsam,
                                                 sintermediate)
    Popen(cmd, shell=True, stdout=PIPE).communicate()
    cmd = "{0} view -h -S -b {1} > {2}.bam".format(SAMTOOLS_PATH,
                                                   sintermediate,
                                                   sintermediate)
    Popen(cmd, shell=True, stdout=PIPE).communicate()
    os.remove(sfsam)

    ##keep the mapped reads(for calculating coverage), covert to bam and sort
    sbam_for_cov = "{0}_for_coverage".format(sfsam)
    print "Sort filtered {0} ...".format(sintermediate)
    cmd = "{0} sort {1}.bam -o {2}.sorted.bam".format(SAMTOOLS_PATH,
                                                      sintermediate,
                                                      sbam_for_cov)
    Popen(cmd, shell=True, stdout=PIPE).communicate()
    cmd = "{0} index {1}.sorted.bam".format(SAMTOOLS_PATH, sbam_for_cov)
    Popen(cmd, shell=True, stdout=PIPE).communicate()

    print "Sort filtered {0} by read name ...".format(sfsam)
    #first covert to bam, then sort
    cmd = "{0} sort -n {1}.bam -o {2}.sortbyname.bam".format(
        SAMTOOLS_PATH, sintermediate, sintermediate)
    Popen(cmd, shell=True, stdout=PIPE).communicate()
    # then covert back to sam
    cmd = "{0} view -h {1}.sortbyname.bam > {2}".format(
        SAMTOOLS_PATH, sintermediate, sintermediate)
    Popen(cmd, shell=True, stdout=PIPE).communicate()

    print "Filter out pairs that mapping quality are low, or only one in pair is qualified...."
    #Then, filter through mapping quality
    filter_PE_by_map_quality(sintermediate, mapq, sfsam_output)

    #remove useless files
    os.remove(sintermediate)
    #os.remove("{0}.bam".format(sintermediate))
    os.remove("{0}.sortbyname.bam".format(sintermediate))