Ejemplo n.º 1
0
 def generator(self):
     '''
     The main method in the class
     1. Draw Mbias plot and generate Mbias table fore per read length in per strand
     2. Decide the trimming positions based on the Mbias plot and generate the trimming file
     '''
     if len(self.trim_file) != 0:
         info("Used the trimming file from the user defined!! Ignore the step of automatically deciding trimming.")
         return self.user_defined_trimming()
     ref = GR.get_ref(self.ref_file)
     info("Calculate the M fraction for every position...")
     # check: are the input SAM files paired-end or single-end
     strand_p = {}
     if self.single_on:
         strand_p['++'] = {}
         strand_p['-+'] = {}
     else:
         strand_p['++'] = {}
         strand_p['-+'] = {}
         strand_p['+-'] = {}
         strand_p['--'] = {}
     strand_p = self.parser_sambam(strand_p, ref)
     #modify in 2013-06-04
     strand_t_raw = []
     name_context = [self.name + '_CG', self.name + '_nonCG']
     for i in range(len(strand_p)):
         strand_t_each = self.decide_trim_bp(strand_p[i])
         strand_t_raw.append(strand_t_each)
         MR.mbias_generator(strand_p[i], strand_t_each, name_context[i])
     strand_t = self.decide_final_trimming(strand_t_raw)
     self.produce_final_trim_file(strand_t)
     return strand_t
Ejemplo n.º 2
0
 def generator(self):
     '''
     The main method in the class
     1. Draw Mbias plot and generate Mbias table fore per read length in per strand
     2. Decide the trimming positions based on the Mbias plot and generate the trimming file
     '''
     if len(self.trim_file) != 0:
         info(
             "Used the trimming file from the user defined!! Ignore the step of automatically deciding trimming."
         )
         return self.user_defined_trimming()
     ref = GR.get_ref(self.ref_file)
     info("Calculate the M fraction for every position...")
     # check: are the input SAM files paired-end or single-end
     strand_p = {}
     if self.single_on:
         strand_p['++'] = {}
         strand_p['-+'] = {}
     else:
         strand_p['++'] = {}
         strand_p['-+'] = {}
         strand_p['+-'] = {}
         strand_p['--'] = {}
     strand_p = self.parser_sambam(strand_p, ref)
     #modify in 2013-06-04
     strand_t_raw = []
     name_context = [self.name + '_CG', self.name + '_nonCG']
     for i in range(len(strand_p)):
         strand_t_each = self.decide_trim_bp(strand_p[i])
         strand_t_raw.append(strand_t_each)
         MR.mbias_generator(strand_p[i], strand_t_each, name_context[i])
     strand_t = self.decide_final_trimming(strand_t_raw)
     self.produce_final_trim_file(strand_t)
     return strand_t
Ejemplo n.º 3
0
def run(args):
    options = args.parse_args()
    if len(options.sam_file) == 0:
        error("Missing the SAM file, use -s or --sam option.")
    else:
        options.sam_file = options.sam_file.split(',')
    for s in options.sam_file:
        if not os.path.isfile(s):
            error("Can't open the SAM file: " + s)
            sys.exit(1)

    if len(options.ref_file) == 0:
        error(
            "Missing the reference genome fasta file, use -r or --ref option.")
    else:
        if not os.path.isfile(options.ref_file):
            error("Can't open the ref file: " + options.ref_file)

    if len(options.samtools) != 0:
        if options.samtools[-1] != '/':
            options.samtools += '/'

    if len(options.name) == 0:
        error("Missing the output file name, use -n or --name options.")

    sam_inf = options.sam_file
    ref_file = options.ref_file
    bsm = options.bsm
    s_path = options.samtools
    name = options.name
    dige_site = options.dige_site
    remove_overlap = options.remove_overlap
    not_mapping = options.not_mapping

    info("Get the all parameter!!")
    #check the input mapping files
    sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path)
    pre_flag = read_inf.readline().split('\t')[1]
    if 'p' in pre_flag:
        single_on = False
        info("The input mapping files are paired-end sequencing!")
    else:
        single_on = True
        info("The input mapping files are single-end sequencing!")

    #get reference information
    ref = GR.get_ref(ref_file)

    ##scan MspI site and trim the end-repaired C
    dige_dict, all_reads, all_mapping_bp, not_mapping_reads, filter_not_mapping_reads, filter_MspI_endrepair_bp, filter_remove_overlap_bp = parser_trim_sambam(
        sam_inf, ref, bsm, s_path, dige_site, single_on, remove_overlap,
        not_mapping, name)

    ##produce MspI Mbias plot
    RR.generator(dige_dict, single_on, name)

    ##produce the filter report
    report(all_reads, all_mapping_bp, not_mapping_reads,
           filter_not_mapping_reads, filter_MspI_endrepair_bp,
           filter_remove_overlap_bp, single_on, name)
Ejemplo n.º 4
0
def run(args):
    options = args.parse_args()
    if len(options.sam_file) == 0:
        error("Missing the SAM file, use -s or --sam option.")
    else:
        options.sam_file = options.sam_file.split(',')
    for s in options.sam_file:
        if not os.path.isfile(s):
            error("Can't open the SAM file: " + s)
            sys.exit(1)

    if len(options.ref_file) == 0:
        error("Missing the reference genome fasta file, use -r or --ref option.")
    else:
        if not os.path.isfile(options.ref_file):
            error("Can't open the ref file: " + options.ref_file)

    if len(options.samtools) != 0:
        if options.samtools[-1] != '/':
            options.samtools += '/'

    if len(options.name) == 0:
        error("Missing the output file name, use -n or --name options.")

    sam_inf = options.sam_file
    ref_file = options.ref_file
    bsm = options.bsm
    s_path = options.samtools
    name = options.name
    dige_site = options.dige_site
    remove_overlap = options.remove_overlap
    not_mapping = options.not_mapping


    info("Get the all parameter!!")
    #check the input mapping files
    sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path)
    pre_flag = read_inf.readline().split('\t')[1]
    if 'p' in pre_flag:
        single_on = False
        info("The input mapping files are paired-end sequencing!")
    else:
        single_on = True
        info("The input mapping files are single-end sequencing!")

    #get reference information
    ref = GR.get_ref(ref_file)

    ##scan MspI site and trim the end-repaired C
    dige_dict, all_reads, all_mapping_bp, not_mapping_reads, filter_not_mapping_reads, filter_MspI_endrepair_bp, filter_remove_overlap_bp = parser_trim_sambam(
        sam_inf, ref, bsm, s_path, dige_site, single_on, remove_overlap, not_mapping, name)


    ##produce MspI Mbias plot
    RR.generator(dige_dict, single_on, name)

    ##produce the filter report
    report(all_reads, all_mapping_bp, not_mapping_reads, filter_not_mapping_reads, filter_MspI_endrepair_bp,
           filter_remove_overlap_bp, single_on, name)
Ejemplo n.º 5
0
    def generator(self):
        '''
        The main method in the class
        1. Draw Mbias plot and generate Mbias table fore per read length in per strand
        2. Decide the trimming positions based on the Mbias plot and generate the trimming file
        3. Show the duplicate reads distribution
        '''

        ref = GR.get_ref(self.ref_file)
        # check: are the input SAM files paired-end or single-end
        strand_p = {}
        if self.single_on:
            strand_p['++'] = {}
            strand_p['-+'] = {}
        else:
            strand_p['++'] = {}
            strand_p['-+'] = {}
            strand_p['+-'] = {}
            strand_p['--'] = {}

        if len(self.trim_file) != 0:
            info("Used the trimming file from the user defined!!")
            info("Ignore both Mbias assessment and trimming decision.")
            loc_dict = self.parser_sambam(strand_p, ref)
            strand_t = self.user_defined_trimming()
        else:
            #modify in 2013-06-04
            strand_t_raw = []
            name_context = [self.name + '_CG', self.name + '_nonCG']
            strand_p, loc_dict = self.parser_sambam(strand_p, ref)
            for i in range(len(strand_p)):
                strand_t_each = self.decide_trim_bp(strand_p[i])
                strand_t_raw.append(strand_t_each)
                MR.mbias_generator(strand_p[i], strand_t_each, name_context[i])
            strand_t = self.decide_final_trimming(strand_t_raw)
            self.produce_final_trim_file(strand_t)
        max_cov = DR.duplicate_report(loc_dict, self.gsize, self.p_poisson,
                                      self.name)
        return strand_t, loc_dict, max_cov
Ejemplo n.º 6
0
    def generator(self):
        '''
        The main method in the class
        1. Draw Mbias plot and generate Mbias table fore per read length in per strand
        2. Decide the trimming positions based on the Mbias plot and generate the trimming file
        3. Show the duplicate reads distribution
        '''

        ref = GR.get_ref(self.ref_file)
        # check: are the input SAM files paired-end or single-end
        strand_p = {}
        if self.single_on:
            strand_p['++'] = {}
            strand_p['-+'] = {}
        else:
            strand_p['++'] = {}
            strand_p['-+'] = {}
            strand_p['+-'] = {}
            strand_p['--'] = {}

        if len(self.trim_file) != 0:
            info("Used the trimming file from the user defined!!")
            info("Ignore both Mbias assessment and trimming decision.")
            loc_dict = self.parser_sambam(strand_p, ref)
            strand_t = self.user_defined_trimming()
        else:
            #modify in 2013-06-04
            strand_t_raw = []
            name_context = [self.name + '_CG', self.name + '_nonCG']
            strand_p, loc_dict = self.parser_sambam(strand_p, ref)
            for i in range(len(strand_p)):
                strand_t_each = self.decide_trim_bp(strand_p[i])
                strand_t_raw.append(strand_t_each)
                MR.mbias_generator(strand_p[i], strand_t_each, name_context[i])
            strand_t = self.decide_final_trimming(strand_t_raw)
            self.produce_final_trim_file(strand_t)
        max_cov = DR.duplicate_report(loc_dict, self.gsize, self.p_poisson, self.name)
        return strand_t, loc_dict, max_cov
Ejemplo n.º 7
0
def run(args):
    """
    Alternative module: Use the strategy in Bis-SNP to trim 5' bisulfite conversion failures
    """
    options = args.parse_args()

    if len(options.sam_file) == 0:
        error("Missing the SAM file, use -s or --sam option.")
    else:
        options.sam_file = options.sam_file.split(',')
    for s in options.sam_file:
        if not os.path.isfile(s):
            error("Can't open the SAM file: " + s)
            sys.exit(1)

    if len(options.ref_file) == 0:
        error(
            "Missing the reference genome fasta file, use -r or --ref option.")
    else:
        if not os.path.isfile(options.ref_file):
            error("Can't open the ref file: " + options.ref_file)

    if len(options.samtools) != 0:
        if options.samtools[-1] != '/':
            options.samtools += '/'

    if len(options.name) == 0:
        error("Missing the output file name, use -n or --name options.")

    sam_inf = options.sam_file
    ref_file = options.ref_file
    bsm = options.bsm
    s_path = options.samtools
    name = options.name
    remove_overlap = options.remove_overlap
    filter_dup = options.filter_dup
    p_poisson = options.p_poisson
    gsize = options.gsize
    not_mapping = options.not_mapping

    info("Get the all parameter!!")

    #check the input mapping files
    sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path)
    pre_flag = read_inf.readline().split('\t')[1]
    if 'p' in pre_flag:
        single_on = False
        info("The input mapping files are paired-end sequencing!")
    else:
        single_on = True
        info("The input mapping files are single-end sequencing!")

    loc_dict = {}
    if filter_dup:
        ## if filter_up is TRUE, the duplicate reads will be assessed and shown in Dup_dis.pdf
        info("The filter_dup has been set True.")
        info("Assess the duplicate reads...")
        for sam in sam_inf:
            #check the input mapping files
            sam_format, read_inf = check.check_mapping_file(sam, s_path)
            if single_on:
                for read in read_inf:
                    loc_dict = LI.Loc_single(read, loc_dict, bsm)
            else:
                for read in read_inf:
                    loc_dict = LI.Loc_paired(read, loc_dict, bsm)
        max_cov = DR.duplicate_report(loc_dict, gsize, p_poisson, name)
        info('Get the duplicate reads distribution!')

    #get reference information
    ref = GR.get_ref(ref_file)
    trim_position = []

    filter_duplicate_reads = 0
    filter_nonuniform_trim_bp = 0
    filter_nonuniform_trim_bp_CG = 0
    filter_remove_overlap_bp = 0
    filter_not_mapping_reads = 0
    all_reads = 0
    not_mapping_reads = 0
    all_mapping_bp = 0

    ##filter the 5' bisulfite failure
    for sam in sam_inf:
        out_sam = sam[:-4] + '_' + name + '_filter.sam'
        out = open(out_sam, 'w')
        #check the input mapping files
        record_mate = {}
        sam_format, read_inf = check.check_mapping_file_header(sam, s_path)

        for read in read_inf:
            #for sam header
            if read.startswith('@'):
                out.write(read)
                continue
            else:
                all_reads += 1  ##record the read number (2013-06-20)

                #Get the read information for trimming
                #If the read isn't unique mapping, we will get a empty list ([]).
                #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
                #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
            read_info = RI(read, bsm)
            read_info = read_info.extract_information()

            if len(read_info) == 0:
                not_mapping_reads += 1
                if not_mapping:  #keep the not_unique mapping reads (or not paired mapping)
                    out.write(read)
                else:
                    filter_not_mapping_reads += 1  ##record the not mapping read number (2013-06-20)
                continue

            if len(
                    loc_dict
            ) > 0:  #the --filter_dup has been set True, have to remove duplicate reads
                duplicate, loc_dict = DF(read_info, loc_dict, max_cov,
                                         single_on)
            else:
                duplicate = False

            if single_on:
                all_mapping_bp += len(
                    read_info[5]
                )  ##record the mapping read basepair (2013-06-20)
            else:
                all_mapping_bp += len(
                    read_info[7]
                )  ##record the mapping read basepair (2013-06-20)

            record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = NF.nonuniform_filter(
                read, out, read_info, ref, remove_overlap, duplicate,
                single_on, record_mate, trim_position,
                filter_nonuniform_trim_bp_CG, filter_duplicate_reads,
                filter_remove_overlap_bp)
        out.close()
        del record_mate
    NR.nonuniform_generator(trim_position, name)

    for i in range(len(trim_position)):
        filter_nonuniform_trim_bp += i * trim_position[i]

    ##produce the filter report
    info('Produce the report file...')
    report_out = open(name + "_BSeQC_nonuniform_filter_report.txt", 'w')
    report_out.write('Total reads: %d\n' % all_reads)
    if single_on:
        report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' %
                         (not_mapping_reads,
                          float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Unique mapping reads: %d(%.2f%s all reads)\n' %
            ((all_reads - not_mapping_reads),
             float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Skip not unique mapping reads: %d(%.2f%s all reads)\n' %
            (filter_not_mapping_reads,
             float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique mapping reads:\n')
        report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write(
            'Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' %
            (filter_duplicate_reads, float(filter_duplicate_reads) /
             (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write(
            "Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            % (filter_nonuniform_trim_bp,
               float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write(
            "Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            %
            (filter_nonuniform_trim_bp_CG,
             float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%"))

    else:
        report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' %
                         (not_mapping_reads,
                          float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Unique paired mapping reads: %d(%.2f%s)\n' %
            ((all_reads - not_mapping_reads),
             float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Skip not paired unique mapping reads: %d(%.2f%s)\n' %
            (filter_not_mapping_reads,
             float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique paired mapping reads:\n')
        report_out.write('All unique paired mapping basepairs: %d\n' %
                         all_mapping_bp)
        report_out.write(
            'Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n'
            % (filter_duplicate_reads, float(filter_duplicate_reads) /
               (all_reads - not_mapping_reads * 100), "%"))
        report_out.write(
            "Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            % (filter_nonuniform_trim_bp,
               float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write(
            "Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            %
            (filter_nonuniform_trim_bp_CG,
             float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%"))
        report_out.write(
            'Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n'
            % (filter_remove_overlap_bp,
               float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%"))
    report_out.close()
    info('Get the report file!')
Ejemplo n.º 8
0
def filter_sam(sam_inf, ref_file, bsmb, strand_t, read_l, single_on, name, s_path, auto, remove_overlap, loc_dict, max_cov,
               not_mapping):
    '''
    Trim the mapping files with the biased positions of every length in every strand,
    which are saved in the variance: strand_t.
    '''
    filter_duplicate_reads = 0
    filter_mbias_trim_bp = 0
    filter_mbias_trim_bp_CG = 0
    filter_remove_overlap_bp = 0
    filter_not_mapping_reads = 0
    all_reads = 0
    not_mapping_reads = 0
    all_mapping_bp = 0
    ref = GR.get_ref(ref_file)
    for sam in sam_inf:
        out_sam = sam[:-4] + '_' + name + '_filter.sam'
        out = open(out_sam, 'w')

        #check the input mapping files
        sam_format, read_inf = check.check_mapping_file_header(sam, s_path)

        #scan every read to qc_filter
        for read in read_inf:
            #for sam header
            if read.startswith('@'):
                out.write(read)
                continue
            else:
                all_reads += 1  ##record the read number (2013-06-20)

            #Get the read information for trimming
            #If the read isn't unique mapping, we will get a empty list ([]).
            #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
            #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
            read_info = RI(read, bsmb)
            read_info = read_info.extract_information()
            if len(read_info) == 0:
                not_mapping_reads += 1
                if not_mapping:         #keep the not_unique mapping reads
                    out.write(read)
                else:
                    filter_not_mapping_reads += 1  ##record the not mapping read number (2013-06-20)
                continue

            if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads
                duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on)
            else:
                duplicate = False

            if single_on:
                all_mapping_bp += len(read_info[5])     ##record the mapping read basepair (2013-06-20)
                if auto:
                    if read_l[0] != '':
                        original_length = int(read_l[sam_inf.index(sam)])
                    else:
                        original_length = ''
                    filter_mbias_trim_bp, filter_duplicate_reads = SF(read, strand_t, out, read_info, original_length,
                        duplicate, filter_mbias_trim_bp, filter_duplicate_reads)
                else:
                    if not duplicate and len(loc_dict) > 0:
                        out.write(read)                 #not trimming, only output not_duplicate reads
                    else:
                        filter_duplicate_reads += 1     ##record the duplicate read (2013-06-20)
            else:
                all_mapping_bp += len(read_info[7])     ##record the mapping read basepair (2013-06-20)
                if auto or remove_overlap:
                    if read_l[0] != '':
                        original_length = [int(i) for i in read_l[sam_inf.index(sam)].split('_')]
                    else:
                        original_length = ''
                    filter_mbias_trim_bp, filter_mbias_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = PF(read, ref, strand_t, out,
                        read_info, original_length, auto, remove_overlap, duplicate, filter_mbias_trim_bp, filter_mbias_trim_bp_CG,
                        filter_duplicate_reads, filter_remove_overlap_bp)
                else:
                    if not duplicate and len(loc_dict) > 0:
                        out.write(read)                  #not trimming, only output not_duplicate reads
                    else:
                        filter_duplicate_reads += 1     ##record the duplicate read (2013-06-20)
        out.close()

    ##produce the filter report
    info('Produce the report file...')
    report_out = open(name + "_BSeQC_mbias_filter_report.txt", 'w')
    report_out.write('Total reads: %d\n' % all_reads)
    if single_on:
        report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % (
        not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % (
        (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % (
        filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique mapping reads:\n')
        report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % (
        filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%"))
        #report_out.write('Filter Mbias CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n' % (
        #filter_mbias_trim_bp_CG, float(filter_mbias_trim_bp_CG) / all_mapping_bp * 100, "%"))
        report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique mapping basepairs)\n' % (
        filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%"))

    else:
        report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % (
        not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % (
        (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % (
        filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique paired mapping reads:\n')
        report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % (
        filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write('Filter Mbias basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (
        filter_mbias_trim_bp, float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write("Filter 5' Mbias CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
        filter_mbias_trim_bp_CG, float(filter_mbias_trim_bp_CG) / all_mapping_bp * 100, "%"))
        report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (
        filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%"))
    report_out.close()
    info('Get the report file!')
Ejemplo n.º 9
0
def filter_sam(sam_inf, ref_file, bsmb, strand_t, read_l, single_on, name,
               s_path, auto, remove_overlap, loc_dict, max_cov, not_mapping):
    '''
    Trim the mapping files with the biased positions of every length in every strand,
    which are saved in the variance: strand_t.
    '''
    filter_duplicate_reads = 0
    filter_mbias_trim_bp = 0
    filter_mbias_trim_bp_CG = 0
    filter_remove_overlap_bp = 0
    filter_not_mapping_reads = 0
    all_reads = 0
    not_mapping_reads = 0
    all_mapping_bp = 0
    ref = GR.get_ref(ref_file)
    for sam in sam_inf:
        out_sam = sam[:-4] + '_' + name + '_filter.sam'
        out = open(out_sam, 'w')

        #check the input mapping files
        sam_format, read_inf = check.check_mapping_file_header(sam, s_path)

        #scan every read to qc_filter
        for read in read_inf:
            #for sam header
            if read.startswith('@'):
                out.write(read)
                continue
            else:
                all_reads += 1  ##record the read number (2013-06-20)

            #Get the read information for trimming
            #If the read isn't unique mapping, we will get a empty list ([]).
            #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
            #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
            read_info = RI(read, bsmb)
            read_info = read_info.extract_information()
            if len(read_info) == 0:
                not_mapping_reads += 1
                if not_mapping:  #keep the not_unique mapping reads
                    out.write(read)
                else:
                    filter_not_mapping_reads += 1  ##record the not mapping read number (2013-06-20)
                continue

            if len(
                    loc_dict
            ) > 0:  #the --filter_dup has been set True, have to remove duplicate reads
                duplicate, loc_dict = DF(read_info, loc_dict, max_cov,
                                         single_on)
            else:
                duplicate = False

            if single_on:
                all_mapping_bp += len(
                    read_info[5]
                )  ##record the mapping read basepair (2013-06-20)
                if auto:
                    if read_l[0] != '':
                        original_length = int(read_l[sam_inf.index(sam)])
                    else:
                        original_length = ''
                    filter_mbias_trim_bp, filter_duplicate_reads = SF(
                        read, strand_t, out, read_info, original_length,
                        duplicate, filter_mbias_trim_bp,
                        filter_duplicate_reads)
                else:
                    if not duplicate and len(loc_dict) > 0:
                        out.write(
                            read
                        )  #not trimming, only output not_duplicate reads
                    else:
                        filter_duplicate_reads += 1  ##record the duplicate read (2013-06-20)
            else:
                all_mapping_bp += len(
                    read_info[7]
                )  ##record the mapping read basepair (2013-06-20)
                if auto or remove_overlap:
                    if read_l[0] != '':
                        original_length = [
                            int(i)
                            for i in read_l[sam_inf.index(sam)].split('_')
                        ]
                    else:
                        original_length = ''
                    filter_mbias_trim_bp, filter_mbias_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = PF(
                        read, ref, strand_t, out, read_info, original_length,
                        auto, remove_overlap, duplicate, filter_mbias_trim_bp,
                        filter_mbias_trim_bp_CG, filter_duplicate_reads,
                        filter_remove_overlap_bp)
                else:
                    if not duplicate and len(loc_dict) > 0:
                        out.write(
                            read
                        )  #not trimming, only output not_duplicate reads
                    else:
                        filter_duplicate_reads += 1  ##record the duplicate read (2013-06-20)
        out.close()

    ##produce the filter report
    info('Produce the report file...')
    report_out = open(name + "_BSeQC_mbias_filter_report.txt", 'w')
    report_out.write('Total reads: %d\n' % all_reads)
    if single_on:
        report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' %
                         (not_mapping_reads,
                          float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Unique mapping reads: %d(%.2f%s all reads)\n' %
            ((all_reads - not_mapping_reads),
             float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Skip not unique mapping reads: %d(%.2f%s all reads)\n' %
            (filter_not_mapping_reads,
             float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique mapping reads:\n')
        report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write(
            'Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' %
            (filter_duplicate_reads, float(filter_duplicate_reads) /
             (all_reads - not_mapping_reads) * 100, "%"))
        #report_out.write('Filter Mbias CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n' % (
        #filter_mbias_trim_bp_CG, float(filter_mbias_trim_bp_CG) / all_mapping_bp * 100, "%"))
        report_out.write(
            'Filter Mbias basepairs: %d(%.2f%s of unique mapping basepairs)\n'
            % (filter_mbias_trim_bp,
               float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%"))

    else:
        report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' %
                         (not_mapping_reads,
                          float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Unique paired mapping reads: %d(%.2f%s)\n' %
            ((all_reads - not_mapping_reads),
             float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write(
            'Skip not paired unique mapping reads: %d(%.2f%s)\n' %
            (filter_not_mapping_reads,
             float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique paired mapping reads:\n')
        report_out.write('All unique paired mapping basepairs: %d\n' %
                         all_mapping_bp)
        report_out.write(
            'Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n'
            % (filter_duplicate_reads, float(filter_duplicate_reads) /
               (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write(
            'Filter Mbias basepairs: %d(%.2f%s of unique paired mapping basepairs)\n'
            % (filter_mbias_trim_bp,
               float(filter_mbias_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write(
            "Filter 5' Mbias CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n"
            % (filter_mbias_trim_bp_CG,
               float(filter_mbias_trim_bp_CG) / all_mapping_bp * 100, "%"))
        report_out.write(
            'Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n'
            % (filter_remove_overlap_bp,
               float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%"))
    report_out.close()
    info('Get the report file!')
Ejemplo n.º 10
0
def run(args):
    """
    Alternative module: Use the strategy in Bis-SNP to trim 5' bisulfite conversion failures
    """
    options = args.parse_args()

    if len(options.sam_file) == 0:
        error("Missing the SAM file, use -s or --sam option.")
    else:
        options.sam_file = options.sam_file.split(',')
    for s in options.sam_file:
        if not os.path.isfile(s):
            error("Can't open the SAM file: " + s)
            sys.exit(1)

    if len(options.ref_file) == 0:
        error("Missing the reference genome fasta file, use -r or --ref option.")
    else:
        if not os.path.isfile(options.ref_file):
            error("Can't open the ref file: " + options.ref_file)

    if len(options.samtools) != 0:
        if options.samtools[-1] != '/':
            options.samtools += '/'

    if len(options.name) == 0:
        error("Missing the output file name, use -n or --name options.")

    sam_inf = options.sam_file
    ref_file = options.ref_file
    bsm = options.bsm
    s_path = options.samtools
    name = options.name
    remove_overlap = options.remove_overlap
    filter_dup = options.filter_dup
    p_poisson = options.p_poisson
    gsize = options.gsize
    not_mapping = options.not_mapping

    info("Get the all parameter!!")

    #check the input mapping files
    sam_format, read_inf = check.check_mapping_file_flag(sam_inf[0], s_path)
    pre_flag = read_inf.readline().split('\t')[1]
    if 'p' in pre_flag:
        single_on = False
        info("The input mapping files are paired-end sequencing!")
    else:
        single_on = True
        info("The input mapping files are single-end sequencing!")

    loc_dict = {}
    if filter_dup:
        ## if filter_up is TRUE, the duplicate reads will be assessed and shown in Dup_dis.pdf
        info("The filter_dup has been set True.")
        info("Assess the duplicate reads...")
        for sam in sam_inf:
            #check the input mapping files
            sam_format, read_inf = check.check_mapping_file(sam, s_path)
            if single_on:
                for read in read_inf:
                    loc_dict = LI.Loc_single(read, loc_dict, bsm)
            else:
                for read in read_inf:
                    loc_dict = LI.Loc_paired(read, loc_dict, bsm)
        max_cov = DR.duplicate_report(loc_dict, gsize, p_poisson, name)
        info('Get the duplicate reads distribution!')

    #get reference information
    ref = GR.get_ref(ref_file)
    trim_position = []

    filter_duplicate_reads = 0
    filter_nonuniform_trim_bp = 0
    filter_nonuniform_trim_bp_CG = 0
    filter_remove_overlap_bp = 0
    filter_not_mapping_reads = 0
    all_reads = 0
    not_mapping_reads = 0
    all_mapping_bp = 0

    ##filter the 5' bisulfite failure
    for sam in sam_inf:
        out_sam = sam[:-4] + '_' + name + '_filter.sam'
        out = open(out_sam, 'w')
        #check the input mapping files
        record_mate = {}
        sam_format, read_inf = check.check_mapping_file_header(sam, s_path)

        for read in read_inf:
            #for sam header
            if read.startswith('@'):
                out.write(read)
                continue
            else:
                all_reads += 1  ##record the read number (2013-06-20)

                #Get the read information for trimming
                #If the read isn't unique mapping, we will get a empty list ([]).
                #In: single unique mapping read  Out: [flag,strand,chr,pos,CIGAR,seq,score]
                #In: paired unique mapping read  Out: [flag,strand,chr,pos1,CIGAR,pos2,insert,seq,score]
            read_info = RI(read, bsm)
            read_info = read_info.extract_information()

            if len(read_info) == 0:
                not_mapping_reads += 1
                if not_mapping:         #keep the not_unique mapping reads (or not paired mapping)
                    out.write(read)
                else:
                    filter_not_mapping_reads += 1  ##record the not mapping read number (2013-06-20)
                continue

            if len(loc_dict) > 0: #the --filter_dup has been set True, have to remove duplicate reads
                duplicate, loc_dict = DF(read_info, loc_dict, max_cov, single_on)
            else:
                duplicate = False

            if single_on:
                all_mapping_bp += len(read_info[5])   ##record the mapping read basepair (2013-06-20)
            else:
                all_mapping_bp += len(read_info[7])   ##record the mapping read basepair (2013-06-20)

            record_mate, trim_position, filter_nonuniform_trim_bp_CG, filter_duplicate_reads, filter_remove_overlap_bp = NF.nonuniform_filter(read,
                                                                                                                out,
                                                                                                                read_info,
                                                                                                                ref,
                                                                                                                remove_overlap,
                                                                                                                duplicate,
                                                                                                                single_on,
                                                                                                                record_mate,
                                                                                                                trim_position,
                                                                                                                filter_nonuniform_trim_bp_CG,
                                                                                                                filter_duplicate_reads,
                                                                                                                filter_remove_overlap_bp)
        out.close()
        del record_mate
    NR.nonuniform_generator(trim_position, name)

    for i in range(len(trim_position)):
        filter_nonuniform_trim_bp += i * trim_position[i]

    ##produce the filter report
    info('Produce the report file...')
    report_out = open(name + "_BSeQC_nonuniform_filter_report.txt", 'w')
    report_out.write('Total reads: %d\n' % all_reads)
    if single_on:
        report_out.write('Not unique mapping reads: %d(%.2f%s all reads)\n' % (
            not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique mapping reads: %d(%.2f%s all reads)\n' % (
            (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not unique mapping reads: %d(%.2f%s all reads)\n' % (
            filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique mapping reads:\n')
        report_out.write('All unique mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique mapping reads)\n' % (
            filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads) * 100, "%"))
        report_out.write("Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
            filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write("Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
            filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%"))

    else:
        report_out.write('Not unique paired mapping reads: %d(%.2f%s)\n' % (
            not_mapping_reads, float(not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Unique paired mapping reads: %d(%.2f%s)\n' % (
            (all_reads - not_mapping_reads), float(all_reads - not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('Skip not paired unique mapping reads: %d(%.2f%s)\n' % (
            filter_not_mapping_reads, float(filter_not_mapping_reads) / all_reads * 100, "%"))
        report_out.write('In unique paired mapping reads:\n')
        report_out.write('All unique paired mapping basepairs: %d\n' % all_mapping_bp)
        report_out.write('Filter Duplicate reads: %d(%.2f%s of unique paired mapping reads)\n' % (
            filter_duplicate_reads, float(filter_duplicate_reads) / (all_reads - not_mapping_reads * 100), "%"))
        report_out.write("Filter 5' nonconversion basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
            filter_nonuniform_trim_bp, float(filter_nonuniform_trim_bp) / all_mapping_bp * 100, "%"))
        report_out.write("Filter 5' nonconversion CpG basepairs: %d(%.2f%s of unique mapping basepairs)\n" % (
            filter_nonuniform_trim_bp_CG, float(filter_nonuniform_trim_bp_CG) / all_mapping_bp * 100, "%"))
        report_out.write('Filter overlapped basepairs: %d(%.2f%s of unique paired mapping basepairs)\n' % (
            filter_remove_overlap_bp, float(filter_remove_overlap_bp) / all_mapping_bp * 100, "%"))
    report_out.close()
    info('Get the report file!')