def get_fisher_pvalue(self,base):
     odds_ratio, fisher_pvalue = fisher(
     ((int(self.get_tumor_base_total(self.ref)), int(self.get_ctrl_base_total(self.ref))),
      (int(self.get_tumor_base_total(base)),  int(self.get_ctrl_base_total(base)))),
       alternative='two-sided'
     )
     val = float(0.0)
     if fisher_pvalue < 10**(-60):
         val = float(60.0)
     elif fisher_pvalue  > 1.0 - 10**(-10) :
         val = float(0.0)
     else:
         val = -math.log( fisher_pvalue, 10 )
     return val
Beispiel #2
0
 def get_fisher_pvalue(self, base):
     odds_ratio, fisher_pvalue = fisher(
         ((int(self.get_tumor_base_total(
             self.ref)), int(self.get_ctrl_base_total(self.ref))),
          (int(self.get_tumor_base_total(base)),
           int(self.get_ctrl_base_total(base)))),
         alternative='two-sided')
     val = float(0.0)
     if fisher_pvalue < 10**(-60):
         val = float(60.0)
     elif fisher_pvalue > 1.0 - 10**(-10):
         val = float(0.0)
     else:
         val = -math.log(fisher_pvalue, 10)
     return val
Beispiel #3
0
    def filter(self, in_tumor_bam, in_normal_bam, output, in_mutation_file):

        srcfile = open(in_mutation_file,'r')
        hResult = open(output,'w')

        if in_tumor_bam and in_normal_bam:
            tumor_samfile = pysam.Samfile(in_tumor_bam, "rb")
            normal_samfile = pysam.Samfile(in_normal_bam, "rb")

            if self.header_flag:
                header = srcfile.readline().rstrip('\n')  
                newheader = ("RefNum_tumor\tAltNum_tumor\tOtherNum_tumor"
                         + "\tRefNum_normal\tAltNum_normal\tOtherNum_normal")
                print >> hResult, (header +"\t"+ newheader)

            ####
            for line in srcfile:
                line = line.rstrip()
                itemlist = line.split('\t')
                # annovar input file (not zero-based number)
                chr, start, end, ref, alt  = (itemlist[0], (int(itemlist[1]) - 1), int(itemlist[2]), itemlist[3], itemlist[4])
                
                tumor_ref, tumor_alt, tumor_other, normal_ref, normal_alt, normal_other, log10_fisher_pvalue= ('---','---','---','---','---','---','---')
                self.makeTwoReference(chr,start,end,ref,alt,output + ".tmp.refalt.fa")

                if tumor_samfile.count(chr,start,end) < self.max_depth:

                    # extract short reads from tumor sequence data around the candidate
                    self.extractRead(tumor_samfile,chr,start,end,output + ".tmp.fa")
                    # alignment tumor short reads to the reference and alternative sequences
                    FNULL = open(os.devnull, 'w')
                    retcode = subprocess.check_call(self.blat_cmds + [output + ".tmp.refalt.fa", output + ".tmp.fa", output + ".tmp.psl"], 
                                                    stdout = FNULL, stderr = subprocess.STDOUT)
                    FNULL.close()
                    # summarize alignment results
                    tumor_ref, tumor_alt, tumor_other = self.summarizeRefAlt(output + ".tmp.psl")
                
                if normal_samfile.count(chr,start,end) < self.max_depth:

                    # extract short reads from normal sequence data around the candidate
                    self.extractRead(normal_samfile,chr,start,end,output + ".tmp.fa")
                    # alignment normal short reads to the reference and alternative sequences
                    FNULL = open(os.devnull, 'w')
                    subprocess.check_call(self.blat_cmds + [output + ".tmp.refalt.fa", output + ".tmp.fa", output + ".tmp.psl"], 
                                          stdout = FNULL, stderr = subprocess.STDOUT)
                    FNULL.close()
                    # summarize alignment results
                    normal_ref, normal_alt, normal_other = self.summarizeRefAlt(output + ".tmp.psl")

                if tumor_ref != '---' and  tumor_alt != '---' and  normal_ref != '---' and  normal_alt != '---':
                    odds_ratio, fisher_pvalue = fisher(((int(tumor_ref),int(normal_ref)),(int(tumor_alt),int(normal_alt))), alternative='two-sided')
                    log10_fisher_pvalue = '{0:.3f}'.format(float(self.math_log_fisher_pvalue(fisher_pvalue)))


                if  ((tumor_alt == '---' or tumor_alt >= self.tumor_min_mismatch) and
                    (normal_alt == '---' or normal_alt <= self.normal_max_mismatch)):
                    print >> hResult, (line +"\t"+ str(tumor_ref)  +"\t"+ str(tumor_alt)  +"\t"+ str(tumor_other)
                                            +"\t"+ str(normal_ref) +"\t"+ str(normal_alt) +"\t"+ str(normal_other)
                                            +"\t"+ str(log10_fisher_pvalue))

            ####
            tumor_samfile.close()
            normal_samfile.close()

        elif in_tumor_bam:
            tumor_samfile = pysam.Samfile(in_tumor_bam, "rb")

            if self.header_flag:
                header = srcfile.readline().rstrip('\n')  
                newheader = ("RefNum_tumor\tAltNum_tumor\tOtherNum_tumor\t0.1\tratio\t0.9")
                print >> hResult, (header +"\t"+ newheader)

            for line in srcfile:
                line = line.rstrip()
                itemlist = line.split('\t')
                # annovar input file (not zero-based number)
                chr, start, end, ref, alt  = (itemlist[0], (int(itemlist[1]) - 1), int(itemlist[2]), itemlist[3], itemlist[4])

                tumor_ref, tumor_alt, tumor_other, beta_01, beta_mid, beta_09 = ('---','---','---','---','---','---')
               
                if tumor_samfile.count(chr,start,end) < self.max_depth:

                    self.makeTwoReference(chr,start,end,ref,alt,output + ".tmp.refalt.fa")
                    # extract short reads from tumor sequence data around the candidate
                    self.extractRead(tumor_samfile,chr,start,end,output + ".tmp.fa")
                    # alignment tumor short reads to the reference and alternative sequences
                    FNULL = open(os.devnull, 'w')
                    retcode = subprocess.check_call(self.blat_cmds + [output + ".tmp.refalt.fa", output + ".tmp.fa", output + ".tmp.psl"], 
                                                    stdout = FNULL, stderr = subprocess.STDOUT)
                    FNULL.close()
                    # summarize alignment results
                    tumor_ref, tumor_alt, tumor_other = self.summarizeRefAlt(output + ".tmp.psl")

                    beta_01  = '{0:.3f}'.format(float(scipy.special.btdtri( int(tumor_alt) + 1, int(tumor_ref) + 1, 0.1 )))
                    beta_mid = '{0:.3f}'.format(float( int(tumor_alt) + 1 ) / float( int(tumor_ref) + int(tumor_alt) + 2 ))
                    beta_09  = '{0:.3f}'.format(float(scipy.special.btdtri( int(tumor_alt) + 1, int(tumor_ref) + 1, 0.9 )))

                if (tumor_alt == '---' or tumor_alt >= self.tumor_min_mismatch):
                    print >> hResult, (line +"\t"+ str(tumor_ref)  +"\t"+ str(tumor_alt)  +"\t"+ str(tumor_other) +"\t"+ str(beta_01) +"\t"+ str(beta_mid) +"\t"+ str(beta_09))
            
            ####
            tumor_samfile.close()

        ####
        hResult.close()
        srcfile.close()

        ####
        if os.path.exists(output + ".tmp.refalt.fa"): os.unlink(output + ".tmp.refalt.fa")
        if os.path.exists(output + ".tmp.fa"): os.unlink(output + ".tmp.fa")
        if os.path.exists(output + ".tmp.psl"): os.unlink(output + ".tmp.psl")
def Pileup_out(mpileup, w, min_depth, min_variant_read, compare):

    #
    # mpileup format
    #
    # chr1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
    #
    # 0 chromosome,
    # 1 1-based coordinate,
    # 2 reference base,
    # 3 the number of reads covering the site (1)
    # 4 read bases (1)
    # 5 base qualities (1)
    # 6 the number of reads covering the site (2)
    # 7 read bases (2)
    # 8 base qualities (2)
    #
    global target
    global remove_chr
    global filter_quals

    #
    # Prepare mpileup data
    #
    # mp_list = str( mpileup.translate( None, '\n' ) ).split( '\t' )
    if sys.version_info.major == 3:
        mp_list = mpileup.decode().strip('\n').split('\t')
    else:
        mp_list = mpileup.strip('\n').split('\t')
    mp_list_len = len(mp_list)
    ref_base_U = mp_list[2].upper()
    coordinate = mp_list[0:3]
    #
    # skip if depth is 0
    #
    if mp_list[3] == '0' or (mp_list_len > 6 and mp_list[6] == '0'):
        # if int(mp_list[ 3 ]) < min_depth or ( mp_list_len > 6 and int(mp_list[ 6 ]) < min_depth ):
        return None

    ref_base_plus = mp_list[4].count('.')
    ref_base_minus = mp_list[4].count(',')

    ref_base_count = mp_list[4].count('.') + mp_list[4].count(',')
    ins_base_count = mp_list[4].count('+')
    del_base_count = mp_list[4].count('-')
    if (int(mp_list[3]) - ref_base_count + ins_base_count +
            del_base_count) < min_variant_read:
        return None

    if ref_base_U not in 'ACGTN': return None
    #
    # data_pair IDs
    # const.POS_CHR = 0
    # const.POS_COORD = 1
    # const.POS_REF = 2
    # const.POS_DATA1 = 3
    # const.POS_DATA2 = 4
    # const.POS_FISHER_SNV = 5
    # const.POS_FISHER_INS = 6
    # const.POS_FISHER_DEL = 7
    # const.POS_COUNT = 8
    #
    data_pair = [
        mp_list[0],
        int(mp_list[1]), mp_list[2], {
            'mis_base': ref_base_U,
            'mis_rate': 0,
            'proper_read_depth': 0,
            'proper_read_depth_plus': 0,
            'proper_read_depth_minus': 0,
            'proper_read_depth_indel': 0,
            'proper_read_depth_indel_plus': 0,
            'proper_read_depth_indel_minus': 0,
            'indel': util.AutoVivification()
        }, {
            'mis_base': ref_base_U,
            'mis_rate': 0,
            'proper_read_depth': 0,
            'proper_read_depth_plus': 0,
            'proper_read_depth_minus': 0,
            'proper_read_depth_indel': 0,
            'proper_read_depth_indel_plus': 0,
            'proper_read_depth_indel_minus': 0,
            'indel': util.AutoVivification()
        }, 1.0, 'N:1.0', 'N:1.0', 0
    ]

    #
    # Loop for 2 bam file case
    #
    if compare:
        data_pair[const.POS_COUNT] = 2
        input_list = [(const.POS_DATA1, mp_list[3], mp_list[4], mp_list[5]),
                      (const.POS_DATA2, mp_list[6], mp_list[7], mp_list[8])]
    else:
        data_pair[const.POS_COUNT] = 1
        input_list = [(const.POS_DATA1, mp_list[3], mp_list[4], mp_list[5])]

    #
    # position id,
    # mpileup output 4th row(number of read covering the site),
    # 5th row(read bases),
    # 6th row(base quality)
    #
    for data_id, depth, read_bases, qual_list in input_list:

        indel = util.AutoVivification()

        #
        # Look for deletion/insertion and save info in 'indel' dictionary
        #
        #   ([\+\-])[0-9]+[ACGTNacgtn]+
        #
        # m.group(1): + or - (deletion/insertion)
        # m.group(2): number of deletion/insertion
        # m.group(3): nucleotides
        #
        deleted = 0
        iter = target.finditer(read_bases)
        for m in iter:
            site = m.start()
            type = m.group(1)
            num = m.group(2)
            bases = m.group(3)[0:int(num)]
            if bases.islower():
                strand = ('-', '+')
            else:
                strand = ('+', '-')

            key = '\t'.join(coordinate + [bases.upper()])
            if type in indel and key in indel[type]:
                indel[type][key][strand[0]] += 1
            else:
                indel[type][key][strand[0]] = 1
                indel[type][key][strand[1]] = 0

            read_bases = read_bases[0:site -
                                    deleted] + read_bases[site + int(num) +
                                                          len(num) + 1 -
                                                          deleted:]
            deleted += 1 + len(num) + int(num)

        #
        # Remove '^.' and '$'
        #
        read_bases = remove_chr.sub('', read_bases)
        read_bases = read_bases.replace('$', '')

        #
        # Error check
        #
        if len(read_bases) != len(qual_list):
            logging.error("mpileup data is not good: {0}, {1}".format(
                mpileup, read_bases))
            return None

        #
        # Count mismatch
        #
        mis_base_U = None
        if int(depth) >= min_depth:

            read_bases = read_bases.replace('.', ref_base_U)
            read_bases = read_bases.replace(',', ref_base_U.lower())

            base_num = {
                "total_A": 0,
                "total_C": 0,
                "total_G": 0,
                "total_T": 0,
                "total_N": 0,
                "A": 0,
                "C": 0,
                "G": 0,
                "T": 0,
                "N": 0,
                "a": 0,
                "c": 0,
                "g": 0,
                "t": 0,
                "n": 0
            }

            #
            # Set data
            #
            data_pair[data_id]['bases'] = read_bases
            data_pair[data_id]['depth'] = int(depth)

            #
            # Count number
            #
            for nuc, qual in zip(read_bases, qual_list):
                if nuc in 'ATGCNacgtn':
                    data_pair[data_id]['proper_read_depth_indel'] += 1
                if nuc in 'ATGCN':
                    data_pair[data_id]['proper_read_depth_indel_plus'] += 1
                if nuc in 'acgtn':
                    data_pair[data_id]['proper_read_depth_indel_minus'] += 1
                if nuc in 'ATGCNacgtn' and not (qual in filter_quals):
                    base_num[nuc] += 1
                    base_num['total_' + nuc.upper()] += 1
                if nuc in 'ATGCatgc' and not (qual in filter_quals):
                    data_pair[data_id]['proper_read_depth'] += 1
                if nuc in 'ATGC' and not (qual in filter_quals):
                    data_pair[data_id]['proper_read_depth_plus'] += 1
                if nuc in 'atgc' and not (qual in filter_quals):
                    data_pair[data_id]['proper_read_depth_minus'] += 1

            #
            # InsDel
            # Beta distribution
            #
            for type in ('+', '-'):
                if type in indel:
                    for key in indel[type].keys():
                        bases = key.split('\t')[3]
                        data_pair[data_id]['indel'][type][bases]['+'] = indel[
                            type][key]['+']
                        data_pair[data_id]['indel'][type][bases]['-'] = indel[
                            type][key]['-']
                        indel_number = \
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'both' ] = ( indel[ type ][ key ][ '-' ] +
                                                                                       indel[ type ][ key ][ '+' ] )
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '0.1' ] = \
                            scipy.special.btdtri( indel_number + 1, float( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) - indel_number + 1, 0.1 )
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'mid' ] = \
                            ( indel_number + 1 ) / ( float( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) + 2 )
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '0.9' ] = \
                            scipy.special.btdtri( indel_number + 1, int( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) - indel_number + 1, 0.9 )
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 's_ratio' ] = \
                            float( indel[ type ][ key ][ '+' ] ) / data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'both' ]

            #
            # skip if reference is 'N'
            #
            if ref_base_U != 'N' and int(
                    data_pair[data_id]['proper_read_depth']) >= min_depth:
                ref_num = base_num['total_' + ref_base_U]

                mis_num = 0
                for nuc in ('A', 'C', 'G', 'T'):
                    data_pair[data_id][nuc] = base_num[nuc]
                    tmp = nuc.lower()
                    data_pair[data_id][tmp] = base_num[tmp]
                    tmp = 'total_' + nuc
                    data_pair[data_id][tmp] = base_num[tmp]

                    if nuc != ref_base_U:
                        if base_num[tmp] > mis_num:
                            mis_num = base_num[tmp]
                            mis_base_U = nuc

                if data_id == const.POS_DATA2 and data_pair[
                        const.POS_DATA1]['mis_base']:
                    mis_num = base_num['total_' +
                                       data_pair[const.POS_DATA1]['mis_base']]
                    mis_base_U = data_pair[const.POS_DATA1]['mis_base']

            ####
            #
            # Calculate ratio
            #
                data_pair[data_id]['mis_rate'] = mis_num / float(
                    data_pair[data_id]['proper_read_depth'])
                data_pair[data_id]['mis_base'] = mis_base_U
                if mis_base_U and (base_num[mis_base_U] +
                                   base_num[mis_base_U.lower()]) > 0:
                    data_pair[data_id]['s_ratio'] = float(
                        base_num[mis_base_U]) / (base_num[mis_base_U] +
                                                 base_num[mis_base_U.lower()])
                # else:
                #    data_pair[ data_id ][ 's_ratio' ]  = float(0)

                #
                # Beta distribution for SNV
                #
                data_pair[data_id]['0.1'] = scipy.special.btdtri(
                    mis_num + 1, ref_num + 1, 0.1)
                data_pair[data_id]['mid'] = (mis_num + 1) / float(ref_num +
                                                                  mis_num + 2)
                data_pair[data_id]['0.9'] = scipy.special.btdtri(
                    mis_num + 1, ref_num + 1, 0.9)

                data_pair[data_id]['mis_num'] = mis_num

        ###
    #
    # Fisher
    #
    # SNV
    #
    if (data_pair[const.POS_COUNT] == 2 and ref_base_U
            and data_pair[const.POS_DATA1]['mis_base']
            and 'mid' in data_pair[const.POS_DATA1].keys()
            and 'mid' in data_pair[const.POS_DATA2].keys()
            and 'proper_read_depth' in data_pair[const.POS_DATA1].keys()
            and 'proper_read_depth' in data_pair[const.POS_DATA2].keys()):
        odds_ratio, fisher_pvalue = fisher(
            ((int(data_pair[const.POS_DATA1]['total_' + ref_base_U]),
              int(data_pair[const.POS_DATA2]['total_' + ref_base_U])),
             (int(data_pair[const.POS_DATA1][
                 'total_' + data_pair[const.POS_DATA1]['mis_base']]),
              int(data_pair[const.POS_DATA2][
                  'total_' + data_pair[const.POS_DATA1]['mis_base']]))),
            alternative='two-sided')

        data_pair[const.POS_FISHER_SNV] = math_log_fisher_pvalue(fisher_pvalue)

    #
    # INDEL
    #
    if (data_pair[const.POS_COUNT] == 2
            and 'indel' in data_pair[const.POS_DATA1]):
        fisher_pvalue = None
        for type in data_pair[const.POS_DATA1]['indel']:
            for bases in data_pair[const.POS_DATA1]['indel'][type].keys():

                # if type in data_pair[ const.POS_DATA1 ][ 'indel' ] and bases in data_pair[ const.POS_DATA1 ][ 'indel' ][ type ]:

                if not isinstance(
                        data_pair[const.POS_DATA2]['indel'][type][bases]
                    ['both'], int):
                    data_pair[
                        const.POS_DATA2]['indel'][type][bases]['both'] = 0
                    data_pair[const.POS_DATA2]['indel'][type][bases]['+'] = 0
                    data_pair[const.POS_DATA2]['indel'][type][bases]['-'] = 0

                if (data_pair[const.POS_DATA2]['proper_read_depth_indel'] >=
                        data_pair[
                            const.POS_DATA2]['indel'][type][bases]['both'] and
                        data_pair[const.POS_DATA1]['proper_read_depth_indel']
                        >= data_pair[
                            const.POS_DATA1]['indel'][type][bases]['both']):
                    odds_ratio, fisher_pvalue = fisher(
                        ((data_pair[const.POS_DATA1]['proper_read_depth_indel']
                          - data_pair[const.POS_DATA1]['indel'][type][bases]
                          ['both'], data_pair[
                              const.POS_DATA1]['indel'][type][bases]['both']),
                         (data_pair[const.POS_DATA2]['proper_read_depth_indel']
                          - data_pair[const.POS_DATA2]['indel'][type][bases]
                          ['both'], data_pair[
                              const.POS_DATA2]['indel'][type][bases]['both'])),
                        alternative='two-sided')

                    if fisher_pvalue != None:
                        if type == '+':
                            data_id = const.POS_FISHER_INS
                        elif type == '-':
                            data_id = const.POS_FISHER_DEL

                        if data_pair[data_id] == 'N:1.0':
                            data_pair[data_id] = bases + ':' + str(
                                math_log_fisher_pvalue(fisher_pvalue))
                        else:
                            data_pair[data_id] += ',' + bases + ':' + str(
                                math_log_fisher_pvalue(fisher_pvalue))

    return data_pair
Beispiel #5
0
def fsometests(sumotudict, ftable, iddict, sumalldict, otu_list, whattest="man-y"):

    '''
    This function consists of statistical tests, that calculate p-value for our data.
    This is my  shiny shit castle of crap, which really lacks some order.
    Return orderdicts with pairs like  otu:p-value: 
    fisher/chi2(if sum from some sample more than 5)[0]
    ttest [1]
    kruskal[2]
    '''
    
    otherdict = OrderedDict()
    sumalldict = sumalldict.values()
    for i in sumotudict.items():
        other = map(operator.sub, sumalldict , i[1])
        otherdict.update({i[0]:other})
    leno = len(otu_list)

    
    otuwsa_od = OrderedDict()
    for i in otu_list:
        dpartbef = [a for a in iddict.values()]
        summm_1 = sumalldict[0]
        summm_2 = sumalldict[1]
        dpart1_l = [ftable.get_value_by_ids(i, a) for a in dpartbef[0]]
        dpart2_l = [ftable.get_value_by_ids(i, a) for a in dpartbef[1]]
        dpart1 = [a/summm_1 for a in dpart1_l]
        dpart2 = [a/summm_2 for a in dpart2_l]
        otuwsa_od.update({i:[dpart1,dpart2]})
   
    if whattest == "chi2":
        fj=0
        pdict = OrderedDict()
        for i in sumotudict.items():
            fj+=1
            sum = i[1]
            other = otherdict.get(i[0])
            table = np.array([sum,
                            other])
            if sum[1] <= 5 or sum[0] <= 5: 
                p = fisher(table)[1]
                pdict.update({i[0]:p})
            else:
                p = chisq(table, lambda_="log-likelihood")[1]
                pdict.update({i[0]:p})
            
            sys.stdout.write('\r')
            sys.stdout.write("fisher {}/{}".format(fj,leno))
            sys.stdout.flush()
    
    elif whattest=="ttest":

        pdict = OrderedDict()
        j=0
        for i in otu_list:
            j+=1
            a = otuwsa_od.get(i)[0]
            b = otuwsa_od.get(i)[1]
            p = ttest(a,b)[1]
            if p != p:
                p = 1
            pdict.update({i:p})

            sys.stdout.write('\r')
            sys.stdout.write("ttest {}/{}".format(j,leno))
            sys.stdout.flush()
        
    elif whattest=="man-y":

        pdict = OrderedDict()
        mj=0
        for i in otu_list:
            mj+=1
            a = otuwsa_od.get(i)[0]
            b = otuwsa_od.get(i)[1]
            p = man(a,b)[1]
            pdict.update({i:p})
            sys.stdout.write('\r')
            sys.stdout.write("man-y {}/{}".format(mj,leno))
            sys.stdout.flush()
    
    elif whattest=="kruscal":

        pdict = OrderedDict()
        kw=0
        for i in otu_list:
            kw+=1
            a = otuwsa_od.get(i)[0]
            b = otuwsa_od.get(i)[1]
            p = wilc(a,b)[1]
            pdict.update({i:p})
            sys.stdout.write('\r')
            sys.stdout.write("kruskal {}/{}".format(kw,leno))
            sys.stdout.flush()
        
        sys.stdout.write('\r')
        
    return pdict
def Pileup_out( mpileup, w, min_depth, min_variant_read, compare ):

    #
    # mpileup format
    #
    # chr1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
    #
    # 0 chromosome,
    # 1 1-based coordinate,
    # 2 reference base,
    # 3 the number of reads covering the site (1)
    # 4 read bases (1)
    # 5 base qualities (1)
    # 6 the number of reads covering the site (2)
    # 7 read bases (2)
    # 8 base qualities (2)
    #
    global target
    global remove_chr
    global filter_quals

    #
    # Prepare mpileup data
    #
    mp_list = str( mpileup.translate( None, '\n' ) ).split( '\t' )
    mp_list_len = len( mp_list )
    ref_base_U = mp_list[ 2 ].upper()
    coordinate = mp_list[ 0:3 ]

    #
    # skip if depth is 0
    #
    if mp_list[ 3 ] == '0' or ( mp_list_len > 6 and mp_list[ 6 ] == '0' ):
    # if int(mp_list[ 3 ]) < min_depth or ( mp_list_len > 6 and int(mp_list[ 6 ]) < min_depth ):
        return None

    ref_base_plus  = mp_list[ 4 ].count('.')
    ref_base_minus = mp_list[ 4 ].count(',')

    ref_base_count = mp_list[ 4 ].count('.') + mp_list[ 4 ].count(',')
    ins_base_count = mp_list[ 4 ].count('+')
    del_base_count = mp_list[ 4 ].count('-')
    if (int(mp_list[ 3 ]) - ref_base_count + ins_base_count + del_base_count) < min_variant_read:
        return None

    if ref_base_U not in 'ACGTN': return None
    #
    # data_pair IDs
    # POS_CHR = 0
    # POS_COORD = 1
    # POS_REF = 2
    # POS_DATA1 = 3
    # POS_DATA2 = 4
    # POS_FISHER_SNV = 5
    # POS_FISHER_INS = 6
    # POS_FISHER_DEL = 7
    # POS_COUNT = 8
    #
    data_pair = [ mp_list[ 0 ],
                  int( mp_list[ 1 ] ),
                  mp_list[ 2 ],
                  { 'mis_base': ref_base_U, 'mis_rate': 0, 'proper_read_depth': 0, 'proper_read_depth_plus': 0, 'proper_read_depth_minus': 0, 'proper_read_depth_indel': 0, 'proper_read_depth_indel_plus': 0, 'proper_read_depth_indel_minus': 0,'indel': AutoVivification() },
                  { 'mis_base': ref_base_U, 'mis_rate': 0, 'proper_read_depth': 0, 'proper_read_depth_plus': 0, 'proper_read_depth_minus': 0, 'proper_read_depth_indel': 0, 'proper_read_depth_indel_plus': 0, 'proper_read_depth_indel_minus': 0,'indel': AutoVivification() },
                  1.0,
                  'N:1.0',
                  'N:1.0',
                  0 ]


    #
    # Loop for 2 bam file case
    #
    if compare:
        data_pair[ POS_COUNT ] = 2
        input_list = [ ( POS_DATA1, mp_list[ 3 ], mp_list[ 4 ], mp_list[ 5 ] ),
                       ( POS_DATA2, mp_list[ 6 ], mp_list[ 7 ], mp_list[ 8 ] ) ]
    else:
        data_pair[ POS_COUNT ] = 1
        input_list = [ ( POS_DATA1, mp_list[ 3 ], mp_list[ 4 ], mp_list[ 5 ] ) ]

    #
    # position id,
    # mpileup output 4th row(number of read covering the site),
    # 5th row(read bases),
    # 6th row(base quality)
    #
    for data_id, depth, read_bases, qual_list in input_list:

        indel = AutoVivification()

        #
        # Look for deletion/insertion and save info in 'indel' dictionary
        #
        #   ([\+\-])[0-9]+[ACGTNacgtn]+
        #
        # m.group(1): + or - (deletion/insertion)
        # m.group(2): number of deletion/insertion
        # m.group(3): nucleotides
        #
        deleted = 0
        iter = target.finditer( read_bases )
        for m in iter:
            site = m.start()
            type = m.group( 1 )
            num = m.group( 2 )
            bases = m.group( 3 )[ 0:int( num ) ]
            if bases.islower():
                strand = ( '-', '+' )
            else:
                strand = ( '+', '-' )
    
            key = '\t'.join( coordinate + [ bases.upper() ] )
            if type in indel and key in indel[ type ]:
                indel[ type ][ key ][ strand[ 0 ] ] += 1
            else:
                indel[ type ][ key ][ strand[ 0 ] ] = 1
                indel[ type ][ key ][ strand[ 1 ] ] = 0
    
            read_bases = read_bases[ 0:site - deleted ] + read_bases[ site + int(num) + len( num ) + 1 - deleted: ]
            deleted += 1 + len( num ) + int( num )
    
        #
        # Remove '^.' and '$'
        #
        read_bases = remove_chr.sub( '', read_bases )
        read_bases = read_bases.translate( None, '$' ) 

        #
        # Error check
        #
        if len( read_bases ) != len( qual_list ):
            logging.error( "mpileup data is not good: {0}, {1}".format( mpileup, read_bases ) )
            return None

        #
        # Count mismatch
        #
        mis_base_U = None
        if int( depth ) >= min_depth:

            read_bases = read_bases.replace( '.', ref_base_U )
            read_bases = read_bases.replace( ',', ref_base_U.lower() )

            base_num = {
                "total_A": 0,
                "total_C": 0,
                "total_G": 0,
                "total_T": 0,
                "total_N": 0,
                "A": 0,
                "C": 0,
                "G": 0,
                "T": 0,
                "N": 0,
                "a": 0,
                "c": 0,
                "g": 0,
                "t": 0,
                "n": 0
            }

            #
            # Set data
            #
            data_pair[ data_id ][ 'bases' ] = read_bases
            data_pair[ data_id ][ 'depth' ] = int( depth )

            #
            # Count number
            #
            for nuc, qual in zip( read_bases, qual_list ):
                if nuc in 'ATGCNacgtn':
                    data_pair[ data_id ][ 'proper_read_depth_indel' ] += 1 
                if nuc in 'ATGCN':
                    data_pair[ data_id ][ 'proper_read_depth_indel_plus' ] += 1 
                if nuc in 'acgtn':
                    data_pair[ data_id ][ 'proper_read_depth_indel_minus' ] += 1 
                if nuc in 'ATGCNacgtn' and not ( qual in filter_quals) :
                    base_num[ nuc ] += 1
                    base_num[ 'total_' + nuc.upper() ] += 1
                if nuc in 'ATGCatgc' and not ( qual in filter_quals):
                    data_pair[ data_id ][ 'proper_read_depth' ] += 1 
                if nuc in 'ATGC' and not ( qual in filter_quals):
                    data_pair[ data_id ][ 'proper_read_depth_plus' ] += 1 
                if nuc in 'atgc' and not ( qual in filter_quals):
                    data_pair[ data_id ][ 'proper_read_depth_minus' ] += 1 

            #
            # InsDel
            # Beta distribution
            #
            for type in ( '+', '-' ):
                if type in indel:
                    for key in indel[ type ].keys():
                        bases = key.split( '\t' )[ 3 ]
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '+' ] = indel[ type ][ key ][ '+' ]
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '-' ] = indel[ type ][ key ][ '-' ]
                        indel_number = \
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'both' ] = ( indel[ type ][ key ][ '-' ] +
                                                                                       indel[ type ][ key ][ '+' ] )
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '0.1' ] = \
                            scipy.special.btdtri( indel_number + 1, float( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) - indel_number + 1, 0.1 )
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'mid' ] = \
                            ( indel_number + 1 ) / ( float( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) + 2 )
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ '0.9' ] = \
                            scipy.special.btdtri( indel_number + 1, int( data_pair[ data_id ][ 'proper_read_depth_indel' ] ) - indel_number + 1, 0.9 )
                        data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 's_ratio' ] = \
                            float( indel[ type ][ key ][ '+' ] ) / data_pair[ data_id ][ 'indel' ][ type ][ bases ][ 'both' ]

            #
            # skip if reference is 'N'
            #
            if ref_base_U != 'N' and int( data_pair[ data_id ][ 'proper_read_depth' ] ) >= min_depth:
                ref_num = base_num[ 'total_' + ref_base_U ]
                    
                mis_num = 0
                for nuc in ( 'A', 'C', 'G', 'T' ):
                    data_pair[ data_id ][ nuc ] = base_num[ nuc ]
                    tmp = nuc.lower()
                    data_pair[ data_id ][ tmp ] = base_num[ tmp ]
                    tmp = 'total_' + nuc
                    data_pair[ data_id ][ tmp ] = base_num[ tmp ]

                    if nuc != ref_base_U:
                        if base_num[ tmp ] > mis_num:
                            mis_num = base_num[ tmp ]
                            mis_base_U = nuc

                if data_id == POS_DATA2 and data_pair[ POS_DATA1 ][ 'mis_base' ]:
                    mis_num = base_num[ 'total_' + data_pair[ POS_DATA1 ][ 'mis_base' ] ]
                    mis_base_U = data_pair[ POS_DATA1 ][ 'mis_base' ]

            ####
                #
                # Calculate ratio
                #
                data_pair[ data_id ][ 'mis_rate' ] = mis_num / float( data_pair[ data_id ][ 'proper_read_depth' ] )
                data_pair[ data_id ][ 'mis_base' ] = mis_base_U
                if mis_base_U and ( base_num[ mis_base_U ] + base_num[ mis_base_U.lower() ] ) > 0:
                    data_pair[ data_id ][ 's_ratio' ]  = float( base_num[ mis_base_U ] ) / ( base_num[ mis_base_U ] + base_num[ mis_base_U.lower() ] )
                # else:
                #    data_pair[ data_id ][ 's_ratio' ]  = float(0)

                #
                # Beta distribution for SNV
                #
                data_pair[ data_id ][ '0.1' ] = scipy.special.btdtri( mis_num + 1, ref_num + 1, 0.1 )
                data_pair[ data_id ][ 'mid' ] = ( mis_num + 1 ) / float( ref_num + mis_num + 2 )
                data_pair[ data_id ][ '0.9' ] = scipy.special.btdtri( mis_num + 1, ref_num + 1, 0.9 )

                data_pair[ data_id ][ 'mis_num' ] = mis_num
        
        ###
    #
    # Fisher
    #
    # SNV
    #
    if ( data_pair[ POS_COUNT ] == 2 and
         ref_base_U and
         data_pair[ POS_DATA1 ][ 'mis_base' ] and
         'mid' in data_pair[ POS_DATA1 ].keys() and
         'mid' in data_pair[ POS_DATA2 ].keys() and
         'proper_read_depth' in data_pair[ POS_DATA1 ].keys() and
         'proper_read_depth' in data_pair[ POS_DATA2 ].keys() 
       ):
        odds_ratio, fisher_pvalue = fisher(
                    ( ( int( data_pair[ POS_DATA1 ][ 'total_' + ref_base_U ] ),
                        int( data_pair[ POS_DATA2 ][ 'total_' + ref_base_U ] ) ),
                      ( int( data_pair[ POS_DATA1 ][ 'total_' + data_pair[ POS_DATA1 ][ 'mis_base' ] ] ),
                        int( data_pair[ POS_DATA2 ][ 'total_' + data_pair[ POS_DATA1 ][ 'mis_base' ] ] ) ) ),
                    alternative='two-sided'
                    )

        data_pair[ POS_FISHER_SNV ] = math_log_fisher_pvalue(fisher_pvalue)

    #
    # INDEL
    #
    if ( data_pair[ POS_COUNT ] == 2 and 'indel' in data_pair[ POS_DATA1 ]
       ):
        fisher_pvalue = None
        for type in data_pair[ POS_DATA1 ][ 'indel' ]:
            for bases in data_pair[ POS_DATA1 ][ 'indel' ][ type ].keys():
              
                # if type in data_pair[ POS_DATA1 ][ 'indel' ] and bases in data_pair[ POS_DATA1 ][ 'indel' ][ type ]:

                if not isinstance( data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ], int ):
                    data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ] = 0
                    data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ '+' ] = 0
                    data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ '-' ] = 0

                if (data_pair[ POS_DATA2 ][ 'proper_read_depth_indel' ] >= data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ] and
                    data_pair[ POS_DATA1 ][ 'proper_read_depth_indel' ] >= data_pair[ POS_DATA1 ][ 'indel' ][ type ][ bases ][ 'both' ]
                    ):
                    odds_ratio, fisher_pvalue = fisher(
                        ( ( data_pair[ POS_DATA1 ][ 'proper_read_depth_indel' ] - data_pair[ POS_DATA1 ][ 'indel' ][ type ][ bases ][ 'both' ],
                            data_pair[ POS_DATA1 ][ 'indel' ][ type ][ bases ][ 'both' ] ),
                          ( data_pair[ POS_DATA2 ][ 'proper_read_depth_indel' ] - data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ],
                            data_pair[ POS_DATA2 ][ 'indel' ][ type ][ bases ][ 'both' ]) ),
                        alternative='two-sided' )
        
                    if fisher_pvalue != None:
                        if type == '+':
                            data_id = POS_FISHER_INS
                        elif type == '-':
                            data_id = POS_FISHER_DEL

                        if data_pair[ data_id ] == 'N:1.0':
                            data_pair[ data_id ] = bases + ':' + str( math_log_fisher_pvalue(fisher_pvalue) )
                        else:
                            data_pair[ data_id ] += ',' + bases + ':' + str( math_log_fisher_pvalue(fisher_pvalue) )


    return data_pair
 def fischers(db, antc, cons):
     obs = raMetricas.__tbContingencia(db, antc, cons)[0]
     return fisher(obs)[1]