Ejemplo n.º 1
0
cdscns_string = ""
outcns = open(options.outfileprename + "_cns.fa", 'w')
outaa = open(options.outfileprename + "_aa.fa", 'w')
outcdscns = open(options.outfileprename + "_cdscns.fa", 'w')
cdsmap = {}
if __name__ == '__main__':
    if options.genomedepth != None:
        depthfile = Util.GATK_depthfile(options.genomedepth,
                                        options.genomedepth + ".index")
        species_idx = depthfile.title.index("Depth_for_" + options.species)
        Considerdepth = True
    else:
        Considerdepth = False
        depthfile = None
        species_idx = -1
    vcfpop = VCFutil.VCF_Data(options.variants)  # new a class
    RefSeqMap, currentChromNO, nextChromNO = Util.getRefSeqMap(
        refFastafilehander=reffa)
    print(currentChromNO, nextChromNO)
    cns_string += currentChromNO + "\n"
    gtfMap = Util.getGtfMap(options.gtffile)

    lastposofdepthfp = 0  #because this time RefSeqMap[0] is 0
    vcfchrom = "begin"
    while currentChromNO != "end of the reffile":
        print("\t\twhile loop:", currentChromNO)
        currentBaselocinGenome = RefSeqMap[currentChromNO][0] + 1
        #        statue = depthfile.set_depthfilefp(currentChromNO, currentBaselocinGenome, lastposofdepthfp)
        #        depth_chrom, depth_pos, depth_linelist,lastposofdepthfp = depthfile.getnextposline()
        if currentChromNO in gtfMap:
            gtfListOfCurrentChrom = gtfMap[currentChromNO]
def make_freq_xaxisKEY_yaxisseqVALUERelation(a):
    chromlistfilename = a[0]
    topleveltablename = a[1]
    targetpopvcffile_withdepthconfig = a[2]
    refpopvcffile_withdepthconfig = a[3]
    numberofindvdoftargetpop_todividintobin = int(a[4])
    mindepthtojudefixed = 20
    d_increase = fractions.Fraction(
        1, (2 * int(numberofindvdoftargetpop_todividintobin)))
    d_increase = round(d_increase, 11)
    minvalue = 0.000000000000
    freq_xaxisKEY_yaxisVALUE_seq_list = {}
    for i in range(numberofindvdoftargetpop_todividintobin * 2 - 1):
        freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue, minvalue + d_increase +
                                           0.00000000004)] = []
        minvalue += d_increase
    else:
        freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue, 1)] = []
    for a, b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()):
        print(str(a), str(b))
#     while minvalue+d_increase<=1:
#         freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue,minvalue+d_increase+0.00000000004)]=[]
#         print('%.12f'%minvalue,'%.12f'%(minvalue+d_increase+0.00000000004))
#         minvalue+=d_increase
#     else:
#         freq_xaxisKEY_yaxisVALUE_seq_list[]
    print("process ID:", os.getpid(), "start", chromlistfilename)
    dbvariantstools = dbm.DBTools(Util.ip, Util.username, Util.password,
                                  Util.vcfdbname)
    chromlistfile = open(chromlistfilename, "r")
    chromlistfilelines = chromlistfile.readlines()
    chromlistfile.close()
    chromlist = []
    for chrrow in chromlistfilelines:
        chrrowlist = re.split(r'\s+', chrrow.strip())
        chromlist.append((chrrowlist[0].strip(), int(chrrowlist[1].strip())))

    vcfnamelist = []
    listofpopvcfmapOfAChr = []
    methodlist = []
    vcfnameKEY_vcfobj_pyBAMfilesVALUE = {}
    N_of_targetpop = len(targetpopvcffile_withdepthconfig)
    N_of_refpop = len(refpopvcffile_withdepthconfig)
    #{ vcftablename1:[depthfilename1,name1,name2] , vcftablename2:[depthfilename2,name1,name2] } or {vcftablename1:None, vcftablename2:None}
    for vcfconfigfilename in targetpopvcffile_withdepthconfig[:] + refpopvcffile_withdepthconfig[:]:
        listofpopvcfmapOfAChr.append({})
        vcfconfig = open(vcfconfigfilename, "r")
        for line in vcfconfig:
            vcffilename_obj = re.search(r"vcffilename=(.*)", line.strip())
            if vcffilename_obj != None:
                vcfname = vcffilename_obj.group(1).strip()
                vcfnamelist.append(vcfname)
                vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname] = []
                vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname].append(
                    VCFutil.VCF_Data(vcfname))
            elif line.split():
                vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname].append(
                    pysam.Samfile(line.strip(), 'rb'))
        vcfconfig.close()
        if re.search(r"indvd[^/]+", vcfname) != None:
            methodlist.append("indvd")

        elif re.search(r"pool[^/]+", vcfname) != None:
            methodlist.append("pool")

        else:
            print("vcfname must with 'pool' or 'indvd'")
            exit(-1)
    for currentchrID, currentchrLen in chromlist:
        for vcfname in vcfnamelist:
            if currentchrID in vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname][
                    0].VcfIndexMap:
                break
        else:
            print("this chr doesn't exist in anypop")
            continue
        for vcfobj_idx in range(len(vcfnamelist)):
            listofpopvcfmapOfAChr[vcfobj_idx] = {}
            listofpopvcfmapOfAChr[vcfobj_idx][
                currentchrID] = vcfnameKEY_vcfobj_pyBAMfilesVALUE[
                    vcfnamelist[vcfobj_idx]][0].getVcfListByChrom(currentchrID)
        target_ref_SNPs = Util.alinmultPopSnpPos(listofpopvcfmapOfAChr, "o")
        for snp_aligned in target_ref_SNPs[currentchrID]:
            if len(snp_aligned[1]) != 1 or len(snp_aligned[2]) != 1:
                print("multple allele", snp_aligned)
                continue
            curpos = int(snp_aligned[0])
            snp = dbvariantstools.operateDB(
                "select",
                "select * from " + topleveltablename + " where chrID='" +
                currentchrID + "' and snp_pos=" + str(curpos) + "")
            if not snp or snp == 0:
                print(currentchrID, curpos, "snp not find in db,skip")
                continue
            else:  #judge the ancenstrall allele
                fanyadepthlist = re.split(r",", snp[0][9])
                if len(fanyadepthlist) == 2 and int(
                        fanyadepthlist[1]
                ) >= mindepthtojudefixed and fanyadepthlist[0].strip() == "0":
                    A_base_idx = 1
                elif len(fanyadepthlist) == 2 and int(
                        fanyadepthlist[0]
                ) >= mindepthtojudefixed and fanyadepthlist[1].strip() == "0":
                    A_base_idx = 0
                else:
                    print("skip snp", snp[0][1], snp[0][7:])
                    continue
            ancestrallcontext = snp[0][5].strip()[0].upper() + snp[0][
                3 + A_base_idx].strip().upper() + snp[0][5].strip()[2].upper()
            if "CG" in ancestrallcontext or "GC" in ancestrallcontext:
                print("skip CG site", ancestrallcontext)
                continue
            ##########x-axis
            countedAF = 0
            target_DAF_sum = 0  #;noofnocoveredsample=0
            for i in range(3, N_of_targetpop + 3):
                if snp_aligned[i] == None:
                    if len(vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[
                            i - 3]]) == 1:
                        print("no depth file")
                        continue
                    else:
                        sum_depth = 0
                        for samfile in vcfnameKEY_vcfobj_pyBAMfilesVALUE[
                                vcfnamelist[i - 3]][1:]:
                            ACGTdep = samfile.count_coverage(
                                currentchrID, curpos - 1, curpos)
                            for dep in ACGTdep:
                                sum_depth += dep[0]
                        if sum_depth >= mindepthtojudefixed:
                            AF = 0
                        else:
                            continue
                else:
                    if methodlist[i - 3] == "indvd":
                        AF = float(
                            re.search(r"AF=([\d\.]+);",
                                      snp_aligned[i][0]).group(1))
                    elif methodlist[i - 3] == "pool":
                        refdep = 0
                        altalleledep = 0
                        AD_idx = (re.split(":", snp_aligned[i][1])).index(
                            "AD")  # gatk GT:AD:DP:GQ:PL
                        for sample in snp_aligned[i][2]:
                            if len(re.split(":", sample)) == 1:  # ./.
                                continue
                            AD_depth = re.split(",",
                                                re.split(":", sample)[AD_idx])
                            try:
                                refdep += int(AD_depth[0])
                                altalleledep += int(AD_depth[1])
                            except ValueError:
                                print(sample, end="|")
                        if refdep == altalleledep and altalleledep == 0:
                            print("no sample available in this pop")
                            #                                 noofnocoveredsample+=1
                            continue
                        AF = altalleledep / (altalleledep + refdep)
                if A_base_idx == 0:
                    DAF = 1 - AF
                elif A_base_idx == 1:
                    DAF = AF
                target_DAF_sum += DAF
                countedAF += 1
            if countedAF == 0:  #or target_DAF_sum==0:
                print(
                    "skip this snp,because it fiexd as ancestral or no covered in this pos in target pops",
                    snp_aligned, snp)
                continue
            target_DAF = target_DAF_sum / countedAF
            ###############y-axis
            countedAF = 0
            rer_DAF_sum = 0
            for i in range(3 + N_of_targetpop,
                           N_of_refpop + N_of_targetpop + 3):
                if snp_aligned[i] == None:
                    if len(vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[
                            i - 3]]) == 1:
                        continue
                    else:
                        #                         depth_linelist=vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[i-3-N_of_targetpop]].getdepthByPos_optimized(currentchrID,curpos)
                        sum_depth = 0
                        for samfile in vcfnameKEY_vcfobj_pyBAMfilesVALUE[
                                vcfnamelist[i - 3]][1:]:
                            ACGTdep = samfile.count_coverage(
                                currentchrID, curpos - 1, curpos)
                            for dep in ACGTdep:
                                sum_depth += dep[0]
                        if sum_depth >= mindepthtojudefixed:
                            AF = 0
                        else:
                            continue
                else:
                    if methodlist[i - 3] == "indvd":
                        AF = float(
                            re.search(r"AF=([\d\.]+);",
                                      snp_aligned[i][0]).group(1))
                        AN = float(
                            re.search(r"AN=([\d\.]+);",
                                      snp_aligned[i][0]).group(1))
                        if AN < 5:
                            continue
                    elif methodlist[i - 3] == "pool":
                        refdep = 0
                        altalleledep = 0
                        AD_idx = (re.split(":", snp_aligned[i][1])).index(
                            "AD")  # gatk GT:AD:DP:GQ:PL
                        for sample in snp_aligned[i][2]:
                            if len(re.split(":", sample)) == 1:  # ./.
                                continue
                            AD_depth = re.split(",",
                                                re.split(":", sample)[AD_idx])
                            try:
                                refdep += int(AD_depth[0])
                                altalleledep += int(AD_depth[1])
                            except ValueError:
                                print(sample, end="|")
                        if (refdep == altalleledep and altalleledep
                                == 0) or altalleledep + refdep < 10:
                            continue
                        AF = altalleledep / (altalleledep + refdep)
                if A_base_idx == 0:
                    DAF = 1 - AF
                elif A_base_idx == 1:
                    DAF = AF
                rer_DAF_sum += DAF
                countedAF += 1
            if countedAF == 0 or rer_DAF_sum == 0:
                print(
                    "skip this snp,because it  no covered in this pos in ref pops",
                    snp_aligned, snp)
                continue
            ######collect according bins
            for a, b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()):
                if target_DAF > a and target_DAF <= b:
                    freq_xaxisKEY_yaxisVALUE_seq_list[(a, b)].append(
                        rer_DAF_sum / countedAF)
                    break


#     freq_xaxisKEY_yaxisVALUERelation={}
#     for a,b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()):
#         freq_xaxisKEY_yaxisVALUERelation[(a,b)]=numpy.mean(freq_xaxisKEY_yaxisVALUE_seq_list[(a,b)])
#         print('%.12f'%a,'%.12f'%(b),'%.12f'%(freq_xaxisKEY_yaxisVALUERelation[(a,b)]),"process ID:",os.getpid(),"done",sep="\t")
    print("process ID:", os.getpid(), "done")
    return copy.deepcopy(freq_xaxisKEY_yaxisVALUE_seq_list)
Ejemplo n.º 3
0
sitesingap = open(sys.argv[5], 'w')
if __name__ == '__main__':
    win = Util.Window()
    i = 0
    interferf = open(sys.argv[5] + ".InterferingTEMP", 'w')
    for gapregion in gapf:
        i += 1
        filledsites = []
        gaplist = re.split(r"\s+", gapregion.strip())
        if not os.path.exists(sys.argv[5] + "temp" + str(i) + ".recode.vcf"):
            os.system(
                vcftools + " --vcf " + sys.argv[2] +
                " --recode --recode-INFO-all --remove-indv DSW33216 --chr " +
                gaplist[0] + " --from-bp " + str(gaplist[1]) + " --to-bp " +
                str(gaplist[2]) + " --out " + sys.argv[5] + "temp" + str(i))
        vcfobj = VCFutil.VCF_Data(sys.argv[5] + "temp" + str(i) +
                                  ".recode.vcf")
        vcflist = vcfobj.getVcfListByChrom(gaplist[0], MQfilter=0)

        findtagcaculator = Caculators.CaculatorToFindTAGs(
            mod="randomvcf", Interferingf=interferf)
        findtagcaculator.curchrom = gaplist[0]
        win.slidWindowOverlap(vcflist, int(gaplist[2]), winsize, winsize,
                              findtagcaculator, int(gaplist[1]))
        filledsites = copy.deepcopy(win.winValueL)
        for s, e, n, poss in filledsites:
            if poss[0] != "NA":
                RefSeqMap = Util.getRefSeqBypos_faster(reff, refidx,
                                                       gaplist[0],
                                                       poss[0][0] - 35,
                                                       poss[0][0] + 35,
                                                       chrlenm[gaplist[0]])
        dilute = 1
else:
    print("error")
    exit(-1)

print(chromlisttosub)
software = options.software.upper().strip()
Morganperbp = float(options.Morganperbp)
chromlistfile = open(options.chromlistfilename, "r")
chromlist = []
for chrrow in chromlistfile:
    chrrowlist = re.split(r'\s+', chrrow.strip())
    chromlist.append(chrrowlist[0].strip())
tempvcffile = open(outputprefix + ".vcf", "w")
if __name__ == '__main__':
    vcfdata = VCFutil.VCF_Data(options.vcffilename.strip())
    i = 0
    outputfilepart = 0
    sumRecOfVCF = 0
    if chromlisttosub == None:
        lastpos = 0
    for chrom in chromlist:
        if chrom not in vcfdata.chromOrder:
            continue
        vcfRecOfAChrom = vcfdata.getVcfListByChrom(
            chrom, dilute, dilutetodensity=dilutetodensity)
        if len(vcfRecOfAChrom) < 30:
            print("Call_geno_snp_ind_Style_software_cyclly",
                  "skip chrom with snps less than 100")
            continue
        else:
Ejemplo n.º 5
0
    affectedlist = []
    unaffectedlist = []
    affectunaffectmark = {}
    f = open(options.affectedlist, 'r')
    for line in f:
        affectedlist.append(line.strip())
        affectunaffectmark[line.strip()] = "2"
    f.close()
    f = open(options.unaffectedlist, 'r')
    for line in f:
        affectunaffectmark[line.strip()] = "1"
    f.close()

    mapfile = open(options.output + ".map", "w")
    pedfile = open(options.output + ".ped", "w")
    vcfobj = VCFutil.VCF_Data(options.vcffile[0])
    chromchangemap = {}
    excludesitesMapBchr = {}

    f = open(options.chrommap, 'r')
    for line in f:
        linelist = re.split(r'\s+', line.strip())
        chromchangemap[linelist[0].strip()] = linelist[1].strip()
    f.close()
    f = open(options.excludesites, 'r')
    for line in f:
        linelist = re.split(r'\s+', line.strip())
        if linelist[0] in excludesitesMapBchr:
            excludesitesMapBchr[linelist[0]].append(int(linelist[1]))
        else:
            excludesitesMapBchr[linelist[0]] = [int(linelist[1])]
Ejemplo n.º 6
0
                  default=True,
                  help="don't print status messages to stdout")

(options, args) = parser.parse_args()
vcfnameKEY_vcfobj_pyBAMfilesVALUE = {}
if options.ancenstralref == None:

    archicpopvcfbamconfig = options.vcfbamconfig.strip()
    vcfconfig = open(archicpopvcfbamconfig, "r")
    for line in vcfconfig:
        vcffilename_obj = re.search(r"vcffilename=(.*)", line.strip())
        if vcffilename_obj != None:
            vcfname = vcffilename_obj.group(1).strip()
            vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname] = []
            vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname].append(
                VCFutil.VCF_Data(vcfname))
        elif line.split():
            vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname].append(
                pysam.Samfile(line.strip(), 'rb'))
    vcfconfig.close()
toplevelsnptablename = options.toplevelsnptable

flanklen = int(options.flanklen.strip())
if __name__ == '__main__':
    ancestralalleletabletools = AncestralAlleletabletools(
        database=Util.vcfdbname,
        ip=Util.ip,
        usrname=Util.username,
        pw=Util.password,
        dbgenome=Util.genomeinfodbname)
    if options.mode.strip() == "1":
Ejemplo n.º 7
0
    for k, v in MfileNameMap.items():
        if k in VfileNameMap:
            VtoMmap[VfileNameMap[k]] = v
            print(VfileNameMap[k], v, sep="\t", file=mf)
    mf.close()

    myformatNamelist = []
    f = open(options.myformatNamelistfile, "r")
    for my_Sample_name in f:
        myformatNamelist.append(my_Sample_name.strip())
    f.close()

    #find the same indvd END
    refFastahandle1 = open(refFastaName1, 'r')
    refFastahandle2 = open(refFastaName2, 'r')
    vcfdataset = VCFutil.VCF_Data(options.vcffilename)
    for k, v in VtoMmap.items():
        commsample_idxlistinM.append(myformatNamelist.index(v))
        commsample_idxlistinV.append(vcfdataset.VcfIndexMap["title"].index(k))

    #
#     bbb=Util.getRefSeqBypos_faster(refFastahandle, refidxByChr, "1", 1, 1)
#     print(bbb)
#     exit()
    for chrom in vcfdataset.chromOrder:
        vcfRecOfAChrom = vcfdataset.getVcfListByChrom(chrom, MQfilter=None)
        MFfRecOfAChrom = []
        #read -M file and change format
        try:
            curMyFormatfile = open(
                'Chr0' + chrom + options.myformatfilesuffix.strip(), 'r')
Ejemplo n.º 8
0
    def fillarchicpop(self,
                      archicpopVcfFile,
                      depthFile,
                      chromtable,
                      archicpopNameindepthFile,
                      tablename="derived_alle_ref",
                      archicpopfieldNameintable="archicpop"):
        """
        abandon the snps which exist in archicpopVcfFile but absence in all others pop snp sets 
        """
        depthfile = Util.GATK_depthfile(depthFile, depthFile + ".index")
        species_idx = depthfile.title.index("Depth_for_" +
                                            archicpopNameindepthFile)
        archicpop = VCFutil.VCF_Data(archicpopVcfFile)
        totalChroms = self.dbtools.operateDB(
            "select", "select count(*) from " + chromtable)[0][0]
        for i in range(0, totalChroms, 20):
            currentsql = "select * from " + chromtable + " order by chrlength desc limit " + str(
                i) + ",20"
            result = self.dbtools.operateDB("select", currentsql)
            for row in result:

                currentchrID = row[0]
                print(currentchrID + ":", end="")
                currentchrLen = int(row[2])
                archicpopSeqOfAChr = {}
                archicpopSeqOfAChr[currentchrID] = archicpop.getVcfListByChrom(
                    archicpopVcfFile, currentchrID)
                allsnpsInAchr = self.dbtools.operateDB(
                    "select", "select snp_pos,alt_base from " + tablename +
                    " where chrID='" + currentchrID + "'")
                for snp in allsnpsInAchr:
                    snp_pos = int(snp[0])
                    ALT = snp[1]
                    low = 0
                    high = len(archicpopSeqOfAChr[currentchrID]) - 1
                    while low <= high:
                        mid = (low + high) >> 1
                        if archicpopSeqOfAChr[currentchrID][mid][0] < snp_pos:
                            low = mid + 1
                        elif archicpopSeqOfAChr[currentchrID][mid][0] > snp_pos:
                            high = mid - 1
                        else:  #find the pos
                            pos, REF, ALT, INFO, FORMAT, samples = archicpopSeqOfAChr[
                                currentchrID][mid]
                            dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)",
                                            INFO)
                            refdep = 0
                            altalleledep = 0
                            if dp4 != None:  #vcf from samtools
                                refdep = int(dp4.group(1)) + int(dp4.group(2))
                                altalleledep = int(dp4.group(3)) + int(
                                    dp4.group(4))
                            else:
                                AD_idx = (re.split(":", FORMAT)).index(
                                    "AD")  #gatk GT:AD:DP:GQ:PL
                                for sample in samples:
                                    if len(re.split(":", sample)) == 1:  # ./.
                                        continue
                                    AD_depth = re.split(
                                        ",",
                                        re.split(":", sample)[AD_idx])
                                    try:
                                        refdep += int(AD_depth[0])
                                        altalleledep += int(AD_depth[1])
                                    except ValueError:
                                        print(sample, end="")
                            popsdata = ALT + ":" + str(refdep) + "," + str(
                                altalleledep)
                            break
                    else:
                        depth_linelist = depthfile.getdepthByPos(
                            currentchrID, snp_pos)
                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata = "no covered"
                        else:
                            popsdata = ALT + ":" + depth_linelist[
                                species_idx] + ",0"
#                     print(snp[0],end="\t")
                    self.dbtools.operateDB(
                        "update", "update " + tablename + " set " +
                        archicpopfieldNameintable + " = '" + popsdata +
                        "' where chrID=" + "'" + currentchrID +
                        "' and snp_pos=" + str(snp[0]))
Ejemplo n.º 9
0
    def filldata(self,
                 vcfFileName,
                 depthfileName,
                 tablename="derived_alle_ref",
                 posUniq=True,
                 continuechrom=None,
                 continuepos=None):
        depthfile = Util.GATK_depthfile(depthfileName,
                                        depthfileName + ".index")
        depth_linelist = None
        vcffile = open(vcfFileName, 'r')
        vcfline = vcffile.readline()
        while re.search(r'^##', vcfline) != None:
            vcfline = vcffile.readline()

        if re.search(r'^#', vcfline) != None:
            poptitlelist = re.split(r'\s+', vcfline.strip())[9:]
            print(poptitlelist)
        else:
            print(
                "need title'#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT'"
            )
            exit(-1)
        for pop in poptitlelist:
            self.dbtools.operateDB("callproc",
                                   "mysql_sp_add_column",
                                   data=("life_pilot", tablename, pop,
                                         "varchar(128)", "default null"))
        popsdata = []  #depth for ref or alt
        if continuechrom != None and continuepos != None:
            print("filldata", continuechrom, continuepos)
            vcfpossearcher = VCFutil.VCF_Data(vcfFileName)
            vcffile.seek(vcfpossearcher.VcfIndexMap[continuechrom])
            vcfline = vcffile.readline()
            while vcfline:
                vcflist = re.split(r'\s+', vcfline.strip())
                chrom = vcflist[0].strip()
                pos = int(vcflist[1].strip())
                print(chrom, pos)
                if chrom == continuechrom and pos == continuepos:
                    break
                vcfline = vcffile.readline()
        else:
            justiceGATKorSamtools = vcffile.readline()
            vcflist = re.split(r'\s+', justiceGATKorSamtools.strip())
            dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)", vcflist[7])
            refdep = 0
            altalleledep = 0
            if dp4 != None:  #vcf from samtools
                print("function for samtools vcf is still need to be finish")
                exit(-1)
            else:
                chrom = vcflist[0].strip()
                pos = int(vcflist[1].strip())
                snpID = vcflist[2].strip()
                REF = vcflist[3].strip()
                ALT = vcflist[4].strip()

                AD_idx = (re.split(":", vcflist[8])).index(
                    "AD")  #gatk GT:AD:DP:GQ:PL
                sample_idx_in_vcf = 0
                for sample in vcflist[9:]:

                    samplename = poptitlelist[sample_idx_in_vcf]

                    sample_idx_in_vcf += 1
                    species_idx = depthfile.title.index("Depth_for_" +
                                                        samplename)
                    if len(re.split(":", sample)) != len(
                            re.split(":", vcflist[8])
                    ) and depth_linelist == None:  # ./. when lack of variantion information,then consider the depthfile
                        depth_linelist = depthfile.getdepthByPos(chrom, pos)

                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata.append('no covered')
                        else:
                            popsdata.append(depth_linelist[species_idx] + ",0")
                        continue
                    elif len(re.split(":", sample)) != len(
                            re.split(":",
                                     vcflist[8])) and depth_linelist != None:
                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata.append('no covered')
                        else:
                            popsdata.append(depth_linelist[species_idx] + ",0")
                        continue

                    popsdata.append(re.split(":", sample)[AD_idx])
                depth_linelist = None
                print(
                    "insert into " + tablename +
                    "(chrID,snp_pos,snpID,ref_base,alt_base," +
                    "".join([e + ","
                             for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                    ") select %s,%s,%s,%s,%s," + "%s," *
                    (len(poptitlelist) - 1) +
                    "%s from dual where not exists( select * from " +
                    tablename + " where " + tablename + ".chrID='" + chrom +
                    "' and " + tablename + ".snp_pos=" + str(pos) + ")",
                    (chrom, pos, snpID, REF, ALT) + tuple(popsdata))
                self.dbtools.operateDB(
                    "insert",
                    "insert into " + tablename +
                    "(chrID,snp_pos,snpID,ref_base,alt_base," +
                    "".join([e + ","
                             for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                    ") select %s,%s,%s,%s,%s," + "%s," *
                    (len(poptitlelist) - 1) +
                    "%s from dual where not exists( select * from " +
                    tablename + " where " + tablename + ".chrID='" + chrom +
                    "' and " + tablename + ".snp_pos=" + str(pos) + ")",
                    data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata))

        for vcfline in vcffile:

            vcflist = re.split(r'\s+', vcfline.strip())
            print(vcfline)
            if posUniq and pos == int(vcflist[1].strip()):
                continue
            chrom = vcflist[0].strip()
            pos = int(vcflist[1].strip())
            snpID = vcflist[2].strip()
            REF = vcflist[3].strip()
            ALT = vcflist[4].strip()

            AD_idx = (re.split(":",
                               vcflist[8])).index("AD")  #gatk GT:AD:DP:GQ:PL
            sample_idx_in_vcf = 0
            popsdata = []
            for sample in vcflist[9:]:
                samplename = poptitlelist[sample_idx_in_vcf]
                sample_idx_in_vcf += 1
                species_idx = depthfile.title.index("Depth_for_" + samplename)
                if len(re.split(":", sample)) != len(re.split(
                        ":", vcflist[8])) and depth_linelist == None:  # ./.
                    depth_linelist = depthfile.getdepthByPos(chrom, pos)
                    if int(depth_linelist[species_idx]) <= 1:
                        popsdata.append('no covered')
                    else:
                        popsdata.append(depth_linelist[species_idx] + ",0")
                    continue
                elif len(re.split(":", sample)) != len(
                        re.split(":", vcflist[8])) and depth_linelist != None:
                    if int(depth_linelist[species_idx]) <= 1:
                        popsdata.append('no covered')
                    else:
                        popsdata.append(depth_linelist[species_idx] + ",0")
                    continue
#                 AD_depth = re.split(",", re.split(":", sample)[AD_idx])

                popsdata.append(re.split(":", sample)[AD_idx])
            depth_linelist = None
            print(
                "insert into " + tablename +
                "(chrID,snp_pos,snpID,ref_base,alt_base," +
                "".join([e + ","
                         for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) +
                "%s from dual where not exists( select * from " + tablename +
                " where " + tablename + ".chrID='" + chrom + "' and " +
                tablename + ".snp_pos=" + str(pos) + ")",
                (chrom, pos, snpID, REF, ALT) + tuple(popsdata))
            self.dbtools.operateDB(
                "insert",
                "insert into " + tablename +
                "(chrID,snp_pos,snpID,ref_base,alt_base," +
                "".join([e + ","
                         for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) +
                "%s from dual where not exists( select * from " + tablename +
                " where " + tablename + ".chrID='" + chrom + "' and " +
                tablename + ".snp_pos=" + str(pos) + ")",
                data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata))
        depthfile.closedepthfile()
        vcffile.close()
Ejemplo n.º 10
0
                  "--outfileprename",
                  dest="outfileprename",
                  help="default infile1_infile2")
parser.add_option("-2",
                  "--ancenstral_or_derived",
                  dest="ancenstral_or_derived",
                  default="d",
                  help="ancenstral(a) or derived(d)")
(options, args) = parser.parse_args()
mindeptojudgefix = 15
#####################
VCFobj = {}
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall = {}
vcfnameKEY_depthobjVALUE_tojudgeancestral = {}
VCFobj["wigeon"] = VCFutil.VCF_Data(
    "/home/bioinfo/liurui/data/vcffiles/uniqmap/taihudomesticgoose/taihudomesticgoose.pool.withindel.vcf"
)
VCFobj["fanya"] = VCFutil.VCF_Data(
    "/home/bioinfo/liurui/data/vcffiles/uniqmap/fanya/fanya._pool.withindel.vcf"
)
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[
    "wigeon"] = Util.GATK_depthfile(
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index"
    )  #here is a temp trick not a error
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[
    "fanya"] = Util.GATK_depthfile(
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index"
    )
vcfnameKEY_depthobjVALUE_tojudgeancestral["wigeon"] = [