def caculateDstatistics(self, p1, p2, p3, p4, caculator, currentchrID, currentchrLen, winwidth=None): win = Util.Window()
# if options.depthfile!=None: # print(options.depthfile,"no need") # originalspeciesref=options.ancenstralref # colname=re.search(r'[^/]*$',originalspeciesref).group(0) # colname=re.sub(r"[^\w^\d]","_",colname);colname=colname[:10] # print(colname) # ancestralalleletabletools.dbvariant.operateDB("callproc", "mysql_sp_add_column", data=(ancestralalleletabletools.dbvariant, toplevelsnptablename, colname, "char(128)", "default null")) OUTFILENAME = "ducksnpflankseq.fa" outfile = open(options.chromlistfilename + "snpflankseq.fa", 'w') duckrefhandler = open(options.ref, 'r') try: duckrefindex = pickle.load(open(options.ref + ".myfasteridx", 'rb')) # originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb')) except IOError: Util.generateFasterRefIndex(options.ref, options.ref + ".myfasteridx") duckrefindex = pickle.load(open(options.ref + ".myfasteridx", 'rb')) # try: # originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb')) # except IOError: # Util.generateIndexByChrom(originalspeciesref, originalspeciesref + ".myindex") # originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb')) chrom_lenlist = [] chromlistfile = open(options.chromlistfilename, "r") for chrrow in chromlistfile: chrrowlist = re.split(r'\s+', chrrow.strip()) chrom_lenlist.append( (chrrowlist[0].strip(), int(chrrowlist[1].strip()))) for currentchrID, currentchrLen in chrom_lenlist:
help="speciesName in table") # (options, args) = parser.parse_args() allpop_with_derived_alletable = options.topleveltable ancestralspeciescolname = options.ancestralspeciesname.strip() farsurebutfew = options.farsurebutfew.strip() mindepth = int(options.mindepth) if __name__ == '__main__': dbtools = dbm.DBTools(Util.ip, Util.username, Util.password, Util.genomeinfodbname) tableprename = "" TABLES = {} for bedfileName in args[:]: tableprename += re.search(r"[^/]*$", bedfileName).group(0)[0] tablename = tableprename + Util.random_str() TABLES[tablename] = ("CREATE TABLE " + tablename + " (" " `snpID` varchar(128) NOT NULL ," " `region` varchar(128) NOT NULL ," " `DAF` double default 100 ," " `MAF` double default 100 ," " PRIMARY KEY (`snpID`,`region`)" ")") tempdbtools = dbm.DBTools(Util.ip, Util.username, Util.password, Util.ghostdbname) tempdbtools.create_table(TABLES) titlelist = [ a[0].strip() for a in dbtools.operateDB( "select", "select column_name from information_schema.columns where table_schema='"
action="store_false", dest="verbose", default=True, help="don't print status messages to stdout") (options, args) = parser.parse_args() refFastaName1 = options.reffa[0] refFastaName2 = options.reffa[1] reffastaidxName1 = refFastaName1 + ".myfasteridx" reffastaidxName2 = refFastaName2 + ".myfasteridx" try: refidxByChr2 = pickle.load(open(reffastaidxName2, 'rb')) refidxByChr1 = pickle.load(open(reffastaidxName1, 'rb')) except IOError: Util.generateFasterRefIndex(refFastaName1, reffastaidxName1) Util.generateFasterRefIndex(refFastaName2, reffastaidxName2) refidxByChr1 = pickle.load(open(reffastaidxName1, 'rb')) refidxByChr2 = pickle.load(open(reffastaidxName2, 'rb')) commsample_idxlistinM = [] commsample_idxlistinV = [] degenerateM = { "R": "AG", "Y": "CT", "M": "AC", "K": "GT", "S": "GC", "W": "AT", "A": "AA", "T": "TT", "C": "CC",
help="ancenstral(a) or derived(d)") (options, args) = parser.parse_args() mindeptojudgefix = 15 ##################### VCFobj = {} vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall = {} vcfnameKEY_depthobjVALUE_tojudgeancestral = {} VCFobj["wigeon"] = VCFutil.VCF_Data( "/home/bioinfo/liurui/data/vcffiles/uniqmap/taihudomesticgoose/taihudomesticgoose.pool.withindel.vcf" ) VCFobj["fanya"] = VCFutil.VCF_Data( "/home/bioinfo/liurui/data/vcffiles/uniqmap/fanya/fanya._pool.withindel.vcf" ) vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[ "wigeon"] = Util.GATK_depthfile( "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth", "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index" ) #here is a temp trick not a error vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[ "fanya"] = Util.GATK_depthfile( "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth", "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index" ) vcfnameKEY_depthobjVALUE_tojudgeancestral["wigeon"] = [ "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth", 9 ] vcfnameKEY_depthobjVALUE_tojudgeancestral["fanya"] = [ "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth", 3 ] ####################################
"1": 42145699, '2': 49200776, '3': 50652576, "4": 40408058, "5": 47253416, "6": 36015257, "7": 35964515, "8": 40690061, "9": 58970518 } winsize = int(sys.argv[6]) reff = open(sys.argv[1], 'r') try: refidx = pickle.load(open(sys.argv[1] + ".myfasteridx", 'rb')) except IOError: Util.generateFasterRefIndex(sys.argv[1], sys.argv[1] + ".myfasteridx") refidx = pickle.load(open(sys.argv[1] + ".myfasteridx", 'rb')) vcftools = "vcftools" gapf = open(sys.argv[3], 'r') scoredsnp = open(sys.argv[4], 'r') scoredsnp.readline() sitesingap = open(sys.argv[5], 'w') if __name__ == '__main__': win = Util.Window() i = 0 interferf = open(sys.argv[5] + ".InterferingTEMP", 'w') for gapregion in gapf: i += 1 filledsites = [] gaplist = re.split(r"\s+", gapregion.strip()) if not os.path.exists(sys.argv[5] + "temp" + str(i) + ".recode.vcf"):
def make_freq_xaxisKEY_yaxisseqVALUERelation(a): chromlistfilename = a[0] topleveltablename = a[1] targetpopvcffile_withdepthconfig = a[2] refpopvcffile_withdepthconfig = a[3] numberofindvdoftargetpop_todividintobin = int(a[4]) mindepthtojudefixed = 20 d_increase = fractions.Fraction( 1, (2 * int(numberofindvdoftargetpop_todividintobin))) d_increase = round(d_increase, 11) minvalue = 0.000000000000 freq_xaxisKEY_yaxisVALUE_seq_list = {} for i in range(numberofindvdoftargetpop_todividintobin * 2 - 1): freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue, minvalue + d_increase + 0.00000000004)] = [] minvalue += d_increase else: freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue, 1)] = [] for a, b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()): print(str(a), str(b)) # while minvalue+d_increase<=1: # freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue,minvalue+d_increase+0.00000000004)]=[] # print('%.12f'%minvalue,'%.12f'%(minvalue+d_increase+0.00000000004)) # minvalue+=d_increase # else: # freq_xaxisKEY_yaxisVALUE_seq_list[] print("process ID:", os.getpid(), "start", chromlistfilename) dbvariantstools = dbm.DBTools(Util.ip, Util.username, Util.password, Util.vcfdbname) chromlistfile = open(chromlistfilename, "r") chromlistfilelines = chromlistfile.readlines() chromlistfile.close() chromlist = [] for chrrow in chromlistfilelines: chrrowlist = re.split(r'\s+', chrrow.strip()) chromlist.append((chrrowlist[0].strip(), int(chrrowlist[1].strip()))) vcfnamelist = [] listofpopvcfmapOfAChr = [] methodlist = [] vcfnameKEY_vcfobj_pyBAMfilesVALUE = {} N_of_targetpop = len(targetpopvcffile_withdepthconfig) N_of_refpop = len(refpopvcffile_withdepthconfig) #{ vcftablename1:[depthfilename1,name1,name2] , vcftablename2:[depthfilename2,name1,name2] } or {vcftablename1:None, vcftablename2:None} for vcfconfigfilename in targetpopvcffile_withdepthconfig[:] + refpopvcffile_withdepthconfig[:]: listofpopvcfmapOfAChr.append({}) vcfconfig = open(vcfconfigfilename, "r") for line in vcfconfig: vcffilename_obj = re.search(r"vcffilename=(.*)", line.strip()) if vcffilename_obj != None: vcfname = vcffilename_obj.group(1).strip() vcfnamelist.append(vcfname) vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname] = [] vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname].append( VCFutil.VCF_Data(vcfname)) elif line.split(): vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname].append( pysam.Samfile(line.strip(), 'rb')) vcfconfig.close() if re.search(r"indvd[^/]+", vcfname) != None: methodlist.append("indvd") elif re.search(r"pool[^/]+", vcfname) != None: methodlist.append("pool") else: print("vcfname must with 'pool' or 'indvd'") exit(-1) for currentchrID, currentchrLen in chromlist: for vcfname in vcfnamelist: if currentchrID in vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname][ 0].VcfIndexMap: break else: print("this chr doesn't exist in anypop") continue for vcfobj_idx in range(len(vcfnamelist)): listofpopvcfmapOfAChr[vcfobj_idx] = {} listofpopvcfmapOfAChr[vcfobj_idx][ currentchrID] = vcfnameKEY_vcfobj_pyBAMfilesVALUE[ vcfnamelist[vcfobj_idx]][0].getVcfListByChrom(currentchrID) target_ref_SNPs = Util.alinmultPopSnpPos(listofpopvcfmapOfAChr, "o") for snp_aligned in target_ref_SNPs[currentchrID]: if len(snp_aligned[1]) != 1 or len(snp_aligned[2]) != 1: print("multple allele", snp_aligned) continue curpos = int(snp_aligned[0]) snp = dbvariantstools.operateDB( "select", "select * from " + topleveltablename + " where chrID='" + currentchrID + "' and snp_pos=" + str(curpos) + "") if not snp or snp == 0: print(currentchrID, curpos, "snp not find in db,skip") continue else: #judge the ancenstrall allele fanyadepthlist = re.split(r",", snp[0][9]) if len(fanyadepthlist) == 2 and int( fanyadepthlist[1] ) >= mindepthtojudefixed and fanyadepthlist[0].strip() == "0": A_base_idx = 1 elif len(fanyadepthlist) == 2 and int( fanyadepthlist[0] ) >= mindepthtojudefixed and fanyadepthlist[1].strip() == "0": A_base_idx = 0 else: print("skip snp", snp[0][1], snp[0][7:]) continue ancestrallcontext = snp[0][5].strip()[0].upper() + snp[0][ 3 + A_base_idx].strip().upper() + snp[0][5].strip()[2].upper() if "CG" in ancestrallcontext or "GC" in ancestrallcontext: print("skip CG site", ancestrallcontext) continue ##########x-axis countedAF = 0 target_DAF_sum = 0 #;noofnocoveredsample=0 for i in range(3, N_of_targetpop + 3): if snp_aligned[i] == None: if len(vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[ i - 3]]) == 1: print("no depth file") continue else: sum_depth = 0 for samfile in vcfnameKEY_vcfobj_pyBAMfilesVALUE[ vcfnamelist[i - 3]][1:]: ACGTdep = samfile.count_coverage( currentchrID, curpos - 1, curpos) for dep in ACGTdep: sum_depth += dep[0] if sum_depth >= mindepthtojudefixed: AF = 0 else: continue else: if methodlist[i - 3] == "indvd": AF = float( re.search(r"AF=([\d\.]+);", snp_aligned[i][0]).group(1)) elif methodlist[i - 3] == "pool": refdep = 0 altalleledep = 0 AD_idx = (re.split(":", snp_aligned[i][1])).index( "AD") # gatk GT:AD:DP:GQ:PL for sample in snp_aligned[i][2]: if len(re.split(":", sample)) == 1: # ./. continue AD_depth = re.split(",", re.split(":", sample)[AD_idx]) try: refdep += int(AD_depth[0]) altalleledep += int(AD_depth[1]) except ValueError: print(sample, end="|") if refdep == altalleledep and altalleledep == 0: print("no sample available in this pop") # noofnocoveredsample+=1 continue AF = altalleledep / (altalleledep + refdep) if A_base_idx == 0: DAF = 1 - AF elif A_base_idx == 1: DAF = AF target_DAF_sum += DAF countedAF += 1 if countedAF == 0: #or target_DAF_sum==0: print( "skip this snp,because it fiexd as ancestral or no covered in this pos in target pops", snp_aligned, snp) continue target_DAF = target_DAF_sum / countedAF ###############y-axis countedAF = 0 rer_DAF_sum = 0 for i in range(3 + N_of_targetpop, N_of_refpop + N_of_targetpop + 3): if snp_aligned[i] == None: if len(vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[ i - 3]]) == 1: continue else: # depth_linelist=vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[i-3-N_of_targetpop]].getdepthByPos_optimized(currentchrID,curpos) sum_depth = 0 for samfile in vcfnameKEY_vcfobj_pyBAMfilesVALUE[ vcfnamelist[i - 3]][1:]: ACGTdep = samfile.count_coverage( currentchrID, curpos - 1, curpos) for dep in ACGTdep: sum_depth += dep[0] if sum_depth >= mindepthtojudefixed: AF = 0 else: continue else: if methodlist[i - 3] == "indvd": AF = float( re.search(r"AF=([\d\.]+);", snp_aligned[i][0]).group(1)) AN = float( re.search(r"AN=([\d\.]+);", snp_aligned[i][0]).group(1)) if AN < 5: continue elif methodlist[i - 3] == "pool": refdep = 0 altalleledep = 0 AD_idx = (re.split(":", snp_aligned[i][1])).index( "AD") # gatk GT:AD:DP:GQ:PL for sample in snp_aligned[i][2]: if len(re.split(":", sample)) == 1: # ./. continue AD_depth = re.split(",", re.split(":", sample)[AD_idx]) try: refdep += int(AD_depth[0]) altalleledep += int(AD_depth[1]) except ValueError: print(sample, end="|") if (refdep == altalleledep and altalleledep == 0) or altalleledep + refdep < 10: continue AF = altalleledep / (altalleledep + refdep) if A_base_idx == 0: DAF = 1 - AF elif A_base_idx == 1: DAF = AF rer_DAF_sum += DAF countedAF += 1 if countedAF == 0 or rer_DAF_sum == 0: print( "skip this snp,because it no covered in this pos in ref pops", snp_aligned, snp) continue ######collect according bins for a, b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()): if target_DAF > a and target_DAF <= b: freq_xaxisKEY_yaxisVALUE_seq_list[(a, b)].append( rer_DAF_sum / countedAF) break # freq_xaxisKEY_yaxisVALUERelation={} # for a,b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()): # freq_xaxisKEY_yaxisVALUERelation[(a,b)]=numpy.mean(freq_xaxisKEY_yaxisVALUE_seq_list[(a,b)]) # print('%.12f'%a,'%.12f'%(b),'%.12f'%(freq_xaxisKEY_yaxisVALUERelation[(a,b)]),"process ID:",os.getpid(),"done",sep="\t") print("process ID:", os.getpid(), "done") return copy.deepcopy(freq_xaxisKEY_yaxisVALUE_seq_list)
def make_getElemBed(elementfold, targetseqnamesubstr, pathtoblastn, reffa): """ targetseqnamesubstr is the str before the first space ,after the > """ allseqtobed = { } #{chrID:[(sstart,send,elem,qstart,qend,revcom,len),(sstart,send,elem,qstart,qend,revcom,len),,,],,,,} if elementfold.endswith("/") or elementfold.endswith("\\"): elementfold = elementfold[:-1] if os.path.isfile(elementfold + "/" + targetseqnamesubstr + ".bed"): bedfile = open(elementfold + "/" + targetseqnamesubstr + ".bed", "r") bedfile.readline() #title for bedline in bedfile: bedlinelist = re.split(r"\t+", bedline) if bedlinelist[0].strip() in allseqtobed: allseqtobed[bedlinelist[0].strip()].append( (int(bedlinelist[1]), int(bedlinelist[2]), bedlinelist[3], int(bedlinelist[4]), int(bedlinelist[5]), bedlinelist[6], int(bedlinelist[7]), int(bedlinelist[8]))) else: allseqtobed[bedlinelist[0].strip()] = [ (int(bedlinelist[1]), int(bedlinelist[2]), bedlinelist[3], int(bedlinelist[4]), int(bedlinelist[5]), bedlinelist[6], int(bedlinelist[7]), int(bedlinelist[8])) ] bedfile.close() return allseqtobed randomstr = Util.random_str() targetseqnamesubstr_lenmap = {} if targetseqnamesubstr == "none": shellstatment = pathtoblastn + " -query " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".fa" + " -task blastn -db " + reffa + " -out " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".blastout -outfmt 7 -num_alignments 10 -num_threads 6" queryfafile = open( elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".collectionfas", 'w') i = 0 for elem in os.listdir(path=elementfold): path = elementfold + "/" + elem if (not os.path.isdir(path)) and ( path.endswith("fa") or path.endswith("fasta")): #True is fa file print(path, i) i += 1 if targetseqnamesubstr.lower().strip() == "none": pathfile = open(path, "r") for line in pathfile: print(line.strip(), file=queryfafile) if line.startswith(">"): seqname = line.strip() else: targetseqnamesubstr_lenmap[seqname[1:]] = len( line.strip()) # print(targetseqnamesubstr_lenmap) pathfile.close() else: muscleout_seqgenerator = SeqIO.parse(path, "fasta") for seq_rec in muscleout_seqgenerator: if seq_rec.id == targetseqnamesubstr: seqstr = "".join(seq_rec.seq).replace("-", "") print(">" + elem, file=queryfafile) # allseqtobed[elem]=[] targetseqnamesubstr_lenmap[elem] = len(seqstr) print(seqstr, file=queryfafile) break else: print(targetseqnamesubstr, "dosenot exist", elem) queryfafile.close() shellstatment = pathtoblastn + " -query " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".collectionfas" + " -task blastn -db " + reffa + " -out " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".blastout -outfmt 7 -num_alignments 10 -num_threads 6" print(shellstatment) a = os.system(shellstatment) if a != 0: print("error") exit(-1) blastout = open( elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".blastout", "r") for line in blastout: if re.search(r"^#", line) != None: lastblastlen = None continue linelist = re.split(r"\s+", line) blastlen = int(linelist[3]) if lastblastlen == None or (blastlen > lastblastlen - 10 or blastlen * 0.95 >= lastblastlen): fafilename = linelist[0] chrom = linelist[1] sstartpos = int(linelist[8]) sendpos = int(linelist[9]) revcom = "forward" if sstartpos > sendpos: temp = sstartpos sstartpos = sendpos sendpos = temp revcom = "revcom" qstartpos = int(linelist[6]) qendpos = int(linelist[7]) total_bases = targetseqnamesubstr_lenmap[fafilename] gap_open = int(linelist[5]) if chrom in allseqtobed: allseqtobed[chrom].append( (sstartpos, sendpos, fafilename, qstartpos, qendpos, revcom, total_bases, gap_open)) else: allseqtobed[chrom] = [ (sstartpos, sendpos, fafilename, qstartpos, qendpos, revcom, total_bases, gap_open) ] lastblastlen = blastlen bedfile = open(elementfold + "/" + targetseqnamesubstr + ".bed", "w") print("chrNo", "Region_start", "Region_end", "fastafilename", "startbase", "endbase", "revcom_forward", "total_bases", "gap_open", sep="\t", file=bedfile) for chrom in allseqtobed.keys(): allseqtobed[chrom].sort(key=lambda listRec: listRec[1]) for startpos, endpos, fafilename, qs, qe, revcom, total_bases, gap_open in allseqtobed[ chrom]: print(chrom, startpos, endpos, fafilename, qs, qe, revcom, total_bases, gap_open, sep="\t", file=bedfile) blastout.close() bedfile.close() return allseqtobed os.system("rm " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".fa " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".blastout")
paramsname.append(n) #random initial value # initvalue=random.gauss(float(v),0.01) # while initvalue>float(u) or initvalue<float(l): # initvalue=random.gauss(float(v),0.01) # paramslist.append(float(initvalue)) paramslist.append(float(v)) lower_boundlist.append(float(l)) upper_boundlist.append(float(u)) # ll_param_MAPlist[n].append() #produce command and run pythonpath=pythonpath+" -p "+n+" "+str(v)+" "+l+" "+u+" " # pythonpath=pythonpath+" -p "+n+" "+str(initvalue)+" "+l+" "+u+" " if randomstr!=None: os.system("rm "+namestr+options.tag+options.model+randomstr+".parameter") randomstr=Util.random_str() print(pythonpath+" -b "+randomstr+" "+str(int(options.bootstrap[1]))) sys.stdout.flush() a=call_system(pythonpath+" -b "+randomstr+" "+str(int(options.bootstrap[1]))) if a!=0: print("cycle",i,a,"wrong") continue #collection result print(options.fsfile+namestr+options.tag+options.model+"array.pickle") u=pickle._Unpickler(open(options.fsfile+namestr+options.tag+options.model+"array.pickle","rb")) u.encoding='latin1' residualarray=u.load() #pickle.load(open(options.fsfile+namestr+options.tag+options.model+"array.pickle","rb")) u=pickle._Unpickler(open(options.fsfile+namestr+options.tag+options.model+"hist.pickle","rb")) u.encoding='latin1' residualhis=u.load()#pickle.load(open(options.fsfile+namestr+options.tag+options.model+"hist.pickle","rb")) bif=open(options.fsfile+namestr+options.tag+options.model+randomstr+"btstrap.temp",'r')
def fillarchicpop(self, archicpopVcfFile, depthFile, chromtable, archicpopNameindepthFile, tablename="derived_alle_ref", archicpopfieldNameintable="archicpop"): """ abandon the snps which exist in archicpopVcfFile but absence in all others pop snp sets """ depthfile = Util.GATK_depthfile(depthFile, depthFile + ".index") species_idx = depthfile.title.index("Depth_for_" + archicpopNameindepthFile) archicpop = VCFutil.VCF_Data(archicpopVcfFile) totalChroms = self.dbtools.operateDB( "select", "select count(*) from " + chromtable)[0][0] for i in range(0, totalChroms, 20): currentsql = "select * from " + chromtable + " order by chrlength desc limit " + str( i) + ",20" result = self.dbtools.operateDB("select", currentsql) for row in result: currentchrID = row[0] print(currentchrID + ":", end="") currentchrLen = int(row[2]) archicpopSeqOfAChr = {} archicpopSeqOfAChr[currentchrID] = archicpop.getVcfListByChrom( archicpopVcfFile, currentchrID) allsnpsInAchr = self.dbtools.operateDB( "select", "select snp_pos,alt_base from " + tablename + " where chrID='" + currentchrID + "'") for snp in allsnpsInAchr: snp_pos = int(snp[0]) ALT = snp[1] low = 0 high = len(archicpopSeqOfAChr[currentchrID]) - 1 while low <= high: mid = (low + high) >> 1 if archicpopSeqOfAChr[currentchrID][mid][0] < snp_pos: low = mid + 1 elif archicpopSeqOfAChr[currentchrID][mid][0] > snp_pos: high = mid - 1 else: #find the pos pos, REF, ALT, INFO, FORMAT, samples = archicpopSeqOfAChr[ currentchrID][mid] dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)", INFO) refdep = 0 altalleledep = 0 if dp4 != None: #vcf from samtools refdep = int(dp4.group(1)) + int(dp4.group(2)) altalleledep = int(dp4.group(3)) + int( dp4.group(4)) else: AD_idx = (re.split(":", FORMAT)).index( "AD") #gatk GT:AD:DP:GQ:PL for sample in samples: if len(re.split(":", sample)) == 1: # ./. continue AD_depth = re.split( ",", re.split(":", sample)[AD_idx]) try: refdep += int(AD_depth[0]) altalleledep += int(AD_depth[1]) except ValueError: print(sample, end="") popsdata = ALT + ":" + str(refdep) + "," + str( altalleledep) break else: depth_linelist = depthfile.getdepthByPos( currentchrID, snp_pos) if int(depth_linelist[species_idx]) <= 1: popsdata = "no covered" else: popsdata = ALT + ":" + depth_linelist[ species_idx] + ",0" # print(snp[0],end="\t") self.dbtools.operateDB( "update", "update " + tablename + " set " + archicpopfieldNameintable + " = '" + popsdata + "' where chrID=" + "'" + currentchrID + "' and snp_pos=" + str(snp[0]))
def extarctAncestryAlleleFromBlastOut(self, BlastOutFile, ancestryrefFile, ancestryrefidx, tablename="derived_alle_ref", ancestralsnptable=None): ancestryreffile = open(ancestryrefFile, 'r') ancestrysnpflank = open(tablename + "ancestrysnpflank.fa", 'w') a = os.popen("awk '$1!~/^#/ && $5==1 && $4>26 && $6==0 {print $0}' " + BlastOutFile) # hits=a.readlines() lastbasesAccur = {} onegroup = [] revcom = False # initial hit = a.readline() hitlist = re.split(r"\s+", hit) sendpos = int(hitlist[9]) sstartpos = int(hitlist[8]) qstartpos = int(hitlist[6]) blastlen = int(hitlist[3]) snp_loc_s = sstartpos + 26 - qstartpos snpindex = 26 - qstartpos if sstartpos > sendpos: temp = sstartpos sstartpos = sendpos sendpos = temp revcom = True lastsnpID = hitlist[0] chrom = hitlist[1] RefSeqMap = Util.getRefSeqBypos(refFastahander=ancestryreffile, refindex=ancestryrefidx, currentChromNO=chrom, startpos=sstartpos, endpos=sendpos) if revcom: tempStr = RefSeqMap[chrom][1:] tempStr.reverse() RefSeqMap[chrom][1:] = Util.complementary(tempStr) revcom = False lastbasesAccur[RefSeqMap[chrom][snpindex + 1]] = [(chrom, sstartpos, sendpos)] onegroup.append((RefSeqMap[chrom][snpindex + 1], blastlen)) for hit in a: print(hit) hitlist = re.split(r"\s+", hit) chrom = hitlist[1] sstartpos = int(hitlist[8]) sendpos = int(hitlist[9]) qstartpos = int(hitlist[6]) blastlen = int(hitlist[3]) snp_loc_s = sstartpos + 26 - qstartpos snpindex = 26 - qstartpos if sstartpos > sendpos: temp = sstartpos sstartpos = sendpos sendpos = temp revcom = True if lastsnpID == hitlist[0]: RefSeqMap = Util.getRefSeqBypos(refFastahander=ancestryreffile, refindex=ancestryrefidx, currentChromNO=chrom, startpos=sstartpos, endpos=sendpos) if revcom: tempStr = RefSeqMap[chrom][1:] tempStr.reverse() RefSeqMap[chrom][1:] = Util.complementary(tempStr) revcom = False print(lastsnpID, RefSeqMap[chrom][snpindex + 1], str(snp_loc_s), "".join(RefSeqMap[chrom][1:]), file=ancestrysnpflank) if RefSeqMap[chrom][snpindex + 1] in lastbasesAccur: lastbasesAccur[RefSeqMap[chrom][snpindex + 1]].append( (chrom, sstartpos, sendpos)) else: lastbasesAccur[RefSeqMap[chrom][snpindex + 1]] = [ (chrom, sstartpos, sendpos) ] onegroup.append((RefSeqMap[chrom][snpindex + 1], blastlen)) else: # 出入数据库 按照不同的主键 即原来是snpid 现在换成别的 snppos = re.search(r"_(\d+)", lastsnpID).group(1) snpChrom = re.search(r"(.+)_(\d+)", lastsnpID).group(1) onegroup.sort(key=lambda listRec: listRec[1]) if len(onegroup) == 1 or onegroup[0][1] - onegroup[1][ 1] >= 15: #first , only one query id,second longest hit 15 bases greater than the second longest hit if ancestralsnptable != None and self.dbtools.operateDB( "select", "select count(*) from " + ancestralsnptable + " where chrID= '" + chrom + "' and snp_start_pos= " + str(snp_loc_s))[0][0] == 0: print("update " + tablename + " set ancestralallel='" + onegroup[0][0] + "' where chrID='" + snpChrom + "'and snp_pos=" + snppos) self.dbtools.operateDB( "update", "update " + tablename + " set ancestralallel='" + onegroup[0][0] + "' where chrID='" + snpChrom + "'and snp_pos=" + snppos) else: print( "select count(*) from " + ancestralsnptable + " where chrID= '" + chrom + "' and snp_start_pos= " + str(snppos), self.dbtools.operateDB( "select", "select count(*) from " + ancestralsnptable + " where chrID= '" + chrom + "' and snp_start_pos= " + str(snppos))) elif (len(lastbasesAccur.keys()) == 1 and self.dbtools.operateDB( "select", "select count(*) from " + ancestralsnptable + " where chrID= '" + chrom + "' and snp_start_pos= " + str(snp_loc_s))[0][0] == 0): for bases in lastbasesAccur: #only once print("update " + tablename + " set ancestralallel='" + bases + "' where chrID='" + snpChrom + "' and snp_pos=" + snppos) self.dbtools.operateDB( "update", "update " + tablename + " set ancestralallel='" + bases + "' where chrID='" + snpChrom + "' and snp_pos=" + snppos) elif len(lastbasesAccur.keys()) == 0: print(" len(lastbasesAccur.keys()) == 0") exit(-1) RefSeqMap = Util.getRefSeqBypos(refFastahander=ancestryreffile, refindex=ancestryrefidx, currentChromNO=chrom, startpos=sstartpos, endpos=sendpos) if revcom: tempStr = RefSeqMap[chrom][1:] tempStr.reverse() RefSeqMap[chrom][1:] = Util.complementary(tempStr) revcom = False print(hitlist[0], RefSeqMap[chrom][snpindex + 1], str(snp_loc_s), "".join(RefSeqMap[chrom][1:]), file=ancestrysnpflank) # dbtools.operateDB("update", "update " + finaltable + " set chicken='" + RefSeqMap[chrom][snpindex + 1] + "' where snpID='" + hitlist[0] + "'") lastsnpID = hitlist[0] lastbasesAccur.clear() lastbasesAccur[RefSeqMap[chrom][snpindex + 1]] = [ (chrom, sstartpos, sendpos) ] print("finish") ancestryreffile.close()
def getflankseqs(self, chrom, chromlen, snpstartpos, snpendpos, idxedreffilehandler, refindex, flanklen, outfile, tablename="derived_alle_ref"): testfile = open("testsnpfile.txt", 'a') snps = self.dbtools.operateDB( "select", "select * from " + tablename + " where chrID='" + chrom + "' and snp_pos>= " + str(snpstartpos) + " and snp_pos<=" + str(snpendpos)) RefSeqMap = Util.getRefSeqBypos(idxedreffilehandler, refindex, chrom, snpstartpos - flanklen, snpendpos + flanklen, chromlen) for snp in snps: currentsnpPos = snp[1] if len(snp[3]) != 1 or len(snp[4]) != 1: # print(snp[4]) continue # skip indel currentsnpID = chrom + "_" + str(snp[1]) if currentsnpPos + 25 <= RefSeqMap[chrom][0] + len(RefSeqMap[ chrom]) - 1 and currentsnpPos - 25 > RefSeqMap[chrom][0]: snpflankseq = ''.join(RefSeqMap[chrom][( currentsnpPos - 25 - RefSeqMap[chrom][0]):(currentsnpPos + 25 - RefSeqMap[chrom][0] + 1)]) print(currentsnpID, snpflankseq[25], file=testfile) snpflankseq = snpflankseq[0:25] + 'N' + snpflankseq[26:] elif currentsnpPos <= RefSeqMap[chrom][0] + len( RefSeqMap[chrom]) - 1 and currentsnpPos + 25 > RefSeqMap[ chrom][0] + len(RefSeqMap[chrom]) - 1: snpflankseq = ''.join(RefSeqMap[chrom][( currentsnpPos - 25 - RefSeqMap[chrom][0]):(currentsnpPos - RefSeqMap[chrom][0] + 1)]) print(currentsnpID, snpflankseq[25], file=testfile) snpflankseq = snpflankseq[0:25] + 'N' elif currentsnpPos - 25 <= RefSeqMap[chrom][0]: snpflankseq = ''.join( RefSeqMap[chrom][(currentsnpPos - RefSeqMap[chrom][0]):( currentsnpPos + 25 - RefSeqMap[chrom][0] + 1)]) print(currentsnpID, snpflankseq[0], file=testfile) snpflankseq = 'N' + snpflankseq[1:26] else: print("what's wrong with the func getflankseqs ?") exit(-1) # if currentsnpPos + 25 <= RefSeqMap[lastchromNo][0] + len(RefSeqMap[lastchromNo]) - 1 and currentsnpPos - 25 > RefSeqMap[lastchromNo][0] : # snpflankseq = ''.join(RefSeqMap[chrom][(currentsnpPos - 25 - RefSeqMap[chrom][0]):(currentsnpPos + 25 - RefSeqMap[chrom][0] + 1)]) # print(currentsnpID, snpflankseq[25], file=testfile) # snpflankseq = snpflankseq[0:25] + 'N' + snpflankseq[26:] print(">" + currentsnpID + "\n" + snpflankseq, end='\n', file=outfile) testfile.close()
def filldata(self, vcfFileName, depthfileName, tablename="derived_alle_ref", posUniq=True, continuechrom=None, continuepos=None): depthfile = Util.GATK_depthfile(depthfileName, depthfileName + ".index") depth_linelist = None vcffile = open(vcfFileName, 'r') vcfline = vcffile.readline() while re.search(r'^##', vcfline) != None: vcfline = vcffile.readline() if re.search(r'^#', vcfline) != None: poptitlelist = re.split(r'\s+', vcfline.strip())[9:] print(poptitlelist) else: print( "need title'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT'" ) exit(-1) for pop in poptitlelist: self.dbtools.operateDB("callproc", "mysql_sp_add_column", data=("life_pilot", tablename, pop, "varchar(128)", "default null")) popsdata = [] #depth for ref or alt if continuechrom != None and continuepos != None: print("filldata", continuechrom, continuepos) vcfpossearcher = VCFutil.VCF_Data(vcfFileName) vcffile.seek(vcfpossearcher.VcfIndexMap[continuechrom]) vcfline = vcffile.readline() while vcfline: vcflist = re.split(r'\s+', vcfline.strip()) chrom = vcflist[0].strip() pos = int(vcflist[1].strip()) print(chrom, pos) if chrom == continuechrom and pos == continuepos: break vcfline = vcffile.readline() else: justiceGATKorSamtools = vcffile.readline() vcflist = re.split(r'\s+', justiceGATKorSamtools.strip()) dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)", vcflist[7]) refdep = 0 altalleledep = 0 if dp4 != None: #vcf from samtools print("function for samtools vcf is still need to be finish") exit(-1) else: chrom = vcflist[0].strip() pos = int(vcflist[1].strip()) snpID = vcflist[2].strip() REF = vcflist[3].strip() ALT = vcflist[4].strip() AD_idx = (re.split(":", vcflist[8])).index( "AD") #gatk GT:AD:DP:GQ:PL sample_idx_in_vcf = 0 for sample in vcflist[9:]: samplename = poptitlelist[sample_idx_in_vcf] sample_idx_in_vcf += 1 species_idx = depthfile.title.index("Depth_for_" + samplename) if len(re.split(":", sample)) != len( re.split(":", vcflist[8]) ) and depth_linelist == None: # ./. when lack of variantion information,then consider the depthfile depth_linelist = depthfile.getdepthByPos(chrom, pos) if int(depth_linelist[species_idx]) <= 1: popsdata.append('no covered') else: popsdata.append(depth_linelist[species_idx] + ",0") continue elif len(re.split(":", sample)) != len( re.split(":", vcflist[8])) and depth_linelist != None: if int(depth_linelist[species_idx]) <= 1: popsdata.append('no covered') else: popsdata.append(depth_linelist[species_idx] + ",0") continue popsdata.append(re.split(":", sample)[AD_idx]) depth_linelist = None print( "insert into " + tablename + "(chrID,snp_pos,snpID,ref_base,alt_base," + "".join([e + "," for e in poptitlelist[:-1]] + poptitlelist[-1:]) + ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) + "%s from dual where not exists( select * from " + tablename + " where " + tablename + ".chrID='" + chrom + "' and " + tablename + ".snp_pos=" + str(pos) + ")", (chrom, pos, snpID, REF, ALT) + tuple(popsdata)) self.dbtools.operateDB( "insert", "insert into " + tablename + "(chrID,snp_pos,snpID,ref_base,alt_base," + "".join([e + "," for e in poptitlelist[:-1]] + poptitlelist[-1:]) + ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) + "%s from dual where not exists( select * from " + tablename + " where " + tablename + ".chrID='" + chrom + "' and " + tablename + ".snp_pos=" + str(pos) + ")", data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata)) for vcfline in vcffile: vcflist = re.split(r'\s+', vcfline.strip()) print(vcfline) if posUniq and pos == int(vcflist[1].strip()): continue chrom = vcflist[0].strip() pos = int(vcflist[1].strip()) snpID = vcflist[2].strip() REF = vcflist[3].strip() ALT = vcflist[4].strip() AD_idx = (re.split(":", vcflist[8])).index("AD") #gatk GT:AD:DP:GQ:PL sample_idx_in_vcf = 0 popsdata = [] for sample in vcflist[9:]: samplename = poptitlelist[sample_idx_in_vcf] sample_idx_in_vcf += 1 species_idx = depthfile.title.index("Depth_for_" + samplename) if len(re.split(":", sample)) != len(re.split( ":", vcflist[8])) and depth_linelist == None: # ./. depth_linelist = depthfile.getdepthByPos(chrom, pos) if int(depth_linelist[species_idx]) <= 1: popsdata.append('no covered') else: popsdata.append(depth_linelist[species_idx] + ",0") continue elif len(re.split(":", sample)) != len( re.split(":", vcflist[8])) and depth_linelist != None: if int(depth_linelist[species_idx]) <= 1: popsdata.append('no covered') else: popsdata.append(depth_linelist[species_idx] + ",0") continue # AD_depth = re.split(",", re.split(":", sample)[AD_idx]) popsdata.append(re.split(":", sample)[AD_idx]) depth_linelist = None print( "insert into " + tablename + "(chrID,snp_pos,snpID,ref_base,alt_base," + "".join([e + "," for e in poptitlelist[:-1]] + poptitlelist[-1:]) + ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) + "%s from dual where not exists( select * from " + tablename + " where " + tablename + ".chrID='" + chrom + "' and " + tablename + ".snp_pos=" + str(pos) + ")", (chrom, pos, snpID, REF, ALT) + tuple(popsdata)) self.dbtools.operateDB( "insert", "insert into " + tablename + "(chrID,snp_pos,snpID,ref_base,alt_base," + "".join([e + "," for e in poptitlelist[:-1]] + poptitlelist[-1:]) + ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) + "%s from dual where not exists( select * from " + tablename + " where " + tablename + ".chrID='" + chrom + "' and " + tablename + ".snp_pos=" + str(pos) + ")", data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata)) depthfile.closedepthfile() vcffile.close()
if aafafileName != None and cdsfafileName != None: aa_cds_filemap[speciesname] = [ open(aafafileName, 'r'), open(cdsfafileName, 'r') ] aaindex = {} cdsindex = {} try: aa_cds_filemap[speciesname].append( pickle.load(open(aafafileName + ".myindex", 'rb'))) aa_cds_filemap[speciesname].append( pickle.load(open(cdsfafileName + ".myindex", 'rb'))) except IOError: print("generateIndexByChrom", speciesname) Util.generateIndexByChrom(aafafileName, aafafileName + ".myindex", "transcript:") Util.generateIndexByChrom(cdsfafileName, cdsfafileName + ".myindex") aa_cds_filemap[speciesname].append( pickle.load(open(aafafileName + ".myindex", 'rb'))) aa_cds_filemap[speciesname].append( pickle.load(open(cdsfafileName + ".myindex", 'rb'))) stat = os.system("rm " + aafafileName + ".myindex " + cdsfafileName + ".myindex") if stat != 0: print("rm " + aafafileName + ".myindex " + cdsfafileName + ".myindex" + " os.system return not 0") exit(-1) print( "rm " + aafafileName + ".myindex " + cdsfafileName +
(options, args) = parser.parse_args() reffa_linktoDB_Name = options.reffa_linktoDB.strip() reffa_linktoDB_hanlder = open(reffa_linktoDB_Name, 'r') reffa_linktoDB_idxName = reffa_linktoDB_Name + ".myindex" reffahanlder = open(options.reffa.strip(), 'r') outfile = open(options.reffa.strip() + "MAPTO" + reffa_linktoDB_Name, 'w') if __name__ == '__main__': dbtools = dbm.DBTools(Util.ip, Util.username, Util.password, Util.genomeinfodbname) try: refidxByChr = pickle.load(open(reffa_linktoDB_idxName, 'rb')) except IOError: Util.generateIndexByChrom(reffa_linktoDB_Name, reffa_linktoDB_idxName) refidxByChr = pickle.load(open(reffa_linktoDB_idxName, 'rb')) for onelineAscaffold in reffahanlder: onelineAscaffold = onelineAscaffold.lower() if re.search(r'^>', onelineAscaffold) != None: current_scaffold = re.search(r'^>(.*)', onelineAscaffold).group(1).strip() else: current_len = len(onelineAscaffold.strip()) selectedchr = dbtools.operateDB( "select", "select * from " + Util.pekingduckchromtable + " where chrlength=" + str(current_len)) print( current_scaffold,
def findTrscpt(winfile, outbedfilename, upextend, downextend, winwidth, slideSize, winType, morethan_lessthan, threshold_title_list=None, percentage=None, mergeNA=False, extendtodistal=0, anchorfile=None, found=False, mapfile=None): if percentage != None and threshold_title_list != None: print("-t conflict with -p") exit(-1) threshold_title_list if anchorfile: # winfile=standardseparately(anchorfile,winfile) winfilemark, winfilearrangement = Util.mapWinvaluefileToChrOfReletiveSpecie( anchorfile, winfile, winwidth, slideSize, True, mapfile) else: # winfile=standardseparately(anchorfile,winfile) os.system("awk ' {if(NR=1){print $0" + '"\tmark"' + "}else{print $0" + '"\tunknown"' + "}}' " + winfile + ">" + winfile + "marked.sexchromseperatestandard") winFileName8Field = winfile + "marked.sexchromseperatestandard" f = open(winFileName8Field, "r") title = re.split(r"\s+", f.readline().strip()) f.close() Nocol = title.index(winType) + 1 re.search(r"[^/]*$", winFileName8Field).group(0) if re.search(r'^.*/', outbedfilename) != None: path = re.search(r'^.*/', outbedfilename).group(0) else: a = os.popen("pwd") path = a.readline().strip() + "/" a.close() if found: outfileNameWINwithGENE = path + re.search( r"[^/]*$", winFileName8Field).group(0) + ".wincopywithgene" return outfileNameWINwithGENE outfile = open(outbedfilename + ".bed.selectedgene", 'w') print("chrNo\tRegion_start\tRegion_end\tNoofWin\textram" + winType + "\tminNoSNP\tmaxNoSNP\ttranscpt\toverlapcode\tgeneID", file=outfile) outfileNameWINwithGENE = path + re.search( r"[^/]*$", winFileName8Field).group(0) + ".wincopywithgene" print(Util.ip, Util.username, Util.password, Util.genomeinfodbname) genomedbtools = dbm.DBTools(Util.ip, Util.username, Util.password, Util.genomeinfodbname) winGenome = Util.WinInGenome(Util.ghostdbname, winFileName8Field, Nocol) time.sleep(SLEEP_FOR_NEXT_TRY) selectWinNos = "threshold method" totalWin = winGenome.windbtools.operateDB( "select", "select count(*) from " + winGenome.wintablewithoutNA)[0][0] # selectWinNos = int(float(percentage) * totalWin) if anchorfile: wherestatmentmt = " where (mark='autosome' and " + winType + ">=" + threshold_title_list[ 0] + ") or (mark='sexchromosome' and " + winType + ">=" + threshold_title_list[ -1] + ")" # wherestatmentmp=" where 1 order by "+winType+" desc limit 0," + str(selectWinNos) wherestatmentlt = " where (mark='autosome' and " + winType + "<=" + threshold_title_list[ 0] + ") or (mark='sexchromosome' and " + winType + "<=" + threshold_title_list[ -1] + ")" # wherestatmentlp=" where 1 order by "+winType+" asc limit 0," + str(selectWinNos) else: wherestatmentmt = " where 1 and " + winType + ">=" + threshold_title_list[ 0] # wherestatmentmp=" where 1 order by "+winType+" desc limit 0," + str(selectWinNos) wherestatmentlt = " where " + winType + "!= 'NA' and " + winType + "<=" + threshold_title_list[ 0] # wherestatmentlp=" where 1 order by "+winType+" asc limit 0," + str(selectWinNos) winGenome.appendGeneName(Util.TranscriptGenetable, genomedbtools, winwidth, slideSize, outfileNameWINwithGENE, upextend, downextend, (10, morethan_lessthan)) # should be rewrite in a clear statment if percentage != None: if morethan_lessthan == "m" or morethan_lessthan == "M": selectedWins = winGenome.windbtools.operateDB( "select", "select * from " + winGenome.wintablewithoutNA + " where 1 order by " + winType + " desc limit 0," + str(selectWinNos)) print("select * from " + winGenome.wintablewithoutNA + " where 1 order by zvalue desc limit 0," + str(selectWinNos)) elif morethan_lessthan == "l" or morethan_lessthan == "L": selectedWins = winGenome.windbtools.operateDB( "select", "select * from " + winGenome.wintablewithoutNA + " where 1 order by " + winType + " asc limit 0," + str(selectWinNos)) print("select * from " + winGenome.wintablewithoutNA + " where 1 order by " + winType + " asc limit 0," + str(selectWinNos)) elif threshold_title_list != None: if morethan_lessthan == "m" or morethan_lessthan == "M": selectedWins = winGenome.windbtools.operateDB( "select", "select * from " + winGenome.wintablewithoutNA + wherestatmentmt) elif morethan_lessthan == "l" or morethan_lessthan == "L": # print("select", "select * from " + winGenome.wintablewithoutNA + " where "+winType+"!= 'NA' and "+winType+"<=" + threshold) selectedWins = winGenome.windbtools.operateDB( "select", "select * from " + winGenome.wintablewithoutNA + wherestatmentlt) selectWinNos = len(selectedWins) selectedWins.sort(key=lambda listRec: float(listRec[5])) if selectWinNos == 0: outfile.close() print("selectWinNos==0") exit(0) print(outbedfilename + ".bed.selectgene", selectWinNos, "~=", len(selectedWins), selectedWins[0], selectedWins[-1]) selectedWinMap = {} for win in selectedWins: if win[0] in selectedWinMap: selectedWinMap[win[0]].append(win) else: selectedWinMap[win[0]] = [win] selectedRegion = {} for chrom in selectedWinMap: selectedWinMap[chrom].sort(key=lambda listRec: int(listRec[1])) selectedRegion[chrom] = [] mergedRegion = [selectedWinMap[chrom][0]] i = 1 while i < len(selectedWinMap[chrom]): # print(chrom,selectedWinMap[chrom][i]) # try: if int(selectedWinMap[chrom][i - 1][1]) + 1 == int( selectedWinMap[chrom][i][1]) or int(selectedWinMap[chrom][ i - 1][1]) * slideSize + winwidth >= int( selectedWinMap[chrom][i] [1]) * slideSize: #continues win mergedRegion.append(selectedWinMap[chrom][i]) else: #not continues #process last region Region_start = int(mergedRegion[0][1]) * slideSize Region_end = int(mergedRegion[-1][1]) * slideSize + winwidth Nwin = len(mergedRegion) extremeValues = [] noofsnps = [] for e in mergedRegion: if winType == "winvalue": extremeValues.append(float(e[5])) elif winType == "zvalue": extremeValues.append(float(e[6])) noofsnps.append(int(e[4])) if morethan_lessthan == "m" or morethan_lessthan == "M": extremeValue = min(extremeValues) elif morethan_lessthan == "l" or morethan_lessthan == "L": extremeValue = max(extremeValues) maxNoSNP = max(noofsnps) mixNoSNP = min(noofsnps) selectedRegion[chrom].append( (chrom, Region_start, Region_end, Nwin, extremeValue, mixNoSNP, maxNoSNP)) #process this win mergedRegion = [selectedWinMap[chrom][i]] i += 1 # except IndexError: # print(i,len(selectedWinMap[chrom]),selectedWinMap[chrom]) # exit(-1) else: Region_start = int(mergedRegion[0][1]) * slideSize Region_end = int(mergedRegion[-1][1]) * slideSize + winwidth Nwin = len(mergedRegion) extremeValues = [] noofsnps = [] for e in mergedRegion: if winType == "winvalue": extremeValues.append(float(e[5])) elif winType == "zvalue": extremeValues.append(float(e[6])) noofsnps.append(int(e[4])) if morethan_lessthan == "m" or morethan_lessthan == "M": extremeValue = min(extremeValues) elif morethan_lessthan == "l" or morethan_lessthan == "L": extremeValue = max(extremeValues) maxNoSNP = max(noofsnps) mixNoSNP = min(noofsnps) selectedRegion[chrom].append( (chrom, Region_start, Region_end, Nwin, extremeValue, mixNoSNP, maxNoSNP)) if mergeNA != False and int(mergeNA) > 0: for chrom in selectedRegion: selectedRegion[chrom].sort(key=lambda listRec: int(listRec[1])) i = 1 idxlist_to_pop = [] while i < len(selectedRegion[chrom]): winNo_end = str(int(selectedRegion[chrom][i][1] / slideSize)) winNo_start = str( int((selectedRegion[chrom][i - 1][2] - winwidth) / slideSize)) print("select * from " + winGenome.wintablewithoutNA + " where " + " chrID='" + chrom + "' and winNo>" + winNo_start + " and winNo<" + winNo_end) wincount_to_determine = winGenome.windbtools.operateDB( "select", "select * from " + winGenome.wintablewithoutNA + " where " + " chrID='" + chrom + "' and winNo>" + winNo_start + " and winNo<" + winNo_end) wincount_to_add = winGenome.windbtools.operateDB( "select", "select * from " + winGenome.wintabletextvalueallwin + " where " + " chrID='" + chrom + "' and winNo>" + winNo_start + " and winNo<" + winNo_end) if len(wincount_to_determine ) == 0 and len(wincount_to_add) <= int(mergeNA): if morethan_lessthan == "m" or morethan_lessthan == "M": extremeValue = min(selectedRegion[chrom][i][4], selectedRegion[chrom][i - 1][4]) elif morethan_lessthan == "l" or morethan_lessthan == "L": extremeValue = max(selectedRegion[chrom][i][4], selectedRegion[chrom][i - 1][4]) maxNoSNP = max(selectedRegion[chrom][i][3], selectedRegion[chrom][i - 1][3]) mixNoSNP = min(selectedRegion[chrom][i][3], selectedRegion[chrom][i - 1][3]) selectedRegion[chrom][i] = ( chrom, selectedRegion[chrom][i - 1][1], selectedRegion[chrom][i][2], selectedRegion[chrom][i - 1][3] + selectedRegion[chrom][i][3] + len(wincount_to_add), extremeValue, mixNoSNP, maxNoSNP) idxlist_to_pop.append(i - 1) i += 1 else: idxlist_to_pop.reverse() for idx_to_pop in idxlist_to_pop: selectedRegion[chrom].pop(idx_to_pop) else: for chrom in selectedRegion: selectedRegion[chrom].sort(key=lambda listRec: int(listRec[1])) # get final table print("getting final table") final_table = {} for chrom in selectedRegion: for region in selectedRegion[chrom]: print(chrom, region) if extendtodistal > 0: final_table[region] = winGenome.collectTrscptInWin( genomedbtools, Util.TranscriptGenetable, region, upextend, downextend, extendtodistal) else: final_table[region] = winGenome.collectTrscptInWin( genomedbtools, Util.TranscriptGenetable, region, upextend, downextend) #process top outlier values print("fill bedselectedtable") for chrom in winGenome.chromOrder: if chrom not in selectedRegion: continue for region in selectedRegion[chrom]: if chrom.strip() == region[0].strip(): tcpts = "" tpcode = "" gnames = "" for tcpt in final_table[region]: tcpts += (tcpt[0] + ",") tpcode += (str(tcpt[-1]) + ",") if tcpt[2].strip() != "": gnames += (tcpt[2] + ",") print("\t".join(map(str, region)), tcpts[:-1], tpcode[:-1], gnames[:-1], sep="\t", file=outfile) winGenome.windbtools.drop_table(winGenome.wintabletextvalueallwin) winGenome.windbtools.drop_table(winGenome.wintablewithoutNA) outfile.close() return outfileNameWINwithGENE
(options, args) = parser.parse_args() outfile = open(options.outfile, 'w') outfilewithvalue = open(options.outfile + "_withvalue", 'w') percentage = float(options.percentageofCovered) averagedepth = int(options.averagedepthThreshold) chromtable = Util.pekingduckchromtable windowWidth = int(options.winwidth) slideSize = int(options.slidesize) mindepth = int(options.mindepth) print(percentage) if __name__ == '__main__': depthbinmap = {} mywin = Util.Window() dbtools = dbm.DBTools(Util.ip, Util.username, Util.password, Util.genomeinfodbname) depthfile = Util.GATK_depthfile(options.genomedepth, options.genomedepth + ".index") if "" in depthfile.title: depthfile.title.remove("") print(depthfile.title, len(depthfile.title) - 3) if options.speciesnames == []: print("chrom", "start_pos", "end_pos", *depthfile.title[3:], sep="\t", file=outfile) print("chrom",
# for a,b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()): # freq_xaxisKEY_yaxisVALUERelation[(a,b)]=numpy.mean(freq_xaxisKEY_yaxisVALUE_seq_list[(a,b)]) # print('%.12f'%a,'%.12f'%(b),'%.12f'%(freq_xaxisKEY_yaxisVALUERelation[(a,b)]),"process ID:",os.getpid(),"done",sep="\t") print("process ID:", os.getpid(), "done") return copy.deepcopy(freq_xaxisKEY_yaxisVALUE_seq_list) if __name__ == '__main__': filenamelistfilename = options.outfileprewithpath + ".freqcorrelationfilenamelist" parameterstuples = (options.chromlistfilename, options.topleveltablejudgeancestral, options.targetpopvcfconfig, options.refpopvcffileconfig, options.numberofindvdoftargetpop_todividintobin) print(parameterstuples, options.outfileprewithpath) freq_xaxisKEY_yaxisVALUE_seq_list = make_freq_xaxisKEY_yaxisseqVALUERelation( parameterstuples) outfilename = options.outfileprewithpath + "_part_" + str( os.getpid()) + Util.random_str() outfile = open(outfilename, 'w') filenamelistfile = open(filenamelistfilename, 'a') for a, b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()): print(str(a), str(b), *freq_xaxisKEY_yaxisVALUE_seq_list[(a, b)], sep="\t", file=outfile) outfile.close() print(outfilename, file=filenamelistfile) print(sys.argv, outfilename) filenamelistfile.close() print("process ID:", os.getpid(), "finished") exit(0)
chromstable=options.chromtable primaryID = "chrID" OUTFILENAME="ducksnpflankseq.fa" # outfile=open("ducksnpflankseq.fa",'w') BlastOutFile="ducksnpflankseq.blast" if __name__ == '__main__': aaa=DAP.MakeDerivedAlleletable(database=dbname,ip="10.2.48.96",usrname="root",pw="1234567") # aaa=DAP.MakeDerivedAlleletable(database=dbname,ip="10.2.48.140",usrname="root",pw="1234567") # ddd=MP.Dstistics_allpop(allpop) # ddd.caculateDofAllpossibleCombination(database=dbname,ip="10.2.48.140",usrname="root",pw="1234567", allpopssnptable="derived_alle_ref", chromstable=chromstable, winwidth=None, minlengthOfchrom=minlengthOfchrom, filenamepre=options.prefilename) dbtoolsforchrom = dbm.DBTools(Util.ip, Util.username, Util.password, Util.genomeinfodbname) try: duckrefindex = pickle.load(open(options.reference + ".myindex", 'rb')) originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb')) except IOError: Util.generateIndexByChrom(options.reference, options.reference + ".myindex") Util.generateIndexByChrom(originalspeciesref, originalspeciesref + ".myindex") duckrefindex = pickle.load(open(options.reference + ".myindex", 'rb')) originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb')) # aaa.createtable() # aaa.filldata(vcfFileName=vcfFileName,depthfileName=DepthFileName,continuechrom=continuechrom,continuepos=continuepos) aaa.fillarchicpop(archicpopVcfFile,DepthFileName,chromstable,archicpopNameindepthFile) # totalChroms = dbtoolsforchrom.operateDB("select","select count(*) from "+chromstable)[0][0] # for i in range(0,totalChroms,20): # currentsql="select * from " + chromstable+" order by chrlength limit "+str(i)+",20" result=dbtoolsforchrom.operateDB("select",currentsql) # for row in result: # currentchrID=row[0] # currentchrLen=int(row[2]) # aaa.getflankseqs(currentchrID,currentchrLen, 1+flanklen, currentchrLen, idxedreffilehandler=duckrefhandler, refindex=duckrefindex, flanklen=flanklen,outfile=outfile, tablename="derived_alle_ref") # outfile.close()
dest="verbose", default=True, help="don't print status messages to stdout") test = open("test.txt", 'w') (options, args) = parser.parse_args() if __name__ == '__main__': phastConsfile = open(options.infilename, "r") L = [] firstline = re.split(r'\s+', phastConsfile.readline()) currentchrom = firstline[0] winStart = int(firstline[1]) L = [(winStart, int(firstline[2]), float(firstline[3]))] print(L) for line in phastConsfile: linelist = re.split(r'\s+', line) if currentchrom == linelist[0]: L.append((int(linelist[1]), int(linelist[2]), float(linelist[3]))) caculate_phastConsValue = Caculators.Caculate_phastConsValue() win = Util.Window() print(int(options.winwidth), winStart) win.forPhastConsFormat(L=L, L_End_Pos=len(L), windowWidth=int(options.winwidth), Caculator=caculate_phastConsValue, winStart=winStart) for e in win.winValueL: print(*e, sep='\t', file=test) phastConsfile.close() test.close()
minlength = options.minlength vcffileslist = options.vcffile sql = "select * from " + chromtable + " where chrlength>=" + minlength class SNPsPerBIN(): def __init__(self): self.SNPsPerBINMap = {} if __name__ == '__main__': speicesidxs_inbindepthmap = [] if len(vcffileslist[:]) == 1 and len( options.specieses) != 0 and options.coveragebin != None: bindepth = Util.BinDepth(options.coveragebin) for species in options.specieses: speicesidxs_inbindepthmap.append( bindepth.speciesname.index("Depth_for_" + species) + 2) consider_Depth = True else: consider_Depth = False dbtools = dbm.DBTools(Util.ip, Util.username, Util.password, Util.genomeinfodbname) print(vcffileslist[:]) for vcf in vcffileslist[:]: vcfname = re.search(r"[^/]*$", vcf).group(0) if re.search(r"indvd[^/]+", vcf) != None: snpcounter = Caculators.Caculate_SNPsPerBIN( windowWidth, considerINDEL=howtoIndel, MethodToSeq="indvd") elif re.search(r"pool[^/]+", vcf) != None:
testminintervalbetweengenes_basesperfaline.readline().strip()) print(minintervalbetweengenes_basesperfaline) #gtffile = open(options.gtffile, 'r') vcffile = open(options.variants, 'r') #covfile = open(options.genomedepth, 'r') cns_string = ">" aa_string = "" cdscns_string = "" outcns = open(options.outfileprename + "_cns.fa", 'w') outaa = open(options.outfileprename + "_aa.fa", 'w') outcdscns = open(options.outfileprename + "_cdscns.fa", 'w') cdsmap = {} if __name__ == '__main__': if options.genomedepth != None: depthfile = Util.GATK_depthfile(options.genomedepth, options.genomedepth + ".index") species_idx = depthfile.title.index("Depth_for_" + options.species) Considerdepth = True else: Considerdepth = False depthfile = None species_idx = -1 vcfpop = VCFutil.VCF_Data(options.variants) # new a class RefSeqMap, currentChromNO, nextChromNO = Util.getRefSeqMap( refFastafilehander=reffa) print(currentChromNO, nextChromNO) cns_string += currentChromNO + "\n" gtfMap = Util.getGtfMap(options.gtffile) lastposofdepthfp = 0 #because this time RefSeqMap[0] is 0 vcfchrom = "begin"
for chrlist, vcflikeFileName, corresponding_ref, flanklen in options.variantfilewithref: chromlistfile = open(chrlist, "r") chrmap = {} for rec in chromlistfile: reclist = re.split(r'\s+', rec.strip()) chrmap[reclist[0]] = reclist[1] flanklen = int(flanklen) duckrefhandler = open(corresponding_ref, 'r') try: duckrefindex = pickle.load( open(corresponding_ref + ".myfasteridx", 'rb')) # originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb')) except IOError: Util.generateFasterRefIndex(corresponding_ref, corresponding_ref + ".myfasteridx", chrsignal=options.chrsignal) duckrefindex = pickle.load( open(corresponding_ref + ".myfasteridx", 'rb')) vcflikefile = open(vcflikeFileName, 'r') vcflinesalchr = vcflikefile.readlines() #1,read variations chrom = None snpsOfOneChrom = [] startpostocollecteSNP = 1 while vcflinesalchr: snpline = vcflinesalchr.pop(0).strip() if snpline[0] == "#" or snpline.lower().find("chrom") == 0: #title continue else: