testminintervalbetweengenes_basesperfaline.readline().strip()) print(minintervalbetweengenes_basesperfaline) #gtffile = open(options.gtffile, 'r') vcffile = open(options.variants, 'r') #covfile = open(options.genomedepth, 'r') cns_string = ">" aa_string = "" cdscns_string = "" outcns = open(options.outfileprename + "_cns.fa", 'w') outaa = open(options.outfileprename + "_aa.fa", 'w') outcdscns = open(options.outfileprename + "_cdscns.fa", 'w') cdsmap = {} if __name__ == '__main__': if options.genomedepth != None: depthfile = Util.GATK_depthfile(options.genomedepth, options.genomedepth + ".index") species_idx = depthfile.title.index("Depth_for_" + options.species) Considerdepth = True else: Considerdepth = False depthfile = None species_idx = -1 vcfpop = VCFutil.VCF_Data(options.variants) # new a class RefSeqMap, currentChromNO, nextChromNO = Util.getRefSeqMap( refFastafilehander=reffa) print(currentChromNO, nextChromNO) cns_string += currentChromNO + "\n" gtfMap = Util.getGtfMap(options.gtffile) lastposofdepthfp = 0 #because this time RefSeqMap[0] is 0 vcfchrom = "begin"
def filldata(self, vcfFileName, depthfileName, tablename="derived_alle_ref", posUniq=True, continuechrom=None, continuepos=None): depthfile = Util.GATK_depthfile(depthfileName, depthfileName + ".index") depth_linelist = None vcffile = open(vcfFileName, 'r') vcfline = vcffile.readline() while re.search(r'^##', vcfline) != None: vcfline = vcffile.readline() if re.search(r'^#', vcfline) != None: poptitlelist = re.split(r'\s+', vcfline.strip())[9:] print(poptitlelist) else: print( "need title'#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT'" ) exit(-1) for pop in poptitlelist: self.dbtools.operateDB("callproc", "mysql_sp_add_column", data=("life_pilot", tablename, pop, "varchar(128)", "default null")) popsdata = [] #depth for ref or alt if continuechrom != None and continuepos != None: print("filldata", continuechrom, continuepos) vcfpossearcher = VCFutil.VCF_Data(vcfFileName) vcffile.seek(vcfpossearcher.VcfIndexMap[continuechrom]) vcfline = vcffile.readline() while vcfline: vcflist = re.split(r'\s+', vcfline.strip()) chrom = vcflist[0].strip() pos = int(vcflist[1].strip()) print(chrom, pos) if chrom == continuechrom and pos == continuepos: break vcfline = vcffile.readline() else: justiceGATKorSamtools = vcffile.readline() vcflist = re.split(r'\s+', justiceGATKorSamtools.strip()) dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)", vcflist[7]) refdep = 0 altalleledep = 0 if dp4 != None: #vcf from samtools print("function for samtools vcf is still need to be finish") exit(-1) else: chrom = vcflist[0].strip() pos = int(vcflist[1].strip()) snpID = vcflist[2].strip() REF = vcflist[3].strip() ALT = vcflist[4].strip() AD_idx = (re.split(":", vcflist[8])).index( "AD") #gatk GT:AD:DP:GQ:PL sample_idx_in_vcf = 0 for sample in vcflist[9:]: samplename = poptitlelist[sample_idx_in_vcf] sample_idx_in_vcf += 1 species_idx = depthfile.title.index("Depth_for_" + samplename) if len(re.split(":", sample)) != len( re.split(":", vcflist[8]) ) and depth_linelist == None: # ./. when lack of variantion information,then consider the depthfile depth_linelist = depthfile.getdepthByPos(chrom, pos) if int(depth_linelist[species_idx]) <= 1: popsdata.append('no covered') else: popsdata.append(depth_linelist[species_idx] + ",0") continue elif len(re.split(":", sample)) != len( re.split(":", vcflist[8])) and depth_linelist != None: if int(depth_linelist[species_idx]) <= 1: popsdata.append('no covered') else: popsdata.append(depth_linelist[species_idx] + ",0") continue popsdata.append(re.split(":", sample)[AD_idx]) depth_linelist = None print( "insert into " + tablename + "(chrID,snp_pos,snpID,ref_base,alt_base," + "".join([e + "," for e in poptitlelist[:-1]] + poptitlelist[-1:]) + ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) + "%s from dual where not exists( select * from " + tablename + " where " + tablename + ".chrID='" + chrom + "' and " + tablename + ".snp_pos=" + str(pos) + ")", (chrom, pos, snpID, REF, ALT) + tuple(popsdata)) self.dbtools.operateDB( "insert", "insert into " + tablename + "(chrID,snp_pos,snpID,ref_base,alt_base," + "".join([e + "," for e in poptitlelist[:-1]] + poptitlelist[-1:]) + ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) + "%s from dual where not exists( select * from " + tablename + " where " + tablename + ".chrID='" + chrom + "' and " + tablename + ".snp_pos=" + str(pos) + ")", data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata)) for vcfline in vcffile: vcflist = re.split(r'\s+', vcfline.strip()) print(vcfline) if posUniq and pos == int(vcflist[1].strip()): continue chrom = vcflist[0].strip() pos = int(vcflist[1].strip()) snpID = vcflist[2].strip() REF = vcflist[3].strip() ALT = vcflist[4].strip() AD_idx = (re.split(":", vcflist[8])).index("AD") #gatk GT:AD:DP:GQ:PL sample_idx_in_vcf = 0 popsdata = [] for sample in vcflist[9:]: samplename = poptitlelist[sample_idx_in_vcf] sample_idx_in_vcf += 1 species_idx = depthfile.title.index("Depth_for_" + samplename) if len(re.split(":", sample)) != len(re.split( ":", vcflist[8])) and depth_linelist == None: # ./. depth_linelist = depthfile.getdepthByPos(chrom, pos) if int(depth_linelist[species_idx]) <= 1: popsdata.append('no covered') else: popsdata.append(depth_linelist[species_idx] + ",0") continue elif len(re.split(":", sample)) != len( re.split(":", vcflist[8])) and depth_linelist != None: if int(depth_linelist[species_idx]) <= 1: popsdata.append('no covered') else: popsdata.append(depth_linelist[species_idx] + ",0") continue # AD_depth = re.split(",", re.split(":", sample)[AD_idx]) popsdata.append(re.split(":", sample)[AD_idx]) depth_linelist = None print( "insert into " + tablename + "(chrID,snp_pos,snpID,ref_base,alt_base," + "".join([e + "," for e in poptitlelist[:-1]] + poptitlelist[-1:]) + ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) + "%s from dual where not exists( select * from " + tablename + " where " + tablename + ".chrID='" + chrom + "' and " + tablename + ".snp_pos=" + str(pos) + ")", (chrom, pos, snpID, REF, ALT) + tuple(popsdata)) self.dbtools.operateDB( "insert", "insert into " + tablename + "(chrID,snp_pos,snpID,ref_base,alt_base," + "".join([e + "," for e in poptitlelist[:-1]] + poptitlelist[-1:]) + ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) + "%s from dual where not exists( select * from " + tablename + " where " + tablename + ".chrID='" + chrom + "' and " + tablename + ".snp_pos=" + str(pos) + ")", data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata)) depthfile.closedepthfile() vcffile.close()
def fillarchicpop(self, archicpopVcfFile, depthFile, chromtable, archicpopNameindepthFile, tablename="derived_alle_ref", archicpopfieldNameintable="archicpop"): """ abandon the snps which exist in archicpopVcfFile but absence in all others pop snp sets """ depthfile = Util.GATK_depthfile(depthFile, depthFile + ".index") species_idx = depthfile.title.index("Depth_for_" + archicpopNameindepthFile) archicpop = VCFutil.VCF_Data(archicpopVcfFile) totalChroms = self.dbtools.operateDB( "select", "select count(*) from " + chromtable)[0][0] for i in range(0, totalChroms, 20): currentsql = "select * from " + chromtable + " order by chrlength desc limit " + str( i) + ",20" result = self.dbtools.operateDB("select", currentsql) for row in result: currentchrID = row[0] print(currentchrID + ":", end="") currentchrLen = int(row[2]) archicpopSeqOfAChr = {} archicpopSeqOfAChr[currentchrID] = archicpop.getVcfListByChrom( archicpopVcfFile, currentchrID) allsnpsInAchr = self.dbtools.operateDB( "select", "select snp_pos,alt_base from " + tablename + " where chrID='" + currentchrID + "'") for snp in allsnpsInAchr: snp_pos = int(snp[0]) ALT = snp[1] low = 0 high = len(archicpopSeqOfAChr[currentchrID]) - 1 while low <= high: mid = (low + high) >> 1 if archicpopSeqOfAChr[currentchrID][mid][0] < snp_pos: low = mid + 1 elif archicpopSeqOfAChr[currentchrID][mid][0] > snp_pos: high = mid - 1 else: #find the pos pos, REF, ALT, INFO, FORMAT, samples = archicpopSeqOfAChr[ currentchrID][mid] dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)", INFO) refdep = 0 altalleledep = 0 if dp4 != None: #vcf from samtools refdep = int(dp4.group(1)) + int(dp4.group(2)) altalleledep = int(dp4.group(3)) + int( dp4.group(4)) else: AD_idx = (re.split(":", FORMAT)).index( "AD") #gatk GT:AD:DP:GQ:PL for sample in samples: if len(re.split(":", sample)) == 1: # ./. continue AD_depth = re.split( ",", re.split(":", sample)[AD_idx]) try: refdep += int(AD_depth[0]) altalleledep += int(AD_depth[1]) except ValueError: print(sample, end="") popsdata = ALT + ":" + str(refdep) + "," + str( altalleledep) break else: depth_linelist = depthfile.getdepthByPos( currentchrID, snp_pos) if int(depth_linelist[species_idx]) <= 1: popsdata = "no covered" else: popsdata = ALT + ":" + depth_linelist[ species_idx] + ",0" # print(snp[0],end="\t") self.dbtools.operateDB( "update", "update " + tablename + " set " + archicpopfieldNameintable + " = '" + popsdata + "' where chrID=" + "'" + currentchrID + "' and snp_pos=" + str(snp[0]))
help="ancenstral(a) or derived(d)") (options, args) = parser.parse_args() mindeptojudgefix = 15 ##################### VCFobj = {} vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall = {} vcfnameKEY_depthobjVALUE_tojudgeancestral = {} VCFobj["wigeon"] = VCFutil.VCF_Data( "/home/bioinfo/liurui/data/vcffiles/uniqmap/taihudomesticgoose/taihudomesticgoose.pool.withindel.vcf" ) VCFobj["fanya"] = VCFutil.VCF_Data( "/home/bioinfo/liurui/data/vcffiles/uniqmap/fanya/fanya._pool.withindel.vcf" ) vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[ "wigeon"] = Util.GATK_depthfile( "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth", "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index" ) #here is a temp trick not a error vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[ "fanya"] = Util.GATK_depthfile( "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth", "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index" ) vcfnameKEY_depthobjVALUE_tojudgeancestral["wigeon"] = [ "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth", 9 ] vcfnameKEY_depthobjVALUE_tojudgeancestral["fanya"] = [ "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth", 3 ] ####################################