Beispiel #1
0
 def setMode(self,mode,i_o,outputpath,windowWidth,slideSize,chromtable,minlength,tempdpip=None,tempdbusername=None,tempdbpw=None,tempdpname=None):
     self.mode=mode
     self.chromtable=chromtable
     self.minlength=minlength
     self.windowWidth=windowWidth
     self.slideSize=slideSize
     self.outputpath=outputpath
     self.i_o=i_o
     if self.mode=='R' or self.mode=='r':
         self.allspeices=[]
         self.treearrayprename=""
         for pathtoname in self.vcffileslist[:]:
             self.allspeices.append(re.search(r"[^/]*$",pathtoname).group(0).replace('.','_'))
             self.treearrayprename+=re.search(r"[^/]*$",pathtoname).group(0)[0]
         self.phyliparrayinfile=open(outputpath+self.treearrayprename+"phylip.arrayin"+str(self.windowWidth)+"_"+str(self.slideSize),'w')
         print("mysqltablename: "+self.treearrayprename+"treearray")
         arraytitle=""
         for name in self.allspeices:
             arraytitle+=(name+"\t")
         print("\t"+arraytitle+"\n")
         for namerow in self.allspeices:
             print(namerow[0:8]+"\n")        
             
         self.allkindofpaire = list(combinations(self.vcffileslist[:], 2))
         
         self.tempdbtools = dbm.DBTools(tempdpip, tempdbusername, tempdbpw, tempdpname)
         TABLES = {}
         TABLES[self.treearrayprename+"treearray"] = (
             "CREATE TABLE "+self.treearrayprename+"treearray ("
             " `chrID` varchar(128) NOT NULL ,"
             " `winNo` int(18) NOT NULL,"
             " PRIMARY KEY (`chrID`,`winNo`)"
             ")engine=innodb default charset=utf8"
             )
         self.tempdbtools.drop_table(self.treearrayprename+"treearray")
         time.sleep(SLEEP_FOR_NEXT_TRY)
         self.tempdbtools.create_table(TABLES)
     elif self.mode == 'G' or self.mode == 'g':
         self.globalFstMapByChrom={}        
         self.specisnum=str(len(self.vcffileslist[:]))
Beispiel #2
0
mergeNA = options.mergeNA
print(mergeNA)
if percentage != None and threshold != None:
    print("-t conflict with -p")
    exit(-1)
#gene_sample_venn="gene_sample_venn"ninglabvariantdata_tmp
vcftable = None
outfile = open(outfilename, 'w')
print("chrNo\tRegion_start\tRegion_end\tNoofWin\textram" + options.winType +
      "\ttranscpt\tgeneID",
      file=outfile)
outfileNameWINwithGENE = path + re.search(
    r"[^/]*$", winFileName7Field).group(0) + ".wincopywithgene"

if __name__ == '__main__':
    genomedbtools = dbm.DBTools(Util.ip, Util.username, Util.password,
                                Util.genomeinfodbname)
    winGenome = Util.WinInGenome(Util.ghostdbname, winFileName7Field)
    time.sleep(SLEEP_FOR_NEXT_TRY)
    winGenome.appendGeneName(Util.TranscriptGenetable, genomedbtools, winWidth,
                             slideSize, outfileNameWINwithGENE, upextend,
                             downextend, (total_outliers, morethan_lessthan))
    selectWinNos = "threshold method"
    if percentage != None:
        totalWin = winGenome.windbtools.operateDB(
            "select",
            "select count(*) from " + winGenome.wintablewithoutNA)[0][0]
        selectWinNos = int(float(percentage) * totalWin)
        if morethan_lessthan == "m" or morethan_lessthan == "M":
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                " where 1 order by " + options.winType + " desc limit 0," +
Beispiel #3
0
def findTrscpt(winfile,
               outbedfilename,
               upextend,
               downextend,
               winwidth,
               slideSize,
               winType,
               morethan_lessthan,
               threshold_title_list=None,
               percentage=None,
               mergeNA=False,
               extendtodistal=0,
               anchorfile=None,
               found=False,
               mapfile=None):

    if percentage != None and threshold_title_list != None:
        print("-t conflict with -p")
        exit(-1)
    threshold_title_list
    if anchorfile:
        #         winfile=standardseparately(anchorfile,winfile)
        winfilemark, winfilearrangement = Util.mapWinvaluefileToChrOfReletiveSpecie(
            anchorfile, winfile, winwidth, slideSize, True, mapfile)
    else:
        #         winfile=standardseparately(anchorfile,winfile)
        os.system("awk ' {if(NR=1){print $0" + '"\tmark"' + "}else{print $0" +
                  '"\tunknown"' + "}}' " + winfile + ">" + winfile +
                  "marked.sexchromseperatestandard")
    winFileName8Field = winfile + "marked.sexchromseperatestandard"
    f = open(winFileName8Field, "r")
    title = re.split(r"\s+", f.readline().strip())
    f.close()
    Nocol = title.index(winType) + 1
    re.search(r"[^/]*$", winFileName8Field).group(0)
    if re.search(r'^.*/', outbedfilename) != None:
        path = re.search(r'^.*/', outbedfilename).group(0)
    else:
        a = os.popen("pwd")
        path = a.readline().strip() + "/"
        a.close()
    if found:
        outfileNameWINwithGENE = path + re.search(
            r"[^/]*$", winFileName8Field).group(0) + ".wincopywithgene"
        return outfileNameWINwithGENE
    outfile = open(outbedfilename + ".bed.selectedgene", 'w')
    print("chrNo\tRegion_start\tRegion_end\tNoofWin\textram" + winType +
          "\tminNoSNP\tmaxNoSNP\ttranscpt\toverlapcode\tgeneID",
          file=outfile)
    outfileNameWINwithGENE = path + re.search(
        r"[^/]*$", winFileName8Field).group(0) + ".wincopywithgene"
    print(Util.ip, Util.username, Util.password, Util.genomeinfodbname)
    genomedbtools = dbm.DBTools(Util.ip, Util.username, Util.password,
                                Util.genomeinfodbname)

    winGenome = Util.WinInGenome(Util.ghostdbname, winFileName8Field, Nocol)

    time.sleep(SLEEP_FOR_NEXT_TRY)
    selectWinNos = "threshold method"
    totalWin = winGenome.windbtools.operateDB(
        "select", "select count(*) from " + winGenome.wintablewithoutNA)[0][0]
    #     selectWinNos = int(float(percentage) * totalWin)
    if anchorfile:
        wherestatmentmt = " where (mark='autosome' and " + winType + ">=" + threshold_title_list[
            0] + ") or (mark='sexchromosome' and " + winType + ">=" + threshold_title_list[
                -1] + ")"
        #         wherestatmentmp=" where 1 order by "+winType+" desc limit 0," + str(selectWinNos)
        wherestatmentlt = " where (mark='autosome' and " + winType + "<=" + threshold_title_list[
            0] + ") or (mark='sexchromosome' and " + winType + "<=" + threshold_title_list[
                -1] + ")"
#         wherestatmentlp=" where 1 order by "+winType+" asc limit 0," + str(selectWinNos)
    else:
        wherestatmentmt = " where 1 and " + winType + ">=" + threshold_title_list[
            0]
        #         wherestatmentmp=" where 1 order by "+winType+" desc limit 0," + str(selectWinNos)
        wherestatmentlt = " where " + winType + "!= 'NA' and " + winType + "<=" + threshold_title_list[
            0]
#         wherestatmentlp=" where 1 order by "+winType+" asc limit 0," + str(selectWinNos)
    winGenome.appendGeneName(Util.TranscriptGenetable, genomedbtools, winwidth,
                             slideSize, outfileNameWINwithGENE, upextend,
                             downextend, (10, morethan_lessthan))
    #    should be rewrite in a clear statment
    if percentage != None:

        if morethan_lessthan == "m" or morethan_lessthan == "M":
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                " where 1 order by " + winType + " desc limit 0," +
                str(selectWinNos))
            print("select * from " + winGenome.wintablewithoutNA +
                  " where 1 order by zvalue desc limit 0," + str(selectWinNos))
        elif morethan_lessthan == "l" or morethan_lessthan == "L":
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                " where 1 order by " + winType + " asc limit 0," +
                str(selectWinNos))
            print("select * from " + winGenome.wintablewithoutNA +
                  " where 1 order by " + winType + " asc limit 0," +
                  str(selectWinNos))
    elif threshold_title_list != None:
        if morethan_lessthan == "m" or morethan_lessthan == "M":
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                wherestatmentmt)

        elif morethan_lessthan == "l" or morethan_lessthan == "L":
            #             print("select", "select * from " + winGenome.wintablewithoutNA + " where "+winType+"!= 'NA' and "+winType+"<=" + threshold)
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                wherestatmentlt)
        selectWinNos = len(selectedWins)
    selectedWins.sort(key=lambda listRec: float(listRec[5]))
    if selectWinNos == 0:
        outfile.close()
        print("selectWinNos==0")
        exit(0)
    print(outbedfilename + ".bed.selectgene", selectWinNos, "~=",
          len(selectedWins), selectedWins[0], selectedWins[-1])
    selectedWinMap = {}
    for win in selectedWins:
        if win[0] in selectedWinMap:
            selectedWinMap[win[0]].append(win)
        else:
            selectedWinMap[win[0]] = [win]

    selectedRegion = {}

    for chrom in selectedWinMap:
        selectedWinMap[chrom].sort(key=lambda listRec: int(listRec[1]))
        selectedRegion[chrom] = []
        mergedRegion = [selectedWinMap[chrom][0]]
        i = 1
        while i < len(selectedWinMap[chrom]):
            #             print(chrom,selectedWinMap[chrom][i])
            #             try:
            if int(selectedWinMap[chrom][i - 1][1]) + 1 == int(
                    selectedWinMap[chrom][i][1]) or int(selectedWinMap[chrom][
                        i - 1][1]) * slideSize + winwidth >= int(
                            selectedWinMap[chrom][i]
                            [1]) * slideSize:  #continues win
                mergedRegion.append(selectedWinMap[chrom][i])
            else:  #not continues
                #process last region
                Region_start = int(mergedRegion[0][1]) * slideSize
                Region_end = int(mergedRegion[-1][1]) * slideSize + winwidth
                Nwin = len(mergedRegion)
                extremeValues = []
                noofsnps = []
                for e in mergedRegion:
                    if winType == "winvalue":
                        extremeValues.append(float(e[5]))
                    elif winType == "zvalue":
                        extremeValues.append(float(e[6]))
                    noofsnps.append(int(e[4]))

                if morethan_lessthan == "m" or morethan_lessthan == "M":
                    extremeValue = min(extremeValues)
                elif morethan_lessthan == "l" or morethan_lessthan == "L":
                    extremeValue = max(extremeValues)
                maxNoSNP = max(noofsnps)
                mixNoSNP = min(noofsnps)
                selectedRegion[chrom].append(
                    (chrom, Region_start, Region_end, Nwin, extremeValue,
                     mixNoSNP, maxNoSNP))
                #process this win
                mergedRegion = [selectedWinMap[chrom][i]]
            i += 1
#             except IndexError:
#                 print(i,len(selectedWinMap[chrom]),selectedWinMap[chrom])
#                 exit(-1)
        else:
            Region_start = int(mergedRegion[0][1]) * slideSize
            Region_end = int(mergedRegion[-1][1]) * slideSize + winwidth
            Nwin = len(mergedRegion)
            extremeValues = []
            noofsnps = []
            for e in mergedRegion:
                if winType == "winvalue":
                    extremeValues.append(float(e[5]))
                elif winType == "zvalue":
                    extremeValues.append(float(e[6]))
                noofsnps.append(int(e[4]))
            if morethan_lessthan == "m" or morethan_lessthan == "M":
                extremeValue = min(extremeValues)
            elif morethan_lessthan == "l" or morethan_lessthan == "L":
                extremeValue = max(extremeValues)
            maxNoSNP = max(noofsnps)
            mixNoSNP = min(noofsnps)
            selectedRegion[chrom].append(
                (chrom, Region_start, Region_end, Nwin, extremeValue, mixNoSNP,
                 maxNoSNP))
    if mergeNA != False and int(mergeNA) > 0:
        for chrom in selectedRegion:
            selectedRegion[chrom].sort(key=lambda listRec: int(listRec[1]))
            i = 1
            idxlist_to_pop = []
            while i < len(selectedRegion[chrom]):
                winNo_end = str(int(selectedRegion[chrom][i][1] / slideSize))
                winNo_start = str(
                    int((selectedRegion[chrom][i - 1][2] - winwidth) /
                        slideSize))
                print("select * from " + winGenome.wintablewithoutNA +
                      " where " + " chrID='" + chrom + "' and winNo>" +
                      winNo_start + " and  winNo<" + winNo_end)
                wincount_to_determine = winGenome.windbtools.operateDB(
                    "select", "select * from " + winGenome.wintablewithoutNA +
                    " where " + " chrID='" + chrom + "' and winNo>" +
                    winNo_start + " and winNo<" + winNo_end)
                wincount_to_add = winGenome.windbtools.operateDB(
                    "select",
                    "select * from " + winGenome.wintabletextvalueallwin +
                    " where " + " chrID='" + chrom + "' and winNo>" +
                    winNo_start + " and winNo<" + winNo_end)
                if len(wincount_to_determine
                       ) == 0 and len(wincount_to_add) <= int(mergeNA):
                    if morethan_lessthan == "m" or morethan_lessthan == "M":
                        extremeValue = min(selectedRegion[chrom][i][4],
                                           selectedRegion[chrom][i - 1][4])
                    elif morethan_lessthan == "l" or morethan_lessthan == "L":
                        extremeValue = max(selectedRegion[chrom][i][4],
                                           selectedRegion[chrom][i - 1][4])
                    maxNoSNP = max(selectedRegion[chrom][i][3],
                                   selectedRegion[chrom][i - 1][3])
                    mixNoSNP = min(selectedRegion[chrom][i][3],
                                   selectedRegion[chrom][i - 1][3])
                    selectedRegion[chrom][i] = (
                        chrom, selectedRegion[chrom][i - 1][1],
                        selectedRegion[chrom][i][2],
                        selectedRegion[chrom][i - 1][3] +
                        selectedRegion[chrom][i][3] + len(wincount_to_add),
                        extremeValue, mixNoSNP, maxNoSNP)
                    idxlist_to_pop.append(i - 1)
                i += 1
            else:
                idxlist_to_pop.reverse()
                for idx_to_pop in idxlist_to_pop:
                    selectedRegion[chrom].pop(idx_to_pop)
    else:
        for chrom in selectedRegion:
            selectedRegion[chrom].sort(key=lambda listRec: int(listRec[1]))
#    get final table
    print("getting final table")
    final_table = {}
    for chrom in selectedRegion:
        for region in selectedRegion[chrom]:
            print(chrom, region)
            if extendtodistal > 0:
                final_table[region] = winGenome.collectTrscptInWin(
                    genomedbtools, Util.TranscriptGenetable, region, upextend,
                    downextend, extendtodistal)
            else:
                final_table[region] = winGenome.collectTrscptInWin(
                    genomedbtools, Util.TranscriptGenetable, region, upextend,
                    downextend)


#process top outlier values
    print("fill bedselectedtable")
    for chrom in winGenome.chromOrder:
        if chrom not in selectedRegion:
            continue
        for region in selectedRegion[chrom]:
            if chrom.strip() == region[0].strip():
                tcpts = ""
                tpcode = ""
                gnames = ""
                for tcpt in final_table[region]:
                    tcpts += (tcpt[0] + ",")
                    tpcode += (str(tcpt[-1]) + ",")
                    if tcpt[2].strip() != "":
                        gnames += (tcpt[2] + ",")
                print("\t".join(map(str, region)),
                      tcpts[:-1],
                      tpcode[:-1],
                      gnames[:-1],
                      sep="\t",
                      file=outfile)

    winGenome.windbtools.drop_table(winGenome.wintabletextvalueallwin)
    winGenome.windbtools.drop_table(winGenome.wintablewithoutNA)
    outfile.close()
    return outfileNameWINwithGENE
Beispiel #4
0
import src.NGS.BasicUtil.DBManager as dbm


parser = OptionParser()
parser.add_option("-t","--toplevelsnptable",dest="toplevelsnptable",default="ducksnp_toplevel",help="depth of the folder to output")
parser.add_option("-m","--minlength",dest="minlength",help="require least chrom length")
parser.add_option("-A","--minAN",dest="minAN")
parser.add_option("-d","--snpperkb",dest="snpperkb")
parser.add_option("-o","--outputfilename",dest="outputfilename")
parser.add_option("-v", "--vcftablelist", dest="vcftablelist",action="append",default=[],help="")
(options, args) = parser.parse_args()
minlength=options.minlength;toplevelsnptable=options.toplevelsnptable;snpperkb=int(options.snpperkb);vcftableslist=options.vcftablelist;minAN=options.minAN
dadisnpfile=open(options.outputfilename,'w')
outgroupidx_in_topleveltable=6;minoutgroupdepth=30
if __name__ == '__main__':
    genomedbtools = dbm.DBTools(Util.ip, Util.username, Util.password, Util.genomeinfodbname)
    dbvariantstools=dbm.DBTools(Util.ip, Util.username,Util.password, Util.vcfdbname)
    toplevelsnptable_titlelist=[a[0].strip() for a in dbvariantstools.operateDB("select", "select column_name  from information_schema.columns where table_schema='" + "ninglabvariantdata" + "' and table_name='" + toplevelsnptable + "'")]
    selectedchroms=genomedbtools.operateDB("select","select * from "+Util.pekingduckchromtable+" where chrlength>="+minlength)
    ######################## title print ##############################
    print(Util.pekingduckchromtable[:9],toplevelsnptable_titlelist[outgroupidx_in_topleveltable],"Allele1",sep="\t",end="\t",file=dadisnpfile)
    for vcftable_name in vcftableslist:
        popName=re.split(r'_',vcftable_name)[0]
        print(popName,end="\t",file=dadisnpfile)
    print("Allele2",end="\t",file=dadisnpfile)
    for vcftable_name in vcftableslist:
        popName=re.split(r'_',vcftable_name)[0]
        print(popName,end="\t",file=dadisnpfile)
    print("Gene\tPosition",file=dadisnpfile)
    ############               finish title print ##################################
    for row in selectedchroms:
Beispiel #5
0
                  action="append",
                  default=[],
                  nargs=2,
                  help="vcffile minAN")
(options, args) = parser.parse_args()

toplevelsnptable = options.toplevelsnptable
snpperkb = float(options.snpperkb)
vcffilelist = options.vcffile  #;minlength=options.minlength
outgroupidx_in_topleveltable = [6, 8]
minoutgroupdepth = 30

noofindvds2quantizing = int(options.noofindvds2quantizing)
dadisnpfile = open(
    options.outputfilename + "dilutetodensity" + options.snpperkb.strip(), 'w')
dbvariantstools = dbm.DBTools(Util.ip, Util.username, Util.password,
                              Util.vcfdbname)
dynamicIU_toptable_obj = Ancestralallele.dynamicInsertUpdateAncestralContext(
    dbvariantstools, Util.beijingreffa, options.toplevelsnptable)

flankseqfafile = open(
    options.outputfilename + re.search(r"[^/]*$", options.chromlist).group(0) +
    ".fa", "a")
# recf=open("recf","w")
if __name__ == '__main__':
    chromlistfile = open(options.chromlist, "r")
    selectedchroms = []
    for chrrow in chromlistfile:
        chrrowlist = re.split(r'\s+', chrrow.strip())
        selectedchroms.append(
            (chrrowlist[0].strip(), int(chrrowlist[1].strip())))
    chromlistfile.close()
Beispiel #6
0
'''
Created on 2013-8-23

@author: liurui
'''
if len(sys.argv) < 2:
    print("python prepareFaConsenusSeq.py [cns1.fq] [cns2.fq] [cns3.fq] ...")
    exit(-1)
tablename='chromosome'
primaryID="chrID"
sql="select * from "+tablename
allchr=""
allseq=""
if __name__ == '__main__':
#    ChromIndexMap = pickle.load(open(fastQFileName + ".myindex", 'rb'))
    dbtools = dbm.DBTools("localhost","root","1234567","life_pilot")
    print(dbtools,"ssssssssssssss")
    for fastQFileName in sys.argv[1:]:
        print(fastQFileName)
        outfile=open(fastQFileName+".fa",'w')
        seqMapByChrom = Util.FastQ_Util.getConsenusSeqMap(fastQFileName, dbtools)

        totalChroms = dbtools.operateDB("select","select count(*) from "+tablename)[0][0]
    #    currentchrID=dbtools.operateDB("select",sql+" limit 0,1")[0][0]
    #    seqMapByChrom[currentchrID]=""
        for i in range(0,totalChroms,20):
            currentsql=sql+" order by "+primaryID+" limit "+str(i)+",20"
            result=dbtools.operateDB("select",currentsql)
            for row in result:
                currentchrID=row[0]
                if currentchrID in seqMapByChrom: