Esempio n. 1
0
 def caculateDstatistics(self,
                         p1,
                         p2,
                         p3,
                         p4,
                         caculator,
                         currentchrID,
                         currentchrLen,
                         winwidth=None):
     win = Util.Window()
Esempio n. 2
0
        #         if options.depthfile!=None:
        #             print(options.depthfile,"no need")
        #         originalspeciesref=options.ancenstralref
        #         colname=re.search(r'[^/]*$',originalspeciesref).group(0)
        #         colname=re.sub(r"[^\w^\d]","_",colname);colname=colname[:10]
        #         print(colname)
        #         ancestralalleletabletools.dbvariant.operateDB("callproc", "mysql_sp_add_column", data=(ancestralalleletabletools.dbvariant, toplevelsnptablename, colname, "char(128)", "default null"))
        OUTFILENAME = "ducksnpflankseq.fa"
        outfile = open(options.chromlistfilename + "snpflankseq.fa", 'w')
        duckrefhandler = open(options.ref, 'r')
        try:
            duckrefindex = pickle.load(open(options.ref + ".myfasteridx",
                                            'rb'))
#             originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb'))
        except IOError:
            Util.generateFasterRefIndex(options.ref,
                                        options.ref + ".myfasteridx")
            duckrefindex = pickle.load(open(options.ref + ".myfasteridx",
                                            'rb'))

#         try:
#             originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb'))
#         except IOError:
#             Util.generateIndexByChrom(originalspeciesref, originalspeciesref + ".myindex")
#             originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb'))
        chrom_lenlist = []
        chromlistfile = open(options.chromlistfilename, "r")
        for chrrow in chromlistfile:
            chrrowlist = re.split(r'\s+', chrrow.strip())
            chrom_lenlist.append(
                (chrrowlist[0].strip(), int(chrrowlist[1].strip())))
        for currentchrID, currentchrLen in chrom_lenlist:
Esempio n. 3
0
                  help="speciesName in table")  #

(options, args) = parser.parse_args()
allpop_with_derived_alletable = options.topleveltable
ancestralspeciescolname = options.ancestralspeciesname.strip()
farsurebutfew = options.farsurebutfew.strip()
mindepth = int(options.mindepth)
if __name__ == '__main__':
    dbtools = dbm.DBTools(Util.ip, Util.username, Util.password,
                          Util.genomeinfodbname)
    tableprename = ""
    TABLES = {}
    for bedfileName in args[:]:

        tableprename += re.search(r"[^/]*$", bedfileName).group(0)[0]
    tablename = tableprename + Util.random_str()
    TABLES[tablename] = ("CREATE TABLE " + tablename + " ("
                         " `snpID` varchar(128) NOT NULL ,"
                         " `region` varchar(128) NOT NULL ,"
                         " `DAF` double default 100 ,"
                         " `MAF` double default 100 ,"
                         " PRIMARY KEY (`snpID`,`region`)"
                         ")")

    tempdbtools = dbm.DBTools(Util.ip, Util.username, Util.password,
                              Util.ghostdbname)
    tempdbtools.create_table(TABLES)
    titlelist = [
        a[0].strip() for a in dbtools.operateDB(
            "select",
            "select column_name  from information_schema.columns where table_schema='"
Esempio n. 4
0
                  action="store_false",
                  dest="verbose",
                  default=True,
                  help="don't print status messages to stdout")

(options, args) = parser.parse_args()

refFastaName1 = options.reffa[0]
refFastaName2 = options.reffa[1]
reffastaidxName1 = refFastaName1 + ".myfasteridx"
reffastaidxName2 = refFastaName2 + ".myfasteridx"
try:
    refidxByChr2 = pickle.load(open(reffastaidxName2, 'rb'))
    refidxByChr1 = pickle.load(open(reffastaidxName1, 'rb'))
except IOError:
    Util.generateFasterRefIndex(refFastaName1, reffastaidxName1)
    Util.generateFasterRefIndex(refFastaName2, reffastaidxName2)
    refidxByChr1 = pickle.load(open(reffastaidxName1, 'rb'))
    refidxByChr2 = pickle.load(open(reffastaidxName2, 'rb'))
commsample_idxlistinM = []
commsample_idxlistinV = []
degenerateM = {
    "R": "AG",
    "Y": "CT",
    "M": "AC",
    "K": "GT",
    "S": "GC",
    "W": "AT",
    "A": "AA",
    "T": "TT",
    "C": "CC",
Esempio n. 5
0
                  help="ancenstral(a) or derived(d)")
(options, args) = parser.parse_args()
mindeptojudgefix = 15
#####################
VCFobj = {}
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall = {}
vcfnameKEY_depthobjVALUE_tojudgeancestral = {}
VCFobj["wigeon"] = VCFutil.VCF_Data(
    "/home/bioinfo/liurui/data/vcffiles/uniqmap/taihudomesticgoose/taihudomesticgoose.pool.withindel.vcf"
)
VCFobj["fanya"] = VCFutil.VCF_Data(
    "/home/bioinfo/liurui/data/vcffiles/uniqmap/fanya/fanya._pool.withindel.vcf"
)
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[
    "wigeon"] = Util.GATK_depthfile(
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index"
    )  #here is a temp trick not a error
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[
    "fanya"] = Util.GATK_depthfile(
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index"
    )
vcfnameKEY_depthobjVALUE_tojudgeancestral["wigeon"] = [
    "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
    9
]
vcfnameKEY_depthobjVALUE_tojudgeancestral["fanya"] = [
    "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
    3
]
####################################
Esempio n. 6
0
    "1": 42145699,
    '2': 49200776,
    '3': 50652576,
    "4": 40408058,
    "5": 47253416,
    "6": 36015257,
    "7": 35964515,
    "8": 40690061,
    "9": 58970518
}
winsize = int(sys.argv[6])
reff = open(sys.argv[1], 'r')
try:
    refidx = pickle.load(open(sys.argv[1] + ".myfasteridx", 'rb'))
except IOError:
    Util.generateFasterRefIndex(sys.argv[1], sys.argv[1] + ".myfasteridx")
    refidx = pickle.load(open(sys.argv[1] + ".myfasteridx", 'rb'))
vcftools = "vcftools"
gapf = open(sys.argv[3], 'r')
scoredsnp = open(sys.argv[4], 'r')
scoredsnp.readline()
sitesingap = open(sys.argv[5], 'w')
if __name__ == '__main__':
    win = Util.Window()
    i = 0
    interferf = open(sys.argv[5] + ".InterferingTEMP", 'w')
    for gapregion in gapf:
        i += 1
        filledsites = []
        gaplist = re.split(r"\s+", gapregion.strip())
        if not os.path.exists(sys.argv[5] + "temp" + str(i) + ".recode.vcf"):
def make_freq_xaxisKEY_yaxisseqVALUERelation(a):
    chromlistfilename = a[0]
    topleveltablename = a[1]
    targetpopvcffile_withdepthconfig = a[2]
    refpopvcffile_withdepthconfig = a[3]
    numberofindvdoftargetpop_todividintobin = int(a[4])
    mindepthtojudefixed = 20
    d_increase = fractions.Fraction(
        1, (2 * int(numberofindvdoftargetpop_todividintobin)))
    d_increase = round(d_increase, 11)
    minvalue = 0.000000000000
    freq_xaxisKEY_yaxisVALUE_seq_list = {}
    for i in range(numberofindvdoftargetpop_todividintobin * 2 - 1):
        freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue, minvalue + d_increase +
                                           0.00000000004)] = []
        minvalue += d_increase
    else:
        freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue, 1)] = []
    for a, b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()):
        print(str(a), str(b))
#     while minvalue+d_increase<=1:
#         freq_xaxisKEY_yaxisVALUE_seq_list[(minvalue,minvalue+d_increase+0.00000000004)]=[]
#         print('%.12f'%minvalue,'%.12f'%(minvalue+d_increase+0.00000000004))
#         minvalue+=d_increase
#     else:
#         freq_xaxisKEY_yaxisVALUE_seq_list[]
    print("process ID:", os.getpid(), "start", chromlistfilename)
    dbvariantstools = dbm.DBTools(Util.ip, Util.username, Util.password,
                                  Util.vcfdbname)
    chromlistfile = open(chromlistfilename, "r")
    chromlistfilelines = chromlistfile.readlines()
    chromlistfile.close()
    chromlist = []
    for chrrow in chromlistfilelines:
        chrrowlist = re.split(r'\s+', chrrow.strip())
        chromlist.append((chrrowlist[0].strip(), int(chrrowlist[1].strip())))

    vcfnamelist = []
    listofpopvcfmapOfAChr = []
    methodlist = []
    vcfnameKEY_vcfobj_pyBAMfilesVALUE = {}
    N_of_targetpop = len(targetpopvcffile_withdepthconfig)
    N_of_refpop = len(refpopvcffile_withdepthconfig)
    #{ vcftablename1:[depthfilename1,name1,name2] , vcftablename2:[depthfilename2,name1,name2] } or {vcftablename1:None, vcftablename2:None}
    for vcfconfigfilename in targetpopvcffile_withdepthconfig[:] + refpopvcffile_withdepthconfig[:]:
        listofpopvcfmapOfAChr.append({})
        vcfconfig = open(vcfconfigfilename, "r")
        for line in vcfconfig:
            vcffilename_obj = re.search(r"vcffilename=(.*)", line.strip())
            if vcffilename_obj != None:
                vcfname = vcffilename_obj.group(1).strip()
                vcfnamelist.append(vcfname)
                vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname] = []
                vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname].append(
                    VCFutil.VCF_Data(vcfname))
            elif line.split():
                vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname].append(
                    pysam.Samfile(line.strip(), 'rb'))
        vcfconfig.close()
        if re.search(r"indvd[^/]+", vcfname) != None:
            methodlist.append("indvd")

        elif re.search(r"pool[^/]+", vcfname) != None:
            methodlist.append("pool")

        else:
            print("vcfname must with 'pool' or 'indvd'")
            exit(-1)
    for currentchrID, currentchrLen in chromlist:
        for vcfname in vcfnamelist:
            if currentchrID in vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfname][
                    0].VcfIndexMap:
                break
        else:
            print("this chr doesn't exist in anypop")
            continue
        for vcfobj_idx in range(len(vcfnamelist)):
            listofpopvcfmapOfAChr[vcfobj_idx] = {}
            listofpopvcfmapOfAChr[vcfobj_idx][
                currentchrID] = vcfnameKEY_vcfobj_pyBAMfilesVALUE[
                    vcfnamelist[vcfobj_idx]][0].getVcfListByChrom(currentchrID)
        target_ref_SNPs = Util.alinmultPopSnpPos(listofpopvcfmapOfAChr, "o")
        for snp_aligned in target_ref_SNPs[currentchrID]:
            if len(snp_aligned[1]) != 1 or len(snp_aligned[2]) != 1:
                print("multple allele", snp_aligned)
                continue
            curpos = int(snp_aligned[0])
            snp = dbvariantstools.operateDB(
                "select",
                "select * from " + topleveltablename + " where chrID='" +
                currentchrID + "' and snp_pos=" + str(curpos) + "")
            if not snp or snp == 0:
                print(currentchrID, curpos, "snp not find in db,skip")
                continue
            else:  #judge the ancenstrall allele
                fanyadepthlist = re.split(r",", snp[0][9])
                if len(fanyadepthlist) == 2 and int(
                        fanyadepthlist[1]
                ) >= mindepthtojudefixed and fanyadepthlist[0].strip() == "0":
                    A_base_idx = 1
                elif len(fanyadepthlist) == 2 and int(
                        fanyadepthlist[0]
                ) >= mindepthtojudefixed and fanyadepthlist[1].strip() == "0":
                    A_base_idx = 0
                else:
                    print("skip snp", snp[0][1], snp[0][7:])
                    continue
            ancestrallcontext = snp[0][5].strip()[0].upper() + snp[0][
                3 + A_base_idx].strip().upper() + snp[0][5].strip()[2].upper()
            if "CG" in ancestrallcontext or "GC" in ancestrallcontext:
                print("skip CG site", ancestrallcontext)
                continue
            ##########x-axis
            countedAF = 0
            target_DAF_sum = 0  #;noofnocoveredsample=0
            for i in range(3, N_of_targetpop + 3):
                if snp_aligned[i] == None:
                    if len(vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[
                            i - 3]]) == 1:
                        print("no depth file")
                        continue
                    else:
                        sum_depth = 0
                        for samfile in vcfnameKEY_vcfobj_pyBAMfilesVALUE[
                                vcfnamelist[i - 3]][1:]:
                            ACGTdep = samfile.count_coverage(
                                currentchrID, curpos - 1, curpos)
                            for dep in ACGTdep:
                                sum_depth += dep[0]
                        if sum_depth >= mindepthtojudefixed:
                            AF = 0
                        else:
                            continue
                else:
                    if methodlist[i - 3] == "indvd":
                        AF = float(
                            re.search(r"AF=([\d\.]+);",
                                      snp_aligned[i][0]).group(1))
                    elif methodlist[i - 3] == "pool":
                        refdep = 0
                        altalleledep = 0
                        AD_idx = (re.split(":", snp_aligned[i][1])).index(
                            "AD")  # gatk GT:AD:DP:GQ:PL
                        for sample in snp_aligned[i][2]:
                            if len(re.split(":", sample)) == 1:  # ./.
                                continue
                            AD_depth = re.split(",",
                                                re.split(":", sample)[AD_idx])
                            try:
                                refdep += int(AD_depth[0])
                                altalleledep += int(AD_depth[1])
                            except ValueError:
                                print(sample, end="|")
                        if refdep == altalleledep and altalleledep == 0:
                            print("no sample available in this pop")
                            #                                 noofnocoveredsample+=1
                            continue
                        AF = altalleledep / (altalleledep + refdep)
                if A_base_idx == 0:
                    DAF = 1 - AF
                elif A_base_idx == 1:
                    DAF = AF
                target_DAF_sum += DAF
                countedAF += 1
            if countedAF == 0:  #or target_DAF_sum==0:
                print(
                    "skip this snp,because it fiexd as ancestral or no covered in this pos in target pops",
                    snp_aligned, snp)
                continue
            target_DAF = target_DAF_sum / countedAF
            ###############y-axis
            countedAF = 0
            rer_DAF_sum = 0
            for i in range(3 + N_of_targetpop,
                           N_of_refpop + N_of_targetpop + 3):
                if snp_aligned[i] == None:
                    if len(vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[
                            i - 3]]) == 1:
                        continue
                    else:
                        #                         depth_linelist=vcfnameKEY_vcfobj_pyBAMfilesVALUE[vcfnamelist[i-3-N_of_targetpop]].getdepthByPos_optimized(currentchrID,curpos)
                        sum_depth = 0
                        for samfile in vcfnameKEY_vcfobj_pyBAMfilesVALUE[
                                vcfnamelist[i - 3]][1:]:
                            ACGTdep = samfile.count_coverage(
                                currentchrID, curpos - 1, curpos)
                            for dep in ACGTdep:
                                sum_depth += dep[0]
                        if sum_depth >= mindepthtojudefixed:
                            AF = 0
                        else:
                            continue
                else:
                    if methodlist[i - 3] == "indvd":
                        AF = float(
                            re.search(r"AF=([\d\.]+);",
                                      snp_aligned[i][0]).group(1))
                        AN = float(
                            re.search(r"AN=([\d\.]+);",
                                      snp_aligned[i][0]).group(1))
                        if AN < 5:
                            continue
                    elif methodlist[i - 3] == "pool":
                        refdep = 0
                        altalleledep = 0
                        AD_idx = (re.split(":", snp_aligned[i][1])).index(
                            "AD")  # gatk GT:AD:DP:GQ:PL
                        for sample in snp_aligned[i][2]:
                            if len(re.split(":", sample)) == 1:  # ./.
                                continue
                            AD_depth = re.split(",",
                                                re.split(":", sample)[AD_idx])
                            try:
                                refdep += int(AD_depth[0])
                                altalleledep += int(AD_depth[1])
                            except ValueError:
                                print(sample, end="|")
                        if (refdep == altalleledep and altalleledep
                                == 0) or altalleledep + refdep < 10:
                            continue
                        AF = altalleledep / (altalleledep + refdep)
                if A_base_idx == 0:
                    DAF = 1 - AF
                elif A_base_idx == 1:
                    DAF = AF
                rer_DAF_sum += DAF
                countedAF += 1
            if countedAF == 0 or rer_DAF_sum == 0:
                print(
                    "skip this snp,because it  no covered in this pos in ref pops",
                    snp_aligned, snp)
                continue
            ######collect according bins
            for a, b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()):
                if target_DAF > a and target_DAF <= b:
                    freq_xaxisKEY_yaxisVALUE_seq_list[(a, b)].append(
                        rer_DAF_sum / countedAF)
                    break


#     freq_xaxisKEY_yaxisVALUERelation={}
#     for a,b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()):
#         freq_xaxisKEY_yaxisVALUERelation[(a,b)]=numpy.mean(freq_xaxisKEY_yaxisVALUE_seq_list[(a,b)])
#         print('%.12f'%a,'%.12f'%(b),'%.12f'%(freq_xaxisKEY_yaxisVALUERelation[(a,b)]),"process ID:",os.getpid(),"done",sep="\t")
    print("process ID:", os.getpid(), "done")
    return copy.deepcopy(freq_xaxisKEY_yaxisVALUE_seq_list)
Esempio n. 8
0
def make_getElemBed(elementfold, targetseqnamesubstr, pathtoblastn, reffa):
    """
    targetseqnamesubstr is the str before the first space ,after the >
    """
    allseqtobed = {
    }  #{chrID:[(sstart,send,elem,qstart,qend,revcom,len),(sstart,send,elem,qstart,qend,revcom,len),,,],,,,}
    if elementfold.endswith("/") or elementfold.endswith("\\"):
        elementfold = elementfold[:-1]
    if os.path.isfile(elementfold + "/" + targetseqnamesubstr + ".bed"):
        bedfile = open(elementfold + "/" + targetseqnamesubstr + ".bed", "r")
        bedfile.readline()  #title
        for bedline in bedfile:
            bedlinelist = re.split(r"\t+", bedline)
            if bedlinelist[0].strip() in allseqtobed:
                allseqtobed[bedlinelist[0].strip()].append(
                    (int(bedlinelist[1]), int(bedlinelist[2]), bedlinelist[3],
                     int(bedlinelist[4]), int(bedlinelist[5]), bedlinelist[6],
                     int(bedlinelist[7]), int(bedlinelist[8])))
            else:
                allseqtobed[bedlinelist[0].strip()] = [
                    (int(bedlinelist[1]), int(bedlinelist[2]), bedlinelist[3],
                     int(bedlinelist[4]), int(bedlinelist[5]), bedlinelist[6],
                     int(bedlinelist[7]), int(bedlinelist[8]))
                ]
        bedfile.close()
        return allseqtobed
    randomstr = Util.random_str()
    targetseqnamesubstr_lenmap = {}
    if targetseqnamesubstr == "none":
        shellstatment = pathtoblastn + " -query " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".fa" + " -task blastn -db " + reffa + " -out " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".blastout -outfmt 7 -num_alignments 10 -num_threads 6"
    queryfafile = open(
        elementfold + "/" + randomstr + "_" + targetseqnamesubstr +
        ".collectionfas", 'w')
    i = 0
    for elem in os.listdir(path=elementfold):
        path = elementfold + "/" + elem

        if (not os.path.isdir(path)) and (
                path.endswith("fa")
                or path.endswith("fasta")):  #True is fa file
            print(path, i)
            i += 1

            if targetseqnamesubstr.lower().strip() == "none":
                pathfile = open(path, "r")
                for line in pathfile:
                    print(line.strip(), file=queryfafile)
                    if line.startswith(">"):
                        seqname = line.strip()
                    else:
                        targetseqnamesubstr_lenmap[seqname[1:]] = len(
                            line.strip())
#                 print(targetseqnamesubstr_lenmap)
                pathfile.close()
            else:
                muscleout_seqgenerator = SeqIO.parse(path, "fasta")
                for seq_rec in muscleout_seqgenerator:
                    if seq_rec.id == targetseqnamesubstr:
                        seqstr = "".join(seq_rec.seq).replace("-", "")
                        print(">" + elem, file=queryfafile)
                        #                     allseqtobed[elem]=[]
                        targetseqnamesubstr_lenmap[elem] = len(seqstr)
                        print(seqstr, file=queryfafile)
                        break
                else:
                    print(targetseqnamesubstr, "dosenot exist", elem)
    queryfafile.close()
    shellstatment = pathtoblastn + " -query " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".collectionfas" + " -task blastn -db " + reffa + " -out " + elementfold + "/" + randomstr + "_" + targetseqnamesubstr + ".blastout -outfmt 7 -num_alignments 10 -num_threads 6"
    print(shellstatment)
    a = os.system(shellstatment)
    if a != 0:
        print("error")
        exit(-1)
    blastout = open(
        elementfold + "/" + randomstr + "_" + targetseqnamesubstr +
        ".blastout", "r")
    for line in blastout:
        if re.search(r"^#", line) != None:
            lastblastlen = None
            continue
        linelist = re.split(r"\s+", line)
        blastlen = int(linelist[3])
        if lastblastlen == None or (blastlen > lastblastlen - 10
                                    or blastlen * 0.95 >= lastblastlen):
            fafilename = linelist[0]
            chrom = linelist[1]
            sstartpos = int(linelist[8])
            sendpos = int(linelist[9])
            revcom = "forward"
            if sstartpos > sendpos:
                temp = sstartpos
                sstartpos = sendpos
                sendpos = temp
                revcom = "revcom"
            qstartpos = int(linelist[6])
            qendpos = int(linelist[7])
            total_bases = targetseqnamesubstr_lenmap[fafilename]
            gap_open = int(linelist[5])
            if chrom in allseqtobed:
                allseqtobed[chrom].append(
                    (sstartpos, sendpos, fafilename, qstartpos, qendpos,
                     revcom, total_bases, gap_open))
            else:
                allseqtobed[chrom] = [
                    (sstartpos, sendpos, fafilename, qstartpos, qendpos,
                     revcom, total_bases, gap_open)
                ]
            lastblastlen = blastlen
    bedfile = open(elementfold + "/" + targetseqnamesubstr + ".bed", "w")
    print("chrNo",
          "Region_start",
          "Region_end",
          "fastafilename",
          "startbase",
          "endbase",
          "revcom_forward",
          "total_bases",
          "gap_open",
          sep="\t",
          file=bedfile)
    for chrom in allseqtobed.keys():
        allseqtobed[chrom].sort(key=lambda listRec: listRec[1])
        for startpos, endpos, fafilename, qs, qe, revcom, total_bases, gap_open in allseqtobed[
                chrom]:
            print(chrom,
                  startpos,
                  endpos,
                  fafilename,
                  qs,
                  qe,
                  revcom,
                  total_bases,
                  gap_open,
                  sep="\t",
                  file=bedfile)
    blastout.close()
    bedfile.close()
    return allseqtobed
    os.system("rm " + elementfold + "/" + randomstr + "_" +
              targetseqnamesubstr + ".fa " + elementfold + "/" + randomstr +
              "_" + targetseqnamesubstr + ".blastout")
Esempio n. 9
0
            paramsname.append(n)
            #random initial value
#             initvalue=random.gauss(float(v),0.01)
#             while initvalue>float(u) or initvalue<float(l):
#                 initvalue=random.gauss(float(v),0.01)
#             paramslist.append(float(initvalue))
            paramslist.append(float(v))
            lower_boundlist.append(float(l))
            upper_boundlist.append(float(u))
#             ll_param_MAPlist[n].append()
        #produce command and run
            pythonpath=pythonpath+" -p "+n+" "+str(v)+" "+l+" "+u+" "
#             pythonpath=pythonpath+" -p "+n+" "+str(initvalue)+" "+l+" "+u+" "
        if randomstr!=None:
            os.system("rm "+namestr+options.tag+options.model+randomstr+".parameter")
        randomstr=Util.random_str()
        print(pythonpath+" -b "+randomstr+" "+str(int(options.bootstrap[1])))
        sys.stdout.flush()
        a=call_system(pythonpath+" -b "+randomstr+" "+str(int(options.bootstrap[1])))
        if a!=0:
            print("cycle",i,a,"wrong")
            continue
        #collection result
        print(options.fsfile+namestr+options.tag+options.model+"array.pickle")
        u=pickle._Unpickler(open(options.fsfile+namestr+options.tag+options.model+"array.pickle","rb"))
        u.encoding='latin1'
        residualarray=u.load() #pickle.load(open(options.fsfile+namestr+options.tag+options.model+"array.pickle","rb"))
        u=pickle._Unpickler(open(options.fsfile+namestr+options.tag+options.model+"hist.pickle","rb"))
        u.encoding='latin1'
        residualhis=u.load()#pickle.load(open(options.fsfile+namestr+options.tag+options.model+"hist.pickle","rb"))
        bif=open(options.fsfile+namestr+options.tag+options.model+randomstr+"btstrap.temp",'r')
Esempio n. 10
0
    def fillarchicpop(self,
                      archicpopVcfFile,
                      depthFile,
                      chromtable,
                      archicpopNameindepthFile,
                      tablename="derived_alle_ref",
                      archicpopfieldNameintable="archicpop"):
        """
        abandon the snps which exist in archicpopVcfFile but absence in all others pop snp sets 
        """
        depthfile = Util.GATK_depthfile(depthFile, depthFile + ".index")
        species_idx = depthfile.title.index("Depth_for_" +
                                            archicpopNameindepthFile)
        archicpop = VCFutil.VCF_Data(archicpopVcfFile)
        totalChroms = self.dbtools.operateDB(
            "select", "select count(*) from " + chromtable)[0][0]
        for i in range(0, totalChroms, 20):
            currentsql = "select * from " + chromtable + " order by chrlength desc limit " + str(
                i) + ",20"
            result = self.dbtools.operateDB("select", currentsql)
            for row in result:

                currentchrID = row[0]
                print(currentchrID + ":", end="")
                currentchrLen = int(row[2])
                archicpopSeqOfAChr = {}
                archicpopSeqOfAChr[currentchrID] = archicpop.getVcfListByChrom(
                    archicpopVcfFile, currentchrID)
                allsnpsInAchr = self.dbtools.operateDB(
                    "select", "select snp_pos,alt_base from " + tablename +
                    " where chrID='" + currentchrID + "'")
                for snp in allsnpsInAchr:
                    snp_pos = int(snp[0])
                    ALT = snp[1]
                    low = 0
                    high = len(archicpopSeqOfAChr[currentchrID]) - 1
                    while low <= high:
                        mid = (low + high) >> 1
                        if archicpopSeqOfAChr[currentchrID][mid][0] < snp_pos:
                            low = mid + 1
                        elif archicpopSeqOfAChr[currentchrID][mid][0] > snp_pos:
                            high = mid - 1
                        else:  #find the pos
                            pos, REF, ALT, INFO, FORMAT, samples = archicpopSeqOfAChr[
                                currentchrID][mid]
                            dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)",
                                            INFO)
                            refdep = 0
                            altalleledep = 0
                            if dp4 != None:  #vcf from samtools
                                refdep = int(dp4.group(1)) + int(dp4.group(2))
                                altalleledep = int(dp4.group(3)) + int(
                                    dp4.group(4))
                            else:
                                AD_idx = (re.split(":", FORMAT)).index(
                                    "AD")  #gatk GT:AD:DP:GQ:PL
                                for sample in samples:
                                    if len(re.split(":", sample)) == 1:  # ./.
                                        continue
                                    AD_depth = re.split(
                                        ",",
                                        re.split(":", sample)[AD_idx])
                                    try:
                                        refdep += int(AD_depth[0])
                                        altalleledep += int(AD_depth[1])
                                    except ValueError:
                                        print(sample, end="")
                            popsdata = ALT + ":" + str(refdep) + "," + str(
                                altalleledep)
                            break
                    else:
                        depth_linelist = depthfile.getdepthByPos(
                            currentchrID, snp_pos)
                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata = "no covered"
                        else:
                            popsdata = ALT + ":" + depth_linelist[
                                species_idx] + ",0"
#                     print(snp[0],end="\t")
                    self.dbtools.operateDB(
                        "update", "update " + tablename + " set " +
                        archicpopfieldNameintable + " = '" + popsdata +
                        "' where chrID=" + "'" + currentchrID +
                        "' and snp_pos=" + str(snp[0]))
Esempio n. 11
0
    def extarctAncestryAlleleFromBlastOut(self,
                                          BlastOutFile,
                                          ancestryrefFile,
                                          ancestryrefidx,
                                          tablename="derived_alle_ref",
                                          ancestralsnptable=None):
        ancestryreffile = open(ancestryrefFile, 'r')
        ancestrysnpflank = open(tablename + "ancestrysnpflank.fa", 'w')
        a = os.popen("awk '$1!~/^#/ && $5==1 && $4>26 && $6==0 {print $0}' " +
                     BlastOutFile)
        #    hits=a.readlines()

        lastbasesAccur = {}
        onegroup = []
        revcom = False
        #    initial
        hit = a.readline()
        hitlist = re.split(r"\s+", hit)

        sendpos = int(hitlist[9])
        sstartpos = int(hitlist[8])
        qstartpos = int(hitlist[6])
        blastlen = int(hitlist[3])
        snp_loc_s = sstartpos + 26 - qstartpos
        snpindex = 26 - qstartpos
        if sstartpos > sendpos:
            temp = sstartpos
            sstartpos = sendpos
            sendpos = temp
            revcom = True
        lastsnpID = hitlist[0]
        chrom = hitlist[1]
        RefSeqMap = Util.getRefSeqBypos(refFastahander=ancestryreffile,
                                        refindex=ancestryrefidx,
                                        currentChromNO=chrom,
                                        startpos=sstartpos,
                                        endpos=sendpos)
        if revcom:
            tempStr = RefSeqMap[chrom][1:]
            tempStr.reverse()
            RefSeqMap[chrom][1:] = Util.complementary(tempStr)
            revcom = False

        lastbasesAccur[RefSeqMap[chrom][snpindex + 1]] = [(chrom, sstartpos,
                                                           sendpos)]
        onegroup.append((RefSeqMap[chrom][snpindex + 1], blastlen))
        for hit in a:
            print(hit)
            hitlist = re.split(r"\s+", hit)
            chrom = hitlist[1]
            sstartpos = int(hitlist[8])
            sendpos = int(hitlist[9])
            qstartpos = int(hitlist[6])
            blastlen = int(hitlist[3])
            snp_loc_s = sstartpos + 26 - qstartpos
            snpindex = 26 - qstartpos
            if sstartpos > sendpos:
                temp = sstartpos
                sstartpos = sendpos
                sendpos = temp
                revcom = True
            if lastsnpID == hitlist[0]:
                RefSeqMap = Util.getRefSeqBypos(refFastahander=ancestryreffile,
                                                refindex=ancestryrefidx,
                                                currentChromNO=chrom,
                                                startpos=sstartpos,
                                                endpos=sendpos)
                if revcom:
                    tempStr = RefSeqMap[chrom][1:]
                    tempStr.reverse()
                    RefSeqMap[chrom][1:] = Util.complementary(tempStr)
                    revcom = False
                print(lastsnpID,
                      RefSeqMap[chrom][snpindex + 1],
                      str(snp_loc_s),
                      "".join(RefSeqMap[chrom][1:]),
                      file=ancestrysnpflank)
                if RefSeqMap[chrom][snpindex + 1] in lastbasesAccur:
                    lastbasesAccur[RefSeqMap[chrom][snpindex + 1]].append(
                        (chrom, sstartpos, sendpos))
                else:
                    lastbasesAccur[RefSeqMap[chrom][snpindex + 1]] = [
                        (chrom, sstartpos, sendpos)
                    ]
                onegroup.append((RefSeqMap[chrom][snpindex + 1], blastlen))
            else:
                #                出入数据库 按照不同的主键 即原来是snpid 现在换成别的

                snppos = re.search(r"_(\d+)", lastsnpID).group(1)
                snpChrom = re.search(r"(.+)_(\d+)", lastsnpID).group(1)
                onegroup.sort(key=lambda listRec: listRec[1])
                if len(onegroup) == 1 or onegroup[0][1] - onegroup[1][
                        1] >= 15:  #first , only one query id,second longest hit 15 bases greater than the second longest hit
                    if ancestralsnptable != None and self.dbtools.operateDB(
                            "select", "select count(*) from " +
                            ancestralsnptable + " where chrID= '" + chrom +
                            "' and snp_start_pos= " +
                            str(snp_loc_s))[0][0] == 0:
                        print("update " + tablename + " set ancestralallel='" +
                              onegroup[0][0] + "' where chrID='" + snpChrom +
                              "'and snp_pos=" + snppos)
                        self.dbtools.operateDB(
                            "update",
                            "update " + tablename + " set ancestralallel='" +
                            onegroup[0][0] + "' where chrID='" + snpChrom +
                            "'and snp_pos=" + snppos)
                    else:
                        print(
                            "select count(*) from " + ancestralsnptable +
                            " where chrID= '" + chrom +
                            "' and snp_start_pos= " + str(snppos),
                            self.dbtools.operateDB(
                                "select", "select count(*) from " +
                                ancestralsnptable + " where chrID= '" + chrom +
                                "' and snp_start_pos= " + str(snppos)))
                elif (len(lastbasesAccur.keys()) == 1
                      and self.dbtools.operateDB(
                          "select", "select count(*) from " +
                          ancestralsnptable + " where chrID= '" + chrom +
                          "' and snp_start_pos= " + str(snp_loc_s))[0][0]
                      == 0):
                    for bases in lastbasesAccur:  #only once
                        print("update " + tablename + " set ancestralallel='" +
                              bases + "' where chrID='" + snpChrom +
                              "' and snp_pos=" + snppos)
                        self.dbtools.operateDB(
                            "update",
                            "update " + tablename + " set ancestralallel='" +
                            bases + "' where chrID='" + snpChrom +
                            "' and snp_pos=" + snppos)
                elif len(lastbasesAccur.keys()) == 0:
                    print(" len(lastbasesAccur.keys()) == 0")
                    exit(-1)
                RefSeqMap = Util.getRefSeqBypos(refFastahander=ancestryreffile,
                                                refindex=ancestryrefidx,
                                                currentChromNO=chrom,
                                                startpos=sstartpos,
                                                endpos=sendpos)
                if revcom:
                    tempStr = RefSeqMap[chrom][1:]
                    tempStr.reverse()
                    RefSeqMap[chrom][1:] = Util.complementary(tempStr)
                    revcom = False
                print(hitlist[0],
                      RefSeqMap[chrom][snpindex + 1],
                      str(snp_loc_s),
                      "".join(RefSeqMap[chrom][1:]),
                      file=ancestrysnpflank)
                #            dbtools.operateDB("update", "update " + finaltable + " set chicken='" + RefSeqMap[chrom][snpindex + 1] + "' where snpID='" + hitlist[0] + "'")
                lastsnpID = hitlist[0]

                lastbasesAccur.clear()
                lastbasesAccur[RefSeqMap[chrom][snpindex + 1]] = [
                    (chrom, sstartpos, sendpos)
                ]
        print("finish")
        ancestryreffile.close()
Esempio n. 12
0
    def getflankseqs(self,
                     chrom,
                     chromlen,
                     snpstartpos,
                     snpendpos,
                     idxedreffilehandler,
                     refindex,
                     flanklen,
                     outfile,
                     tablename="derived_alle_ref"):

        testfile = open("testsnpfile.txt", 'a')
        snps = self.dbtools.operateDB(
            "select", "select * from " + tablename + " where chrID='" + chrom +
            "' and snp_pos>= " + str(snpstartpos) + " and snp_pos<=" +
            str(snpendpos))
        RefSeqMap = Util.getRefSeqBypos(idxedreffilehandler, refindex, chrom,
                                        snpstartpos - flanklen,
                                        snpendpos + flanklen, chromlen)

        for snp in snps:
            currentsnpPos = snp[1]
            if len(snp[3]) != 1 or len(snp[4]) != 1:
                #                        print(snp[4])
                continue  # skip indel
            currentsnpID = chrom + "_" + str(snp[1])
            if currentsnpPos + 25 <= RefSeqMap[chrom][0] + len(RefSeqMap[
                    chrom]) - 1 and currentsnpPos - 25 > RefSeqMap[chrom][0]:
                snpflankseq = ''.join(RefSeqMap[chrom][(
                    currentsnpPos - 25 -
                    RefSeqMap[chrom][0]):(currentsnpPos + 25 -
                                          RefSeqMap[chrom][0] + 1)])
                print(currentsnpID, snpflankseq[25], file=testfile)
                snpflankseq = snpflankseq[0:25] + 'N' + snpflankseq[26:]

            elif currentsnpPos <= RefSeqMap[chrom][0] + len(
                    RefSeqMap[chrom]) - 1 and currentsnpPos + 25 > RefSeqMap[
                        chrom][0] + len(RefSeqMap[chrom]) - 1:
                snpflankseq = ''.join(RefSeqMap[chrom][(
                    currentsnpPos - 25 -
                    RefSeqMap[chrom][0]):(currentsnpPos - RefSeqMap[chrom][0] +
                                          1)])
                print(currentsnpID, snpflankseq[25], file=testfile)
                snpflankseq = snpflankseq[0:25] + 'N'

            elif currentsnpPos - 25 <= RefSeqMap[chrom][0]:
                snpflankseq = ''.join(
                    RefSeqMap[chrom][(currentsnpPos - RefSeqMap[chrom][0]):(
                        currentsnpPos + 25 - RefSeqMap[chrom][0] + 1)])
                print(currentsnpID, snpflankseq[0], file=testfile)
                snpflankseq = 'N' + snpflankseq[1:26]

            else:
                print("what's wrong with the func getflankseqs ?")
                exit(-1)
#            if currentsnpPos + 25 <= RefSeqMap[lastchromNo][0] + len(RefSeqMap[lastchromNo]) - 1 and currentsnpPos - 25 > RefSeqMap[lastchromNo][0] :
#            snpflankseq = ''.join(RefSeqMap[chrom][(currentsnpPos - 25 - RefSeqMap[chrom][0]):(currentsnpPos + 25 - RefSeqMap[chrom][0] + 1)])
#            print(currentsnpID, snpflankseq[25], file=testfile)
#             snpflankseq = snpflankseq[0:25] + 'N' + snpflankseq[26:]
            print(">" + currentsnpID + "\n" + snpflankseq,
                  end='\n',
                  file=outfile)
        testfile.close()
Esempio n. 13
0
    def filldata(self,
                 vcfFileName,
                 depthfileName,
                 tablename="derived_alle_ref",
                 posUniq=True,
                 continuechrom=None,
                 continuepos=None):
        depthfile = Util.GATK_depthfile(depthfileName,
                                        depthfileName + ".index")
        depth_linelist = None
        vcffile = open(vcfFileName, 'r')
        vcfline = vcffile.readline()
        while re.search(r'^##', vcfline) != None:
            vcfline = vcffile.readline()

        if re.search(r'^#', vcfline) != None:
            poptitlelist = re.split(r'\s+', vcfline.strip())[9:]
            print(poptitlelist)
        else:
            print(
                "need title'#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT'"
            )
            exit(-1)
        for pop in poptitlelist:
            self.dbtools.operateDB("callproc",
                                   "mysql_sp_add_column",
                                   data=("life_pilot", tablename, pop,
                                         "varchar(128)", "default null"))
        popsdata = []  #depth for ref or alt
        if continuechrom != None and continuepos != None:
            print("filldata", continuechrom, continuepos)
            vcfpossearcher = VCFutil.VCF_Data(vcfFileName)
            vcffile.seek(vcfpossearcher.VcfIndexMap[continuechrom])
            vcfline = vcffile.readline()
            while vcfline:
                vcflist = re.split(r'\s+', vcfline.strip())
                chrom = vcflist[0].strip()
                pos = int(vcflist[1].strip())
                print(chrom, pos)
                if chrom == continuechrom and pos == continuepos:
                    break
                vcfline = vcffile.readline()
        else:
            justiceGATKorSamtools = vcffile.readline()
            vcflist = re.split(r'\s+', justiceGATKorSamtools.strip())
            dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)", vcflist[7])
            refdep = 0
            altalleledep = 0
            if dp4 != None:  #vcf from samtools
                print("function for samtools vcf is still need to be finish")
                exit(-1)
            else:
                chrom = vcflist[0].strip()
                pos = int(vcflist[1].strip())
                snpID = vcflist[2].strip()
                REF = vcflist[3].strip()
                ALT = vcflist[4].strip()

                AD_idx = (re.split(":", vcflist[8])).index(
                    "AD")  #gatk GT:AD:DP:GQ:PL
                sample_idx_in_vcf = 0
                for sample in vcflist[9:]:

                    samplename = poptitlelist[sample_idx_in_vcf]

                    sample_idx_in_vcf += 1
                    species_idx = depthfile.title.index("Depth_for_" +
                                                        samplename)
                    if len(re.split(":", sample)) != len(
                            re.split(":", vcflist[8])
                    ) and depth_linelist == None:  # ./. when lack of variantion information,then consider the depthfile
                        depth_linelist = depthfile.getdepthByPos(chrom, pos)

                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata.append('no covered')
                        else:
                            popsdata.append(depth_linelist[species_idx] + ",0")
                        continue
                    elif len(re.split(":", sample)) != len(
                            re.split(":",
                                     vcflist[8])) and depth_linelist != None:
                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata.append('no covered')
                        else:
                            popsdata.append(depth_linelist[species_idx] + ",0")
                        continue

                    popsdata.append(re.split(":", sample)[AD_idx])
                depth_linelist = None
                print(
                    "insert into " + tablename +
                    "(chrID,snp_pos,snpID,ref_base,alt_base," +
                    "".join([e + ","
                             for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                    ") select %s,%s,%s,%s,%s," + "%s," *
                    (len(poptitlelist) - 1) +
                    "%s from dual where not exists( select * from " +
                    tablename + " where " + tablename + ".chrID='" + chrom +
                    "' and " + tablename + ".snp_pos=" + str(pos) + ")",
                    (chrom, pos, snpID, REF, ALT) + tuple(popsdata))
                self.dbtools.operateDB(
                    "insert",
                    "insert into " + tablename +
                    "(chrID,snp_pos,snpID,ref_base,alt_base," +
                    "".join([e + ","
                             for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                    ") select %s,%s,%s,%s,%s," + "%s," *
                    (len(poptitlelist) - 1) +
                    "%s from dual where not exists( select * from " +
                    tablename + " where " + tablename + ".chrID='" + chrom +
                    "' and " + tablename + ".snp_pos=" + str(pos) + ")",
                    data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata))

        for vcfline in vcffile:

            vcflist = re.split(r'\s+', vcfline.strip())
            print(vcfline)
            if posUniq and pos == int(vcflist[1].strip()):
                continue
            chrom = vcflist[0].strip()
            pos = int(vcflist[1].strip())
            snpID = vcflist[2].strip()
            REF = vcflist[3].strip()
            ALT = vcflist[4].strip()

            AD_idx = (re.split(":",
                               vcflist[8])).index("AD")  #gatk GT:AD:DP:GQ:PL
            sample_idx_in_vcf = 0
            popsdata = []
            for sample in vcflist[9:]:
                samplename = poptitlelist[sample_idx_in_vcf]
                sample_idx_in_vcf += 1
                species_idx = depthfile.title.index("Depth_for_" + samplename)
                if len(re.split(":", sample)) != len(re.split(
                        ":", vcflist[8])) and depth_linelist == None:  # ./.
                    depth_linelist = depthfile.getdepthByPos(chrom, pos)
                    if int(depth_linelist[species_idx]) <= 1:
                        popsdata.append('no covered')
                    else:
                        popsdata.append(depth_linelist[species_idx] + ",0")
                    continue
                elif len(re.split(":", sample)) != len(
                        re.split(":", vcflist[8])) and depth_linelist != None:
                    if int(depth_linelist[species_idx]) <= 1:
                        popsdata.append('no covered')
                    else:
                        popsdata.append(depth_linelist[species_idx] + ",0")
                    continue
#                 AD_depth = re.split(",", re.split(":", sample)[AD_idx])

                popsdata.append(re.split(":", sample)[AD_idx])
            depth_linelist = None
            print(
                "insert into " + tablename +
                "(chrID,snp_pos,snpID,ref_base,alt_base," +
                "".join([e + ","
                         for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) +
                "%s from dual where not exists( select * from " + tablename +
                " where " + tablename + ".chrID='" + chrom + "' and " +
                tablename + ".snp_pos=" + str(pos) + ")",
                (chrom, pos, snpID, REF, ALT) + tuple(popsdata))
            self.dbtools.operateDB(
                "insert",
                "insert into " + tablename +
                "(chrID,snp_pos,snpID,ref_base,alt_base," +
                "".join([e + ","
                         for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) +
                "%s from dual where not exists( select * from " + tablename +
                " where " + tablename + ".chrID='" + chrom + "' and " +
                tablename + ".snp_pos=" + str(pos) + ")",
                data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata))
        depthfile.closedepthfile()
        vcffile.close()
Esempio n. 14
0
 if aafafileName != None and cdsfafileName != None:
     aa_cds_filemap[speciesname] = [
         open(aafafileName, 'r'),
         open(cdsfafileName, 'r')
     ]
     aaindex = {}
     cdsindex = {}
     try:
         aa_cds_filemap[speciesname].append(
             pickle.load(open(aafafileName + ".myindex", 'rb')))
         aa_cds_filemap[speciesname].append(
             pickle.load(open(cdsfafileName + ".myindex", 'rb')))
     except IOError:
         print("generateIndexByChrom", speciesname)
         Util.generateIndexByChrom(aafafileName,
                                   aafafileName + ".myindex",
                                   "transcript:")
         Util.generateIndexByChrom(cdsfafileName,
                                   cdsfafileName + ".myindex")
         aa_cds_filemap[speciesname].append(
             pickle.load(open(aafafileName + ".myindex", 'rb')))
         aa_cds_filemap[speciesname].append(
             pickle.load(open(cdsfafileName + ".myindex", 'rb')))
     stat = os.system("rm " + aafafileName + ".myindex " +
                      cdsfafileName + ".myindex")
     if stat != 0:
         print("rm " + aafafileName + ".myindex " + cdsfafileName +
               ".myindex" + " os.system return not 0")
         exit(-1)
     print(
         "rm " + aafafileName + ".myindex " + cdsfafileName +
Esempio n. 15
0
(options, args) = parser.parse_args()

reffa_linktoDB_Name = options.reffa_linktoDB.strip()
reffa_linktoDB_hanlder = open(reffa_linktoDB_Name, 'r')
reffa_linktoDB_idxName = reffa_linktoDB_Name + ".myindex"

reffahanlder = open(options.reffa.strip(), 'r')
outfile = open(options.reffa.strip() + "MAPTO" + reffa_linktoDB_Name, 'w')
if __name__ == '__main__':
    dbtools = dbm.DBTools(Util.ip, Util.username, Util.password,
                          Util.genomeinfodbname)
    try:
        refidxByChr = pickle.load(open(reffa_linktoDB_idxName, 'rb'))
    except IOError:
        Util.generateIndexByChrom(reffa_linktoDB_Name, reffa_linktoDB_idxName)
        refidxByChr = pickle.load(open(reffa_linktoDB_idxName, 'rb'))

    for onelineAscaffold in reffahanlder:
        onelineAscaffold = onelineAscaffold.lower()
        if re.search(r'^>', onelineAscaffold) != None:
            current_scaffold = re.search(r'^>(.*)',
                                         onelineAscaffold).group(1).strip()
        else:
            current_len = len(onelineAscaffold.strip())

            selectedchr = dbtools.operateDB(
                "select", "select * from " + Util.pekingduckchromtable +
                " where chrlength=" + str(current_len))
            print(
                current_scaffold,
Esempio n. 16
0
def findTrscpt(winfile,
               outbedfilename,
               upextend,
               downextend,
               winwidth,
               slideSize,
               winType,
               morethan_lessthan,
               threshold_title_list=None,
               percentage=None,
               mergeNA=False,
               extendtodistal=0,
               anchorfile=None,
               found=False,
               mapfile=None):

    if percentage != None and threshold_title_list != None:
        print("-t conflict with -p")
        exit(-1)
    threshold_title_list
    if anchorfile:
        #         winfile=standardseparately(anchorfile,winfile)
        winfilemark, winfilearrangement = Util.mapWinvaluefileToChrOfReletiveSpecie(
            anchorfile, winfile, winwidth, slideSize, True, mapfile)
    else:
        #         winfile=standardseparately(anchorfile,winfile)
        os.system("awk ' {if(NR=1){print $0" + '"\tmark"' + "}else{print $0" +
                  '"\tunknown"' + "}}' " + winfile + ">" + winfile +
                  "marked.sexchromseperatestandard")
    winFileName8Field = winfile + "marked.sexchromseperatestandard"
    f = open(winFileName8Field, "r")
    title = re.split(r"\s+", f.readline().strip())
    f.close()
    Nocol = title.index(winType) + 1
    re.search(r"[^/]*$", winFileName8Field).group(0)
    if re.search(r'^.*/', outbedfilename) != None:
        path = re.search(r'^.*/', outbedfilename).group(0)
    else:
        a = os.popen("pwd")
        path = a.readline().strip() + "/"
        a.close()
    if found:
        outfileNameWINwithGENE = path + re.search(
            r"[^/]*$", winFileName8Field).group(0) + ".wincopywithgene"
        return outfileNameWINwithGENE
    outfile = open(outbedfilename + ".bed.selectedgene", 'w')
    print("chrNo\tRegion_start\tRegion_end\tNoofWin\textram" + winType +
          "\tminNoSNP\tmaxNoSNP\ttranscpt\toverlapcode\tgeneID",
          file=outfile)
    outfileNameWINwithGENE = path + re.search(
        r"[^/]*$", winFileName8Field).group(0) + ".wincopywithgene"
    print(Util.ip, Util.username, Util.password, Util.genomeinfodbname)
    genomedbtools = dbm.DBTools(Util.ip, Util.username, Util.password,
                                Util.genomeinfodbname)

    winGenome = Util.WinInGenome(Util.ghostdbname, winFileName8Field, Nocol)

    time.sleep(SLEEP_FOR_NEXT_TRY)
    selectWinNos = "threshold method"
    totalWin = winGenome.windbtools.operateDB(
        "select", "select count(*) from " + winGenome.wintablewithoutNA)[0][0]
    #     selectWinNos = int(float(percentage) * totalWin)
    if anchorfile:
        wherestatmentmt = " where (mark='autosome' and " + winType + ">=" + threshold_title_list[
            0] + ") or (mark='sexchromosome' and " + winType + ">=" + threshold_title_list[
                -1] + ")"
        #         wherestatmentmp=" where 1 order by "+winType+" desc limit 0," + str(selectWinNos)
        wherestatmentlt = " where (mark='autosome' and " + winType + "<=" + threshold_title_list[
            0] + ") or (mark='sexchromosome' and " + winType + "<=" + threshold_title_list[
                -1] + ")"
#         wherestatmentlp=" where 1 order by "+winType+" asc limit 0," + str(selectWinNos)
    else:
        wherestatmentmt = " where 1 and " + winType + ">=" + threshold_title_list[
            0]
        #         wherestatmentmp=" where 1 order by "+winType+" desc limit 0," + str(selectWinNos)
        wherestatmentlt = " where " + winType + "!= 'NA' and " + winType + "<=" + threshold_title_list[
            0]
#         wherestatmentlp=" where 1 order by "+winType+" asc limit 0," + str(selectWinNos)
    winGenome.appendGeneName(Util.TranscriptGenetable, genomedbtools, winwidth,
                             slideSize, outfileNameWINwithGENE, upextend,
                             downextend, (10, morethan_lessthan))
    #    should be rewrite in a clear statment
    if percentage != None:

        if morethan_lessthan == "m" or morethan_lessthan == "M":
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                " where 1 order by " + winType + " desc limit 0," +
                str(selectWinNos))
            print("select * from " + winGenome.wintablewithoutNA +
                  " where 1 order by zvalue desc limit 0," + str(selectWinNos))
        elif morethan_lessthan == "l" or morethan_lessthan == "L":
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                " where 1 order by " + winType + " asc limit 0," +
                str(selectWinNos))
            print("select * from " + winGenome.wintablewithoutNA +
                  " where 1 order by " + winType + " asc limit 0," +
                  str(selectWinNos))
    elif threshold_title_list != None:
        if morethan_lessthan == "m" or morethan_lessthan == "M":
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                wherestatmentmt)

        elif morethan_lessthan == "l" or morethan_lessthan == "L":
            #             print("select", "select * from " + winGenome.wintablewithoutNA + " where "+winType+"!= 'NA' and "+winType+"<=" + threshold)
            selectedWins = winGenome.windbtools.operateDB(
                "select", "select * from " + winGenome.wintablewithoutNA +
                wherestatmentlt)
        selectWinNos = len(selectedWins)
    selectedWins.sort(key=lambda listRec: float(listRec[5]))
    if selectWinNos == 0:
        outfile.close()
        print("selectWinNos==0")
        exit(0)
    print(outbedfilename + ".bed.selectgene", selectWinNos, "~=",
          len(selectedWins), selectedWins[0], selectedWins[-1])
    selectedWinMap = {}
    for win in selectedWins:
        if win[0] in selectedWinMap:
            selectedWinMap[win[0]].append(win)
        else:
            selectedWinMap[win[0]] = [win]

    selectedRegion = {}

    for chrom in selectedWinMap:
        selectedWinMap[chrom].sort(key=lambda listRec: int(listRec[1]))
        selectedRegion[chrom] = []
        mergedRegion = [selectedWinMap[chrom][0]]
        i = 1
        while i < len(selectedWinMap[chrom]):
            #             print(chrom,selectedWinMap[chrom][i])
            #             try:
            if int(selectedWinMap[chrom][i - 1][1]) + 1 == int(
                    selectedWinMap[chrom][i][1]) or int(selectedWinMap[chrom][
                        i - 1][1]) * slideSize + winwidth >= int(
                            selectedWinMap[chrom][i]
                            [1]) * slideSize:  #continues win
                mergedRegion.append(selectedWinMap[chrom][i])
            else:  #not continues
                #process last region
                Region_start = int(mergedRegion[0][1]) * slideSize
                Region_end = int(mergedRegion[-1][1]) * slideSize + winwidth
                Nwin = len(mergedRegion)
                extremeValues = []
                noofsnps = []
                for e in mergedRegion:
                    if winType == "winvalue":
                        extremeValues.append(float(e[5]))
                    elif winType == "zvalue":
                        extremeValues.append(float(e[6]))
                    noofsnps.append(int(e[4]))

                if morethan_lessthan == "m" or morethan_lessthan == "M":
                    extremeValue = min(extremeValues)
                elif morethan_lessthan == "l" or morethan_lessthan == "L":
                    extremeValue = max(extremeValues)
                maxNoSNP = max(noofsnps)
                mixNoSNP = min(noofsnps)
                selectedRegion[chrom].append(
                    (chrom, Region_start, Region_end, Nwin, extremeValue,
                     mixNoSNP, maxNoSNP))
                #process this win
                mergedRegion = [selectedWinMap[chrom][i]]
            i += 1
#             except IndexError:
#                 print(i,len(selectedWinMap[chrom]),selectedWinMap[chrom])
#                 exit(-1)
        else:
            Region_start = int(mergedRegion[0][1]) * slideSize
            Region_end = int(mergedRegion[-1][1]) * slideSize + winwidth
            Nwin = len(mergedRegion)
            extremeValues = []
            noofsnps = []
            for e in mergedRegion:
                if winType == "winvalue":
                    extremeValues.append(float(e[5]))
                elif winType == "zvalue":
                    extremeValues.append(float(e[6]))
                noofsnps.append(int(e[4]))
            if morethan_lessthan == "m" or morethan_lessthan == "M":
                extremeValue = min(extremeValues)
            elif morethan_lessthan == "l" or morethan_lessthan == "L":
                extremeValue = max(extremeValues)
            maxNoSNP = max(noofsnps)
            mixNoSNP = min(noofsnps)
            selectedRegion[chrom].append(
                (chrom, Region_start, Region_end, Nwin, extremeValue, mixNoSNP,
                 maxNoSNP))
    if mergeNA != False and int(mergeNA) > 0:
        for chrom in selectedRegion:
            selectedRegion[chrom].sort(key=lambda listRec: int(listRec[1]))
            i = 1
            idxlist_to_pop = []
            while i < len(selectedRegion[chrom]):
                winNo_end = str(int(selectedRegion[chrom][i][1] / slideSize))
                winNo_start = str(
                    int((selectedRegion[chrom][i - 1][2] - winwidth) /
                        slideSize))
                print("select * from " + winGenome.wintablewithoutNA +
                      " where " + " chrID='" + chrom + "' and winNo>" +
                      winNo_start + " and  winNo<" + winNo_end)
                wincount_to_determine = winGenome.windbtools.operateDB(
                    "select", "select * from " + winGenome.wintablewithoutNA +
                    " where " + " chrID='" + chrom + "' and winNo>" +
                    winNo_start + " and winNo<" + winNo_end)
                wincount_to_add = winGenome.windbtools.operateDB(
                    "select",
                    "select * from " + winGenome.wintabletextvalueallwin +
                    " where " + " chrID='" + chrom + "' and winNo>" +
                    winNo_start + " and winNo<" + winNo_end)
                if len(wincount_to_determine
                       ) == 0 and len(wincount_to_add) <= int(mergeNA):
                    if morethan_lessthan == "m" or morethan_lessthan == "M":
                        extremeValue = min(selectedRegion[chrom][i][4],
                                           selectedRegion[chrom][i - 1][4])
                    elif morethan_lessthan == "l" or morethan_lessthan == "L":
                        extremeValue = max(selectedRegion[chrom][i][4],
                                           selectedRegion[chrom][i - 1][4])
                    maxNoSNP = max(selectedRegion[chrom][i][3],
                                   selectedRegion[chrom][i - 1][3])
                    mixNoSNP = min(selectedRegion[chrom][i][3],
                                   selectedRegion[chrom][i - 1][3])
                    selectedRegion[chrom][i] = (
                        chrom, selectedRegion[chrom][i - 1][1],
                        selectedRegion[chrom][i][2],
                        selectedRegion[chrom][i - 1][3] +
                        selectedRegion[chrom][i][3] + len(wincount_to_add),
                        extremeValue, mixNoSNP, maxNoSNP)
                    idxlist_to_pop.append(i - 1)
                i += 1
            else:
                idxlist_to_pop.reverse()
                for idx_to_pop in idxlist_to_pop:
                    selectedRegion[chrom].pop(idx_to_pop)
    else:
        for chrom in selectedRegion:
            selectedRegion[chrom].sort(key=lambda listRec: int(listRec[1]))
#    get final table
    print("getting final table")
    final_table = {}
    for chrom in selectedRegion:
        for region in selectedRegion[chrom]:
            print(chrom, region)
            if extendtodistal > 0:
                final_table[region] = winGenome.collectTrscptInWin(
                    genomedbtools, Util.TranscriptGenetable, region, upextend,
                    downextend, extendtodistal)
            else:
                final_table[region] = winGenome.collectTrscptInWin(
                    genomedbtools, Util.TranscriptGenetable, region, upextend,
                    downextend)


#process top outlier values
    print("fill bedselectedtable")
    for chrom in winGenome.chromOrder:
        if chrom not in selectedRegion:
            continue
        for region in selectedRegion[chrom]:
            if chrom.strip() == region[0].strip():
                tcpts = ""
                tpcode = ""
                gnames = ""
                for tcpt in final_table[region]:
                    tcpts += (tcpt[0] + ",")
                    tpcode += (str(tcpt[-1]) + ",")
                    if tcpt[2].strip() != "":
                        gnames += (tcpt[2] + ",")
                print("\t".join(map(str, region)),
                      tcpts[:-1],
                      tpcode[:-1],
                      gnames[:-1],
                      sep="\t",
                      file=outfile)

    winGenome.windbtools.drop_table(winGenome.wintabletextvalueallwin)
    winGenome.windbtools.drop_table(winGenome.wintablewithoutNA)
    outfile.close()
    return outfileNameWINwithGENE
Esempio n. 17
0
(options, args) = parser.parse_args()

outfile = open(options.outfile, 'w')
outfilewithvalue = open(options.outfile + "_withvalue", 'w')
percentage = float(options.percentageofCovered)
averagedepth = int(options.averagedepthThreshold)

chromtable = Util.pekingduckchromtable
windowWidth = int(options.winwidth)
slideSize = int(options.slidesize)
mindepth = int(options.mindepth)
print(percentage)
if __name__ == '__main__':

    depthbinmap = {}
    mywin = Util.Window()
    dbtools = dbm.DBTools(Util.ip, Util.username, Util.password,
                          Util.genomeinfodbname)
    depthfile = Util.GATK_depthfile(options.genomedepth,
                                    options.genomedepth + ".index")
    if "" in depthfile.title:
        depthfile.title.remove("")
    print(depthfile.title, len(depthfile.title) - 3)
    if options.speciesnames == []:
        print("chrom",
              "start_pos",
              "end_pos",
              *depthfile.title[3:],
              sep="\t",
              file=outfile)
        print("chrom",
#     for a,b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()):
#         freq_xaxisKEY_yaxisVALUERelation[(a,b)]=numpy.mean(freq_xaxisKEY_yaxisVALUE_seq_list[(a,b)])
#         print('%.12f'%a,'%.12f'%(b),'%.12f'%(freq_xaxisKEY_yaxisVALUERelation[(a,b)]),"process ID:",os.getpid(),"done",sep="\t")
    print("process ID:", os.getpid(), "done")
    return copy.deepcopy(freq_xaxisKEY_yaxisVALUE_seq_list)
if __name__ == '__main__':
    filenamelistfilename = options.outfileprewithpath + ".freqcorrelationfilenamelist"
    parameterstuples = (options.chromlistfilename,
                        options.topleveltablejudgeancestral,
                        options.targetpopvcfconfig,
                        options.refpopvcffileconfig,
                        options.numberofindvdoftargetpop_todividintobin)
    print(parameterstuples, options.outfileprewithpath)
    freq_xaxisKEY_yaxisVALUE_seq_list = make_freq_xaxisKEY_yaxisseqVALUERelation(
        parameterstuples)
    outfilename = options.outfileprewithpath + "_part_" + str(
        os.getpid()) + Util.random_str()
    outfile = open(outfilename, 'w')
    filenamelistfile = open(filenamelistfilename, 'a')
    for a, b in sorted(freq_xaxisKEY_yaxisVALUE_seq_list.keys()):
        print(str(a),
              str(b),
              *freq_xaxisKEY_yaxisVALUE_seq_list[(a, b)],
              sep="\t",
              file=outfile)
    outfile.close()
    print(outfilename, file=filenamelistfile)
    print(sys.argv, outfilename)
    filenamelistfile.close()
    print("process ID:", os.getpid(), "finished")
    exit(0)
Esempio n. 19
0
chromstable=options.chromtable
primaryID = "chrID"
OUTFILENAME="ducksnpflankseq.fa"
# outfile=open("ducksnpflankseq.fa",'w')
BlastOutFile="ducksnpflankseq.blast"
if __name__ == '__main__':
    aaa=DAP.MakeDerivedAlleletable(database=dbname,ip="10.2.48.96",usrname="root",pw="1234567")
#     aaa=DAP.MakeDerivedAlleletable(database=dbname,ip="10.2.48.140",usrname="root",pw="1234567")
#     ddd=MP.Dstistics_allpop(allpop)
#     ddd.caculateDofAllpossibleCombination(database=dbname,ip="10.2.48.140",usrname="root",pw="1234567", allpopssnptable="derived_alle_ref", chromstable=chromstable, winwidth=None, minlengthOfchrom=minlengthOfchrom, filenamepre=options.prefilename)
    dbtoolsforchrom = dbm.DBTools(Util.ip, Util.username, Util.password, Util.genomeinfodbname)
    try:
        duckrefindex = pickle.load(open(options.reference + ".myindex", 'rb'))
        originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb'))
    except IOError:
        Util.generateIndexByChrom(options.reference, options.reference + ".myindex")
        Util.generateIndexByChrom(originalspeciesref, originalspeciesref + ".myindex")
        duckrefindex = pickle.load(open(options.reference + ".myindex", 'rb'))
        originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb'))
#     aaa.createtable()
#     aaa.filldata(vcfFileName=vcfFileName,depthfileName=DepthFileName,continuechrom=continuechrom,continuepos=continuepos)
    aaa.fillarchicpop(archicpopVcfFile,DepthFileName,chromstable,archicpopNameindepthFile)
#     totalChroms = dbtoolsforchrom.operateDB("select","select count(*) from "+chromstable)[0][0]
#     for i in range(0,totalChroms,20):
#         currentsql="select * from " + chromstable+" order by chrlength limit "+str(i)+",20"
        result=dbtoolsforchrom.operateDB("select",currentsql)
#         for row in result:
#             currentchrID=row[0]
#             currentchrLen=int(row[2])
#             aaa.getflankseqs(currentchrID,currentchrLen, 1+flanklen, currentchrLen, idxedreffilehandler=duckrefhandler, refindex=duckrefindex, flanklen=flanklen,outfile=outfile, tablename="derived_alle_ref")
#     outfile.close()
Esempio n. 20
0
                  dest="verbose",
                  default=True,
                  help="don't print status messages to stdout")
test = open("test.txt", 'w')
(options, args) = parser.parse_args()
if __name__ == '__main__':
    phastConsfile = open(options.infilename, "r")
    L = []
    firstline = re.split(r'\s+', phastConsfile.readline())
    currentchrom = firstline[0]
    winStart = int(firstline[1])
    L = [(winStart, int(firstline[2]), float(firstline[3]))]
    print(L)
    for line in phastConsfile:
        linelist = re.split(r'\s+', line)
        if currentchrom == linelist[0]:
            L.append((int(linelist[1]), int(linelist[2]), float(linelist[3])))

    caculate_phastConsValue = Caculators.Caculate_phastConsValue()
    win = Util.Window()
    print(int(options.winwidth), winStart)
    win.forPhastConsFormat(L=L,
                           L_End_Pos=len(L),
                           windowWidth=int(options.winwidth),
                           Caculator=caculate_phastConsValue,
                           winStart=winStart)
    for e in win.winValueL:
        print(*e, sep='\t', file=test)

    phastConsfile.close()
    test.close()
Esempio n. 21
0
minlength = options.minlength
vcffileslist = options.vcffile
sql = "select * from " + chromtable + " where chrlength>=" + minlength


class SNPsPerBIN():
    def __init__(self):
        self.SNPsPerBINMap = {}


if __name__ == '__main__':

    speicesidxs_inbindepthmap = []
    if len(vcffileslist[:]) == 1 and len(
            options.specieses) != 0 and options.coveragebin != None:
        bindepth = Util.BinDepth(options.coveragebin)
        for species in options.specieses:
            speicesidxs_inbindepthmap.append(
                bindepth.speciesname.index("Depth_for_" + species) + 2)
        consider_Depth = True
    else:
        consider_Depth = False
    dbtools = dbm.DBTools(Util.ip, Util.username, Util.password,
                          Util.genomeinfodbname)
    print(vcffileslist[:])
    for vcf in vcffileslist[:]:
        vcfname = re.search(r"[^/]*$", vcf).group(0)
        if re.search(r"indvd[^/]+", vcf) != None:
            snpcounter = Caculators.Caculate_SNPsPerBIN(
                windowWidth, considerINDEL=howtoIndel, MethodToSeq="indvd")
        elif re.search(r"pool[^/]+", vcf) != None:
Esempio n. 22
0
        testminintervalbetweengenes_basesperfaline.readline().strip())
    print(minintervalbetweengenes_basesperfaline)
#gtffile = open(options.gtffile, 'r')
vcffile = open(options.variants, 'r')
#covfile = open(options.genomedepth, 'r')

cns_string = ">"
aa_string = ""
cdscns_string = ""
outcns = open(options.outfileprename + "_cns.fa", 'w')
outaa = open(options.outfileprename + "_aa.fa", 'w')
outcdscns = open(options.outfileprename + "_cdscns.fa", 'w')
cdsmap = {}
if __name__ == '__main__':
    if options.genomedepth != None:
        depthfile = Util.GATK_depthfile(options.genomedepth,
                                        options.genomedepth + ".index")
        species_idx = depthfile.title.index("Depth_for_" + options.species)
        Considerdepth = True
    else:
        Considerdepth = False
        depthfile = None
        species_idx = -1
    vcfpop = VCFutil.VCF_Data(options.variants)  # new a class
    RefSeqMap, currentChromNO, nextChromNO = Util.getRefSeqMap(
        refFastafilehander=reffa)
    print(currentChromNO, nextChromNO)
    cns_string += currentChromNO + "\n"
    gtfMap = Util.getGtfMap(options.gtffile)

    lastposofdepthfp = 0  #because this time RefSeqMap[0] is 0
    vcfchrom = "begin"
Esempio n. 23
0
    for chrlist, vcflikeFileName, corresponding_ref, flanklen in options.variantfilewithref:
        chromlistfile = open(chrlist, "r")
        chrmap = {}
        for rec in chromlistfile:
            reclist = re.split(r'\s+', rec.strip())
            chrmap[reclist[0]] = reclist[1]

        flanklen = int(flanklen)
        duckrefhandler = open(corresponding_ref, 'r')
        try:
            duckrefindex = pickle.load(
                open(corresponding_ref + ".myfasteridx", 'rb'))
#             originalspeciesindex = pickle.load(open(originalspeciesref + ".myindex", 'rb'))
        except IOError:
            Util.generateFasterRefIndex(corresponding_ref,
                                        corresponding_ref + ".myfasteridx",
                                        chrsignal=options.chrsignal)
            duckrefindex = pickle.load(
                open(corresponding_ref + ".myfasteridx", 'rb'))
        vcflikefile = open(vcflikeFileName, 'r')
        vcflinesalchr = vcflikefile.readlines()
        #1,read variations
        chrom = None
        snpsOfOneChrom = []
        startpostocollecteSNP = 1
        while vcflinesalchr:
            snpline = vcflinesalchr.pop(0).strip()

            if snpline[0] == "#" or snpline.lower().find("chrom") == 0:  #title
                continue
            else: