Ejemplo n.º 1
0
        testminintervalbetweengenes_basesperfaline.readline().strip())
    print(minintervalbetweengenes_basesperfaline)
#gtffile = open(options.gtffile, 'r')
vcffile = open(options.variants, 'r')
#covfile = open(options.genomedepth, 'r')

cns_string = ">"
aa_string = ""
cdscns_string = ""
outcns = open(options.outfileprename + "_cns.fa", 'w')
outaa = open(options.outfileprename + "_aa.fa", 'w')
outcdscns = open(options.outfileprename + "_cdscns.fa", 'w')
cdsmap = {}
if __name__ == '__main__':
    if options.genomedepth != None:
        depthfile = Util.GATK_depthfile(options.genomedepth,
                                        options.genomedepth + ".index")
        species_idx = depthfile.title.index("Depth_for_" + options.species)
        Considerdepth = True
    else:
        Considerdepth = False
        depthfile = None
        species_idx = -1
    vcfpop = VCFutil.VCF_Data(options.variants)  # new a class
    RefSeqMap, currentChromNO, nextChromNO = Util.getRefSeqMap(
        refFastafilehander=reffa)
    print(currentChromNO, nextChromNO)
    cns_string += currentChromNO + "\n"
    gtfMap = Util.getGtfMap(options.gtffile)

    lastposofdepthfp = 0  #because this time RefSeqMap[0] is 0
    vcfchrom = "begin"
Ejemplo n.º 2
0
    def filldata(self,
                 vcfFileName,
                 depthfileName,
                 tablename="derived_alle_ref",
                 posUniq=True,
                 continuechrom=None,
                 continuepos=None):
        depthfile = Util.GATK_depthfile(depthfileName,
                                        depthfileName + ".index")
        depth_linelist = None
        vcffile = open(vcfFileName, 'r')
        vcfline = vcffile.readline()
        while re.search(r'^##', vcfline) != None:
            vcfline = vcffile.readline()

        if re.search(r'^#', vcfline) != None:
            poptitlelist = re.split(r'\s+', vcfline.strip())[9:]
            print(poptitlelist)
        else:
            print(
                "need title'#CHROM    POS    ID    REF    ALT    QUAL    FILTER    INFO    FORMAT'"
            )
            exit(-1)
        for pop in poptitlelist:
            self.dbtools.operateDB("callproc",
                                   "mysql_sp_add_column",
                                   data=("life_pilot", tablename, pop,
                                         "varchar(128)", "default null"))
        popsdata = []  #depth for ref or alt
        if continuechrom != None and continuepos != None:
            print("filldata", continuechrom, continuepos)
            vcfpossearcher = VCFutil.VCF_Data(vcfFileName)
            vcffile.seek(vcfpossearcher.VcfIndexMap[continuechrom])
            vcfline = vcffile.readline()
            while vcfline:
                vcflist = re.split(r'\s+', vcfline.strip())
                chrom = vcflist[0].strip()
                pos = int(vcflist[1].strip())
                print(chrom, pos)
                if chrom == continuechrom and pos == continuepos:
                    break
                vcfline = vcffile.readline()
        else:
            justiceGATKorSamtools = vcffile.readline()
            vcflist = re.split(r'\s+', justiceGATKorSamtools.strip())
            dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)", vcflist[7])
            refdep = 0
            altalleledep = 0
            if dp4 != None:  #vcf from samtools
                print("function for samtools vcf is still need to be finish")
                exit(-1)
            else:
                chrom = vcflist[0].strip()
                pos = int(vcflist[1].strip())
                snpID = vcflist[2].strip()
                REF = vcflist[3].strip()
                ALT = vcflist[4].strip()

                AD_idx = (re.split(":", vcflist[8])).index(
                    "AD")  #gatk GT:AD:DP:GQ:PL
                sample_idx_in_vcf = 0
                for sample in vcflist[9:]:

                    samplename = poptitlelist[sample_idx_in_vcf]

                    sample_idx_in_vcf += 1
                    species_idx = depthfile.title.index("Depth_for_" +
                                                        samplename)
                    if len(re.split(":", sample)) != len(
                            re.split(":", vcflist[8])
                    ) and depth_linelist == None:  # ./. when lack of variantion information,then consider the depthfile
                        depth_linelist = depthfile.getdepthByPos(chrom, pos)

                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata.append('no covered')
                        else:
                            popsdata.append(depth_linelist[species_idx] + ",0")
                        continue
                    elif len(re.split(":", sample)) != len(
                            re.split(":",
                                     vcflist[8])) and depth_linelist != None:
                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata.append('no covered')
                        else:
                            popsdata.append(depth_linelist[species_idx] + ",0")
                        continue

                    popsdata.append(re.split(":", sample)[AD_idx])
                depth_linelist = None
                print(
                    "insert into " + tablename +
                    "(chrID,snp_pos,snpID,ref_base,alt_base," +
                    "".join([e + ","
                             for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                    ") select %s,%s,%s,%s,%s," + "%s," *
                    (len(poptitlelist) - 1) +
                    "%s from dual where not exists( select * from " +
                    tablename + " where " + tablename + ".chrID='" + chrom +
                    "' and " + tablename + ".snp_pos=" + str(pos) + ")",
                    (chrom, pos, snpID, REF, ALT) + tuple(popsdata))
                self.dbtools.operateDB(
                    "insert",
                    "insert into " + tablename +
                    "(chrID,snp_pos,snpID,ref_base,alt_base," +
                    "".join([e + ","
                             for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                    ") select %s,%s,%s,%s,%s," + "%s," *
                    (len(poptitlelist) - 1) +
                    "%s from dual where not exists( select * from " +
                    tablename + " where " + tablename + ".chrID='" + chrom +
                    "' and " + tablename + ".snp_pos=" + str(pos) + ")",
                    data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata))

        for vcfline in vcffile:

            vcflist = re.split(r'\s+', vcfline.strip())
            print(vcfline)
            if posUniq and pos == int(vcflist[1].strip()):
                continue
            chrom = vcflist[0].strip()
            pos = int(vcflist[1].strip())
            snpID = vcflist[2].strip()
            REF = vcflist[3].strip()
            ALT = vcflist[4].strip()

            AD_idx = (re.split(":",
                               vcflist[8])).index("AD")  #gatk GT:AD:DP:GQ:PL
            sample_idx_in_vcf = 0
            popsdata = []
            for sample in vcflist[9:]:
                samplename = poptitlelist[sample_idx_in_vcf]
                sample_idx_in_vcf += 1
                species_idx = depthfile.title.index("Depth_for_" + samplename)
                if len(re.split(":", sample)) != len(re.split(
                        ":", vcflist[8])) and depth_linelist == None:  # ./.
                    depth_linelist = depthfile.getdepthByPos(chrom, pos)
                    if int(depth_linelist[species_idx]) <= 1:
                        popsdata.append('no covered')
                    else:
                        popsdata.append(depth_linelist[species_idx] + ",0")
                    continue
                elif len(re.split(":", sample)) != len(
                        re.split(":", vcflist[8])) and depth_linelist != None:
                    if int(depth_linelist[species_idx]) <= 1:
                        popsdata.append('no covered')
                    else:
                        popsdata.append(depth_linelist[species_idx] + ",0")
                    continue
#                 AD_depth = re.split(",", re.split(":", sample)[AD_idx])

                popsdata.append(re.split(":", sample)[AD_idx])
            depth_linelist = None
            print(
                "insert into " + tablename +
                "(chrID,snp_pos,snpID,ref_base,alt_base," +
                "".join([e + ","
                         for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) +
                "%s from dual where not exists( select * from " + tablename +
                " where " + tablename + ".chrID='" + chrom + "' and " +
                tablename + ".snp_pos=" + str(pos) + ")",
                (chrom, pos, snpID, REF, ALT) + tuple(popsdata))
            self.dbtools.operateDB(
                "insert",
                "insert into " + tablename +
                "(chrID,snp_pos,snpID,ref_base,alt_base," +
                "".join([e + ","
                         for e in poptitlelist[:-1]] + poptitlelist[-1:]) +
                ") select %s,%s,%s,%s,%s," + "%s," * (len(poptitlelist) - 1) +
                "%s from dual where not exists( select * from " + tablename +
                " where " + tablename + ".chrID='" + chrom + "' and " +
                tablename + ".snp_pos=" + str(pos) + ")",
                data=(chrom, pos, snpID, REF, ALT) + tuple(popsdata))
        depthfile.closedepthfile()
        vcffile.close()
Ejemplo n.º 3
0
    def fillarchicpop(self,
                      archicpopVcfFile,
                      depthFile,
                      chromtable,
                      archicpopNameindepthFile,
                      tablename="derived_alle_ref",
                      archicpopfieldNameintable="archicpop"):
        """
        abandon the snps which exist in archicpopVcfFile but absence in all others pop snp sets 
        """
        depthfile = Util.GATK_depthfile(depthFile, depthFile + ".index")
        species_idx = depthfile.title.index("Depth_for_" +
                                            archicpopNameindepthFile)
        archicpop = VCFutil.VCF_Data(archicpopVcfFile)
        totalChroms = self.dbtools.operateDB(
            "select", "select count(*) from " + chromtable)[0][0]
        for i in range(0, totalChroms, 20):
            currentsql = "select * from " + chromtable + " order by chrlength desc limit " + str(
                i) + ",20"
            result = self.dbtools.operateDB("select", currentsql)
            for row in result:

                currentchrID = row[0]
                print(currentchrID + ":", end="")
                currentchrLen = int(row[2])
                archicpopSeqOfAChr = {}
                archicpopSeqOfAChr[currentchrID] = archicpop.getVcfListByChrom(
                    archicpopVcfFile, currentchrID)
                allsnpsInAchr = self.dbtools.operateDB(
                    "select", "select snp_pos,alt_base from " + tablename +
                    " where chrID='" + currentchrID + "'")
                for snp in allsnpsInAchr:
                    snp_pos = int(snp[0])
                    ALT = snp[1]
                    low = 0
                    high = len(archicpopSeqOfAChr[currentchrID]) - 1
                    while low <= high:
                        mid = (low + high) >> 1
                        if archicpopSeqOfAChr[currentchrID][mid][0] < snp_pos:
                            low = mid + 1
                        elif archicpopSeqOfAChr[currentchrID][mid][0] > snp_pos:
                            high = mid - 1
                        else:  #find the pos
                            pos, REF, ALT, INFO, FORMAT, samples = archicpopSeqOfAChr[
                                currentchrID][mid]
                            dp4 = re.search(r"DP4=(\d*),(\d*),(\d*),(\d*)",
                                            INFO)
                            refdep = 0
                            altalleledep = 0
                            if dp4 != None:  #vcf from samtools
                                refdep = int(dp4.group(1)) + int(dp4.group(2))
                                altalleledep = int(dp4.group(3)) + int(
                                    dp4.group(4))
                            else:
                                AD_idx = (re.split(":", FORMAT)).index(
                                    "AD")  #gatk GT:AD:DP:GQ:PL
                                for sample in samples:
                                    if len(re.split(":", sample)) == 1:  # ./.
                                        continue
                                    AD_depth = re.split(
                                        ",",
                                        re.split(":", sample)[AD_idx])
                                    try:
                                        refdep += int(AD_depth[0])
                                        altalleledep += int(AD_depth[1])
                                    except ValueError:
                                        print(sample, end="")
                            popsdata = ALT + ":" + str(refdep) + "," + str(
                                altalleledep)
                            break
                    else:
                        depth_linelist = depthfile.getdepthByPos(
                            currentchrID, snp_pos)
                        if int(depth_linelist[species_idx]) <= 1:
                            popsdata = "no covered"
                        else:
                            popsdata = ALT + ":" + depth_linelist[
                                species_idx] + ",0"
#                     print(snp[0],end="\t")
                    self.dbtools.operateDB(
                        "update", "update " + tablename + " set " +
                        archicpopfieldNameintable + " = '" + popsdata +
                        "' where chrID=" + "'" + currentchrID +
                        "' and snp_pos=" + str(snp[0]))
Ejemplo n.º 4
0
                  help="ancenstral(a) or derived(d)")
(options, args) = parser.parse_args()
mindeptojudgefix = 15
#####################
VCFobj = {}
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall = {}
vcfnameKEY_depthobjVALUE_tojudgeancestral = {}
VCFobj["wigeon"] = VCFutil.VCF_Data(
    "/home/bioinfo/liurui/data/vcffiles/uniqmap/taihudomesticgoose/taihudomesticgoose.pool.withindel.vcf"
)
VCFobj["fanya"] = VCFutil.VCF_Data(
    "/home/bioinfo/liurui/data/vcffiles/uniqmap/fanya/fanya._pool.withindel.vcf"
)
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[
    "wigeon"] = Util.GATK_depthfile(
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index"
    )  #here is a temp trick not a error
vcfnameKEY_depthfilename_titlenameVALUE_tojudgeancestrall[
    "fanya"] = Util.GATK_depthfile(
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
        "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth.index"
    )
vcfnameKEY_depthobjVALUE_tojudgeancestral["wigeon"] = [
    "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
    9
]
vcfnameKEY_depthobjVALUE_tojudgeancestral["fanya"] = [
    "/home/bioinfo/liurui/data/depth/g_j_sm_k_l_y_f_w_pool/gjsmklyfw_gatk.depth",
    3
]
####################################