Exemple #1
0
def getDataFrame():
    input = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\compare.txt'
    out = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\compareDataFrame.txt'
    wt = FF.getWriter(out, False)
    iter = FF.getLineByPath(input)
    dataframe = {}
    diseases = 'BD,CAD,Height,RA,TC'.split(',')
    for dis in diseases:
        dataframe[dis] = {}
        dataframe[dis]['our'] = []
        dataframe[dis]['their'] = []
    for line in iter:
        arr = line.split('\t')
        dataframe['BD']['our'].append(arr[0])
        dataframe['BD']['their'].append(arr[1])
        dataframe['CAD']['our'].append(arr[2])
        dataframe['CAD']['their'].append(arr[3])
        dataframe['Height']['our'].append(arr[4])
        dataframe['Height']['their'].append(arr[5])
        dataframe['RA']['our'].append(arr[6])
        dataframe['RA']['their'].append(arr[7])
        dataframe['TC']['our'].append(arr[8])
        dataframe['TC']['their'].append(arr[9])
    wt.write('TraitDisease\tMethod\tTissueCellType\tRank\n')
    tissueSet = sorted(dataframe['BD']['our'])
    for dis in dataframe.keys():
        for met in dataframe[dis].keys():
            for tis in tissueSet:
                wt.write(dis + '\t' + met + '\t' + tis + '\t' + str(
                    len(dataframe[dis][met]) -
                    dataframe[dis][met].index(tis)) + '\n')
    wt.close()
Exemple #2
0
def getNGData():
    overlap = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\overlap.txt'
    out = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\NGdata.txt'
    input = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\NGresults.txt'
    dic_iter = FF.getLineByPath(overlap)
    dict = {}
    dict['our'] = []
    dict['their'] = []
    for line in dic_iter:
        arr = line.split('\t')
        dict['our'].append(arr[0])
        dict['their'].append(arr[1])
    ng_iter = FF.getLineByPath(input)
    dis_tis = {}
    tiss = []
    diss = ng_iter.__next__().split('\t')[1:]
    print(diss)
    for dis in diss:
        dis_tis[dis] = []
    for line in ng_iter:
        arr = line.split('\t')
        if arr[0] in dict['their']:
            tiss.append(dict['our'][dict['their'].index(arr[0])])
            for i in range(len(diss)):
                dis_tis[diss[i]].append(arr[i + 1])
    wt = FF.getWriter(out, False)
    wt.write('TraitDisease\tTissueCellType\tValue\n')
    for dis in dis_tis.keys():
        for i in range(len(dis_tis[dis])):
            wt.write(dis + '\t' + tiss[i] + '\t' + str(dis_tis[dis][i]) + '\n')
    wt.close()
def getConditionalGeneList(dir, outPath, outPathGC, pattern):
    fdr = getFDR()
    import os
    import common.util.FileFunction as FF
    if not os.path.exists(os.path.dirname(outPath)):
        os.mkdir(os.path.dirname(outPath))
    wt = FF.getWriter(outPath, False)
    wtgc = FF.getWriter(outPathGC, False)
    FF.gzWrite(wt, "DiseaseName\tAssociatedGenes\n", outPath)
    FF.gzWrite(wtgc, "DiseaseName\tAssociatedGenesCounts\n", outPathGC)
    import os
    print(len(os.listdir(dir)))
    for disDir in os.listdir(dir):
        for file in os.listdir(os.path.join(dir, disDir)):
            if file.__contains__(pattern):
                print(os.path.join(dir, disDir, file))
                pcut = fdr[file.split("-")[0]]
                genelist = getGeneListFromKGGxlsx(
                    os.path.join(dir, disDir, file), pcut)
                FF.gzWrite(
                    wt,
                    file.split("-")[0] + "\t" + ",".join(genelist) + "\n",
                    outPath)
                FF.gzWrite(
                    wtgc,
                    file.split("-")[0] + "\t" + str(len(genelist)) + "\n",
                    outPathGC)
    wt.close()
    print("\n" + str(len(os.listdir(dir))) + " files finish!")
Exemple #4
0
def getRank(inPath, outDir):
    if not os.path.exists(outDir):
        os.mkdir(outDir)
    line_iter = FF.getArrByPath(inPath)
    head = line_iter.__next__()
    bwArr = []
    for i in range(int((len(head) - 1) / 2)):
        bwArr.append(
            FF.getWriter(os.path.join(outDir, head[i + 1][0:-1] + ".txt"),
                         False))
    for line in line_iter:
        for j in range(len(bwArr)):
            FF.gzWrite(bwArr[j], line[0] + "\t" + line[j + 1] + "\n", ".txt")
    for k in range(len(bwArr)):
        bwArr[k].close()
def singleDiseaseDeal():
    #path="F:\Projects\TEA\GWAS_tmp\BD\BD-ECS-Cond-0.01.xlsx"
    path = "F:\Projects\TEA\GWAS_tmp\BD\\noCondGenes.txt"
    pcut = 7.427493748198251E-4
    outPath = "F:\Projects\TEA\GWAS_tmp\BD\\noCondAssociatedGenes.txt"
    import os
    import common.util.FileFunction as FF
    if not os.path.exists(os.path.dirname(outPath)):
        os.mkdir(os.path.dirname(outPath))
    wt = FF.getWriter(outPath, False)
    FF.gzWrite(wt, "DiseaseName\tAssociatedGenes\n", outPath)
    #genes=getGeneListFromKGGxlsx(path, pcut)
    genes = getGeneListFromKGGtxt(path, pcut)
    FF.gzWrite(wt, "BD\t" + ",".join(genes) + "\n", outPath)
    wt.close()
def getKGGparameter(dir, outPath):
    import common.util.FileFunction as FF
    wt = FF.getWriter(outPath, False)
    FF.gzWrite(wt, "DiseaseName\tErrorRate\n", outPath)
    import os
    for disDir in os.listdir(dir):
        for file in os.listdir(os.path.join(dir, disDir)):
            if file.__contains__("ECS-Cond"):
                if len(file.split("-")[2].split("_")) == 1:
                    error = "0.05"
                else:
                    error = file.split("-")[2].split("_")[1].replace(
                        ".xlsx", "")
                FF.gzWrite(wt,
                           file.split("-")[0] + "\t" + error + "\n", outPath)
    wt.close()
Exemple #7
0
def filterOurResult():
    overlap = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\overlap.txt'
    out = 'F:\\Projects\\TEA\\GWAS_tmp\\analysis\\Rplot\\OurData_v3.tpm_0.01.AddCategories2.txt'
    input = 'F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\tea.enrich_v3.tpm_0.01.forRplot.txt'
    dic_iter = FF.getLineByPath(overlap)
    dict = {}
    dict['our'] = []
    dict['their'] = []
    for line in dic_iter:
        arr = line.split('\t')
        dict['our'].append(arr[0])
        dict['their'].append(arr[1])
    ng_iter = FF.getLineByPath(input)
    wt = FF.getWriter(out, False)
    wt.write(ng_iter.__next__() + '\n')
    for line in ng_iter:
        if line.split('\t')[1] in dict['our']:
            wt.write(line + '\n')
    wt.close()
Exemple #8
0
def analysis(inPath, outPath):
    line_iter = FF.getArrByPath(inPath)
    wt = FF.getWriter(outPath, False)
    FF.gzWrite(wt, "\t".join(line_iter.__next__()) + "\n", outPath)
    for line in line_iter:
        FF.gzWrite(wt, line[0], outPath)
        for idx in range(len(line) - 1):
            genes = line[idx + 1].split(",")
            hit = 0
            nonhit = 0
            for gene in genes:
                pubmed = gene.split(":")[1]
                if pubmed:
                    hit += 1
                else:
                    nonhit += 1
            FF.gzWrite(wt, "\t" + str(hit) + "/" + str(nonhit), outPath)
        FF.gzWrite(wt, "\t" + "\n", outPath)
    wt.close()
Exemple #9
0
 def writeLog(self, content):
     import common.util.FileFunction as FF
     import time
     wt = FF.getWriter(self.logPath, True)
     if self.timeDate:
         content = time.strftime("%Y-%m-%d %H:%M:%S ",
                                 time.localtime(time.time())) + content
     wt.write("\n" + content)
     print(content)
     wt.close()
def getGeneListFromKGGtxt(path, pcut):
    regenelist = []
    import common.util.FileFunction as FF
    line_iter = FF.getLineByPath(path)
    line_iter.__next__()
    for line in line_iter:
        cell = line.split("\t")
        if float(cell[1]) < pcut:
            regenelist.append(cell[0])
    return regenelist
Exemple #11
0
def classTissues():
    tissueClass = {}
    tissueClass['Brain']='Brain-Amygdala	Brain-Anteriorcingulatecortex(BA24)	Brain-Caudate(basalganglia)	' \
                         'Brain-CerebellarHemisphere	Brain-Cerebellum	Brain-Cortex	Brain-FrontalCortex(BA9)	' \
                         'Brain-Hippocampus	Brain-Hypothalamus	Brain-Nucleusaccumbens(basalganglia)	' \
                         'Brain-Putamen(basalganglia)	Brain-Spinalcord(cervicalc-1)	' \
                         'Brain-Substantianigra'.split("\t")
    tissueClass[
        'Adipose'] = 'Adipose-Subcutaneous\tAdipose-Visceral(Omentum)'.split(
            '\t')
    tissueClass[
        'Circulatory'] = 'Artery-Aorta\tArtery-Coronary\tArtery-Tibial\tHeart-AtrialAppendage\tHeart-LeftVentricle'.split(
            '\t')
    tissueClass[
        'Reproductive (Female)'] = 'FallopianTube\tCervix-Ectocervix\tCervix-Endocervix\tUterus\tVagina\tOvary\tBreast-MammaryTissue'.split(
            '\t')
    tissueClass[
        'Digestive'] = 'MinorSalivaryGland\tStomach\tLiver\tEsophagus-GastroesophagealJunction\tEsophagus-Mucosa\tEsophagus-Muscularis\tSmallIntestine-TerminalIleum\tColon-Sigmoid\tColon-Transverse'.split(
            '\t')

    tissueClass['Endocrine'] = 'AdrenalGland,Thyroid'.split(',')
    tissueClass['Urinary'] = 'Bladder,Kidney-Cortex'.split(',')
    tissueClass['Immune'] = 'Cells-EBV-transformedlymphocytes,Spleen'.split(
        ',')
    tissueClass['Connective'] = ['Cells-Transformedfibroblasts']
    tissueClass['Respiratory'] = ['Lung']

    tissueClass['Muscular'] = ['Muscle-Skeletal']
    tissueClass['Nerve'] = ['Nerve-Tibial']
    tissueClass['Reproductive (Male)'] = 'Prostate,Testis'.split(',')
    tissueClass[
        'Skin'] = 'Skin-NotSunExposed(Suprapubic),Skin-SunExposed(Lowerleg)'.split(
            ',')

    #orgpath='F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\NGdata.txt'
    #outpath='F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\NGdataAddCategories2.txt'

    orgpath = 'F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\tea.enrich_v3.tpm_0.01.txt'
    outpath = 'F:\Projects\TEA\GWAS_tmp\\analysis\Rplot\\tea.enrich_v3.tpm_0.01.forRplot.txt'
    line_iter = FF.getLineByPath(orgpath)
    wt = FF.getWriter(outpath, False)
    FF.gzWrite(wt, line_iter.__next__() + '\tCategories\n')
    for line in line_iter:
        break_loop = False
        tissue = line.split("\t")[1]
        for cata in tissueClass.keys():
            for tis in tissueClass[cata]:
                if tis == tissue:
                    FF.gzWrite(wt, line + '\t' + cata + '\n')
                    break_loop = True
                    break
            if break_loop == True:
                break
    wt.close()
Exemple #12
0
def compareMethods():
    orign = 'D:\\Users\\xuechao\\Desktop\\orign.txt'
    dict_path = 'D:\\Users\\xuechao\\Desktop\\overlap.txt'
    out = 'D:\\Users\\xuechao\\Desktop\\out.txt'
    dic_iter = FF.getLineByPath(dict_path)
    dict = {}
    dict['our'] = []
    dict['their'] = []
    for line in dic_iter:
        arr = line.split('\t')
        dict['our'].append(arr[0])
        dict['their'].append(arr[1])
    print(len(dict['our']))
    print(len(dict['their']))
    wt = FF.getWriter(out, False)
    ori_iter = FF.getLineByPath(orign)
    for line in ori_iter:
        arr = line.split('\t')
        for tis in arr:
            if tis in dict['our']:
                FF.gzWrite(wt, tis + '\t')
                continue
            if tis in dict['their']:
                FF.gzWrite(wt, dict['our'][dict['their'].index(tis)] + '\t')
                continue
            else:
                FF.gzWrite(wt, '\t')
        FF.gzWrite(wt, '\n')
    wt.close()
Exemple #13
0
def mergeGenes(noPath, rankPath, outPath):
    no_arr = []
    out_wt = FF.getWriter(outPath, False)
    out_wtc = FF.getWriter(outPath + ".count", False)
    FF.gzWrite(out_wt, "Disease\tNoRankGenes\tRankGenes\n", outPath)
    FF.gzWrite(out_wtc,
               "Disease\tAllNoGenes\tAllRankGenes\tNoRankGenes\tRankGenes\n",
               outPath)
    no_iter = FF.getArrByPath(noPath)
    rank_iter = FF.getArrByPath(rankPath)
    no_iter.__next__()
    rank_iter.__next__()
    for arr in no_iter:
        no_arr.append(arr)
    for arrr in rank_iter:
        for arrn in no_arr:
            if arrr[0] == arrn[0]:
                FF.gzWrite(out_wt, arrr[0] + "\t", outPath)

                gener = arrr[1].split(",")
                genen = arrn[1].split(",")
                FF.gzWrite(
                    out_wtc, arrr[0] + "\t" + str(len(genen)) + "\t" +
                    str(len(gener)) + "\t", outPath)
                tmpgenes = []
                for gene1 in genen:
                    goto = False
                    for gene2 in gener:
                        if gene1 == gene2:
                            goto = True
                            break
                    if goto == True:
                        continue
                    else:
                        tmpgenes.append(gene1)
                FF.gzWrite(out_wt, ",".join(tmpgenes) + "\t", outPath)
                FF.gzWrite(out_wtc, str(len(tmpgenes)) + "\t", outPath)
                ##
                tmpgenes = []
                for gene1 in gener:
                    goto = False
                    for gene2 in genen:
                        if gene1 == gene2:
                            goto = True
                            break
                    if goto == True:
                        continue
                    else:
                        tmpgenes.append(gene1)
                FF.gzWrite(out_wt, ",".join(tmpgenes) + "\n", outPath)
                FF.gzWrite(out_wtc, str(len(tmpgenes)) + "\n", outPath)
            continue
        continue
    out_wt.close()
Exemple #14
0
def getFinalResultXlsx(dir, noDir, removeNoPebmed=False):
    inPath = os.path.join(dir, "compareNCBI.txt")
    outPath = os.path.join(dir, "compareNCBI.stat.rmNoNCBI.print.xls")
    style1 = XFStyle()
    style2 = XFStyle()
    style3 = XFStyle()
    pattern = Pattern()
    pattern.pattern = Pattern.SOLID_PATTERN
    pattern.pattern_fore_colour = Style.colour_map['gray25']
    algn1 = Alignment()
    algn1.wrap = algn1.WRAP_AT_RIGHT
    algn1.horz = algn1.HORZ_LEFT
    algn1.vert = algn1.VERT_CENTER
    style2.alignment = algn1
    style1.alignment = algn1
    style1.pattern = pattern
    import xlwt
    wb = xlwt.Workbook()
    ncbi = FF.getArrByPath(inPath)
    head = ncbi.__next__()
    for arr in ncbi:
        rank_dir = os.path.join(dir, "TSBGene", "DiseaseBased")
        #rank_dir ="F:\Projects\TEA\GWAS_10_diseases"
        wb_rank = xlrd.open_workbook(
            getKGGXlsPath(rank_dir, arr[0], "ECS-rank-Cond"))
        wb_no = xlrd.open_workbook(
            getKGGXlsPath(noDir, arr[0], "ECS-rmHLA-Cond"))

        fisher = []
        ws = wb.add_sheet(arr[0], True)
        ws.col(0).width = 256 * 15
        ws.col(1).width = 256 * 15
        ws.col(2).width = 256 * 15
        ws.col(3).width = 256 * 100
        rn = 0
        ws.write(rn, 0, "Gene", style=style2)
        ws.write(rn, 1, "P1", style=style2)
        ws.write(rn, 2, "P2", style=style2)
        ws.write(rn, 3, "PubMedID", style=style2)
        rn += 1
        for i in range(len(arr) - 1):
            ws.write_merge(rn,
                           rn,
                           0,
                           3,
                           getRightColName(head[i + 1]),
                           style=style1)
            rn += 1
            pubmeds = arr[i + 1].split(",")
            fisher.append(len(pubmeds))
            hit = 0
            for pm in pubmeds:
                gpm = pm.split(":")
                if gpm[1]:
                    hit += 1
                    ws.write(rn, 0, gpm[0], style=style2)
                    ws.write(rn, 3, gpm[1], style=style2)
                    ws.write(rn, 1, getGenePvalueFromKGGXls(gpm[0], wb_no),
                             style2)
                    ws.write(rn, 2, getGenePvalueFromKGGXls(gpm[0], wb_rank),
                             style2)
                    rn += 1
                else:
                    if not removeNoPebmed:
                        ws.write(rn, 0, gpm[0], style=style2)
                        ws.write(rn, 1, getGenePvalueFromKGGXls(gpm[0], wb_no),
                                 style2)
                        ws.write(rn, 2,
                                 getGenePvalueFromKGGXls(gpm[0],
                                                         wb_rank), style2)
                        rn += 1
            fisher.append(hit)
        fisher[0] = fisher[0] - fisher[1]
        fisher[2] = fisher[2] - fisher[3]
        no_bigger_rank = stats.fisher_exact([fisher[0:2], fisher[2:4]],
                                            alternative="less")[1]
        no_smaller_rank = stats.fisher_exact([fisher[0:2], fisher[2:4]],
                                             alternative="greater")[1]
        ws.write_merge(
            rn,
            rn + 1,
            0,
            3,
            "STATISTIC (hit counts/non-hit counts):  By p-value ranking:" +
            str(fisher[1]) + "/" + str(fisher[0]) +
            "; By selective expression ranking:" + str(fisher[3]) + "/" +
            str(fisher[2]) + "\n" +
            "Fisher's exact test: P(H1=p-value>selective expression)=" +
            str(no_bigger_rank) + "; P(H1=selective expression>p-value)=" +
            str(no_smaller_rank),
            style=style1)
        rn += 2
        ws.write_merge(
            rn,
            rn + 2,
            0,
            3,
            "Note: P1: This is a conditional gene-based association p-value according "
            "to statistical significance order. \nP2: This is a conditional gene-based "
            "association p-value according to tissue-specific pathogenic potential. "
            "\nThe papers co-mentioning the gene and diseases/traits in the titles or "
            "abstracts in PubMed database were searched by the API function.",
            style=style2)
    wb.save(outPath)