def MAMA2Forest(gse_gpl_list, metaDir, exprDir, clinicalDir, clinicalTag, pCut, esCut, thread=4, gplMapFile='', outlierReplace=False, runningR=True, log=False): paraList = list() for p in pCut: for es in esCut: outputDir = '{}/{}_{}/forest'.format(metaDir, p, es) mkdir(outputDir) with open('{}/{}_{}/pVal_es.tsv'.format(metaDir, p, es), 'r') as rf: lines = rf.readlines()[1:] geneList = [line.split('\t')[0] for line in lines] if gplMapFile: with open(gplMapFile, 'r') as rf: annot = json.load(rf) geneListMapped = list() for gene in geneList: if gene in annot: mapped = annot[gene] if '///' in mapped: """ mappedList = [i.strip() for i in mapped.split('///')] for g in mappedList: if g not in geneListMapped: geneListMapped.append(g) """ pass else: if mapped not in geneListMapped: geneListMapped.append(mapped) geneList = geneListMapped with open('{}/geneListMapped.json'.format(outputDir), 'w') as wf: json.dump(geneList, wf, indent=2) paraList.append( (geneList, gse_gpl_list, exprDir, clinicalDir, outputDir, clinicalTag, outlierReplace, runningR, log)) with multiprocessing.Pool(processes=thread) as pool: pool.starmap(forest, paraList)
def mDEDS2Forest(gse_gpl_list, metaDir, exprDir, clinicalDir, clinicalTag, outlierReplace=False, runningR=True, log=False): geneList = list() geneSet = set() df = pd.read_csv('{}/__meta_results.csv'.format(metaDir)) symbol = list(df['symbol']) for s in symbol: s = str(s) if s != 'nan' and '/' not in s: if s not in geneSet: geneList.append(s) geneSet.add(s) outDir = '{}/forest'.format(metaDir) mkdir(outDir) forest(geneList, gse_gpl_list, exprDir, clinicalDir, outDir, clinicalTag, outlierReplace, runningR, log)
def forest(geneList, gse_gplList, exprDir, clinicalDir, outDir, clinicalTag, outlierReplace, runningR=True, log=False): geneList = [gene for gene in geneList if '/' not in gene] data = {gene: dict() for gene in geneList} for g in gse_gplList: gse = '{}_{}'.format(g[0], g[1]) with open('{}/{}_clinical.csv'.format(clinicalDir, gse), 'r') as rf: lines = rf.readlines()[1:] lines = [ line.strip().split(',') for line in lines if len(line) > 3 ] lines = [[item.strip(' \n\t"\'') for item in line] for line in lines] clinic = {line[0]: line[1] for line in lines} control = [ gsm.upper().split('.')[0] for gsm in clinic if clinic[gsm] == '0' ] treat = [ gsm.upper().split('.')[0] for gsm in clinic if clinic[gsm] == '1' ] with open('{}/{}_expr.tsv'.format(exprDir, gse), 'r') as rf: lines = rf.readlines() gsmList = lines[0].strip().split('\t') gsmList = [ gsm.strip(' \t\n\'"').upper().split('.')[0] for gsm in gsmList ] for line in lines[1:]: if len(line) > 3: line = [ item.strip('\t\n \'"') for item in line.split('\t') ] if line[0] in geneList: gene = line[0] expr = [float(i) for i in line[1:]] if outlierReplace: q = list(np.percentile(expr, [25, 50, 75])) n = 1.5 cutoff = (q[1] - n * (q[2] - q[0]), q[1] + n * (q[2] - q[0]), q[1]) exp = list() for e in expr: if e < cutoff[0]: e = -1 elif e > cutoff[1]: e = 1 else: e = (e - cutoff[2]) / (cutoff[1] - cutoff[2]) * n exp.append(e) expr = exp expr = dict(zip(gsmList, expr)) controlExpr = [ float(expr[gsm]) for gsm in control if gsm in expr ] treatExpr = [ float(expr[gsm]) for gsm in treat if gsm in expr ] if gse not in data[gene]: data[gene][gse] = dict() data[gene][gse]['NContrl'] = len(control) data[gene][gse]['MeanControl'] = statistics.mean( controlExpr) data[gene][gse]['StdControl'] = statistics.stdev( controlExpr) data[gene][gse]['NTreat'] = len(treat) data[gene][gse]['MeanTreat'] = statistics.mean( treatExpr) data[gene][gse]['StdTreat'] = statistics.stdev( treatExpr) mkdir('{}/tmp'.format(outDir)) rGeneList = list() for i in range(0, len(geneList)): gene = geneList[i] if len(data[gene]) > 1: with open('{}/tmp/{}_{}.csv'.format(outDir, i, gene), 'w') as wf: s = 'study,n1,mean1,sd1,n2,mean2,sd2\n' s += '\n'.join([ '{},{},{},{},{},{},{}'.format( gse, data[gene][gse]['NTreat'], data[gene][gse]['MeanTreat'], data[gene][gse]['StdTreat'], data[gene][gse]['NContrl'], data[gene][gse]['MeanControl'], data[gene][gse]['StdControl']) for gse in data[gene] ]) rGeneList.append('{}_{}'.format(i, gene)) print(s, file=wf, end='') rScript = 'library(parallel)\n' rScript += 'f <- function(gene) {\n library(meta)\n' rScript += ' a <- read.table(paste("{}/tmp/", gene, ".csv", sep=""), sep=",", header=T, stringsAsFactors=FALSE)\n'.format( outDir) rScript += ' metarsmd=metacont(n1,mean1,sd1,n2,mean2,sd2,data=a,sm="SMD",label.c="Non-metastasis",label.e="metastasis",comb.fixed=FALSE,comb.random=TRUE,studlab=study)\n' rScript += ' png(file=paste("{}/", gene, ".png", sep=""), width=12, height=6, units="in", res=600, bg="transparent")\n'.format( outDir) rScript += ' forest(metarsmd)\n dev.off()\n' rScript += ' c(gene, metarsmd$TE.random, metarsmd$lower.random, metarsmd$upper.random, metarsmd$k)\n}\n\n' rScript += 'genes <- c("{}")\n'.format('", "'.join(rGeneList)) rScript += 'detectCores(logical = F)\ncl <- makeCluster(getOption("cl.cores", 4))\n' rScript += 'ci <- parLapply(cl, genes, f)\nstopCluster(cl)\n' rScript += 'write.table(ci, file="{}/ci.csv", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)'.format( outDir) with open('{}/forest.R'.format(outDir), 'w') as wf: print(rScript, file=wf, end='') if runningR: if log: logging.info('Begin to calculate expression by R.') cmd = 'Rscript {}/forest.R > {}/forest.R.log'.format(outDir, outDir) r = os.system(cmd) if log: if r == 0: logging.info('Forest plot Successfully.') else: logging.error('Forest plot Failed.')
def mDEDS2Heatmap(gse_gpl_list, metaDir, exprDir, clinicalDir, clinicalTag, differentGeneCount, stratification, ramdomTimes, heatmapSampleCount, outlierReplace=True, runningR=True, log=False): fc = dict() gene = dict() with open('{}/geneOrder.tsv'.format(metaDir), 'r') as rf: for line in rf.readlines(): s = line.split('\t') if len(s) > 1: s = [i.strip('"\' \n') for i in s] gene[s[0]] = s[1] with open('{}/mDEDS_{}.csv'.format(metaDir, max(differentGeneCount))) as rf: lines = rf.readlines() title = [i.strip('"\'\n \t').lower() for i in lines[0].split(',')] gene_col_id = title.index('geneorder') fc_col_id = title.index('fc') for line in lines[1:]: s = [i.strip('"\n\t \'') for i in line.split(',')] if len(s) > fc_col_id: fc[gene[s[gene_col_id]]] = float(s[fc_col_id]) selectedProbe = sorted(fc.items(), key=operator.itemgetter(1), reverse=True) selectedProbe = [i[0] for i in selectedProbe] selectedProbe = [ "KRT80", "SAPCD2", "FHOD1", "CCNB2", "ANLN", "COL4A1", "LOC100506119", "CDKN3", "IER5L", "MMP11", "FAM83D", "LOC286052", "AEBP1", "PCDH17", "SLITRK5", "BUB1B", "CCNB1", "SLC7A5", "DONSON", "SUGCT", "ST8SIA6-AS1", "AVEN", "PCAT6", "KIF14", "MEX3A", "ACKR1", "P2RY12", "WDR78", "MDH1B", "RP11-53O19.3", "C1orf21", "AGBL2", "RLN2", "CCDC176", "HAUS1", "DYNLRB2", "MED13L", "FCGBP", "KIAA1551", "UBXN10", "SUSD3", "PNRC2", "CASC1", "FAM120AOS", "CCDC170", "STC2", "CCR6", "FAM161B", "RP11-28F1.2", "LINC00472" ] rScript = 'library(gplots)\nlibrary(pheatmap)\nlibrary(RColorBrewer)\n\n' for dataset in gse_gpl_list: gse = dataset[0] gpl = dataset[1] mkdir('{}/{}_{}'.format(metaDir, gse, gpl)) expr = exprGSE(gse, gpl, exprDir) for count in heatmapSampleCount: gsmGroupList = samplingGse(gse, gpl, count, clinicalDir, stratification=stratification, times=ramdomTimes) selectedSamples = [d.keys() for d in gsmGroupList] for i in range(0, ramdomTimes): if count < len(selectedProbe): filteredProbe = selectedProbe[0:int(count / 2)] filteredProbe.extend( selectedProbe[int(len(selectedProbe) - count / 2):]) heatMapExpr( expr, gse, gpl, selectedSamples[i], filteredProbe, '{}/{}_{}/{}_{}_expr.tsv'.format( metaDir, gse, gpl, count, i), outlierReplace) heatMapGroup( gse, gpl, gsmGroupList[i], '{}/{}_{}/{}_{}_group.tsv'.format( metaDir, gse, gpl, count, i)) else: heatMapExpr( expr, gse, gpl, selectedSamples[i], selectedProbe, '{}/{}_{}/{}_{}_expr.tsv'.format( metaDir, gse, gpl, count, i), outlierReplace) heatMapGroup( gse, gpl, gsmGroupList[i], '{}/{}_{}/{}_{}_group.tsv'.format( metaDir, gse, gpl, count, i)) rScript += '\nexpr <- read.table("{}/{}_{}/{}_{}_expr.tsv")\n'.format( metaDir, gse, gpl, count, i) rScript += 'x <- as.matrix(expr)\n' rScript += 'png(file="{}/{}_{}/1_{}_{}.png", width=1024, height=1024, bg="transparent")\n'\ .format(metaDir, gse, gpl, count, i) rScript += 'pheatmap(x, cutree_rows=2, cutree_cols=2, color=greenred(75), border_color=NA)\n' rScript += 'dev.off()\n' rScript += 'png(file="{}/{}_{}/2_{}_{}.png", width=1024, height=1024, bg="transparent")\n'\ .format(metaDir, gse, gpl, count, i) rScript += 'heatmap.2(x, col=greenred, scale="row", trace="none")\n' rScript += 'dev.off()\n' rScript += '\nclinic <- read.table("{}/{}_{}/{}_{}_group.tsv", head=T, row.names=1)\n'.format( metaDir, gse, gpl, count, i) rScript += 'c <- clinic\n' rScript += 'annotation_c <- data.frame(c)\n' rScript += 'rownames(annotation_c) <- colnames(x)\n' for j in [ 'correlation', 'euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'minkowski' ]: rScript += 'png(file="{}/{}_{}/3_{}_{}_{}.png", width=1024, height=1024, bg="transparent")\n'.format( metaDir, gse, gpl, count, i, j) rScript += 'pheatmap(as.matrix(x), annotation_col=annotation_c, color=bluered(200), border_color=NA, cutree_rows=2, cutree_cols=2, clustering_distance_cols="{}", scale="column")\n'.format( j) rScript += 'dev.off()\n' with open('{}/heatmap.R'.format(metaDir), 'w') as wf: print(rScript, file=wf) if runningR: if log: logging.info('Begin to calculate expression by R.') cmd = 'Rscript {}/heatmap.R > {}/heatmap.R.log'. \ format(metaDir, metaDir) r = os.system(cmd) if log: if r == 0: logging.info('Calculate expression Successfully.') else: logging.error('Calculate expression Failed.')
def MAMA2Heatmap(gse_gpl_list, metaDir, exprDir, clinicalDir, clinicalTag, pCut, esCut, stratification, ramdomTimes, heatmapSampleCount, outlierReplace=True, runningR=True, log=False): pVal = dict() es = dict() with open('{}/p_value'.format(metaDir), 'r') as rf: lines = rf.readlines() title = [i.strip('"\'\n \t') for i in lines[0].split('\t')] col_id = title.index('c_pval') + 1 for line in lines[1:]: s = [i.strip('"\n\t \'') for i in line.split('\t')] if len(s) > col_id: pVal[s[0]] = float(s[col_id]) with open('{}/es'.format(metaDir), 'r') as rf: lines = rf.readlines() title = [i.strip('"\'\n \t') for i in lines[0].split('\t')] col_id = title.index('zSco') + 1 for line in lines[1:]: s = [i.strip('"\n\t \'') for i in line.split('\t')] if len(s) > col_id: es[s[0]] = float(s[col_id]) with open('{}/pVal_es.tsv'.format(metaDir), 'w') as wf: s = '\tp\tes' for probe in pVal: if probe in es: s += '{}\t{}\t{}'.format(probe, pVal[probe], es[probe]) print(s, file=wf, end='') rScript = 'library(gplots)\nlibrary(pheatmap)\nlibrary(RColorBrewer)\n\n' for p in pCut: for e in esCut: dir = '{}/{}_{}'.format(metaDir, p, e) mkdir(dir) selectedProbe = dict() for probe in pVal: if probe in es: if pVal[probe] < p and abs(es[probe]) > e: selectedProbe[probe] = es[probe] selectedProbe = sorted(selectedProbe.items(), key=operator.itemgetter(1), reverse=True) selectedProbe = [i[0] for i in selectedProbe] with open('{}/pVal_es.tsv'.format(dir), 'w') as wf: s = '\tp\tes\n' s += '\n'.join([ '{}\t{}\t{}'.format(probe, pVal[probe], es[probe]) for probe in selectedProbe ]) print(s, file=wf, end='') for dataset in gse_gpl_list: gse = dataset[0] gpl = dataset[1] mkdir('{}/{}_{}'.format(dir, gse, gpl)) expr = exprGSE(gse, gpl, exprDir) for count in heatmapSampleCount: gsmGroupList = samplingGse(gse, gpl, count, clinicalDir, stratification=stratification, times=ramdomTimes) selectedSamples = [d.keys() for d in gsmGroupList] for i in range(0, ramdomTimes): if count < len(selectedProbe): filteredProbe = selectedProbe[0:int(count / 2)] filteredProbe.extend(selectedProbe[ int(len(selectedProbe) - count / 2):]) heatMapExpr( expr, gse, gpl, selectedSamples[i], filteredProbe, '{}/{}_{}/{}_{}_expr.tsv'.format( dir, gse, gpl, count, i), outlierReplace) heatMapGroup( gse, gpl, gsmGroupList[i], '{}/{}_{}/{}_{}_group.tsv'.format( dir, gse, gpl, count, i)) else: heatMapExpr( expr, gse, gpl, selectedSamples[i], selectedProbe, '{}/{}_{}/{}_{}_expr.tsv'.format( dir, gse, gpl, count, i), outlierReplace) heatMapGroup( gse, gpl, gsmGroupList[i], '{}/{}_{}/{}_{}_group.tsv'.format( dir, gse, gpl, count, i)) rScript += '\nexpr <- read.table("{}/{}_{}/{}_{}_expr.tsv")\n'.format( dir, gse, gpl, count, i) rScript += 'x <- as.matrix(expr)\n' rScript += 'png(file="{}/{}_{}/1_{}_{}.png", width=1024, height=1024, bg="transparent")\n'\ .format(dir, gse, gpl, count, i) rScript += 'pheatmap(x, cutree_rows=2, cutree_cols=2, color=greenred(75), border_color=NA)\n' rScript += 'dev.off()\n' rScript += 'png(file="{}/{}_{}/2_{}_{}.png", width=1024, height=1024, bg="transparent")\n'\ .format(dir, gse, gpl, count, i) rScript += 'heatmap.2(x, col=greenred, scale="row", trace="none")\n' rScript += 'dev.off()\n' with open('{}/heatmap.R'.format(metaDir), 'w') as wf: print(rScript, file=wf) if runningR: if log: logging.info('Begin to calculate expression by R.') cmd = 'Rscript {}/heatmap.R > {}/heatmap.R.log'. \ format(metaDir, metaDir) r = os.system(cmd) if log: if r == 0: logging.info('Calculate expression Successfully.') else: logging.error('Calculate expression Failed.')
def getExprTable(gse, gpl, exprDir, gsmList=None, matrixOnly=False, log=True, maxTrial=4, downloadMethod='wget', runningR=True, rScriptDir=None, gseRawDir=None, celDir=None, matrixDir=None): with tempfile.TemporaryDirectory() as tmpDir: if matrixOnly: if not matrixDir: matrixDir = '{}/matrix'.format(tmpDir) mkdir(matrixDir) r = getGseMatrix(gse, gpl, matrixDir, log, maxTrial, downloadMethod) if r[0]: filename = '{}/{}'.format(matrixDir, r[3]) matrixFile = extractGzipFile(filename) expr = matrix2expr(matrixFile) if gsmList: gsmList = {gsm.upper() for gsm in gsmList} expr = { oligo: { sample: expr[oligo][sample] for sample in expr[oligo] if sample in gsmList } for oligo in expr } gsmList = sorted(expr[list(expr.keys())[0]].keys()) s = '\t'.join(gsmList) s += '\n' s += '\n'.join([ '{}\t{}'.format( oligo, '\t'.join([expr[oligo][sample] for sample in gsmList])) for oligo in sorted(expr.keys()) ]) with open('{}/{}_{}_expr.tsv'.format(exprDir, gse, gpl), 'w') as wf: print(s, end='', file=wf) return True, gse, gpl, '{}/{}_{}_expr.tsv'.format( exprDir, gse, gpl) else: return False, gse, gpl, '' else: if not gseRawDir: gseRawDir = '{}/gseRaw'.format(tmpDir) mkdir(gseRawDir) if not celDir: celDir = '{}/cel'.format(tmpDir) mkdir(gseRawDir) celDir = '{}/{}_{}'.format(celDir, gse, gpl) mkdir(celDir) r = getGseRaw(gse, gseRawDir, log, maxTrial, downloadMethod) if r[0]: filename = '{}/{}'.format(gseRawDir, r[2]) with tarfile.open(filename, 'r') as tf: tf.extractall(celDir) for file in os.listdir(celDir): gsm = file.split('.')[0].split('_')[0].split( '-')[0].upper() if gsmList: if gsm not in gsmList: os.remove('{}/{}'.format(celDir, file)) continue f = extractGzipFile('{}/{}'.format(celDir, file), rm=True) os.rename(f, '{}/{}.CEL'.format(celDir, gsm)) if not rScriptDir: rScriptDir = '{}/rScript'.format(exprDir) mkdir(rScriptDir) R = 'rm(list = ls())\nlibrary("affyPLM")\nlibrary("affy")\n\n' R += 'memory.limit(16000)\n\n' R += 'setwd("{}")\n'.format(celDir) R += 'raw.set <- ReadAffy()\nrma.data <- rma(raw.set)\n' R += '%s <- exprs(rma.data)\n' % gse R += 'write.table({}, "{}/{}_{}_expr.tsv", sep="\\t")\n'.\ format(gse, exprDir, gse, gpl) R += 'rm(raw.set, rma.data)\ngc()\n\n' with open('{}/{}_{}_expr.R'.format(rScriptDir, gse, gpl), 'w') \ as wf: print(R, file=wf) if runningR: if log: logging.info('Begin to calculate expression by R.') cmd = 'Rscript {}/{}_{}_expr.R > {}/{}_{}_expr.R.log'.\ format(rScriptDir, gse, gpl, rScriptDir, gse, gpl) if os.system(cmd) == 0: logging.info('Calculate expression Successfully.') return True, gse, gpl, '{}/{}_{}_expr.tsv'.\ format(exprDir, gse, gpl) else: logging.error('Calculate expression Failed.') return False, gse, gpl, '' else: return False, gse, gpl, ''