def meta2Heatmap(settingFile): settings = config(settingFile) metaDir = settings['meta directory'] if settings['mapping to gene in meta']: exprDir = settings['gene expression table directory'] else: exprDir = settings['expression table directory'] clinicalDir = settings['simplified clinical data directory'] clinicalTag = settings['clinical phenotype'] pCut = settings['p-value cutoff'] esCut = settings['effect size cutoff'] gse_gpl_list = [(info['gse'], info['gpl']) for info in settings['datasets information'] if info['meta']] ramdomTimes = settings['randomized times'] heatmapSampleCount = settings['max sample in heatmap'] outlierReplace = settings['outliers replace'] stratification = settings['randomized sampling'] runningR = settings['run R in python'] differentGeneCount = settings['max different expression gene included'] log = settings['log'] if settings['meta method'] == 'MAMA': return MAMA2Heatmap(gse_gpl_list, metaDir, exprDir, clinicalDir, clinicalTag, pCut, esCut, stratification, ramdomTimes, heatmapSampleCount, outlierReplace, runningR, log) elif settings['meta method'] == 'mDEDS': return mDEDS2Heatmap(gse_gpl_list, metaDir, exprDir, clinicalDir, clinicalTag, differentGeneCount, stratification, ramdomTimes, heatmapSampleCount, outlierReplace, runningR, log)
def geneExpr(settingFile, probeExprDone=False): settings = config(settingFile) if not probeExprDone: probeExpr(settingFile) if settings['probe mapping to gene']: gplList = {info['gpl'] for info in settings['datasets information']} getGplAnnotTableList( gplList, directory=settings['probe annotation directory'], log=settings['log'], maxTrial=settings['max trail in downloading files'], thread=settings['thread']) for gpl in gplList: annot2Map(gpl, settings['probe annotation directory'], settings['probe mapping to gene keyword'], settings['log']) p = [(info['gse'], info['gpl'], settings['expression table directory'], settings['probe annotation directory'], settings['gene expression table directory'], settings['probe mapping to gene keyword'], settings['probe mapping to gene method'], settings['multiple mapped probe']) for info in settings['datasets information']] with multiprocessing.Pool(processes=settings['thread']) as pool: results = pool.starmap(probeExpr2geneExpr, p)
def probeExpr(settingFile): settings = config(settingFile) gsmList = extractClinical( settings['datasets information'], settings['directory containing clinical csv files'], settings['simplified clinical data directory'], settings['clinical phenotype'], settings['log']) for item in settings['datasets information']: item['gsmList'] = gsmList[(item['gse'], item['gpl'])] if 'matrixOnly' not in item: item['matrixOnly'] = False with open( '{}/probeExpr.config.json'.format( settings['expression table directory']), 'w') as wf: json.dump(settings, wf, indent=2) return getExprTables( gse_gpl_gsmList_matrixOnly_List=settings['datasets information'], exprDir=settings['expression table directory'], log=settings['log'], maxTrial=settings['max trail in downloading files'], thread=settings['thread'], downloadMethod=settings['download method'], runningR=settings['run R in python'], rScriptDir=settings['R script directory'], gseRawDir=settings['GSE raw directory'], celDir=settings['CEL file directory'], matrixDir=settings['GSE matrix directory'])
def metaQC(settingFile): settings = config(settingFile) metaqcDir = settings['metaQC directory'] geneExprDir = settings['gene expression table directory'] clinicalDir = settings['simplified clinical data directory'] gse_gpl_list = [ '{}_{}'.format(info['gse'], info['gpl']) for info in settings['datasets information'] if info['metaQC'] ] gmtFile = settings['gmt File'] p = [(info['gse'], info['gpl'], metaqcDir, geneExprDir, clinicalDir) for info in settings['datasets information'] if info['metaQC']] with multiprocessing.Pool(processes=settings['thread']) as pool: results = pool.starmap(geneExpr2metaqcTable, p) s = 'rm(list = ls())\nlibrary("MetaDE")\nlibrary("MetaQC")\n\n' s += 'setwd("{}")\nmemory.limit(16000)\n\n'.format(metaqcDir) s += 'study.names <- c({})\n'.format(', '.join( ['"%s"' % i for i in gse_gpl_list])) s += 'raw <- MetaDE.Read(study.names, skip=rep(1, {}), via="txt", matched=T, log=T)\n'.format( len(gse_gpl_list)) s += 'gc()\n' s += 'merged <- MetaDE.merge(raw)\ngc()\n' s += 'Data.QC<-list()\nfor(i in 1:%d){\n' % len(gse_gpl_list) s += ' colnames(merged[[i]][[1]])<-merged[[i]][[2]]\n' s += ' Data.QC[[i]]<-impute.knn(merged[[i]][[1]])$data\n}\n' s += 'names(Data.QC)<-names(merged)\n' s += 'QC <- MetaQC(Data.QC, "{}", filterGenes=F,verbose=TRUE, isParallel=T, resp.type="Twoclass")\n'.format( gmtFile) s += 'gc()\nrunQC(QC, B=1e4, fileForCQCp="{}")\n'.format(gmtFile) s += 'png(file="{}/metaQC.png", width=2048, height=2048, bg="transparent")\n'.format( metaqcDir) s += 'plot(QC)\ndev.off()\n' s += 'sink("{}/metaQC.txt")\nprint(QC)\nsink()\n'.format(metaqcDir) s += 'save.image("{}/metaQC.RData")'.format(metaqcDir) with open('{}/metaQC.R'.format(metaqcDir), 'w') as wf: print(s, file=wf, end='') if settings['run R in python']: if settings['log']: logging.info('Begin QC by R.') cmd = 'Rscript {}/metaQC.R >{}/metaQC.R.log'. \ format(metaqcDir, metaqcDir) if os.system(cmd) == 0: logging.info('QC Successfully.') return True else: logging.error('QC Failed.') return False
def meta2Forest(settingFile): settings = config(settingFile) metaDir = settings['meta directory'] if settings['mapping to gene in valid']: exprDir = settings['gene expression table directory'] else: exprDir = settings['expression table directory'] clinicalDir = settings['simplified clinical data directory'] clinicalTag = settings['clinical phenotype'] thread = settings['thread'] outlierReplace = settings['outliers replace'] gse_gpl_list = [(info['gse'], info['gpl']) for info in settings['datasets information']] gpl = [ info['gpl'] for info in settings['datasets information'] if info['meta'] ][0] log = settings['log'] runningR = settings['run R in python'] gplMapFile = '' if settings['probe mapping to gene'] \ and not settings['mapping to gene in meta']\ and settings['mapping to gene in valid']: gplMapFile = '{}/{}_probe2{}.json'.format( settings['probe annotation directory'], gpl, settings['probe mapping to gene keyword']) if settings['meta method'] == 'MAMA': pCut = settings['p-value cutoff'] esCut = settings['effect size cutoff'] return MAMA2Forest(gse_gpl_list, metaDir, exprDir, clinicalDir, clinicalTag, pCut, esCut, thread, gplMapFile, outlierReplace, runningR, log) elif settings['meta method'] == 'mDEDS': maxDEDSGeneCount = max( settings['max different expression gene included']) return mDEDS2Forest(gse_gpl_list, metaDir, exprDir, clinicalDir, clinicalTag, outlierReplace, runningR, log)
def meta2Csv(settingFile): settings = config(settingFile) metaDir = settings['meta directory'] metaMethod = settings['meta method'] if not settings['mapping to gene in meta'] \ and settings['probe mapping to gene']: annotDir = settings['probe annotation directory'] gpls = [ ds['gpl'] for ds in settings['datasets information'] if ds['meta'] ] gpls = set(gpls) rfs = [ open('{}/{}_probe2symbol.json'.format(annotDir, gpl), 'r') for gpl in gpls ] probe2symbols = [json.load(rf) for rf in rfs] probe2symbol = dict() for i in probe2symbols: probe2symbol.update(i) for rf in rfs: rf.close() if metaMethod == 'MAMA': pass elif metaMethod == 'mDEDS': geneCount = max(settings['max different expression gene included']) geneOrderFile = '{}/geneOrder.tsv'.format(metaDir) with open(geneOrderFile, 'r') as rf: lines = rf.readlines()[1:] table = [[i.replace('"', '').strip() for i in line.split('\t')] for line in lines if len(line.split('\t')) > 1] geneOrder = {i[0]: i[1] for i in table} with open('{}/mDEDS_{}.csv'.format(metaDir, geneCount), 'r') as rf: lines = rf.readlines() title = [i.replace('"', '').strip() for i in lines[0].split(',')] table = [[i.replace('"', '').strip() for i in line.split(',')] for line in lines[1:] if len(line.split(',')) > 1] data = dict() for i in range(len(title)): t = title[i] if t.upper() == 'geneOrder'.upper(): if settings['mapping to gene in meta']: data['input order'] = [row[i] for row in table] data['symbol'] = [ geneOrder[j] for j in data['input order'] ] else: data['input order'] = [row[i] for row in table] data['probe'] = [ geneOrder[j] for j in data['input order'] ] if settings['probe mapping to gene']: data['symbol'] = list() for p in data['probe']: if p in probe2symbol: data['symbol'].append(probe2symbol[p]) else: data['symbol'].append('') elif t == '': data['order'] = [row[i] for row in table] elif t.lower() == 'fc'.lower(): data['fc'] = [row[i] for row in table] data['abs(fc)'] = [-abs(float(row[i])) for row in table] else: data[t.lower()] = [row[i] for row in table] df = pd.DataFrame(data).sort_values(by=['deds', 'abs(fc)']) df.to_csv('{}/__meta_results.csv'.format(metaDir), index=False)
def metaAnalysis(settingFile): settings = config(settingFile) metaDir = settings['meta directory'] geneExprDir = settings['gene expression table directory'] exprDir = settings['expression table directory'] clinicalDir = settings['simplified clinical data directory'] clinicalTag = settings['clinical phenotype'] gse_gpl_list = [ '{}_{}'.format(info['gse'], info['gpl']) for info in settings['datasets information'] if info['meta'] ] metaMethod = settings['meta method'] if metaMethod == 'MAMA': s = 'rm(list = ls())\nlibrary("MAMA")\nlibrary("metaMA")\n' s += 'library("affyPLM")\nlibrary("affy")\n\n' s += 'memory.limit(16000)\n\n' for gse in gse_gpl_list: if settings['mapping to gene in meta']: s += '{} <- as.matrix(read.table("{}/{}_expr.tsv"))\n' \ .format(gse, geneExprDir, gse) else: s += '{} <- as.matrix(read.table("{}/{}_expr.tsv"))\n' \ .format(gse, exprDir, gse) s += 'GEDM<-list({})\n'.format(', '.join(gse_gpl_list)) s += 'rm({})\ngc()\n\n'.format(', '.join(gse_gpl_list)) s += 'setwd("{}")\n'.format(metaDir) for gse in gse_gpl_list: s += 'annot_%s <- read.csv("%s/%s_clinical.csv")\n' \ % (gse, clinicalDir, gse) s += 'row.names(annot_%s) <- annot_%s$CEL_Number\n' \ % (gse, gse) s += 'annot_%s$%s <- as.factor(annot_%s$%s)\n' \ % (gse, clinicalTag, gse, clinicalTag) s += 'Clinical <- list(%s)\n' \ % ', '.join(['annot_%s' % gse for gse in gse_gpl_list]) s += 'rm(%s)\ngc()\n' \ % ', '.join(['annot_%s' % gse for gse in gse_gpl_list]) s += 'datanames <- c(%s)\n\n' \ % ', '.join(['"%s"' % gse for gse in gse_gpl_list]) s += 'setClass("esets",slots=list(GEDM="list",clinical="list",datanames="character"),package = "MAMA")\n' s += 'HCC_meta_data <- new("esets",GEDM=GEDM,clinical=Clinical,datanames=datanames)\n' s += 'rm(Clinical,datanames)\ngc()\n\n' s += 'pval <- metaMA(HCC_meta_data,"%s",which="pval")\n' % clinicalTag s += 'gc()\n' s += 'es2 <- ES.GeneMeta(HCC_meta_data,"%s",nperm=1000)\n\n' % clinicalTag s += 'results1 <- join.results(pval,type=1,genenames=rownames(GEDM(HCC_meta_data)[[1]]))\n' s += 'p_value <- as.data.frame(results1)\n' s += 'rawpval = 2 * (1 - pnorm(abs(pval$TestStatistic)))\n' s += 'FDR_pval <- p.adjust(rawpval, method="BY", n=length(rawpval))\n' s += 'p_value$c_pval <- rawpval\n' s += 'p_value$FDR <- FDR_pval\n' s += 'rm(rawpval, FDR_pval)\ngc()\n' s += 'es2_theScores <- es2$theScores\n' s += 'es2_ScoresFDR <- es2$ScoresFDR\n' s += 'es2_ScoresFDR <- es2_ScoresFDR$two.sided\n' s += 'write.table(p_value, file="p_value", sep="\\t", col.names = T)\n' s += 'write.table(es2_ScoresFDR, file="es", sep="\\t", col.names = T)\n' with open('{}/{}.R'.format(metaDir, metaMethod), 'w') as wf: print(s, file=wf) if settings['run R in python']: if settings['log']: logging.info('Begin meta-analysis by R.') cmd = 'Rscript {}/{}.R >{}/{}.R.log'. \ format(metaDir, metaMethod, metaDir, metaMethod) if os.system(cmd) == 0: logging.info('meta-analysis Successfully.') return True else: logging.error('meta-analysis Failed.') return False elif metaMethod == 'mDEDS': with open('{}/clinic.csv'.format(metaDir), 'w') as wf: print('gsm,{}'.format(clinicalTag), file=wf, end='') for dataset in gse_gpl_list: with open('{}/{}_clinical.csv'.format(clinicalDir, dataset), 'r') as rf: s = '\n' for line in rf.readlines()[1:]: s += line print(s, file=wf, end='') s = 'rm(list = ls())\nlibrary("MAMA")\nlibrary("metaMA")\n' s += 'library("affyPLM")\nlibrary("affy")\n' s += 'library("CONOR")\nlibrary("DEDS")\n\n' s += 'memory.limit(16000)\n\n' gse = gse_gpl_list[0] if settings['mapping to gene in meta']: s += 'a <- read.table("%s/%s_expr.tsv")\n' % (geneExprDir, gse) else: s += 'a <- read.table("%s/%s_expr.tsv")\n' % (exprDir, gse) for gse in gse_gpl_list[1:]: if settings['mapping to gene in meta']: s += 'b <- read.table("%s/%s_expr.tsv")\n' % (geneExprDir, gse) else: s += 'b <- read.table("%s/%s_expr.tsv")\n' % (exprDir, gse) s += 'm <- xpn(a, b, iterations=10)\na <- m$x\nb <- m$y\n' s += 'm <- merge(a, b, by="row.names")\n' s += 'row.names(m) <- m$Row.names\na <- m[, -1]\n' s += 'a <- a[, order(colnames(a))]\na <- as.matrix(a)\n' s += 'rm(b, m)\ngc()\n\n' s += 'merged.expr <- a\n' s += 'save(merged.expr, file="%s/merged.expr.RData")\n' % metaDir s += 'rm(a)\ngc()\n\n' s += 'annot <- read.csv("{}/clinic.csv", row.names=1, header=T)\n'.format( metaDir) s += 'merged <- as.data.frame(merged.expr)\n' s += 'annot <- annot[order(row.names(annot)), ]\n' s += 'merged <- merged[, order(colnames(merged))]\n' s += 'merged <- as.matrix(merged)\n' # s += 'merged <- 2 ^ merged\n' s += 'rm(merged.expr)\ngc()\n' s += 'deds <- deds.stat.linkC(merged, annot, B=1300)\n' s += 'save(deds, file="{}/deds.RData")\n'.format(metaDir) s += 'gc()\n\n' for i in settings['max different expression gene included']: s += 'r <- topgenes(deds, number=%d, sort.by="fc")\n' % i s += 'write.csv(r, file="%s/mDEDS_%d.csv")\n' % (metaDir, i) s += 'write.table(rownames(merged), file="{}/geneOrder.tsv", sep="\t")'.format( metaDir) with open('{}/{}.R'.format(metaDir, metaMethod), 'w') as wf: print(s, file=wf) if settings['run R in python']: if settings['log']: logging.info('Begin meta-analysis by R.') cmd = 'Rscript {}/{}.R >{}/{}.R.log'. \ format(metaDir, metaMethod, metaDir, metaMethod) if os.system(cmd) == 0: if settings['log']: logging.info('meta-analysis Successfully.') return True else: if settings['log']: logging.error('meta-analysis Failed.') return False