def step2_NC_detection(conf_dict,logfile): ''' analysis part mainly Rscript ''' # start # create section for # Rscript detectNonCanonical.r outname signalname usePQ cutoff alpha lambdachoice topN tmpRpackgeDIR createDIR(conf_dict['General']['startdir']+"tmpPackage/") cmd = "Rscript %s %s %s %s %s %s %s"%(conf_dict['rscript']+"detectNonCanonical.r", conf_dict['General']['outname'], conf_dict['General']['signalname'], conf_dict['options']['Pvalue'], conf_dict['options']['Alpha'], conf_dict['options']['Lambda'], conf_dict['options']['TopNcofactors'], conf_dict['General']['startdir']+"tmpPackage/") #rwlog(cmd,logfile) os.system('echo "[CMD] %s " >> %s'%(cmd,logfile)) tmpobj = sp(cmd) return conf_dict
def step0_check_data(conf_dict, logfile): ''' step0 integrate data check and complement parameter ''' ### check data path , format , if "~" in conf_dict['General']['HMRpeak']: ewlog( 'require absolute path for HMRpeak bed file, HMRpeak file cannot contain "~", current HMRpeak file is %s' % (conf_dict['General']['HMRpeak']), logfile) if "~" in conf_dict['General']['signal']: ewlog( 'require absolute path for HMsignal bigwig file, signal file cannot contain "~", current signal file(s): %s' % (conf_dict['General']['signal']), logfile) if not conf_dict['General']['HMRpeak'].startswith('/'): conf_dict['General']['HMRpeak'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['HMRpeak'] if not os.path.isfile(conf_dict['General']['HMRpeak']): ewlog("HMRpeak file %s not found" % (conf_dict['General']['HMRpeak']), logfile) if not conf_dict['General']['HMRpeak'].endswith('.bed'): ewlog('extenion of HMR peak file is not .bed', logfile) checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000) if checkbed == "fail": ewlog("HMRpeak file is not a bed file", logfile) elif checkbed == "lesspeak": ewlog("HMRpeak file contains less than 1000 peaks") conf_dict['General']['signalname'] = [] conf_dict['General']['signalfile'] = [] for bwsignalfile in conf_dict['General']['signal']: if not bwsignalfile.startswith('/'): bwsignalfile = conf_dict['General']['startdir'] + bwsignalfile if not os.path.isfile(bwsignalfile): wlog("signal bw file %s not found, ignored" % (bwsignalfile), logfile) continue if bwsignalfile.endswith(".bw"): conf_dict['General']['signalfile'].append(bwsignalfile) conf_dict['General']['signalname'].append( bwsignalfile.split("/")[-1][:-3]) elif bwsignalfile.endswith(".bigwig"): conf_dict['General']['signalfile'].append(bwsignalfile) conf_dict['General']['signalname'].append( bwsignalfile.split("/")[-1][:-7]) else: wlog('[WARNING] extension of signal bw file is not bw/bigwig', logfile) conf_dict['General']['signalfile'].append(bwsignalfile) conf_dict['General']['signalname'].append( bwsignalfile.split("/")[-1]) if len(conf_dict['General']['signalfile']) == 0: ewlog("no signal bw file valid, exit") elif len(conf_dict['General']['signalfile']) > 4: ewlog( "maximum signal bw file is limited to 4. There were %s signal file inputed, exit" % (len(conf_dict['General']['signalfile']))) ### check TFpeak folder if "~" in conf_dict['General']['peakFolder']: ewlog( 'require absolute path for peak/track Folder, Folder cannot contain "~", current Folder is %s' % (conf_dict['General']['peakFolder']), logfile) if not conf_dict['General']['peakFolder'].startswith('/'): conf_dict['General']['peakFolder'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['peakFolder'] if not conf_dict['General']['peakFolder'].endswith('/'): conf_dict['General']['peakFolder'] += "/" if not os.path.isdir(conf_dict['General']['peakFolder']): ewlog("Folder %s not found" % (conf_dict['General']['peakFolder']), logfile) if conf_dict['General']['mode'] == "signal": wlog("signal mode is activated", logfile) if conf_dict['General']['bwfolder']: wlog("bwFolder is specified, checking data for signal mode", logfile) if "~" in conf_dict['General']['bwfolder']: wlog( 'require absolute path for bwFolder, bwFolder cannot contain "~", current Folder is %s, use peak mode' % (conf_dict['General']['bwfolder']), logfile) conf_dict['General']['mode'] = "binary" else: if not conf_dict['General']['bwfolder'].startswith('/'): conf_dict['General']['bwfolder'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['bwfolder'] if not conf_dict['General']['bwfolder'].endswith('/'): conf_dict['General']['bwfolder'] += "/" if not os.path.isdir(conf_dict['General']['bwfolder']): wlog( "bwFolder %s not found, use binary mode" % (conf_dict['General']['peakFolder']), logfile) conf_dict['General']['mode'] = "binary" else: wlog("bwfolder is not specified, use binary mode", logfile) conf_dict['General']['mode'] = "binary" else: wlog("binary mode is activaed", logfile) wlog( "Check the peak.bed files in the Folder, only '.bed' files with >1000 peaks are included in the following analysis", logfile) conf_dict['General']['peakfilenames'] = [] for f in os.listdir(conf_dict['General']['peakFolder']): if f.endswith(".bed") and os.path.isfile( conf_dict['General']['peakFolder'] + f): checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000) if checkbed == "pass": conf_dict['General']['peakfilenames'].append(f[:-4]) if (len(conf_dict['General']['peakfilenames']) == 0): ewlog( "no peak file (cofactor candidate) in (bed format & >1000peaks) are included, exit", logfile) if conf_dict['General']['mode'] == "signal": conf_dict['General']['bwfilenames'] = [] for f in os.listdir(conf_dict['General']['bwfolder']): if f.endswith(".bw") and os.path.isfile( conf_dict['General']['bwfolder'] + f): conf_dict['General']['bwfilenames'].append(f[:-3]) ### compare the name from bwfiles and peak files conf_dict['General']['usefilename'] = [] for name in conf_dict['General']['bwfilenames']: if name in conf_dict['General']['peakfilenames']: conf_dict['General']['usefilename'].append(name) ### if less than 50% peakfiles share name with bwfiles, change back to peak mode if len(conf_dict['General']['usefilename']) < len( conf_dict['General']['peakfilenames']) * 0.5: conf_dict['General']['mode'] = "binary" wlog( "the number of shared peak&bw files is less than half of the number of peakfiles, use binary mode", logfile) else: wlog("all checks for signal mode passed, use signal mode", logfile) if conf_dict['General']['mode'] == "binary": conf_dict['General']['usefilename'] = conf_dict['General'][ 'peakfilenames'] wlog( "%s cofactor candidates are included" % (len(conf_dict['General']['usefilename'])), logfile) #checkbed = checkbedformat(conf_dict['General']['HMRpeak'],1000) #if checkbed == "pass": #conf_dict['General']['peakfilenames'].append(f[:-3]) outf = open( conf_dict['General']['outname'] + "_cofactor_candidate_list.txt", 'w') for cofactor in conf_dict['General']['usefilename']: outf.write(cofactor + "\n") outf.close() ### check options wlog('check option: ', logfile) # try: wlog( "extend length for HMsignal is %s bp" % (int(conf_dict['options']['ext'])), logfile) conf_dict['options']['ext'] = int(conf_dict['options']['ext']) except: wlog( "extend length %s is not valid, use default value: 1000bp" % (conf_dict['options']['ext']), logfile) conf_dict['options']['ext'] = 1000 try: wlog( "use Pvalue = %s as cutoff" % (str(float(conf_dict['options']['Pvalue']))), logfile) except: wlog( "input Pvalue %s is not recognized, use default Pvalue=0.001" % (conf_dict['options']['Pvalue']), logfile) conf_dict['options']['Pvalue'] = 0.001 if float(conf_dict['options']['Pvalue']) >= 1: wlog( "input Pvalue %s is not valid, use default Pvalue=0.001" % (conf_dict['options']['Pvalue']), logfile) conf_dict['options']['Pvalue'] = 0.001 try: usealpha = float(conf_dict['options']['Alpha']) if usealpha >= 1: wlog("alpha (for elastic-net) cannot be >=1, use alpha=0.5", logfile) conf_dict['options']['Alpha'] = 0.5 else: wlog( "Alpha (for elastic-net) = %s" % (str(float(conf_dict['options']['Alpha']))), logfile) conf_dict['options']['Alpha'] = usealpha except: wlog( "input alpha (for elastic-net) %s is not valid, use alpha=0.5" % (conf_dict['options']['Alpha']), logfile) wlog("Lambda choice is %s" % (conf_dict['options']['Lambda']), logfile) if conf_dict['options']['TopNcofactors'] == "all": wlog("all significant co-factors will be output", logfile) else: try: topTF = int(conf_dict['options']['TopNcofactors']) wlog( "the topN number %s will be output" % (conf_dict['options']['TopNcofactors']), logfile) conf_dict['options']['TopNcofactors'] = topTF except: wlog( "the topN number %s is not valid, output top5 co-factors" % (conf_dict['options']['TopNcofactors']), logfile) conf_dict['options']['TopNcofactors'] = 5 OS = platform.system() if OS == "Linux": bwsum_software = "bigWigSummary_linux" elif OS == "Darwin": bwsum_software = "bigWigSummary_mac" else: wlog( "detected system is nither linux nor mac, try linux version of bigWigSummary", logfile) bwsum_software = "bigWigSummary_linux" conf_dict['General'][ 'bwsummary'] = HMRpipe.__path__[0] + "/%s" % bwsum_software if os.path.isfile(HMRpipe.__path__[0] + "/bedtools"): conf_dict['General']['bedtools'] = HMRpipe.__path__[0] + "/bedtools" else: conf_dict['General']['bedtools'] = "bedtools" ### check Rscript #if not 'Usage' in sperr('Rscript')[1] and not 'version' in sperr('Rscript')[1]: # ewlog('require Rscript',logfile) ### check pdflatex if sp('pdflatex --help')[0] == "": wlog( 'pdflatex was not installed, ncHMR_detector is still processing but no summary report generated', logfile) conf_dict['General']['latex'] = 0 else: conf_dict['General']['latex'] = 1 return conf_dict
def step3_summary(conf_dict, logfile): ''' analysis part mainly Rscript dimentional reduction + clustering ''' # start # create section for wlog('collect results', logfile) # Rscript analysis.r expmat outname coverGN highvarZ selectPCcutoff rdnumber maxKnum summarydir = 'summary/' createDIR(summarydir) sp("mv %s_NCsummary.txt %s" % (conf_dict['General']['outname'], summarydir)) sp("mv %s_elnet_lambdaSelection.pdf %s" % (conf_dict['General']['outname'], summarydir)) if os.path.isfile("%s_cofactor_HMsignal.pdf" % conf_dict['General']['outname']): sp("mv %s_cofactor_HMsignal.pdf %s" % (conf_dict['General']['outname'], summarydir)) tmpresult = 'tmpResults/' createDIR(tmpresult) sp("mv %s_HMsig.bed %s" % (conf_dict['General']['outname'], tmpresult)) sp("mv %s_peakov.bed %s" % (conf_dict['General']['outname'], tmpresult)) sp("mv %s_cofactor_candidate_list.txt %s" % (conf_dict['General']['outname'], tmpresult)) sp("mv %s_filterNC.txt %s" % (conf_dict['General']['outname'], tmpresult)) wlog('generate summary documents', logfile) ### initiate QCdoc = """\documentclass[11pt,a4paper]{article} \usepackage{tabularx} \usepackage[english]{babel} \usepackage{array} \usepackage{graphicx} \usepackage{color} \DeclareGraphicsExtensions{.eps,.png,.pdf,.ps} \\begin{document} \\title{Summary reports of non-classical function detection of : %s} \\vspace{-1cm} \maketitle \\tableofcontents \\newpage \\newpage \section{Data description} \\begin{quotation} Table 1 mainly describes the input files, parameters and options. \end{quotation} \\begin{table}[h] \\small \caption{parameter description}\label{bstable} \\begin{tabularx}{\\textwidth}{ |X|l| } """ % (strlatexformat(conf_dict['General']['outname'])) ### table1 prepare parameter NcoTF = len(conf_dict['General']['peakfilenames']) QCdoc += """ \hline parameter & value \\\\ \hline output name & %s \\\\ \hline HMRpeak(peak filename) & %s \\\\ \hline mode & %s \\\\ \hline HM signal(bw filename) & \\begin{tabular}[c]{@{}l@{}}%s\end{tabular} \\\\ \hline \#cofactor candidates & %s \\\\ \hline options & value \\\\ \hline extend size & %sbp \\\\ \hline Alpha (Elastic net) & %s \\\\ \hline Pvalue cutoff & %s \\\\ \hline topN cofactors & %s \\\\ \hline """ % (strlatexformat(conf_dict['General']['outname']), strlatexformat(conf_dict['General']['HMRpeak'].split("/")[-1]), conf_dict['General']['mode'], strlatexformat("\\\\ ".join(conf_dict['General']['signalname'])), str(NcoTF), str(conf_dict['options']['ext']), str(conf_dict['options']['Alpha']), str(conf_dict['options']['Pvalue']), str(conf_dict['options']['TopNcofactors'])) QCdoc += """ \end{tabularx} \end{table} """ ### cross validation in elastic net QCdoc += """ \\newpage \\newpage \section{ElasticNet co-factor selection} In this step we use a feature selection (elastic-net. Zou, H. and Hastie T. (2005) to select potential co-factors which corresponded to the non-classical function. Below shows the cross-validation curve for the decison of lambda in elastic-net for each histone modification substrate. \\begin{figure}[h] \caption{cross-validation curve for lambda decision} \label{fig:profileunion} \setlength{\\abovecaptionskip}{0pt} \setlength{\\belowcaptionskip}{10pt} \centering {\includegraphics[width=0.8\\textwidth]{%s}} \end{figure} """ % (conf_dict['General']['outname'] + "_elnet_lambdaSelection.pdf") inf_ncsummary = open("summary/" + conf_dict['General']['outname'] + "_NCsummary.txt") line = inf_ncsummary.readline() if line.startswith("no non-classical function detected"): QCdoc += """ \\newpage \\newpage \section{potential co-factors corresponded to non-classical function} No significant co-factor was detected, indicating that the non-classical function of the HMR was not exist or none of the existing factor candidates act as a co-factor of the non-classical function. """ else: QCdoc += """ \\newpage \\newpage \section{potential co-factors corresponded to non-classical function} In summary, %s factors were predicted to potentially act as a co-factor of the non-classical function. The top%s co-factors were listed. \subsection{summary of co-factors} \\begin{quotation} The corresponded histone modification substrate (HMsubstrate), empirical P-value, R-square (ordered) and the number of non-classical (NC) sites for each potential co-factor were listed below. The empirical P-value was calculated based on the comparison of foreground (observed) R-square and background R-square (distribution of random R-square generated from the 1,000 permutations of co-binding events) for each potential co-factor. The non-classical (NC) sites were defined by lower HMsubstrate signal (using Otus' method) and co-binding events of each potential co-factor. \end{quotation} \\begin{table}[h] \\small \caption{cofactor summary}\label{bstable} \\begin{tabular}{ |l|l|l|l|l| } \hline co-factor & HMsubstrate & Pval & Rsquare & NCsites \\\\ """ % (int( sp("wc -l tmpResults/%s_filterNC.txt" % (conf_dict['General']['outname']))[0].split()[0]) - 1, int( sp("wc -l summary/%s_NCsummary.txt" % (conf_dict['General']['outname']))[0].split()[0]) - 1) for line in inf_ncsummary: if line.startswith("TFname"): continue ll = line.split() this_doc = """\hline %s & %s & %s & %s & %s \\\\ """ % (strlatexformat(ll[0]), strlatexformat( ll[1]), ll[2], round(float(ll[3]), 3), ll[5]) QCdoc += this_doc inf_ncsummary.close() QCdoc += """ \hline \end{tabular} \end{table} \\newpage \\newpage \subsection{Boxplot of HM on non-classical and classic sites} \\begin{quotation} Boxplot was generated to compare the difference of the histone mark (HM) signal on either non-classical or classic sites(peak). The non-classical sites were defined by lower HM signal (using Otus' method) and co-binding events of each potential co-factor. The boxplot corresponded to top co-factors were displayed. \end{quotation} \\begin{figure}[h] \caption{boxplot cofactor HMsignal} \label{fig:profileunion} \setlength{\\abovecaptionskip}{0pt} \setlength{\\belowcaptionskip}{10pt} \centering {\includegraphics[width=0.8\\textwidth]{%s}} \end{figure} """ % ((conf_dict['General']['outname'] + "_cofactor_HMsignal.pdf")) QCdoc += """ \\newpage \\newpage \section{Output list} \\begin{quotation} All the main output files were described in the following table \end{quotation} \\begin{table}[h] \\small \caption{output list}\label{bstable} \\begin{tabular}{ |l|l| } \hline description & filename \\\\ \hline summary table of non-classical (NC) function & summary/%s \\\\ \hline summary report (this doc) & summary/%s \\\\ \hline cobinding matrix on HMR peaks & tmpResults/%s \\\\ \hline histone mark signal on HMR peaks & tmpResults/%s \\\\ \hline \end{tabular} \end{table} \end{document} """ % (strlatexformat(conf_dict['General']['outname'] + "_NCsummary.txt"), strlatexformat(conf_dict['General']['outname'] + "_summary.pdf"), strlatexformat(conf_dict['General']['outname'] + "_peakov.bed"), strlatexformat(conf_dict['General']['outname'] + "_HMsig.bed")) latexfile = conf_dict['General']['outname'] + '_summary.tex' outf = open(summarydir + latexfile, 'w') outf.write(QCdoc) outf.close() cmd = "pdflatex %s" % (latexfile) cmd2 = 'cp %s ../' % (conf_dict['General']['outname'] + '_summary.pdf') if conf_dict['General']['latex'] == 1: wlog( 'pdflatex was detected in default PATH, generate summary report %s' % (conf_dict['General']['outname'] + '_summary.pdf'), logfile) os.chdir(summarydir) tmpobj = sp(cmd) tmpobj = sp(cmd) tmpobj = sp(cmd2) tmpobj = sp("rm %s_summary.aux" % conf_dict['General']['outname']) tmpobj = sp("rm %s_summary.log" % conf_dict['General']['outname']) tmpobj = sp("rm %s_summary.toc" % conf_dict['General']['outname']) # for files in os.listdir(plot_folder): # if os.path.isfile(files) and files[-12:-4] == "_summary": # if not files[-4:] in ['.tex','.pdf',',png','.txt']: # cmd = "rm %s"%(files) # rwlog(cmd,logfile) else: wlog( 'pdflatex was not detected in default PATH, generate summary report .tex file in summary/ folder, you can move the whole summary/ folder to the environment with pdflatex installed and run cmd in the summary/ folder: "pdflatex %s"' % (conf_dict['General']['outname'] + '_summary.tex'), logfile) #if conf_dict['clean']: # wlog('--clean pararmeter was turned on, remove internal files with large size',logfile) # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_symbol.bed'),logfile) # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_cds.bed'),logfile) # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_3utr.bed'),logfile) # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_5utr.bed'),logfile) # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_TTSdis.bed'),logfile) # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_combined.bed'),logfile) # rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_barcode_reform.txt'),logfile) # os.chdir("../") wlog('Step3 summary DONE, check %s for final outputs' % (summarydir), logfile) return conf_dict
def step2_NC_detection(conf_dict, logfile): ''' analysis part, mainly Rscript Detect ''' # Rscript detectNonClassic.r outname signalname usePQ cutoff alpha lambdachoice topN tmpRpackgeDIR if conf_dict['General']['mode'] == "binary": Rscript = """ ### read parameter a<-commandArgs(T) outname <- a[1] signalname <- unlist(strsplit(a[2],",")) cutoff <- as.numeric(a[3]) Alpha <- as.numeric(a[4]) LambdaChoice <- (a[5]) topN <- a[6] R2cutoff <- as.numeric(a[8]) R2classic <- 0.1 tmp_rpackage_dir <- a[7] ### install R packages if("foreach" %in% installed.packages()[,"Package"]){ library(foreach) }else{ install.packages("foreach",dependencies=TRUE,lib=tmp_rpackage_dir,repos="https://mirrors.tongji.edu.cn/CRAN/") library(foreach,lib.loc=tmp_rpackage_dir) } if("glmnet" %in% installed.packages()[,"Package"]){ library(glmnet) }else{ install.packages("glmnet",dependencies=TRUE,lib=tmp_rpackage_dir,repos="https://mirrors.tongji.edu.cn/CRAN/") library(glmnet,lib.loc=tmp_rpackage_dir) } library(methods) ### define functions unilinear_only <- function(TFov,HMsig){ # univariate linear regression lmresult <- summary(lm(HMsig ~ TFov)) fgR2 <- lmresult$adj.r.squared lmcoeff <- lmresult$coefficients['TFov','Estimate'] lmP <- lmresult$coefficients['TFov',c('Pr(>|t|)')] return(c(fgR2,lmcoeff)) } unilinear_permute <- function(TFov,HMsig){ # univariate linear regression + empirical pvalue lmresult <- summary(lm(HMsig ~ TFov)) fgR2 <- lmresult$adj.r.squared bgR2s <- c() for(i in 1:999){ bgTFov <- rep(0,length(TFov)) bgTFov[sample(length(TFov),length(which(TFov > 0)))] <- 1 bgR2 <- summary(lm(HMsig ~ bgTFov))$adj.r.squared bgR2s <- c(bgR2s,bgR2) } permuteP <- (length(which(fgR2 < bgR2s))+1)/(length(bgR2s)+1) return(permuteP) } maxG<-function(cutoff,usedata){ # function of estimating G given cutoff # G : inter-class variance # method refer to "Otus' method" w0 <- length(which(usedata < cutoff))/length(usedata) w1 <- length(which(usedata >= cutoff))/length(usedata) u0 <- mean(usedata[which(usedata < cutoff)]) u1 <- mean(usedata[which(usedata >= cutoff)]) g <- w0*w1*(u0-u1)**2 return(g) } signal2cutoff <- function(rawsig){ # function for separate NC peaks considering HMsignal # input data: a vector of signal (HMsignal on HMR peak, linear scale) # method: go through all possible cutoff, find the cutoff corresponding to maximum G # output: the cutoff, a vector of selected cutoff candidates (Gbins) and a vector of G value for each cutoff candidates (G) sig <- log10(rawsig[which(rawsig>0)]) # separate the section of (log) signal to N cutoffs Gbins <- seq(min(sig)+0.1,max(sig)-0.1,0.01) # estimate G for each cutoff candidates G <- unlist(lapply(Gbins,maxG, sig)) # select cutoff at the first time G meats its maximum value NCcut<-seq(min(sig)+0.1,max(sig)-0.1,0.01)[which(G==max(G))][1] group_detail <- rep(0, length(rawsig)) group_detail[which(rawsig >= 10**NCcut)] <- 1 # output the grouping result: list contains 3 items # item1: NC cutoff # item2: 2 column for cutoff candidates and corresponded G # item3: Nrow = peak number,Ncolumn = 2, c1 for signal, c2 for group number, 0 for lowHM group (solo, non-classical), 1 for highHM group (ensemble, classical) return(list(NCcut, cbind(Gbins,G), cbind(rawsig, group_detail) )) } # step1 data preprocess peakov <- read.table(paste0(outname,"_peakov.bed")) signal <- read.table(paste0(outname,"_HMsig.bed")) candidate_list <- as.vector(read.table(paste0(outname,"_cofactor_candidate_list.txt"))[,1]) bedncol <- ncol(peakov) - length(candidate_list) colnames(peakov) <- c(paste0("c",seq(1:bedncol)),candidate_list) colnames(signal) <- c(paste0("c",seq(1:bedncol)),signalname) rownames(peakov) <- paste0("r",seq(1,nrow(peakov))) rownames(signal) <- paste0("r",seq(1,nrow(peakov))) TFov <- peakov[,candidate_list] TFov[TFov > 1] <- 1 HMsig <- as.matrix(signal[,signalname]) colnames(HMsig) <- signalname rownames(HMsig) <- rownames(signal) ### step2, prepare X,Y for model selection lindata <- cbind(HMsig,TFov) bind_sum <- apply(TFov,1,sum) ## sites with 90% factors overlapped are excluded use1_lindata <- lindata[names(bind_sum[which(bind_sum < ncol(TFov)*0.9)]),] ## factors with < 100 or > 95% cobinding events are excluded TF_sum <- apply(use1_lindata[,colnames(TFov)],2,sum) use_lindata <- lindata[names(bind_sum[which(bind_sum < ncol(TFov)*0.9)]),c(colnames(HMsig),names(TF_sum)[which(TF_sum>=100 & TF_sum <= nrow(use1_lindata)*0.95)])] ## form X, Y ### raw Y is used in otsu' method rawY <- as.matrix(use_lindata[,colnames(HMsig)]) colnames(rawY) <- colnames(HMsig) Y <- as.matrix(scale(use_lindata[,colnames(HMsig)])) colnames(Y) <- colnames(HMsig) X <- as.matrix(use_lindata[,(ncol(HMsig)+1):ncol(use_lindata)]) peakX <- peakov[rownames(X),1:bedncol] ### elastic net model selection set.seed(1007) coTFusage <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X)) rownames(coTFusage) <- colnames(X) colnames(coTFusage) <- colnames(Y) if(ncol(Y) == 1){ pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=6,width=6) par(mar=c(4,4,4,2)) }else if(ncol(Y) == 2){ pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=6,width=12) par(mfrow=c(1,2),mar=c(4,4,4,2)) }else if(ncol(Y) == 3){ pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=12,width=12) par(mfrow=c(2,2),mar=c(4,4,4,2)) }else{ pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=12,width=12) par(mfrow=c(2,2),mar=c(4,4,4,2)) } for(i in 1:ncol(Y)){ thisY <- Y[,i] this_name <- colnames(Y)[i] cv.glmmod <- cv.glmnet(x=X,y=thisY,alpha=Alpha,family="gaussian") coeff_this <- coef(cv.glmmod,s=paste0("lambda.",LambdaChoice)) coTF_name <- rownames(coeff_this)[which(coeff_this[,1] < 0 & rownames(coeff_this)!= "(Intercept)")] coTFusage[coTF_name,this_name] <- 1 ### cross-validation curve for each HM if (i %in% 1:4){ plot(cv.glmmod) title(this_name,line=2.5) if (LambdaChoice == "1se"){ abline(v=log(cv.glmmod$lambda.1se),col="blue",lwd=2) legend("topleft",legend="lambda.1se",lwd=3,bty="n",col='blue') }else{ abline(v=log(cv.glmmod$lambda.min),col="blue",lwd=2) legend("topleft",legend="lambda.min",lwd=3,bty="n",col='blue') } } } dev.off() ### generate R2 and coeff table R2_mat <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X)) coeff_mat <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X)) rownames(R2_mat) <- colnames(X) colnames(R2_mat) <- colnames(Y) rownames(coeff_mat) <- colnames(X) colnames(coeff_mat) <- colnames(Y) for(TFname in colnames(X)){ for(HMname in colnames(Y)){ unilinResult <- unilinear_only(X[,TFname],Y[,HMname]) R2_mat[TFname,HMname] <- unilinResult[1] coeff_mat[TFname,HMname] <- unilinResult[2] } } summary_table <- c() ### select candidates # for each candidates detected in el-net step, require R2>0.1, coeff<0, R2 for any positive correlated HM signal (coeff>0) < 0.1 for(TFname in colnames(X)){ this_usage <- coTFusage[TFname,] this_R2 <- R2_mat[TFname,] this_coeff <- coeff_mat[TFname,] classicFun <- 0 nonClassicHMidx <- c() for(i in 1:ncol(coTFusage)){ if(this_usage[i]==1 & this_R2[i] >= R2cutoff & this_coeff[i] < 0){ nonClassicHMidx <- c(nonClassicHMidx, i) }else if(this_R2[i] >= R2classic & this_coeff[i] > 0 ){ classicFun <- 1 } } if(classicFun == 0){ for(idx in nonClassicHMidx){ nonClassicHM <- colnames(Y)[idx] R2 <- this_R2[idx] coeff <- this_coeff[idx] Pval <- unilinear_permute(X[,TFname],Y[,idx]) summary_table <- rbind(summary_table, c(TFname, nonClassicHM, Pval, R2, coeff)) } } } ### summary and output steps if(is.null(summary_table)){ print("no significant candidates detected") write.table("no non-classical function detected",file=paste0(outname,"_filterNC.txt"),quote=F,sep="\t",row.names=F,col.names=F) write.table("no non-classical function detected",file=paste0(outname,'_NCsummary.txt'),quote=F,sep="\t",row.names=F,col.names=F) }else{ colnames(summary_table) <- c("TFname","HMname","Pval","R2","coeff") if(nrow(summary_table) > 1){ summary_table <- summary_table[order(as.numeric(summary_table[,"R2"]),decreasing=TRUE),] } if(topN == "all"){ topN <- nrow(summary_table) }else{ topN <- as.numeric(topN) } if(topN == 1){ out_table_raw <- summary_table }else{ out_table_raw <- as.matrix(summary_table[1:topN,]) } ## separate NC/C peaks based on HMsignal using otsu's method peakgroup <- c() for(i in 1:ncol(rawY)){ thisY <- rawY[,i] thisY_NCotsu <- signal2cutoff(thisY) thisY_peakgroup <- thisY_NCotsu[[3]][,2] peakgroup <- cbind(peakgroup, thisY_peakgroup) } colnames(peakgroup) <- colnames(Y) ## for each predicted coTFvsHM pair, output the cobinding NC sites as a bed file if(!file.exists("nonClassicalPeaks/")){dir.create("nonClassicalPeaks")} if(topN == 1){ pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=6,width=6) par(mar=c(4,4,2,2)) }else if(topN == 2){ pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=6,width=12) par(mfrow=c(1,2),mar=c(4,4,2,2)) }else{ pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=12,width=12) par(mfrow=c(2,2),mar=c(4,4,2,2)) } num_NCsites <- c() for(topnum in 1:nrow(out_table_raw)){ coTF = out_table_raw[topnum,1] substrateHM = out_table_raw[topnum,2] coTF_cobinding <- X[,coTF] HMnc <- peakgroup[,substrateHM] cobinding_NC_peak <- peakX[which(coTF_cobinding>0 & HMnc == 0),] write.table(cobinding_NC_peak, file=paste0("nonClassicalPeaks/",outname,"_",coTF,"_",substrateHM,"_top",topnum,"nonclassical_peaks.bed"),quote=F,sep="\t",row.names=F,col.names=F) num_NCsites <- c(num_NCsites, nrow(cobinding_NC_peak)) if(topnum %in% 1:4){ ## boxplot compare the histone modification signal between on classical and non-classical peaks boxplot(Y[which(coTF_cobinding>0 & HMnc == 0)],Y[which(coTF_cobinding==0 | HMnc > 0)], names=c("non-classical peak","classical peak"),ylab=paste0(substrateHM," signal"),main=paste0(coTF," & ",substrateHM), outline=F,cex.main=1) legend("topleft",legend=paste0("#NCpeak = ",nrow(cobinding_NC_peak)),bty="n") } } dev.off() out_table <- cbind(out_table_raw, num_NCsites) write.table(summary_table,file=paste0(outname,"_filterNC.txt"),quote=F,sep="\t",row.names=F,col.names=T) write.table(out_table,file=paste0(outname,'_NCsummary.txt'),quote=F,sep="\t",row.names=F,col.names=T) } """ # Rscript detectNonClassic.r outname signalname usePQ cutoff alpha lambdachoice topN #tmpRpackgeDIR else: # conf_dict['General']['mode'] == "signal": Rscript = """ ### read parameter a<-commandArgs(T) outname <- a[1] signalname <- unlist(strsplit(a[2],",")) cutoff <- as.numeric(a[3]) Alpha <- as.numeric(a[4]) LambdaChoice <- (a[5]) topN <- a[6] R2cutoff <- as.numeric(a[8]) R2classic <- 0.1 tmp_rpackage_dir <- a[7] ### install R packages if("foreach" %in% installed.packages()[,"Package"]){ library(foreach) }else{ install.packages("foreach",dependencies=TRUE,lib=tmp_rpackage_dir,repos="https://mirrors.tongji.edu.cn/CRAN/") library(foreach,lib.loc=tmp_rpackage_dir) } if("glmnet" %in% installed.packages()[,"Package"]){ library(glmnet) }else{ install.packages("glmnet",dependencies=TRUE,lib=tmp_rpackage_dir,repos="https://mirrors.tongji.edu.cn/CRAN/") library(glmnet,lib.loc=tmp_rpackage_dir) } library(methods) ### define functions unilinear_only <- function(TFov,HMsig){ # univariate linear regression lmresult <- summary(lm(HMsig ~ TFov)) fgR2 <- lmresult$adj.r.squared lmcoeff <- lmresult$coefficients['TFov','Estimate'] lmP <- lmresult$coefficients['TFov',c('Pr(>|t|)')] return(c(fgR2,lmcoeff)) } unilinear_permute <- function(TFov,HMsig){ # univariate linear regression + empirical pvalue lmresult <- summary(lm(HMsig ~ TFov)) fgR2 <- lmresult$adj.r.squared bgR2s <- c() for(i in 1:999){ bgTFov <- rep(0,length(TFov)) bgTFov[sample(length(TFov),length(which(TFov > 0)))] <- 1 bgR2 <- summary(lm(HMsig ~ bgTFov))$adj.r.squared bgR2s <- c(bgR2s,bgR2) } permuteP <- (length(which(fgR2 < bgR2s))+1)/(length(bgR2s)+1) return(permuteP) } maxG<-function(cutoff,usedata){ # function of estimating G given cutoff # G : inter-class variance # method refer to "Otus' method" w0 <- length(which(usedata < cutoff))/length(usedata) w1 <- length(which(usedata >= cutoff))/length(usedata) u0 <- mean(usedata[which(usedata < cutoff)]) u1 <- mean(usedata[which(usedata >= cutoff)]) g <- w0*w1*(u0-u1)**2 return(g) } signal2cutoff <- function(rawsig){ # function for separate NC peaks considering HMsignal # input data: a vector of signal (HMsignal on HMR peak, linear scale) # method: go through all possible cutoff, find the cutoff corresponding to maximum G # output: the cutoff, a vector of selected cutoff candidates (Gbins) and a vector of G value for each cutoff candidates (G) sig <- log10(rawsig[which(rawsig>0)]) # separate the section of (log) signal to N cutoffs Gbins <- seq(min(sig)+0.1,max(sig)-0.1,0.01) # estimate G for each cutoff candidates G <- unlist(lapply(Gbins,maxG, sig)) # select cutoff at the first time G meats its maximum value NCcut<-seq(min(sig)+0.1,max(sig)-0.1,0.01)[which(G==max(G))][1] group_detail <- rep(0, length(rawsig)) group_detail[which(rawsig >= 10**NCcut)] <- 1 # output the grouping result: list contains 3 items # item1: NC cutoff # item2: 2 column for cutoff candidates and corresponded G # item3: Nrow = peak number,Ncolumn = 2, c1 for signal, c2 for group number, 0 for lowHM group (solo, non-classical), 1 for highHM group (ensemble, classical) return(list(NCcut, cbind(Gbins,G), cbind(rawsig, group_detail) )) } trim95 <- function(INdata){ indata <- INdata indata[indata>quantile(indata,0.95)] <- quantile(indata,0.95) return(indata) } # step1 data preprocess peakov <- read.table(paste0(outname,"_peakov.bed")) peaksig <- read.table(paste0(outname,"_TFsig.bed")) signal <- read.table(paste0(outname,"_HMsig.bed")) candidate_list <- as.vector(read.table(paste0(outname,"_cofactor_candidate_list.txt"))[,1]) bedncol <- ncol(peakov) - length(candidate_list) colnames(peakov) <- c(paste0("c",seq(1:bedncol)),candidate_list) colnames(peaksig) <- c(paste0("c",seq(1:bedncol)),candidate_list) colnames(signal) <- c(paste0("c",seq(1:bedncol)),signalname) rownames(peakov) <- paste0("r",seq(1,nrow(peakov))) rownames(peaksig) <- paste0("r",seq(1,nrow(peaksig))) rownames(signal) <- paste0("r",seq(1,nrow(peakov))) use_candidate_list <- c() for(i in candidate_list){ SDsig <- sd(as.numeric(peaksig[,i])) if (i != outname || SDsig == 0){ use_candidate_list <- c(use_candidate_list,i) } } TFov <- as.matrix(peakov[,use_candidate_list]) TFov[TFov > 1] <- 1 TFsig_raw <- as.matrix(peakov[,use_candidate_list]) TFsig <- apply(TFsig_raw,2,trim95) HMsig <- as.matrix(signal[,signalname]) colnames(HMsig) <- signalname rownames(HMsig) <- rownames(signal) ### step2, prepare X,Y for model selection lindata <- cbind(HMsig,TFov) bind_sum <- apply(TFov,1,sum) ## sites with 90% factors overlapped are excluded use1_lindata <- lindata[names(bind_sum[which(bind_sum < ncol(TFov)*0.9)]),] ## factors with < 100 or > 95% cobinding events are excluded TF_sum <- apply(use1_lindata[,colnames(TFov)],2,sum) use_lindata <- lindata[names(bind_sum[which(bind_sum < ncol(TFov)*0.9)]),c(colnames(HMsig),names(TF_sum)[which(TF_sum>=100 & TF_sum <= nrow(lindata)*0.95)])] ## form X, Y ### raw Y is used in otsu' method rawY <- as.matrix(use_lindata[,colnames(HMsig)]) colnames(rawY) <- colnames(HMsig) Y <- as.matrix(scale(use_lindata[,colnames(HMsig)])) colnames(Y) <- colnames(HMsig) X_peakOV <- as.matrix(use_lindata[,(ncol(HMsig)+1):ncol(use_lindata)]) peakX <- peakov[rownames(X_peakOV),1:bedncol] X_sig <- as.matrix(TFsig[rownames(X_peakOV),colnames(X_peakOV)]) X_sig[X_peakOV == 0] <- 0 X <- X_sig ### elastic net model selection set.seed(1007) coTFusage <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X)) rownames(coTFusage) <- colnames(X) colnames(coTFusage) <- colnames(Y) if(ncol(Y) == 1){ pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=6,width=6) par(mar=c(4,4,4,2)) }else if(ncol(Y) == 2){ pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=6,width=12) par(mfrow=c(1,2),mar=c(4,4,4,2)) }else if(ncol(Y) == 3){ pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=12,width=12) par(mfrow=c(2,2),mar=c(4,4,4,2)) }else{ pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=12,width=12) par(mfrow=c(2,2),mar=c(4,4,4,2)) } for(i in 1:ncol(Y)){ thisY <- Y[,i] this_name <- colnames(Y)[i] cv.glmmod <- cv.glmnet(x=X,y=thisY,alpha=Alpha,family="gaussian") coeff_this <- coef(cv.glmmod,s=paste0("lambda.",LambdaChoice)) coTF_name <- rownames(coeff_this)[which(coeff_this[,1] < 0 & rownames(coeff_this)!= "(Intercept)")] coTFusage[coTF_name,this_name] <- 1 ### cross-validation curve for each HM if (i %in% 1:4){ plot(cv.glmmod) title(this_name,line=2.5) if (LambdaChoice == "1se"){ abline(v=log(cv.glmmod$lambda.1se),col="blue",lwd=2) legend("topleft",legend="lambda.1se",lwd=3,bty="n",col='blue') }else{ abline(v=log(cv.glmmod$lambda.min),col="blue",lwd=2) legend("topleft",legend="lambda.min",lwd=3,bty="n",col='blue') } } } dev.off() ### generate R2 and coeff table R2_mat <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X)) coeff_mat <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X)) rownames(R2_mat) <- colnames(X) colnames(R2_mat) <- colnames(Y) rownames(coeff_mat) <- colnames(X) colnames(coeff_mat) <- colnames(Y) for(TFname in colnames(X)){ for(HMname in colnames(Y)){ unilinResult <- unilinear_only(X[,TFname],Y[,HMname]) R2_mat[TFname,HMname] <- unilinResult[1] coeff_mat[TFname,HMname] <- unilinResult[2] } } summary_table <- c() ### select candidates # for each candidates detected in el-net step, require R2>0.1, coeff<0, R2 for any positive correlated HM signal (coeff>0) < 0.1 for(TFname in colnames(X)){ this_usage <- coTFusage[TFname,] this_R2 <- R2_mat[TFname,] this_coeff <- coeff_mat[TFname,] classicFun <- 0 nonClassicHMidx <- c() for(i in 1:ncol(coTFusage)){ if(this_usage[i]==1 & this_R2[i] >= R2cutoff & this_coeff[i] < 0){ nonClassicHMidx <- c(nonClassicHMidx, i) }else if(this_R2[i] >= R2classic & this_coeff[i] > 0 ){ classicFun <- 1 } } if(classicFun == 0){ for(idx in nonClassicHMidx){ nonClassicHM <- colnames(Y)[idx] R2 <- this_R2[idx] coeff <- this_coeff[idx] Pval <- unilinear_permute(X[,TFname],Y[,idx]) summary_table <- rbind(summary_table, c(TFname, nonClassicHM, Pval, R2, coeff)) } } } ### summary and output steps if(is.null(summary_table)){ print("no significant candidates detected") write.table("no non-classical function detected",file=paste0(outname,"_filterNC.txt"),quote=F,sep="\t",row.names=F,col.names=F) write.table("no non-classical function detected",file=paste0(outname,'_NCsummary.txt'),quote=F,sep="\t",row.names=F,col.names=F) }else{ colnames(summary_table) <- c("TFname","HMname","Pval","R2","coeff") if(nrow(summary_table) > 1){ summary_table <- summary_table[order(as.numeric(summary_table[,"R2"]),decreasing=TRUE),] } if(topN == "all"){ topN <- nrow(summary_table) }else{ topN <- as.numeric(topN) } if(topN == 1){ out_table_raw <- summary_table }else{ out_table_raw <- as.matrix(summary_table[1:topN,]) } ## separate NC/C peaks based on HMsignal using otsu's method peakgroup <- c() for(i in 1:ncol(rawY)){ thisY <- rawY[,i] thisY_NCotsu <- signal2cutoff(thisY) thisY_peakgroup <- thisY_NCotsu[[3]][,2] peakgroup <- cbind(peakgroup, thisY_peakgroup) } colnames(peakgroup) <- colnames(Y) ## for each predicted coTFvsHM pair, output the cobinding NC sites as a bed file if(!file.exists("nonClassicalPeaks/")){dir.create("nonClassicalPeaks")} if(topN == 1){ pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=6,width=6) par(mar=c(4,4,2,2)) }else if(topN == 2){ pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=6,width=12) par(mfrow=c(1,2),mar=c(4,4,2,2)) }else{ pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=12,width=12) par(mfrow=c(2,2),mar=c(4,4,2,2)) } num_NCsites <- c() for(topnum in 1:nrow(out_table_raw)){ coTF = out_table_raw[topnum,1] substrateHM = out_table_raw[topnum,2] coTF_cobinding <- X[,coTF] HMnc <- peakgroup[,substrateHM] cobinding_NC_peak <- peakX[which(coTF_cobinding>0 & HMnc == 0),] write.table(cobinding_NC_peak, file=paste0("nonClassicalPeaks/",outname,"_",coTF,"_",substrateHM,"_top",topnum,"nonclassical_peaks.bed"),quote=F,sep="\t",row.names=F,col.names=F) num_NCsites <- c(num_NCsites, nrow(cobinding_NC_peak)) if(topnum %in% 1:4){ ## boxplot compare the histone modification signal between on classical and non-classical peaks boxplot(Y[which(coTF_cobinding>0 & HMnc == 0)],Y[which(coTF_cobinding==0 | HMnc > 0)], names=c("non-classical peak","classical peak"),ylab=paste0(substrateHM," signal"),main=paste0(coTF," & ",substrateHM), outline=F,cex.main=1) legend("topleft",legend=paste0("#NCpeak = ",nrow(cobinding_NC_peak)),bty="n") } } dev.off() out_table <- cbind(out_table_raw, num_NCsites) write.table(summary_table,file=paste0(outname,"_filterNC.txt"),quote=F,sep="\t",row.names=F,col.names=T) write.table(out_table,file=paste0(outname,'_NCsummary.txt'),quote=F,sep="\t",row.names=F,col.names=T) } """ createDIR("tmpPackage/") outf = open("tmpPackage/detectNonClassic.r", 'w') outf.write(Rscript) outf.close() cmd = "Rscript %s %s %s %s %s %s %s %s %s" % ( "tmpPackage/detectNonClassic.r", conf_dict['General']['outname'], ",".join(conf_dict['General']['signalname']), conf_dict['options']['Pvalue'], conf_dict['options']['Alpha'], conf_dict['options']['Lambda'], conf_dict['options']['TopNcofactors'], conf_dict['General']['startdir'] + "tmpPackage/", conf_dict['options']['Rcutoff']) #rwlog(cmd,logfile) os.system('echo "[CMD] %s " >> %s' % (cmd, logfile)) tmpobj = sp(cmd) return conf_dict
def step0_check_data(conf_dict, logfile): ''' step0 integrate data check and complement parameter ''' ### check data path , format , if "~" in conf_dict['General']['HMRpeak']: ewlog( 'require absolute path for HMRpeak bed file, HMRpeak file cannot contain "~", current HMRpeak file is %s' % (conf_dict['General']['HMRpeak']), logfile) if "~" in conf_dict['General']['signal']: ewlog( 'require absolute path for HMsignal bigwig file, signal file cannot contain "~", current signal file is %s' % (conf_dict['General']['signal']), logfile) if not conf_dict['General']['HMRpeak'].startswith('/'): conf_dict['General']['HMRpeak'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['HMRpeak'] if not conf_dict['General']['signal'].startswith('/'): conf_dict['General']['signal'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['signal'] if not os.path.isfile(conf_dict['General']['HMRpeak']): ewlog("HMRpeak file %s not found" % (conf_dict['General']['HMRpeak']), logfile) if not os.path.isfile(conf_dict['General']['signal']): ewlog("signal bw file %s not found" % (conf_dict['General']['signal']), logfile) if not conf_dict['General']['HMRpeak'].endswith('.bed'): ewlog('extenion of HMR peak file is not .bed', logfile) checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000) if checkbed == "fail": ewlog("HMRpeak file is not a bed file", logfile) elif checkbed == "lesspeak": ewlog("HMRpeak file contains less than 1000 peaks") if conf_dict['General']['signal'].endswith('.bw'): conf_dict['General']['signalname'] = conf_dict['General'][ 'signal'].split("/")[-1][:-3] elif conf_dict['General']['signal'].endswith('.bigwig'): conf_dict['General']['signalname'] = conf_dict['General'][ 'signal'].split("/")[-1][:-7] else: wlog('[WARNING] extension of signal bw file is not bw/bigwig', logfile) ### check TFpeak folder if "~" in conf_dict['General']['peakFolder']: ewlog( 'require absolute path for peakFolder, peakFolder cannot contain "~", current peakFolder is %s' % (conf_dict['General']['peakFolder']), logfile) if not conf_dict['General']['peakFolder'].startswith('/'): conf_dict['General']['peakFolder'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['peakFolder'] if not conf_dict['General']['peakFolder'].endswith('/'): conf_dict['General']['peakFolder'] += "/" if not os.path.isdir(conf_dict['General']['peakFolder']): ewlog("peakFolder %s not found" % (conf_dict['General']['peakFolder']), logfile) wlog( "Check the peak.bed files in the peakFolder, only '.bed' files with >1000 peaks are included in the following analysis", logfile) conf_dict['General']['peakfilenames'] = [] for f in os.listdir(conf_dict['General']['peakFolder']): if f.endswith(".bed") and os.path.isfile( conf_dict['General']['peakFolder'] + f): checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000) if checkbed == "pass": conf_dict['General']['peakfilenames'].append(f[:-4]) if (len(conf_dict['General']['peakfilenames']) == 0): ewlog( "no peak file (cofactor candidate) in bed format & >1000peaks are included, exit", logfile) else: wlog( "%s peak files (cofactor candidates) are included" % (len(conf_dict['General']['peakfilenames'])), logfile) outf = open( conf_dict['General']['outname'] + "_cofactor_candidate_list.txt", 'w') for cofactor in conf_dict['General']['peakfilenames']: outf.write(cofactor + "\n") outf.close() ### check options wlog('check option: ', logfile) try: wlog("extend length is %s bp" % (int(conf_dict['options']['ext'])), logfile) conf_dict['options']['ext'] = int(conf_dict['options']['ext']) except: wlog( "extend length %s is not valid, use default value: 1000bp" % (conf_dict['options']['ext']), logfile) conf_dict['options']['ext'] = 1000 try: wlog( "use Pvalue = %s as cutoff" % (str(float(conf_dict['options']['Pvalue']))), logfile) except: wlog( "input Pvalue %s is not recognized, use default Pvalue=0.001" % (conf_dict['options']['Pvalue']), logfile) conf_dict['options']['Pvalue'] = 0.001 if float(conf_dict['options']['Pvalue']) >= 1: wlog( "input Pvalue %s is not valid, use default Pvalue=0.001" % (conf_dict['options']['Pvalue']), logfile) conf_dict['options']['Pvalue'] = 0.001 try: usealpha = float(conf_dict['options']['Alpha']) if usealpha >= 1: wlog("alpha cannot be >=1, use alpha=0.5", logfile) conf_dict['options']['Alpha'] = 0.5 else: wlog("Alpha = %s" (str(float(conf_dict['options']['Alpha']))), logfile) conf_dict['options']['Alpha'] = usealpha except: wlog( "input alpha %s is not valid, use alpha=0.5" % (conf_dict['options']['Alpha']), logfile) wlog("Lambda choice is %s" % (conf_dict['options']['Lambda']), logfile) if conf_dict['options']['TopNcofactors'] == "all": wlog("all significant co-factors will be output", logfile) else: try: topTF = int(conf_dict['options']['TopNcofactors']) wlog( "the topN number %s will be output" % (conf_dict['options']['TopNcofactors']), logfile) conf_dict['options']['TopNcofactors'] = topTF except: wlog( "the topN number %s is not valid, output top5 co-factors" % (conf_dict['options']['TopNcofactors']), logfile) conf_dict['options']['TopNcofactors'] = 5 OS = platform.system() if OS == "Linux": bwsum_software = "bigWigSummary_linux" elif OS == "Darwin": bwsum_software = "bigWigSummary_mac" else: wlog("detected system is nither linux nor mac, try linux version", logfile) bwsum_software = "bigWigSummary_linux" conf_dict['General']['software'] = bwsum_software ### check Rscript #if not 'Usage' in sperr('Rscript')[1] and not 'version' in sperr('Rscript')[1]: # ewlog('require Rscript',logfile) ### check pdflatex if sp('pdflatex --help')[0] == "": wlog( 'pdflatex was not installed, HMR is still processing but no summary report generated', logfile) conf_dict['General']['latex'] = 0 else: conf_dict['General']['latex'] = 1 return conf_dict