예제 #1
0
def step2_NC_detection(conf_dict,logfile):
    '''
    analysis part
    mainly Rscript
    '''   
    # start
    # create section for 
    # Rscript detectNonCanonical.r outname signalname usePQ cutoff alpha lambdachoice topN tmpRpackgeDIR
    createDIR(conf_dict['General']['startdir']+"tmpPackage/")
    cmd = "Rscript %s %s %s %s %s %s %s"%(conf_dict['rscript']+"detectNonCanonical.r",
                         conf_dict['General']['outname'],
                         conf_dict['General']['signalname'],
                         conf_dict['options']['Pvalue'],
                         conf_dict['options']['Alpha'],
                         conf_dict['options']['Lambda'],
                         conf_dict['options']['TopNcofactors'],
                         conf_dict['General']['startdir']+"tmpPackage/")
    #rwlog(cmd,logfile)
    os.system('echo "[CMD] %s " >> %s'%(cmd,logfile))
    tmpobj = sp(cmd)

    return conf_dict
def step0_check_data(conf_dict, logfile):
    '''
    step0 integrate data 
    check and complement parameter
    '''
    ### check data path , format ,
    if "~" in conf_dict['General']['HMRpeak']:
        ewlog(
            'require absolute path for HMRpeak bed file, HMRpeak file cannot contain "~", current HMRpeak file is %s'
            % (conf_dict['General']['HMRpeak']), logfile)
    if "~" in conf_dict['General']['signal']:
        ewlog(
            'require absolute path for HMsignal bigwig file, signal file cannot contain "~", current signal file(s): %s'
            % (conf_dict['General']['signal']), logfile)
    if not conf_dict['General']['HMRpeak'].startswith('/'):
        conf_dict['General']['HMRpeak'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['HMRpeak']

    if not os.path.isfile(conf_dict['General']['HMRpeak']):
        ewlog("HMRpeak file %s not found" % (conf_dict['General']['HMRpeak']),
              logfile)

    if not conf_dict['General']['HMRpeak'].endswith('.bed'):
        ewlog('extenion of HMR peak file is not .bed', logfile)
    checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000)
    if checkbed == "fail":
        ewlog("HMRpeak file is not a bed file", logfile)
    elif checkbed == "lesspeak":
        ewlog("HMRpeak file contains less than 1000 peaks")

    conf_dict['General']['signalname'] = []
    conf_dict['General']['signalfile'] = []
    for bwsignalfile in conf_dict['General']['signal']:
        if not bwsignalfile.startswith('/'):
            bwsignalfile = conf_dict['General']['startdir'] + bwsignalfile

        if not os.path.isfile(bwsignalfile):
            wlog("signal bw file %s not found, ignored" % (bwsignalfile),
                 logfile)
            continue

        if bwsignalfile.endswith(".bw"):
            conf_dict['General']['signalfile'].append(bwsignalfile)
            conf_dict['General']['signalname'].append(
                bwsignalfile.split("/")[-1][:-3])
        elif bwsignalfile.endswith(".bigwig"):
            conf_dict['General']['signalfile'].append(bwsignalfile)
            conf_dict['General']['signalname'].append(
                bwsignalfile.split("/")[-1][:-7])
        else:
            wlog('[WARNING] extension of signal bw file is not bw/bigwig',
                 logfile)
            conf_dict['General']['signalfile'].append(bwsignalfile)
            conf_dict['General']['signalname'].append(
                bwsignalfile.split("/")[-1])

    if len(conf_dict['General']['signalfile']) == 0:
        ewlog("no signal bw file valid, exit")
    elif len(conf_dict['General']['signalfile']) > 4:
        ewlog(
            "maximum signal bw file is limited to 4. There were %s signal file inputed, exit"
            % (len(conf_dict['General']['signalfile'])))

    ### check TFpeak folder
    if "~" in conf_dict['General']['peakFolder']:
        ewlog(
            'require absolute path for peak/track Folder, Folder cannot contain "~", current Folder is %s'
            % (conf_dict['General']['peakFolder']), logfile)
    if not conf_dict['General']['peakFolder'].startswith('/'):
        conf_dict['General']['peakFolder'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['peakFolder']
    if not conf_dict['General']['peakFolder'].endswith('/'):
        conf_dict['General']['peakFolder'] += "/"
    if not os.path.isdir(conf_dict['General']['peakFolder']):
        ewlog("Folder %s not found" % (conf_dict['General']['peakFolder']),
              logfile)

    if conf_dict['General']['mode'] == "signal":
        wlog("signal mode is activated", logfile)
        if conf_dict['General']['bwfolder']:
            wlog("bwFolder is specified, checking data for signal mode",
                 logfile)
            if "~" in conf_dict['General']['bwfolder']:
                wlog(
                    'require absolute path for bwFolder, bwFolder cannot contain "~", current Folder is %s, use peak mode'
                    % (conf_dict['General']['bwfolder']), logfile)
                conf_dict['General']['mode'] = "binary"
            else:
                if not conf_dict['General']['bwfolder'].startswith('/'):
                    conf_dict['General']['bwfolder'] = conf_dict['General'][
                        'startdir'] + conf_dict['General']['bwfolder']
                if not conf_dict['General']['bwfolder'].endswith('/'):
                    conf_dict['General']['bwfolder'] += "/"
                if not os.path.isdir(conf_dict['General']['bwfolder']):
                    wlog(
                        "bwFolder %s not found, use binary mode" %
                        (conf_dict['General']['peakFolder']), logfile)
                    conf_dict['General']['mode'] = "binary"
        else:
            wlog("bwfolder is not specified, use binary mode", logfile)
            conf_dict['General']['mode'] = "binary"
    else:
        wlog("binary mode is activaed", logfile)

    wlog(
        "Check the peak.bed files in the Folder, only '.bed' files with >1000 peaks are included in the following analysis",
        logfile)
    conf_dict['General']['peakfilenames'] = []
    for f in os.listdir(conf_dict['General']['peakFolder']):
        if f.endswith(".bed") and os.path.isfile(
                conf_dict['General']['peakFolder'] + f):
            checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000)
            if checkbed == "pass":
                conf_dict['General']['peakfilenames'].append(f[:-4])

    if (len(conf_dict['General']['peakfilenames']) == 0):
        ewlog(
            "no peak file (cofactor candidate) in (bed format & >1000peaks) are included, exit",
            logfile)

    if conf_dict['General']['mode'] == "signal":
        conf_dict['General']['bwfilenames'] = []
        for f in os.listdir(conf_dict['General']['bwfolder']):
            if f.endswith(".bw") and os.path.isfile(
                    conf_dict['General']['bwfolder'] + f):
                conf_dict['General']['bwfilenames'].append(f[:-3])
        ### compare the name from bwfiles and peak files
        conf_dict['General']['usefilename'] = []
        for name in conf_dict['General']['bwfilenames']:
            if name in conf_dict['General']['peakfilenames']:
                conf_dict['General']['usefilename'].append(name)
        ### if less than 50% peakfiles share name with bwfiles, change back to peak mode
        if len(conf_dict['General']['usefilename']) < len(
                conf_dict['General']['peakfilenames']) * 0.5:
            conf_dict['General']['mode'] = "binary"
            wlog(
                "the number of shared peak&bw files is less than half of the number of peakfiles, use binary mode",
                logfile)
        else:
            wlog("all checks for signal mode passed, use signal mode", logfile)

    if conf_dict['General']['mode'] == "binary":
        conf_dict['General']['usefilename'] = conf_dict['General'][
            'peakfilenames']

    wlog(
        "%s cofactor candidates are included" %
        (len(conf_dict['General']['usefilename'])), logfile)

    #checkbed = checkbedformat(conf_dict['General']['HMRpeak'],1000)
    #if checkbed == "pass":
    #conf_dict['General']['peakfilenames'].append(f[:-3])

    outf = open(
        conf_dict['General']['outname'] + "_cofactor_candidate_list.txt", 'w')
    for cofactor in conf_dict['General']['usefilename']:
        outf.write(cofactor + "\n")
    outf.close()
    ### check options
    wlog('check option: ', logfile)
    #
    try:
        wlog(
            "extend length for HMsignal is %s bp" %
            (int(conf_dict['options']['ext'])), logfile)
        conf_dict['options']['ext'] = int(conf_dict['options']['ext'])
    except:
        wlog(
            "extend length %s is not valid, use default value: 1000bp" %
            (conf_dict['options']['ext']), logfile)
        conf_dict['options']['ext'] = 1000

    try:
        wlog(
            "use Pvalue = %s as cutoff" %
            (str(float(conf_dict['options']['Pvalue']))), logfile)
    except:
        wlog(
            "input Pvalue %s is not recognized, use default Pvalue=0.001" %
            (conf_dict['options']['Pvalue']), logfile)
        conf_dict['options']['Pvalue'] = 0.001
    if float(conf_dict['options']['Pvalue']) >= 1:
        wlog(
            "input Pvalue %s is not valid, use default Pvalue=0.001" %
            (conf_dict['options']['Pvalue']), logfile)
        conf_dict['options']['Pvalue'] = 0.001

    try:
        usealpha = float(conf_dict['options']['Alpha'])
        if usealpha >= 1:
            wlog("alpha (for elastic-net) cannot be >=1, use alpha=0.5",
                 logfile)
            conf_dict['options']['Alpha'] = 0.5
        else:
            wlog(
                "Alpha (for elastic-net) = %s" %
                (str(float(conf_dict['options']['Alpha']))), logfile)
            conf_dict['options']['Alpha'] = usealpha
    except:
        wlog(
            "input alpha (for elastic-net) %s is not valid, use alpha=0.5" %
            (conf_dict['options']['Alpha']), logfile)

    wlog("Lambda choice is %s" % (conf_dict['options']['Lambda']), logfile)
    if conf_dict['options']['TopNcofactors'] == "all":
        wlog("all significant co-factors will be output", logfile)
    else:
        try:
            topTF = int(conf_dict['options']['TopNcofactors'])
            wlog(
                "the topN number %s will be output" %
                (conf_dict['options']['TopNcofactors']), logfile)
            conf_dict['options']['TopNcofactors'] = topTF
        except:
            wlog(
                "the topN number %s is not valid, output top5 co-factors" %
                (conf_dict['options']['TopNcofactors']), logfile)
            conf_dict['options']['TopNcofactors'] = 5

    OS = platform.system()
    if OS == "Linux":
        bwsum_software = "bigWigSummary_linux"
    elif OS == "Darwin":
        bwsum_software = "bigWigSummary_mac"
    else:
        wlog(
            "detected system is nither linux nor mac, try linux version of bigWigSummary",
            logfile)
        bwsum_software = "bigWigSummary_linux"

    conf_dict['General'][
        'bwsummary'] = HMRpipe.__path__[0] + "/%s" % bwsum_software
    if os.path.isfile(HMRpipe.__path__[0] + "/bedtools"):
        conf_dict['General']['bedtools'] = HMRpipe.__path__[0] + "/bedtools"
    else:
        conf_dict['General']['bedtools'] = "bedtools"

    ### check Rscript
    #if not 'Usage' in sperr('Rscript')[1] and not 'version' in sperr('Rscript')[1]:
    #    ewlog('require Rscript',logfile)

    ### check pdflatex
    if sp('pdflatex --help')[0] == "":
        wlog(
            'pdflatex was not installed, ncHMR_detector is still processing but no summary report generated',
            logfile)
        conf_dict['General']['latex'] = 0
    else:
        conf_dict['General']['latex'] = 1

    return conf_dict
def step3_summary(conf_dict, logfile):
    '''
    analysis part
    mainly Rscript
    dimentional reduction + clustering
    '''
    # start
    # create section for

    wlog('collect results', logfile)
    # Rscript analysis.r expmat outname coverGN highvarZ selectPCcutoff rdnumber maxKnum
    summarydir = 'summary/'
    createDIR(summarydir)
    sp("mv %s_NCsummary.txt %s" %
       (conf_dict['General']['outname'], summarydir))
    sp("mv %s_elnet_lambdaSelection.pdf %s" %
       (conf_dict['General']['outname'], summarydir))
    if os.path.isfile("%s_cofactor_HMsignal.pdf" %
                      conf_dict['General']['outname']):
        sp("mv %s_cofactor_HMsignal.pdf %s" %
           (conf_dict['General']['outname'], summarydir))

    tmpresult = 'tmpResults/'
    createDIR(tmpresult)
    sp("mv %s_HMsig.bed %s" % (conf_dict['General']['outname'], tmpresult))
    sp("mv %s_peakov.bed %s" % (conf_dict['General']['outname'], tmpresult))
    sp("mv %s_cofactor_candidate_list.txt %s" %
       (conf_dict['General']['outname'], tmpresult))
    sp("mv %s_filterNC.txt %s" % (conf_dict['General']['outname'], tmpresult))

    wlog('generate summary documents', logfile)
    ### initiate
    QCdoc = """\documentclass[11pt,a4paper]{article}
\usepackage{tabularx}
\usepackage[english]{babel}
\usepackage{array}
\usepackage{graphicx}
\usepackage{color}
\DeclareGraphicsExtensions{.eps,.png,.pdf,.ps}
\\begin{document}
\\title{Summary reports of non-classical function detection of : %s}

\\vspace{-1cm}
\maketitle
\\tableofcontents
\\newpage
\\newpage
\section{Data description}
\\begin{quotation}
Table 1 mainly describes the input files, parameters and options.
\end{quotation}
\\begin{table}[h]
\\small
\caption{parameter description}\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|l| }

""" % (strlatexformat(conf_dict['General']['outname']))
    ### table1 prepare parameter
    NcoTF = len(conf_dict['General']['peakfilenames'])
    QCdoc += """      
\hline
parameter & value  \\\\
\hline
output name & %s \\\\
\hline
HMRpeak(peak filename) & %s \\\\
\hline
mode & %s \\\\
\hline
HM signal(bw filename) & \\begin{tabular}[c]{@{}l@{}}%s\end{tabular}  \\\\
\hline
\#cofactor candidates & %s \\\\
\hline
options & value \\\\
\hline
extend size & %sbp \\\\
\hline
Alpha (Elastic net) & %s \\\\
\hline
Pvalue cutoff & %s \\\\
\hline
topN cofactors & %s \\\\
\hline
""" % (strlatexformat(conf_dict['General']['outname']),
       strlatexformat(conf_dict['General']['HMRpeak'].split("/")[-1]),
       conf_dict['General']['mode'],
       strlatexformat("\\\\ ".join(conf_dict['General']['signalname'])),
       str(NcoTF), str(conf_dict['options']['ext']),
       str(conf_dict['options']['Alpha']), str(conf_dict['options']['Pvalue']),
       str(conf_dict['options']['TopNcofactors']))
    QCdoc += """
\end{tabularx}
\end{table}
"""
    ### cross validation in elastic net
    QCdoc += """
\\newpage
\\newpage
\section{ElasticNet co-factor selection}
In this step we use a feature selection (elastic-net. Zou, H. and Hastie T. (2005) to select potential co-factors which corresponded to the non-classical function. Below shows the cross-validation curve for the decison of lambda in elastic-net for each histone modification substrate.  
\\begin{figure}[h]
        \caption{cross-validation curve for lambda decision} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
""" % (conf_dict['General']['outname'] + "_elnet_lambdaSelection.pdf")

    inf_ncsummary = open("summary/" + conf_dict['General']['outname'] +
                         "_NCsummary.txt")
    line = inf_ncsummary.readline()
    if line.startswith("no non-classical function detected"):
        QCdoc += """
\\newpage
\\newpage
\section{potential co-factors corresponded to non-classical function}
No significant co-factor was detected, indicating that the non-classical function of the HMR was not exist or none of the existing factor candidates act as a co-factor of the non-classical function.
"""
    else:
        QCdoc += """
\\newpage
\\newpage
\section{potential co-factors corresponded to non-classical function}
In summary, %s factors were predicted to potentially act as a co-factor of the non-classical function. The top%s co-factors were listed.
\subsection{summary of co-factors}
\\begin{quotation}
The corresponded histone modification substrate (HMsubstrate), empirical P-value, R-square (ordered) and the number of non-classical (NC) sites for each potential co-factor were listed below. The empirical P-value was calculated based on the comparison of foreground (observed) R-square and background R-square (distribution of random R-square generated from the 1,000 permutations of co-binding events) for each potential co-factor. The non-classical (NC) sites were defined by lower HMsubstrate signal (using Otus' method) and co-binding events of each potential co-factor.
\end{quotation}
\\begin{table}[h]
\\small
\caption{cofactor summary}\label{bstable}
\\begin{tabular}{ |l|l|l|l|l| }
    
\hline
co-factor & HMsubstrate & Pval & Rsquare & NCsites \\\\
""" % (int(
            sp("wc -l tmpResults/%s_filterNC.txt" %
               (conf_dict['General']['outname']))[0].split()[0]) - 1,
        int(
           sp("wc -l summary/%s_NCsummary.txt" %
              (conf_dict['General']['outname']))[0].split()[0]) - 1)

        for line in inf_ncsummary:
            if line.startswith("TFname"):
                continue
            ll = line.split()
            this_doc = """\hline
%s & %s & %s & %s & %s \\\\
""" % (strlatexformat(ll[0]), strlatexformat(
                ll[1]), ll[2], round(float(ll[3]), 3), ll[5])
            QCdoc += this_doc
        inf_ncsummary.close()
        QCdoc += """
\hline
\end{tabular}
\end{table}
\\newpage
\\newpage
\subsection{Boxplot of HM on non-classical and classic sites}
\\begin{quotation}
Boxplot was generated to compare the difference of the histone mark (HM) signal on either non-classical or classic sites(peak). The non-classical sites were defined by lower HM signal (using Otus' method) and co-binding events of each potential co-factor. The boxplot corresponded to top co-factors were displayed.  
\end{quotation}
\\begin{figure}[h]
        \caption{boxplot cofactor HMsignal} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
""" % ((conf_dict['General']['outname'] + "_cofactor_HMsignal.pdf"))

    QCdoc += """
\\newpage
\\newpage
\section{Output list}
\\begin{quotation}
All the main output files were described in the following table
\end{quotation}
\\begin{table}[h]
\\small
\caption{output list}\label{bstable}
\\begin{tabular}{ |l|l| }
    
\hline
description & filename \\\\
\hline
summary table of non-classical (NC) function & summary/%s \\\\
\hline
summary report (this doc) & summary/%s \\\\
\hline
cobinding matrix on HMR peaks & tmpResults/%s \\\\
\hline
histone mark signal on HMR peaks & tmpResults/%s \\\\
\hline

\end{tabular}
\end{table} 
\end{document} 

""" % (strlatexformat(conf_dict['General']['outname'] + "_NCsummary.txt"),
       strlatexformat(conf_dict['General']['outname'] + "_summary.pdf"),
       strlatexformat(conf_dict['General']['outname'] + "_peakov.bed"),
       strlatexformat(conf_dict['General']['outname'] + "_HMsig.bed"))

    latexfile = conf_dict['General']['outname'] + '_summary.tex'

    outf = open(summarydir + latexfile, 'w')
    outf.write(QCdoc)
    outf.close()
    cmd = "pdflatex %s" % (latexfile)
    cmd2 = 'cp %s ../' % (conf_dict['General']['outname'] + '_summary.pdf')
    if conf_dict['General']['latex'] == 1:
        wlog(
            'pdflatex was detected in default PATH, generate summary report %s'
            % (conf_dict['General']['outname'] + '_summary.pdf'), logfile)
        os.chdir(summarydir)
        tmpobj = sp(cmd)
        tmpobj = sp(cmd)
        tmpobj = sp(cmd2)
        tmpobj = sp("rm %s_summary.aux" % conf_dict['General']['outname'])
        tmpobj = sp("rm %s_summary.log" % conf_dict['General']['outname'])
        tmpobj = sp("rm %s_summary.toc" % conf_dict['General']['outname'])

#        for files in os.listdir(plot_folder):
#            if os.path.isfile(files) and files[-12:-4] == "_summary":
#                if not files[-4:] in ['.tex','.pdf',',png','.txt']:
#                    cmd = "rm %s"%(files)
#                    rwlog(cmd,logfile)
    else:
        wlog(
            'pdflatex was not detected in default PATH, generate summary report .tex file in summary/ folder, you can move the whole summary/ folder to the environment with pdflatex installed and run cmd in the summary/ folder: "pdflatex %s"'
            % (conf_dict['General']['outname'] + '_summary.tex'), logfile)

    #if conf_dict['clean']:
    #    wlog('--clean pararmeter was turned on, remove internal files with large size',logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_symbol.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_cds.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_3utr.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_5utr.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_TTSdis.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_combined.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_barcode_reform.txt'),logfile)
#
    os.chdir("../")
    wlog('Step3 summary DONE, check %s for final outputs' % (summarydir),
         logfile)

    return conf_dict
def step2_NC_detection(conf_dict, logfile):
    '''
    analysis part, mainly Rscript
    Detect 
    '''

    # Rscript detectNonClassic.r outname signalname usePQ cutoff alpha lambdachoice topN tmpRpackgeDIR
    if conf_dict['General']['mode'] == "binary":
        Rscript = """
### read parameter
a<-commandArgs(T)

outname <- a[1]
signalname <- unlist(strsplit(a[2],","))
cutoff <- as.numeric(a[3])
Alpha <- as.numeric(a[4])
LambdaChoice <- (a[5])
topN <- a[6]
R2cutoff <- as.numeric(a[8])
R2classic <- 0.1
tmp_rpackage_dir <- a[7]

### install R packages
if("foreach" %in% installed.packages()[,"Package"]){
    library(foreach)
}else{
    install.packages("foreach",dependencies=TRUE,lib=tmp_rpackage_dir,repos="https://mirrors.tongji.edu.cn/CRAN/")
    library(foreach,lib.loc=tmp_rpackage_dir)
}

if("glmnet" %in% installed.packages()[,"Package"]){
    library(glmnet)
}else{
    install.packages("glmnet",dependencies=TRUE,lib=tmp_rpackage_dir,repos="https://mirrors.tongji.edu.cn/CRAN/")
    library(glmnet,lib.loc=tmp_rpackage_dir)
}

library(methods)

### define functions
unilinear_only <- function(TFov,HMsig){
    # univariate linear regression 
    lmresult <- summary(lm(HMsig ~ TFov))
    fgR2 <- lmresult$adj.r.squared
    lmcoeff <- lmresult$coefficients['TFov','Estimate']
    lmP <- lmresult$coefficients['TFov',c('Pr(>|t|)')]
    return(c(fgR2,lmcoeff))
}

unilinear_permute <- function(TFov,HMsig){
    # univariate linear regression + empirical pvalue
    lmresult <- summary(lm(HMsig ~ TFov))
    fgR2 <- lmresult$adj.r.squared
    bgR2s <- c()
    for(i in 1:999){
        bgTFov <- rep(0,length(TFov))
        bgTFov[sample(length(TFov),length(which(TFov > 0)))] <- 1
        bgR2 <- summary(lm(HMsig ~ bgTFov))$adj.r.squared
        bgR2s <- c(bgR2s,bgR2)
    }
    permuteP <- (length(which(fgR2 < bgR2s))+1)/(length(bgR2s)+1)
    return(permuteP)
}

maxG<-function(cutoff,usedata){
    # function of estimating G given cutoff
    # G : inter-class variance
    # method refer to "Otus' method"
    w0 <- length(which(usedata < cutoff))/length(usedata)
    w1 <- length(which(usedata >= cutoff))/length(usedata)
    u0 <- mean(usedata[which(usedata < cutoff)])
    u1 <- mean(usedata[which(usedata >= cutoff)])
    g <- w0*w1*(u0-u1)**2
    return(g)
}

signal2cutoff <- function(rawsig){
    # function for separate NC peaks considering HMsignal
    # input data: a vector of signal (HMsignal on HMR peak, linear scale)
    # method: go through all possible cutoff, find the cutoff corresponding to maximum G
    # output: the cutoff, a vector of selected cutoff candidates (Gbins) and a vector of G value for each cutoff candidates (G) 
    
    sig <- log10(rawsig[which(rawsig>0)])
    # separate the section of (log) signal to N cutoffs 
    Gbins <- seq(min(sig)+0.1,max(sig)-0.1,0.01)
    
    # estimate G for each cutoff candidates
    G <- unlist(lapply(Gbins,maxG, sig))
    
    # select cutoff at the first time G meats its maximum value
    NCcut<-seq(min(sig)+0.1,max(sig)-0.1,0.01)[which(G==max(G))][1]

    group_detail <- rep(0, length(rawsig))
    group_detail[which(rawsig >= 10**NCcut)] <- 1
    
    # output the grouping result: list contains 3 items 
    # item1: NC cutoff
    # item2: 2 column for cutoff candidates and corresponded G
    # item3: Nrow = peak number,Ncolumn = 2, c1 for signal, c2 for group number, 0 for lowHM group (solo, non-classical), 1 for highHM group (ensemble, classical)
    
    return(list(NCcut, cbind(Gbins,G), cbind(rawsig, group_detail) ))
}

# step1 data preprocess
peakov <- read.table(paste0(outname,"_peakov.bed"))
signal <- read.table(paste0(outname,"_HMsig.bed"))

candidate_list <- as.vector(read.table(paste0(outname,"_cofactor_candidate_list.txt"))[,1])
bedncol <- ncol(peakov) - length(candidate_list)
colnames(peakov) <- c(paste0("c",seq(1:bedncol)),candidate_list)
colnames(signal) <- c(paste0("c",seq(1:bedncol)),signalname)
rownames(peakov) <- paste0("r",seq(1,nrow(peakov)))
rownames(signal) <- paste0("r",seq(1,nrow(peakov)))

TFov <- peakov[,candidate_list]
TFov[TFov > 1] <- 1
HMsig <- as.matrix(signal[,signalname])
colnames(HMsig) <- signalname
rownames(HMsig) <- rownames(signal)


### step2, prepare X,Y for model selection 
lindata <- cbind(HMsig,TFov)
bind_sum <- apply(TFov,1,sum)
## sites with 90% factors overlapped are excluded
use1_lindata <- lindata[names(bind_sum[which(bind_sum < ncol(TFov)*0.9)]),]
## factors with < 100 or > 95% cobinding events are excluded
TF_sum <- apply(use1_lindata[,colnames(TFov)],2,sum)
use_lindata <- lindata[names(bind_sum[which(bind_sum < ncol(TFov)*0.9)]),c(colnames(HMsig),names(TF_sum)[which(TF_sum>=100 & TF_sum <= nrow(use1_lindata)*0.95)])]
## form X, Y
### raw Y is used in otsu' method
rawY <- as.matrix(use_lindata[,colnames(HMsig)])
colnames(rawY) <- colnames(HMsig)
Y <- as.matrix(scale(use_lindata[,colnames(HMsig)]))
colnames(Y) <- colnames(HMsig)
X <- as.matrix(use_lindata[,(ncol(HMsig)+1):ncol(use_lindata)])
peakX <- peakov[rownames(X),1:bedncol]


### elastic net model selection
set.seed(1007)
coTFusage <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X))
rownames(coTFusage) <- colnames(X)
colnames(coTFusage) <- colnames(Y)

if(ncol(Y) == 1){
    pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=6,width=6)
    par(mar=c(4,4,4,2))    
}else if(ncol(Y) == 2){
    pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=6,width=12)
    par(mfrow=c(1,2),mar=c(4,4,4,2))
}else if(ncol(Y) == 3){
    pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=12,width=12)
    par(mfrow=c(2,2),mar=c(4,4,4,2))
}else{
    pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=12,width=12)
    par(mfrow=c(2,2),mar=c(4,4,4,2))
}

for(i in 1:ncol(Y)){
    thisY <- Y[,i]
    this_name <- colnames(Y)[i]
    cv.glmmod <- cv.glmnet(x=X,y=thisY,alpha=Alpha,family="gaussian")
    coeff_this <- coef(cv.glmmod,s=paste0("lambda.",LambdaChoice)) 
    coTF_name <- rownames(coeff_this)[which(coeff_this[,1] < 0 & rownames(coeff_this)!= "(Intercept)")]

    coTFusage[coTF_name,this_name] <- 1

    ### cross-validation curve for each HM
    if (i %in% 1:4){
        plot(cv.glmmod)
        title(this_name,line=2.5)
        if (LambdaChoice == "1se"){
            abline(v=log(cv.glmmod$lambda.1se),col="blue",lwd=2)
            legend("topleft",legend="lambda.1se",lwd=3,bty="n",col='blue')
        }else{
            abline(v=log(cv.glmmod$lambda.min),col="blue",lwd=2)
            legend("topleft",legend="lambda.min",lwd=3,bty="n",col='blue')
        }
    }
}

dev.off()

### generate R2 and coeff table
R2_mat <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X))
coeff_mat <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X))
rownames(R2_mat) <- colnames(X)
colnames(R2_mat) <- colnames(Y)
rownames(coeff_mat) <- colnames(X)
colnames(coeff_mat) <- colnames(Y)
for(TFname in colnames(X)){
    for(HMname in colnames(Y)){
        unilinResult <- unilinear_only(X[,TFname],Y[,HMname])
        R2_mat[TFname,HMname] <- unilinResult[1]
        coeff_mat[TFname,HMname] <- unilinResult[2]
    }
}

summary_table <- c()
### select candidates
# for each candidates detected in el-net step, require R2>0.1, coeff<0, R2 for any positive correlated HM signal (coeff>0) < 0.1 
for(TFname in colnames(X)){
    this_usage <- coTFusage[TFname,]
    this_R2 <- R2_mat[TFname,]
    this_coeff <- coeff_mat[TFname,]

    classicFun <- 0
    nonClassicHMidx <- c()

    for(i in 1:ncol(coTFusage)){
        if(this_usage[i]==1 & this_R2[i] >= R2cutoff & this_coeff[i] < 0){
            nonClassicHMidx <- c(nonClassicHMidx, i)
        }else if(this_R2[i] >= R2classic & this_coeff[i] > 0 ){
            classicFun <- 1
        }
    }

    if(classicFun == 0){
        for(idx in nonClassicHMidx){
            nonClassicHM <- colnames(Y)[idx]
            R2 <- this_R2[idx]
            coeff <- this_coeff[idx]
            Pval <- unilinear_permute(X[,TFname],Y[,idx])
            summary_table <- rbind(summary_table, c(TFname, nonClassicHM, Pval, R2, coeff))
        }        
    }
}

### summary and output steps
if(is.null(summary_table)){
    print("no significant candidates detected")
    write.table("no non-classical function detected",file=paste0(outname,"_filterNC.txt"),quote=F,sep="\t",row.names=F,col.names=F)
    write.table("no non-classical function detected",file=paste0(outname,'_NCsummary.txt'),quote=F,sep="\t",row.names=F,col.names=F)
}else{
    colnames(summary_table) <- c("TFname","HMname","Pval","R2","coeff")
    if(nrow(summary_table) > 1){
        summary_table <- summary_table[order(as.numeric(summary_table[,"R2"]),decreasing=TRUE),]
    }

    if(topN == "all"){
        topN <- nrow(summary_table)
    }else{
        topN <- as.numeric(topN)
    }
    
    if(topN == 1){
        out_table_raw <- summary_table
    }else{
        out_table_raw <- as.matrix(summary_table[1:topN,])
    }
    
    ## separate NC/C peaks based on HMsignal using otsu's method
    peakgroup <- c()
    for(i in 1:ncol(rawY)){
        thisY <- rawY[,i]
        thisY_NCotsu <- signal2cutoff(thisY)
        thisY_peakgroup <- thisY_NCotsu[[3]][,2]
        peakgroup <- cbind(peakgroup, thisY_peakgroup)
    }
    colnames(peakgroup) <- colnames(Y)

    ## for each predicted coTFvsHM pair, output the cobinding NC sites as a bed file
    if(!file.exists("nonClassicalPeaks/")){dir.create("nonClassicalPeaks")}
    if(topN == 1){
        pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=6,width=6)
        par(mar=c(4,4,2,2))    
    }else if(topN == 2){
        pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=6,width=12)
        par(mfrow=c(1,2),mar=c(4,4,2,2))
    }else{
        pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=12,width=12)
        par(mfrow=c(2,2),mar=c(4,4,2,2))
    }
    num_NCsites <- c()
    for(topnum in 1:nrow(out_table_raw)){
        coTF = out_table_raw[topnum,1]
        substrateHM = out_table_raw[topnum,2]
        coTF_cobinding <- X[,coTF]
        HMnc <- peakgroup[,substrateHM]
        cobinding_NC_peak <- peakX[which(coTF_cobinding>0 & HMnc == 0),]
        write.table(cobinding_NC_peak, file=paste0("nonClassicalPeaks/",outname,"_",coTF,"_",substrateHM,"_top",topnum,"nonclassical_peaks.bed"),quote=F,sep="\t",row.names=F,col.names=F)
        num_NCsites <- c(num_NCsites, nrow(cobinding_NC_peak))
        if(topnum %in% 1:4){
            ## boxplot compare the histone modification signal between on classical and non-classical peaks
            boxplot(Y[which(coTF_cobinding>0 & HMnc == 0)],Y[which(coTF_cobinding==0 | HMnc > 0)],
                names=c("non-classical peak","classical peak"),ylab=paste0(substrateHM," signal"),main=paste0(coTF," & ",substrateHM),
                outline=F,cex.main=1)
            legend("topleft",legend=paste0("#NCpeak = ",nrow(cobinding_NC_peak)),bty="n")
        }
    }
    dev.off()
    
    out_table <- cbind(out_table_raw, num_NCsites)
    write.table(summary_table,file=paste0(outname,"_filterNC.txt"),quote=F,sep="\t",row.names=F,col.names=T)
    write.table(out_table,file=paste0(outname,'_NCsummary.txt'),quote=F,sep="\t",row.names=F,col.names=T)
} 
  
"""
# Rscript detectNonClassic.r outname signalname usePQ cutoff alpha lambdachoice topN #tmpRpackgeDIR
    else:  # conf_dict['General']['mode'] == "signal":
        Rscript = """
### read parameter
a<-commandArgs(T)

outname <- a[1]
signalname <- unlist(strsplit(a[2],","))
cutoff <- as.numeric(a[3])
Alpha <- as.numeric(a[4])
LambdaChoice <- (a[5])
topN <- a[6]
R2cutoff <- as.numeric(a[8])
R2classic <- 0.1
tmp_rpackage_dir <- a[7]

### install R packages
if("foreach" %in% installed.packages()[,"Package"]){
    library(foreach)
}else{
    install.packages("foreach",dependencies=TRUE,lib=tmp_rpackage_dir,repos="https://mirrors.tongji.edu.cn/CRAN/")
    library(foreach,lib.loc=tmp_rpackage_dir)
}

if("glmnet" %in% installed.packages()[,"Package"]){
    library(glmnet)
}else{
    install.packages("glmnet",dependencies=TRUE,lib=tmp_rpackage_dir,repos="https://mirrors.tongji.edu.cn/CRAN/")
    library(glmnet,lib.loc=tmp_rpackage_dir)
}

library(methods)

### define functions
unilinear_only <- function(TFov,HMsig){
    # univariate linear regression 
    lmresult <- summary(lm(HMsig ~ TFov))
    fgR2 <- lmresult$adj.r.squared
    lmcoeff <- lmresult$coefficients['TFov','Estimate']
    lmP <- lmresult$coefficients['TFov',c('Pr(>|t|)')]
    return(c(fgR2,lmcoeff))
}

unilinear_permute <- function(TFov,HMsig){
    # univariate linear regression + empirical pvalue
    lmresult <- summary(lm(HMsig ~ TFov))
    fgR2 <- lmresult$adj.r.squared
    bgR2s <- c()
    for(i in 1:999){
        bgTFov <- rep(0,length(TFov))
        bgTFov[sample(length(TFov),length(which(TFov > 0)))] <- 1
        bgR2 <- summary(lm(HMsig ~ bgTFov))$adj.r.squared
        bgR2s <- c(bgR2s,bgR2)
    }
    permuteP <- (length(which(fgR2 < bgR2s))+1)/(length(bgR2s)+1)
    return(permuteP)
}

maxG<-function(cutoff,usedata){
    # function of estimating G given cutoff
    # G : inter-class variance
    # method refer to "Otus' method"
    w0 <- length(which(usedata < cutoff))/length(usedata)
    w1 <- length(which(usedata >= cutoff))/length(usedata)
    u0 <- mean(usedata[which(usedata < cutoff)])
    u1 <- mean(usedata[which(usedata >= cutoff)])
    g <- w0*w1*(u0-u1)**2
    return(g)
}

signal2cutoff <- function(rawsig){
    # function for separate NC peaks considering HMsignal
    # input data: a vector of signal (HMsignal on HMR peak, linear scale)
    # method: go through all possible cutoff, find the cutoff corresponding to maximum G
    # output: the cutoff, a vector of selected cutoff candidates (Gbins) and a vector of G value for each cutoff candidates (G) 
    
    sig <- log10(rawsig[which(rawsig>0)])
    # separate the section of (log) signal to N cutoffs 
    Gbins <- seq(min(sig)+0.1,max(sig)-0.1,0.01)
    
    # estimate G for each cutoff candidates
    G <- unlist(lapply(Gbins,maxG, sig))
    
    # select cutoff at the first time G meats its maximum value
    NCcut<-seq(min(sig)+0.1,max(sig)-0.1,0.01)[which(G==max(G))][1]

    group_detail <- rep(0, length(rawsig))
    group_detail[which(rawsig >= 10**NCcut)] <- 1
    
    # output the grouping result: list contains 3 items 
    # item1: NC cutoff
    # item2: 2 column for cutoff candidates and corresponded G
    # item3: Nrow = peak number,Ncolumn = 2, c1 for signal, c2 for group number, 0 for lowHM group (solo, non-classical), 1 for highHM group (ensemble, classical)
    
    return(list(NCcut, cbind(Gbins,G), cbind(rawsig, group_detail) ))
}

trim95 <- function(INdata){
	indata <- INdata
	indata[indata>quantile(indata,0.95)] <- quantile(indata,0.95)
	return(indata)
}
# step1 data preprocess
peakov <- read.table(paste0(outname,"_peakov.bed"))
peaksig <- read.table(paste0(outname,"_TFsig.bed"))
signal <- read.table(paste0(outname,"_HMsig.bed"))

candidate_list <- as.vector(read.table(paste0(outname,"_cofactor_candidate_list.txt"))[,1])
bedncol <- ncol(peakov) - length(candidate_list)
colnames(peakov) <- c(paste0("c",seq(1:bedncol)),candidate_list)
colnames(peaksig) <- c(paste0("c",seq(1:bedncol)),candidate_list)
colnames(signal) <- c(paste0("c",seq(1:bedncol)),signalname)

rownames(peakov) <- paste0("r",seq(1,nrow(peakov)))
rownames(peaksig) <- paste0("r",seq(1,nrow(peaksig)))
rownames(signal) <- paste0("r",seq(1,nrow(peakov)))

use_candidate_list <- c()
for(i in candidate_list){
	SDsig <- sd(as.numeric(peaksig[,i]))
    if (i != outname || SDsig == 0){
        use_candidate_list <- c(use_candidate_list,i)
    }
}

TFov <- as.matrix(peakov[,use_candidate_list])
TFov[TFov > 1] <- 1
TFsig_raw <- as.matrix(peakov[,use_candidate_list])
TFsig <- apply(TFsig_raw,2,trim95)

HMsig <- as.matrix(signal[,signalname])
colnames(HMsig) <- signalname
rownames(HMsig) <- rownames(signal)


### step2, prepare X,Y for model selection 
lindata <- cbind(HMsig,TFov)
bind_sum <- apply(TFov,1,sum)
## sites with 90% factors overlapped are excluded
use1_lindata <- lindata[names(bind_sum[which(bind_sum < ncol(TFov)*0.9)]),]
## factors with < 100 or > 95% cobinding events are excluded
TF_sum <- apply(use1_lindata[,colnames(TFov)],2,sum)
use_lindata <- lindata[names(bind_sum[which(bind_sum < ncol(TFov)*0.9)]),c(colnames(HMsig),names(TF_sum)[which(TF_sum>=100 & TF_sum <= nrow(lindata)*0.95)])]
## form X, Y
### raw Y is used in otsu' method
rawY <- as.matrix(use_lindata[,colnames(HMsig)])
colnames(rawY) <- colnames(HMsig)
Y <- as.matrix(scale(use_lindata[,colnames(HMsig)]))
colnames(Y) <- colnames(HMsig)
X_peakOV <- as.matrix(use_lindata[,(ncol(HMsig)+1):ncol(use_lindata)])
peakX <- peakov[rownames(X_peakOV),1:bedncol]
X_sig <- as.matrix(TFsig[rownames(X_peakOV),colnames(X_peakOV)])
X_sig[X_peakOV == 0] <- 0
X <- X_sig

### elastic net model selection
set.seed(1007)
coTFusage <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X))
rownames(coTFusage) <- colnames(X)
colnames(coTFusage) <- colnames(Y)

if(ncol(Y) == 1){
    pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=6,width=6)
    par(mar=c(4,4,4,2))    
}else if(ncol(Y) == 2){
    pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=6,width=12)
    par(mfrow=c(1,2),mar=c(4,4,4,2))
}else if(ncol(Y) == 3){
    pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=12,width=12)
    par(mfrow=c(2,2),mar=c(4,4,4,2))
}else{
    pdf(file=paste0(outname,"_elnet_lambdaSelection.pdf"),height=12,width=12)
    par(mfrow=c(2,2),mar=c(4,4,4,2))
}

for(i in 1:ncol(Y)){
    thisY <- Y[,i]
    this_name <- colnames(Y)[i]
    cv.glmmod <- cv.glmnet(x=X,y=thisY,alpha=Alpha,family="gaussian")
    coeff_this <- coef(cv.glmmod,s=paste0("lambda.",LambdaChoice)) 
    coTF_name <- rownames(coeff_this)[which(coeff_this[,1] < 0 & rownames(coeff_this)!= "(Intercept)")]

    coTFusage[coTF_name,this_name] <- 1

    ### cross-validation curve for each HM
    if (i %in% 1:4){
        plot(cv.glmmod)
        title(this_name,line=2.5)
        if (LambdaChoice == "1se"){
            abline(v=log(cv.glmmod$lambda.1se),col="blue",lwd=2)
            legend("topleft",legend="lambda.1se",lwd=3,bty="n",col='blue')
        }else{
            abline(v=log(cv.glmmod$lambda.min),col="blue",lwd=2)
            legend("topleft",legend="lambda.min",lwd=3,bty="n",col='blue')
        }
    }
}

dev.off()

### generate R2 and coeff table
R2_mat <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X))
coeff_mat <- matrix(rep(0,ncol(X)*ncol(Y)),nrow=ncol(X))
rownames(R2_mat) <- colnames(X)
colnames(R2_mat) <- colnames(Y)
rownames(coeff_mat) <- colnames(X)
colnames(coeff_mat) <- colnames(Y)
for(TFname in colnames(X)){
    for(HMname in colnames(Y)){
        unilinResult <- unilinear_only(X[,TFname],Y[,HMname])
        R2_mat[TFname,HMname] <- unilinResult[1]
        coeff_mat[TFname,HMname] <- unilinResult[2]
    }
}

summary_table <- c()
### select candidates
# for each candidates detected in el-net step, require R2>0.1, coeff<0, R2 for any positive correlated HM signal (coeff>0) < 0.1 
for(TFname in colnames(X)){
    this_usage <- coTFusage[TFname,]
    this_R2 <- R2_mat[TFname,]
    this_coeff <- coeff_mat[TFname,]

    classicFun <- 0
    nonClassicHMidx <- c()

    for(i in 1:ncol(coTFusage)){
        if(this_usage[i]==1 & this_R2[i] >= R2cutoff & this_coeff[i] < 0){
            nonClassicHMidx <- c(nonClassicHMidx, i)
        }else if(this_R2[i] >= R2classic & this_coeff[i] > 0 ){
            classicFun <- 1
        }
    }

    if(classicFun == 0){
        for(idx in nonClassicHMidx){
            nonClassicHM <- colnames(Y)[idx]
            R2 <- this_R2[idx]
            coeff <- this_coeff[idx]
            Pval <- unilinear_permute(X[,TFname],Y[,idx])
            summary_table <- rbind(summary_table, c(TFname, nonClassicHM, Pval, R2, coeff))
        }        
    }
}

### summary and output steps
if(is.null(summary_table)){
    print("no significant candidates detected")
    write.table("no non-classical function detected",file=paste0(outname,"_filterNC.txt"),quote=F,sep="\t",row.names=F,col.names=F)
    write.table("no non-classical function detected",file=paste0(outname,'_NCsummary.txt'),quote=F,sep="\t",row.names=F,col.names=F)
}else{
    colnames(summary_table) <- c("TFname","HMname","Pval","R2","coeff")
    if(nrow(summary_table) > 1){
        summary_table <- summary_table[order(as.numeric(summary_table[,"R2"]),decreasing=TRUE),]
    }

    if(topN == "all"){
        topN <- nrow(summary_table)
    }else{
        topN <- as.numeric(topN)
    }
    
    if(topN == 1){
        out_table_raw <- summary_table
    }else{
        out_table_raw <- as.matrix(summary_table[1:topN,])
    }
    
    ## separate NC/C peaks based on HMsignal using otsu's method
    peakgroup <- c()
    for(i in 1:ncol(rawY)){
        thisY <- rawY[,i]
        thisY_NCotsu <- signal2cutoff(thisY)
        thisY_peakgroup <- thisY_NCotsu[[3]][,2]
        peakgroup <- cbind(peakgroup, thisY_peakgroup)
    }
    colnames(peakgroup) <- colnames(Y)

    ## for each predicted coTFvsHM pair, output the cobinding NC sites as a bed file
    if(!file.exists("nonClassicalPeaks/")){dir.create("nonClassicalPeaks")}
    if(topN == 1){
        pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=6,width=6)
        par(mar=c(4,4,2,2))    
    }else if(topN == 2){
        pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=6,width=12)
        par(mfrow=c(1,2),mar=c(4,4,2,2))
    }else{
        pdf(file=paste0(outname,"_cofactor_HMsignal.pdf"),height=12,width=12)
        par(mfrow=c(2,2),mar=c(4,4,2,2))
    }
    num_NCsites <- c()
    for(topnum in 1:nrow(out_table_raw)){
        coTF = out_table_raw[topnum,1]
        substrateHM = out_table_raw[topnum,2]
        coTF_cobinding <- X[,coTF]
        HMnc <- peakgroup[,substrateHM]
        cobinding_NC_peak <- peakX[which(coTF_cobinding>0 & HMnc == 0),]
        write.table(cobinding_NC_peak, file=paste0("nonClassicalPeaks/",outname,"_",coTF,"_",substrateHM,"_top",topnum,"nonclassical_peaks.bed"),quote=F,sep="\t",row.names=F,col.names=F)
        num_NCsites <- c(num_NCsites, nrow(cobinding_NC_peak))
        if(topnum %in% 1:4){
            ## boxplot compare the histone modification signal between on classical and non-classical peaks
            boxplot(Y[which(coTF_cobinding>0 & HMnc == 0)],Y[which(coTF_cobinding==0 | HMnc > 0)],
                names=c("non-classical peak","classical peak"),ylab=paste0(substrateHM," signal"),main=paste0(coTF," & ",substrateHM),
                outline=F,cex.main=1)
            legend("topleft",legend=paste0("#NCpeak = ",nrow(cobinding_NC_peak)),bty="n")
        }
    }
    dev.off()
    
    out_table <- cbind(out_table_raw, num_NCsites)
    write.table(summary_table,file=paste0(outname,"_filterNC.txt"),quote=F,sep="\t",row.names=F,col.names=T)
    write.table(out_table,file=paste0(outname,'_NCsummary.txt'),quote=F,sep="\t",row.names=F,col.names=T)
} 
 
"""
    createDIR("tmpPackage/")
    outf = open("tmpPackage/detectNonClassic.r", 'w')
    outf.write(Rscript)
    outf.close()
    cmd = "Rscript %s %s %s %s %s %s %s %s %s" % (
        "tmpPackage/detectNonClassic.r", conf_dict['General']['outname'],
        ",".join(conf_dict['General']['signalname']),
        conf_dict['options']['Pvalue'], conf_dict['options']['Alpha'],
        conf_dict['options']['Lambda'], conf_dict['options']['TopNcofactors'],
        conf_dict['General']['startdir'] + "tmpPackage/",
        conf_dict['options']['Rcutoff'])
    #rwlog(cmd,logfile)
    os.system('echo "[CMD] %s " >> %s' % (cmd, logfile))
    tmpobj = sp(cmd)

    return conf_dict
예제 #5
0
def step0_check_data(conf_dict, logfile):
    '''
    step0 integrate data 
    check and complement parameter
    '''
    ### check data path , format ,
    if "~" in conf_dict['General']['HMRpeak']:
        ewlog(
            'require absolute path for HMRpeak bed file, HMRpeak file cannot contain "~", current HMRpeak file is %s'
            % (conf_dict['General']['HMRpeak']), logfile)
    if "~" in conf_dict['General']['signal']:
        ewlog(
            'require absolute path for HMsignal bigwig file, signal file cannot contain "~", current signal file is %s'
            % (conf_dict['General']['signal']), logfile)
    if not conf_dict['General']['HMRpeak'].startswith('/'):
        conf_dict['General']['HMRpeak'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['HMRpeak']
    if not conf_dict['General']['signal'].startswith('/'):
        conf_dict['General']['signal'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['signal']

    if not os.path.isfile(conf_dict['General']['HMRpeak']):
        ewlog("HMRpeak file %s not found" % (conf_dict['General']['HMRpeak']),
              logfile)
    if not os.path.isfile(conf_dict['General']['signal']):
        ewlog("signal bw file %s not found" % (conf_dict['General']['signal']),
              logfile)

    if not conf_dict['General']['HMRpeak'].endswith('.bed'):
        ewlog('extenion of HMR peak file is not .bed', logfile)
    checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000)
    if checkbed == "fail":
        ewlog("HMRpeak file is not a bed file", logfile)
    elif checkbed == "lesspeak":
        ewlog("HMRpeak file contains less than 1000 peaks")

    if conf_dict['General']['signal'].endswith('.bw'):
        conf_dict['General']['signalname'] = conf_dict['General'][
            'signal'].split("/")[-1][:-3]
    elif conf_dict['General']['signal'].endswith('.bigwig'):
        conf_dict['General']['signalname'] = conf_dict['General'][
            'signal'].split("/")[-1][:-7]
    else:
        wlog('[WARNING] extension of signal bw file is not bw/bigwig', logfile)

    ### check TFpeak folder
    if "~" in conf_dict['General']['peakFolder']:
        ewlog(
            'require absolute path for peakFolder, peakFolder cannot contain "~", current peakFolder is %s'
            % (conf_dict['General']['peakFolder']), logfile)
    if not conf_dict['General']['peakFolder'].startswith('/'):
        conf_dict['General']['peakFolder'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['peakFolder']
    if not conf_dict['General']['peakFolder'].endswith('/'):
        conf_dict['General']['peakFolder'] += "/"
    if not os.path.isdir(conf_dict['General']['peakFolder']):
        ewlog("peakFolder %s not found" % (conf_dict['General']['peakFolder']),
              logfile)

    wlog(
        "Check the peak.bed files in the peakFolder, only '.bed' files with >1000 peaks are included in the following analysis",
        logfile)
    conf_dict['General']['peakfilenames'] = []
    for f in os.listdir(conf_dict['General']['peakFolder']):
        if f.endswith(".bed") and os.path.isfile(
                conf_dict['General']['peakFolder'] + f):
            checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000)
            if checkbed == "pass":
                conf_dict['General']['peakfilenames'].append(f[:-4])
    if (len(conf_dict['General']['peakfilenames']) == 0):
        ewlog(
            "no peak file (cofactor candidate) in bed format & >1000peaks are included, exit",
            logfile)
    else:
        wlog(
            "%s peak files (cofactor candidates) are included" %
            (len(conf_dict['General']['peakfilenames'])), logfile)

    outf = open(
        conf_dict['General']['outname'] + "_cofactor_candidate_list.txt", 'w')
    for cofactor in conf_dict['General']['peakfilenames']:
        outf.write(cofactor + "\n")
    outf.close()
    ### check options
    wlog('check option: ', logfile)

    try:
        wlog("extend length is %s bp" % (int(conf_dict['options']['ext'])),
             logfile)
        conf_dict['options']['ext'] = int(conf_dict['options']['ext'])
    except:
        wlog(
            "extend length %s is not valid, use default value: 1000bp" %
            (conf_dict['options']['ext']), logfile)
        conf_dict['options']['ext'] = 1000

    try:
        wlog(
            "use Pvalue = %s as cutoff" %
            (str(float(conf_dict['options']['Pvalue']))), logfile)
    except:
        wlog(
            "input Pvalue %s is not recognized, use default Pvalue=0.001" %
            (conf_dict['options']['Pvalue']), logfile)
        conf_dict['options']['Pvalue'] = 0.001
    if float(conf_dict['options']['Pvalue']) >= 1:
        wlog(
            "input Pvalue %s is not valid, use default Pvalue=0.001" %
            (conf_dict['options']['Pvalue']), logfile)
        conf_dict['options']['Pvalue'] = 0.001

    try:
        usealpha = float(conf_dict['options']['Alpha'])
        if usealpha >= 1:
            wlog("alpha cannot be >=1, use alpha=0.5", logfile)
            conf_dict['options']['Alpha'] = 0.5
        else:
            wlog("Alpha = %s" (str(float(conf_dict['options']['Alpha']))),
                 logfile)
            conf_dict['options']['Alpha'] = usealpha
    except:
        wlog(
            "input alpha %s is not valid, use alpha=0.5" %
            (conf_dict['options']['Alpha']), logfile)

    wlog("Lambda choice is %s" % (conf_dict['options']['Lambda']), logfile)
    if conf_dict['options']['TopNcofactors'] == "all":
        wlog("all significant co-factors will be output", logfile)
    else:
        try:
            topTF = int(conf_dict['options']['TopNcofactors'])
            wlog(
                "the topN number %s will be output" %
                (conf_dict['options']['TopNcofactors']), logfile)
            conf_dict['options']['TopNcofactors'] = topTF
        except:
            wlog(
                "the topN number %s is not valid, output top5 co-factors" %
                (conf_dict['options']['TopNcofactors']), logfile)
            conf_dict['options']['TopNcofactors'] = 5

    OS = platform.system()
    if OS == "Linux":
        bwsum_software = "bigWigSummary_linux"
    elif OS == "Darwin":
        bwsum_software = "bigWigSummary_mac"
    else:
        wlog("detected system is nither linux nor mac, try linux version",
             logfile)
        bwsum_software = "bigWigSummary_linux"

    conf_dict['General']['software'] = bwsum_software
    ### check Rscript
    #if not 'Usage' in sperr('Rscript')[1] and not 'version' in sperr('Rscript')[1]:
    #    ewlog('require Rscript',logfile)

    ### check pdflatex
    if sp('pdflatex --help')[0] == "":
        wlog(
            'pdflatex was not installed, HMR is still processing but no summary report generated',
            logfile)
        conf_dict['General']['latex'] = 0
    else:
        conf_dict['General']['latex'] = 1

    return conf_dict