Exemple #1
0
def step1_generate_matrix(conf_dict, logfile):
    '''
    generate expression matrix file 
    main data processing step, including mapping, generate expression matrix and QC matrix which is used in next step
    for fastq format : 
        STAR/bowtie2 mapping
        q30 filter, 
    for sam format:
        q30 filter     
    '''
    #t= time.time()
    ### generate TF overlap matrix using bed tools
    wlog("generate TF overlap matrix", logfile)
    init = 0
    for f in conf_dict['General']['peakfilenames']:
        if init == 0:
            cmd = '%s intersect -a %s -b %s -c ' % (
                conf_dict['General']['bedtools'],
                conf_dict['General']['HMRpeak'],
                conf_dict['General']['peakFolder'] + f + ".bed")
            init = 1
        else:
            cmd += '| %s intersect -a - -b %s -c ' % (
                conf_dict['General']['bedtools'],
                conf_dict['General']['peakFolder'] + f + ".bed")

    cmd += '> %s' % (conf_dict['General']['outname'] + "_peakov.bed")
    rlogonly(cmd, logfile)

    ### generate HMsig

    wlog("generate HMsignal matrix", logfile)
    inf = open(conf_dict['General']['HMRpeak'])
    outf = open(conf_dict['General']['outname'] + "_HMsig.bed", 'w')
    for line in inf:
        ll = line.split()
        center = (int(ll[1]) + int(ll[2])) / 2
        start = max(0, center - conf_dict['options']['ext'])
        end = center + conf_dict['options']['ext']
        addsigALL = []
        for bwsigfile in conf_dict['General']['signalfile']:
            addsigALL.append(
                bwsigAve(bwsigfile, ll[0], start, end,
                         conf_dict['General']['bwsummary']))
        newll = ll + addsigALL
        outf.write("\t".join(map(str, newll)) + "\n")
    inf.close()
    outf.close()

    #s1time = time.time() -t
    #wlog("time for Step1: %s"%(s1time),logfile)
    #conf_dict['results'] = {}
    #conf_dict['results']['expmat'] = conf_dict['Step2_ExpMat']['expmat']
    #conf_dict['results']['qcmat'] = conf_dict['Step2_ExpMat']['qcmat']

    return conf_dict
def step0_check_data(conf_dict, logfile):
    '''
    step0 integrate data 
    check and complement parameter
    '''
    ### check data path , format ,
    if "~" in conf_dict['General']['HMRpeak']:
        ewlog(
            'require absolute path for HMRpeak bed file, HMRpeak file cannot contain "~", current HMRpeak file is %s'
            % (conf_dict['General']['HMRpeak']), logfile)
    if "~" in conf_dict['General']['signal']:
        ewlog(
            'require absolute path for HMsignal bigwig file, signal file cannot contain "~", current signal file(s): %s'
            % (conf_dict['General']['signal']), logfile)
    if not conf_dict['General']['HMRpeak'].startswith('/'):
        conf_dict['General']['HMRpeak'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['HMRpeak']

    if not os.path.isfile(conf_dict['General']['HMRpeak']):
        ewlog("HMRpeak file %s not found" % (conf_dict['General']['HMRpeak']),
              logfile)

    if not conf_dict['General']['HMRpeak'].endswith('.bed'):
        ewlog('extenion of HMR peak file is not .bed', logfile)
    checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000)
    if checkbed == "fail":
        ewlog("HMRpeak file is not a bed file", logfile)
    elif checkbed == "lesspeak":
        ewlog("HMRpeak file contains less than 1000 peaks")

    conf_dict['General']['signalname'] = []
    conf_dict['General']['signalfile'] = []
    for bwsignalfile in conf_dict['General']['signal']:
        if not bwsignalfile.startswith('/'):
            bwsignalfile = conf_dict['General']['startdir'] + bwsignalfile

        if not os.path.isfile(bwsignalfile):
            wlog("signal bw file %s not found, ignored" % (bwsignalfile),
                 logfile)
            continue

        if bwsignalfile.endswith(".bw"):
            conf_dict['General']['signalfile'].append(bwsignalfile)
            conf_dict['General']['signalname'].append(
                bwsignalfile.split("/")[-1][:-3])
        elif bwsignalfile.endswith(".bigwig"):
            conf_dict['General']['signalfile'].append(bwsignalfile)
            conf_dict['General']['signalname'].append(
                bwsignalfile.split("/")[-1][:-7])
        else:
            wlog('[WARNING] extension of signal bw file is not bw/bigwig',
                 logfile)
            conf_dict['General']['signalfile'].append(bwsignalfile)
            conf_dict['General']['signalname'].append(
                bwsignalfile.split("/")[-1])

    if len(conf_dict['General']['signalfile']) == 0:
        ewlog("no signal bw file valid, exit")
    elif len(conf_dict['General']['signalfile']) > 4:
        ewlog(
            "maximum signal bw file is limited to 4. There were %s signal file inputed, exit"
            % (len(conf_dict['General']['signalfile'])))

    ### check TFpeak folder
    if "~" in conf_dict['General']['peakFolder']:
        ewlog(
            'require absolute path for peak/track Folder, Folder cannot contain "~", current Folder is %s'
            % (conf_dict['General']['peakFolder']), logfile)
    if not conf_dict['General']['peakFolder'].startswith('/'):
        conf_dict['General']['peakFolder'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['peakFolder']
    if not conf_dict['General']['peakFolder'].endswith('/'):
        conf_dict['General']['peakFolder'] += "/"
    if not os.path.isdir(conf_dict['General']['peakFolder']):
        ewlog("Folder %s not found" % (conf_dict['General']['peakFolder']),
              logfile)

    if conf_dict['General']['mode'] == "signal":
        wlog("signal mode is activated", logfile)
        if conf_dict['General']['bwfolder']:
            wlog("bwFolder is specified, checking data for signal mode",
                 logfile)
            if "~" in conf_dict['General']['bwfolder']:
                wlog(
                    'require absolute path for bwFolder, bwFolder cannot contain "~", current Folder is %s, use peak mode'
                    % (conf_dict['General']['bwfolder']), logfile)
                conf_dict['General']['mode'] = "binary"
            else:
                if not conf_dict['General']['bwfolder'].startswith('/'):
                    conf_dict['General']['bwfolder'] = conf_dict['General'][
                        'startdir'] + conf_dict['General']['bwfolder']
                if not conf_dict['General']['bwfolder'].endswith('/'):
                    conf_dict['General']['bwfolder'] += "/"
                if not os.path.isdir(conf_dict['General']['bwfolder']):
                    wlog(
                        "bwFolder %s not found, use binary mode" %
                        (conf_dict['General']['peakFolder']), logfile)
                    conf_dict['General']['mode'] = "binary"
        else:
            wlog("bwfolder is not specified, use binary mode", logfile)
            conf_dict['General']['mode'] = "binary"
    else:
        wlog("binary mode is activaed", logfile)

    wlog(
        "Check the peak.bed files in the Folder, only '.bed' files with >1000 peaks are included in the following analysis",
        logfile)
    conf_dict['General']['peakfilenames'] = []
    for f in os.listdir(conf_dict['General']['peakFolder']):
        if f.endswith(".bed") and os.path.isfile(
                conf_dict['General']['peakFolder'] + f):
            checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000)
            if checkbed == "pass":
                conf_dict['General']['peakfilenames'].append(f[:-4])

    if (len(conf_dict['General']['peakfilenames']) == 0):
        ewlog(
            "no peak file (cofactor candidate) in (bed format & >1000peaks) are included, exit",
            logfile)

    if conf_dict['General']['mode'] == "signal":
        conf_dict['General']['bwfilenames'] = []
        for f in os.listdir(conf_dict['General']['bwfolder']):
            if f.endswith(".bw") and os.path.isfile(
                    conf_dict['General']['bwfolder'] + f):
                conf_dict['General']['bwfilenames'].append(f[:-3])
        ### compare the name from bwfiles and peak files
        conf_dict['General']['usefilename'] = []
        for name in conf_dict['General']['bwfilenames']:
            if name in conf_dict['General']['peakfilenames']:
                conf_dict['General']['usefilename'].append(name)
        ### if less than 50% peakfiles share name with bwfiles, change back to peak mode
        if len(conf_dict['General']['usefilename']) < len(
                conf_dict['General']['peakfilenames']) * 0.5:
            conf_dict['General']['mode'] = "binary"
            wlog(
                "the number of shared peak&bw files is less than half of the number of peakfiles, use binary mode",
                logfile)
        else:
            wlog("all checks for signal mode passed, use signal mode", logfile)

    if conf_dict['General']['mode'] == "binary":
        conf_dict['General']['usefilename'] = conf_dict['General'][
            'peakfilenames']

    wlog(
        "%s cofactor candidates are included" %
        (len(conf_dict['General']['usefilename'])), logfile)

    #checkbed = checkbedformat(conf_dict['General']['HMRpeak'],1000)
    #if checkbed == "pass":
    #conf_dict['General']['peakfilenames'].append(f[:-3])

    outf = open(
        conf_dict['General']['outname'] + "_cofactor_candidate_list.txt", 'w')
    for cofactor in conf_dict['General']['usefilename']:
        outf.write(cofactor + "\n")
    outf.close()
    ### check options
    wlog('check option: ', logfile)
    #
    try:
        wlog(
            "extend length for HMsignal is %s bp" %
            (int(conf_dict['options']['ext'])), logfile)
        conf_dict['options']['ext'] = int(conf_dict['options']['ext'])
    except:
        wlog(
            "extend length %s is not valid, use default value: 1000bp" %
            (conf_dict['options']['ext']), logfile)
        conf_dict['options']['ext'] = 1000

    try:
        wlog(
            "use Pvalue = %s as cutoff" %
            (str(float(conf_dict['options']['Pvalue']))), logfile)
    except:
        wlog(
            "input Pvalue %s is not recognized, use default Pvalue=0.001" %
            (conf_dict['options']['Pvalue']), logfile)
        conf_dict['options']['Pvalue'] = 0.001
    if float(conf_dict['options']['Pvalue']) >= 1:
        wlog(
            "input Pvalue %s is not valid, use default Pvalue=0.001" %
            (conf_dict['options']['Pvalue']), logfile)
        conf_dict['options']['Pvalue'] = 0.001

    try:
        usealpha = float(conf_dict['options']['Alpha'])
        if usealpha >= 1:
            wlog("alpha (for elastic-net) cannot be >=1, use alpha=0.5",
                 logfile)
            conf_dict['options']['Alpha'] = 0.5
        else:
            wlog(
                "Alpha (for elastic-net) = %s" %
                (str(float(conf_dict['options']['Alpha']))), logfile)
            conf_dict['options']['Alpha'] = usealpha
    except:
        wlog(
            "input alpha (for elastic-net) %s is not valid, use alpha=0.5" %
            (conf_dict['options']['Alpha']), logfile)

    wlog("Lambda choice is %s" % (conf_dict['options']['Lambda']), logfile)
    if conf_dict['options']['TopNcofactors'] == "all":
        wlog("all significant co-factors will be output", logfile)
    else:
        try:
            topTF = int(conf_dict['options']['TopNcofactors'])
            wlog(
                "the topN number %s will be output" %
                (conf_dict['options']['TopNcofactors']), logfile)
            conf_dict['options']['TopNcofactors'] = topTF
        except:
            wlog(
                "the topN number %s is not valid, output top5 co-factors" %
                (conf_dict['options']['TopNcofactors']), logfile)
            conf_dict['options']['TopNcofactors'] = 5

    OS = platform.system()
    if OS == "Linux":
        bwsum_software = "bigWigSummary_linux"
    elif OS == "Darwin":
        bwsum_software = "bigWigSummary_mac"
    else:
        wlog(
            "detected system is nither linux nor mac, try linux version of bigWigSummary",
            logfile)
        bwsum_software = "bigWigSummary_linux"

    conf_dict['General'][
        'bwsummary'] = HMRpipe.__path__[0] + "/%s" % bwsum_software
    if os.path.isfile(HMRpipe.__path__[0] + "/bedtools"):
        conf_dict['General']['bedtools'] = HMRpipe.__path__[0] + "/bedtools"
    else:
        conf_dict['General']['bedtools'] = "bedtools"

    ### check Rscript
    #if not 'Usage' in sperr('Rscript')[1] and not 'version' in sperr('Rscript')[1]:
    #    ewlog('require Rscript',logfile)

    ### check pdflatex
    if sp('pdflatex --help')[0] == "":
        wlog(
            'pdflatex was not installed, ncHMR_detector is still processing but no summary report generated',
            logfile)
        conf_dict['General']['latex'] = 0
    else:
        conf_dict['General']['latex'] = 1

    return conf_dict
def step3_summary(conf_dict, logfile):
    '''
    analysis part
    mainly Rscript
    dimentional reduction + clustering
    '''
    # start
    # create section for

    wlog('collect results', logfile)
    # Rscript analysis.r expmat outname coverGN highvarZ selectPCcutoff rdnumber maxKnum
    summarydir = 'summary/'
    createDIR(summarydir)
    sp("mv %s_NCsummary.txt %s" %
       (conf_dict['General']['outname'], summarydir))
    sp("mv %s_elnet_lambdaSelection.pdf %s" %
       (conf_dict['General']['outname'], summarydir))
    if os.path.isfile("%s_cofactor_HMsignal.pdf" %
                      conf_dict['General']['outname']):
        sp("mv %s_cofactor_HMsignal.pdf %s" %
           (conf_dict['General']['outname'], summarydir))

    tmpresult = 'tmpResults/'
    createDIR(tmpresult)
    sp("mv %s_HMsig.bed %s" % (conf_dict['General']['outname'], tmpresult))
    sp("mv %s_peakov.bed %s" % (conf_dict['General']['outname'], tmpresult))
    sp("mv %s_cofactor_candidate_list.txt %s" %
       (conf_dict['General']['outname'], tmpresult))
    sp("mv %s_filterNC.txt %s" % (conf_dict['General']['outname'], tmpresult))

    wlog('generate summary documents', logfile)
    ### initiate
    QCdoc = """\documentclass[11pt,a4paper]{article}
\usepackage{tabularx}
\usepackage[english]{babel}
\usepackage{array}
\usepackage{graphicx}
\usepackage{color}
\DeclareGraphicsExtensions{.eps,.png,.pdf,.ps}
\\begin{document}
\\title{Summary reports of non-classical function detection of : %s}

\\vspace{-1cm}
\maketitle
\\tableofcontents
\\newpage
\\newpage
\section{Data description}
\\begin{quotation}
Table 1 mainly describes the input files, parameters and options.
\end{quotation}
\\begin{table}[h]
\\small
\caption{parameter description}\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|l| }

""" % (strlatexformat(conf_dict['General']['outname']))
    ### table1 prepare parameter
    NcoTF = len(conf_dict['General']['peakfilenames'])
    QCdoc += """      
\hline
parameter & value  \\\\
\hline
output name & %s \\\\
\hline
HMRpeak(peak filename) & %s \\\\
\hline
mode & %s \\\\
\hline
HM signal(bw filename) & \\begin{tabular}[c]{@{}l@{}}%s\end{tabular}  \\\\
\hline
\#cofactor candidates & %s \\\\
\hline
options & value \\\\
\hline
extend size & %sbp \\\\
\hline
Alpha (Elastic net) & %s \\\\
\hline
Pvalue cutoff & %s \\\\
\hline
topN cofactors & %s \\\\
\hline
""" % (strlatexformat(conf_dict['General']['outname']),
       strlatexformat(conf_dict['General']['HMRpeak'].split("/")[-1]),
       conf_dict['General']['mode'],
       strlatexformat("\\\\ ".join(conf_dict['General']['signalname'])),
       str(NcoTF), str(conf_dict['options']['ext']),
       str(conf_dict['options']['Alpha']), str(conf_dict['options']['Pvalue']),
       str(conf_dict['options']['TopNcofactors']))
    QCdoc += """
\end{tabularx}
\end{table}
"""
    ### cross validation in elastic net
    QCdoc += """
\\newpage
\\newpage
\section{ElasticNet co-factor selection}
In this step we use a feature selection (elastic-net. Zou, H. and Hastie T. (2005) to select potential co-factors which corresponded to the non-classical function. Below shows the cross-validation curve for the decison of lambda in elastic-net for each histone modification substrate.  
\\begin{figure}[h]
        \caption{cross-validation curve for lambda decision} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
""" % (conf_dict['General']['outname'] + "_elnet_lambdaSelection.pdf")

    inf_ncsummary = open("summary/" + conf_dict['General']['outname'] +
                         "_NCsummary.txt")
    line = inf_ncsummary.readline()
    if line.startswith("no non-classical function detected"):
        QCdoc += """
\\newpage
\\newpage
\section{potential co-factors corresponded to non-classical function}
No significant co-factor was detected, indicating that the non-classical function of the HMR was not exist or none of the existing factor candidates act as a co-factor of the non-classical function.
"""
    else:
        QCdoc += """
\\newpage
\\newpage
\section{potential co-factors corresponded to non-classical function}
In summary, %s factors were predicted to potentially act as a co-factor of the non-classical function. The top%s co-factors were listed.
\subsection{summary of co-factors}
\\begin{quotation}
The corresponded histone modification substrate (HMsubstrate), empirical P-value, R-square (ordered) and the number of non-classical (NC) sites for each potential co-factor were listed below. The empirical P-value was calculated based on the comparison of foreground (observed) R-square and background R-square (distribution of random R-square generated from the 1,000 permutations of co-binding events) for each potential co-factor. The non-classical (NC) sites were defined by lower HMsubstrate signal (using Otus' method) and co-binding events of each potential co-factor.
\end{quotation}
\\begin{table}[h]
\\small
\caption{cofactor summary}\label{bstable}
\\begin{tabular}{ |l|l|l|l|l| }
    
\hline
co-factor & HMsubstrate & Pval & Rsquare & NCsites \\\\
""" % (int(
            sp("wc -l tmpResults/%s_filterNC.txt" %
               (conf_dict['General']['outname']))[0].split()[0]) - 1,
        int(
           sp("wc -l summary/%s_NCsummary.txt" %
              (conf_dict['General']['outname']))[0].split()[0]) - 1)

        for line in inf_ncsummary:
            if line.startswith("TFname"):
                continue
            ll = line.split()
            this_doc = """\hline
%s & %s & %s & %s & %s \\\\
""" % (strlatexformat(ll[0]), strlatexformat(
                ll[1]), ll[2], round(float(ll[3]), 3), ll[5])
            QCdoc += this_doc
        inf_ncsummary.close()
        QCdoc += """
\hline
\end{tabular}
\end{table}
\\newpage
\\newpage
\subsection{Boxplot of HM on non-classical and classic sites}
\\begin{quotation}
Boxplot was generated to compare the difference of the histone mark (HM) signal on either non-classical or classic sites(peak). The non-classical sites were defined by lower HM signal (using Otus' method) and co-binding events of each potential co-factor. The boxplot corresponded to top co-factors were displayed.  
\end{quotation}
\\begin{figure}[h]
        \caption{boxplot cofactor HMsignal} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
""" % ((conf_dict['General']['outname'] + "_cofactor_HMsignal.pdf"))

    QCdoc += """
\\newpage
\\newpage
\section{Output list}
\\begin{quotation}
All the main output files were described in the following table
\end{quotation}
\\begin{table}[h]
\\small
\caption{output list}\label{bstable}
\\begin{tabular}{ |l|l| }
    
\hline
description & filename \\\\
\hline
summary table of non-classical (NC) function & summary/%s \\\\
\hline
summary report (this doc) & summary/%s \\\\
\hline
cobinding matrix on HMR peaks & tmpResults/%s \\\\
\hline
histone mark signal on HMR peaks & tmpResults/%s \\\\
\hline

\end{tabular}
\end{table} 
\end{document} 

""" % (strlatexformat(conf_dict['General']['outname'] + "_NCsummary.txt"),
       strlatexformat(conf_dict['General']['outname'] + "_summary.pdf"),
       strlatexformat(conf_dict['General']['outname'] + "_peakov.bed"),
       strlatexformat(conf_dict['General']['outname'] + "_HMsig.bed"))

    latexfile = conf_dict['General']['outname'] + '_summary.tex'

    outf = open(summarydir + latexfile, 'w')
    outf.write(QCdoc)
    outf.close()
    cmd = "pdflatex %s" % (latexfile)
    cmd2 = 'cp %s ../' % (conf_dict['General']['outname'] + '_summary.pdf')
    if conf_dict['General']['latex'] == 1:
        wlog(
            'pdflatex was detected in default PATH, generate summary report %s'
            % (conf_dict['General']['outname'] + '_summary.pdf'), logfile)
        os.chdir(summarydir)
        tmpobj = sp(cmd)
        tmpobj = sp(cmd)
        tmpobj = sp(cmd2)
        tmpobj = sp("rm %s_summary.aux" % conf_dict['General']['outname'])
        tmpobj = sp("rm %s_summary.log" % conf_dict['General']['outname'])
        tmpobj = sp("rm %s_summary.toc" % conf_dict['General']['outname'])

#        for files in os.listdir(plot_folder):
#            if os.path.isfile(files) and files[-12:-4] == "_summary":
#                if not files[-4:] in ['.tex','.pdf',',png','.txt']:
#                    cmd = "rm %s"%(files)
#                    rwlog(cmd,logfile)
    else:
        wlog(
            'pdflatex was not detected in default PATH, generate summary report .tex file in summary/ folder, you can move the whole summary/ folder to the environment with pdflatex installed and run cmd in the summary/ folder: "pdflatex %s"'
            % (conf_dict['General']['outname'] + '_summary.tex'), logfile)

    #if conf_dict['clean']:
    #    wlog('--clean pararmeter was turned on, remove internal files with large size',logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_symbol.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_cds.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_3utr.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_5utr.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_TTSdis.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_combined.bed'),logfile)
    #    rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_barcode_reform.txt'),logfile)
#
    os.chdir("../")
    wlog('Step3 summary DONE, check %s for final outputs' % (summarydir),
         logfile)

    return conf_dict
Exemple #4
0
def step0_check_data(conf_dict, logfile):
    '''
    step0 integrate data 
    check and complement parameter
    '''
    ### check data path , format ,
    if "~" in conf_dict['General']['HMRpeak']:
        ewlog(
            'require absolute path for HMRpeak bed file, HMRpeak file cannot contain "~", current HMRpeak file is %s'
            % (conf_dict['General']['HMRpeak']), logfile)
    if "~" in conf_dict['General']['signal']:
        ewlog(
            'require absolute path for HMsignal bigwig file, signal file cannot contain "~", current signal file is %s'
            % (conf_dict['General']['signal']), logfile)
    if not conf_dict['General']['HMRpeak'].startswith('/'):
        conf_dict['General']['HMRpeak'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['HMRpeak']
    if not conf_dict['General']['signal'].startswith('/'):
        conf_dict['General']['signal'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['signal']

    if not os.path.isfile(conf_dict['General']['HMRpeak']):
        ewlog("HMRpeak file %s not found" % (conf_dict['General']['HMRpeak']),
              logfile)
    if not os.path.isfile(conf_dict['General']['signal']):
        ewlog("signal bw file %s not found" % (conf_dict['General']['signal']),
              logfile)

    if not conf_dict['General']['HMRpeak'].endswith('.bed'):
        ewlog('extenion of HMR peak file is not .bed', logfile)
    checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000)
    if checkbed == "fail":
        ewlog("HMRpeak file is not a bed file", logfile)
    elif checkbed == "lesspeak":
        ewlog("HMRpeak file contains less than 1000 peaks")

    if conf_dict['General']['signal'].endswith('.bw'):
        conf_dict['General']['signalname'] = conf_dict['General'][
            'signal'].split("/")[-1][:-3]
    elif conf_dict['General']['signal'].endswith('.bigwig'):
        conf_dict['General']['signalname'] = conf_dict['General'][
            'signal'].split("/")[-1][:-7]
    else:
        wlog('[WARNING] extension of signal bw file is not bw/bigwig', logfile)

    ### check TFpeak folder
    if "~" in conf_dict['General']['peakFolder']:
        ewlog(
            'require absolute path for peakFolder, peakFolder cannot contain "~", current peakFolder is %s'
            % (conf_dict['General']['peakFolder']), logfile)
    if not conf_dict['General']['peakFolder'].startswith('/'):
        conf_dict['General']['peakFolder'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['peakFolder']
    if not conf_dict['General']['peakFolder'].endswith('/'):
        conf_dict['General']['peakFolder'] += "/"
    if not os.path.isdir(conf_dict['General']['peakFolder']):
        ewlog("peakFolder %s not found" % (conf_dict['General']['peakFolder']),
              logfile)

    wlog(
        "Check the peak.bed files in the peakFolder, only '.bed' files with >1000 peaks are included in the following analysis",
        logfile)
    conf_dict['General']['peakfilenames'] = []
    for f in os.listdir(conf_dict['General']['peakFolder']):
        if f.endswith(".bed") and os.path.isfile(
                conf_dict['General']['peakFolder'] + f):
            checkbed = checkbedformat(conf_dict['General']['HMRpeak'], 1000)
            if checkbed == "pass":
                conf_dict['General']['peakfilenames'].append(f[:-4])
    if (len(conf_dict['General']['peakfilenames']) == 0):
        ewlog(
            "no peak file (cofactor candidate) in bed format & >1000peaks are included, exit",
            logfile)
    else:
        wlog(
            "%s peak files (cofactor candidates) are included" %
            (len(conf_dict['General']['peakfilenames'])), logfile)

    outf = open(
        conf_dict['General']['outname'] + "_cofactor_candidate_list.txt", 'w')
    for cofactor in conf_dict['General']['peakfilenames']:
        outf.write(cofactor + "\n")
    outf.close()
    ### check options
    wlog('check option: ', logfile)

    try:
        wlog("extend length is %s bp" % (int(conf_dict['options']['ext'])),
             logfile)
        conf_dict['options']['ext'] = int(conf_dict['options']['ext'])
    except:
        wlog(
            "extend length %s is not valid, use default value: 1000bp" %
            (conf_dict['options']['ext']), logfile)
        conf_dict['options']['ext'] = 1000

    try:
        wlog(
            "use Pvalue = %s as cutoff" %
            (str(float(conf_dict['options']['Pvalue']))), logfile)
    except:
        wlog(
            "input Pvalue %s is not recognized, use default Pvalue=0.001" %
            (conf_dict['options']['Pvalue']), logfile)
        conf_dict['options']['Pvalue'] = 0.001
    if float(conf_dict['options']['Pvalue']) >= 1:
        wlog(
            "input Pvalue %s is not valid, use default Pvalue=0.001" %
            (conf_dict['options']['Pvalue']), logfile)
        conf_dict['options']['Pvalue'] = 0.001

    try:
        usealpha = float(conf_dict['options']['Alpha'])
        if usealpha >= 1:
            wlog("alpha cannot be >=1, use alpha=0.5", logfile)
            conf_dict['options']['Alpha'] = 0.5
        else:
            wlog("Alpha = %s" (str(float(conf_dict['options']['Alpha']))),
                 logfile)
            conf_dict['options']['Alpha'] = usealpha
    except:
        wlog(
            "input alpha %s is not valid, use alpha=0.5" %
            (conf_dict['options']['Alpha']), logfile)

    wlog("Lambda choice is %s" % (conf_dict['options']['Lambda']), logfile)
    if conf_dict['options']['TopNcofactors'] == "all":
        wlog("all significant co-factors will be output", logfile)
    else:
        try:
            topTF = int(conf_dict['options']['TopNcofactors'])
            wlog(
                "the topN number %s will be output" %
                (conf_dict['options']['TopNcofactors']), logfile)
            conf_dict['options']['TopNcofactors'] = topTF
        except:
            wlog(
                "the topN number %s is not valid, output top5 co-factors" %
                (conf_dict['options']['TopNcofactors']), logfile)
            conf_dict['options']['TopNcofactors'] = 5

    OS = platform.system()
    if OS == "Linux":
        bwsum_software = "bigWigSummary_linux"
    elif OS == "Darwin":
        bwsum_software = "bigWigSummary_mac"
    else:
        wlog("detected system is nither linux nor mac, try linux version",
             logfile)
        bwsum_software = "bigWigSummary_linux"

    conf_dict['General']['software'] = bwsum_software
    ### check Rscript
    #if not 'Usage' in sperr('Rscript')[1] and not 'version' in sperr('Rscript')[1]:
    #    ewlog('require Rscript',logfile)

    ### check pdflatex
    if sp('pdflatex --help')[0] == "":
        wlog(
            'pdflatex was not installed, HMR is still processing but no summary report generated',
            logfile)
        conf_dict['General']['latex'] = 0
    else:
        conf_dict['General']['latex'] = 1

    return conf_dict