コード例 #1
0
def step4_analysis(conf_dict, logfile):
    '''
    analysis part
    mainly Rscript
    dimentional reduction + clustering
    '''
    # start
    # create section for
    t = time.time()
    wlog('Step4: analysis', logfile)
    wlog(
        'dimentional reduction + clustering with own script, based on selected STAMP barcodes',
        logfile)
    # Rscript analysis.r expmat outname coverGN highvarZ selectPCcutoff rdnumber maxKnum
    analysisdir = conf_dict['General']['outputdirectory'] + 'analysis/'
    createDIR(analysisdir)
    os.chdir(analysisdir)

    conf_dict['Step4_Analysis']['clusterresult'] = analysisdir + conf_dict[
        'General']['outname'] + '_cluster.txt'
    conf_dict['QCplots']['gapstat'] = analysisdir + conf_dict['General'][
        'outname'] + '_Figure10_GapStat.pdf'
    conf_dict['QCplots']['cluster'] = analysisdir + conf_dict['General'][
        'outname'] + '_Figure11_cluster.pdf'
    conf_dict['QCplots']['silhouette'] = analysisdir + conf_dict['General'][
        'outname'] + '_Figure12_silhouetteScore.pdf'
    conf_dict['QCplots']['umicolor'] = analysisdir + conf_dict['General'][
        'outname'] + '_Figure13_totalUMIcolored.pdf'
    conf_dict['QCplots']['itrcolor'] = analysisdir + conf_dict['General'][
        'outname'] + '_Figure14_intronRate_colored.pdf'
    conf_dict['results']['pctable'] = analysisdir + conf_dict['General'][
        'outname'] + '_pctable.txt'
    conf_dict['results']['cortable'] = analysisdir + conf_dict['General'][
        'outname'] + '_correlation_table.txt'
    conf_dict['results']['features'] = analysisdir + conf_dict['General'][
        'outname'] + '_features_clustercell.txt'

    cmd = "%s %s %s %s %s %s %s %s %s %s %s %s %s" % (
        'Rscript', conf_dict['rscript'] + 'analysis.r',
        conf_dict['results']['expmatcc'], conf_dict['General']['outname'],
        conf_dict['Step4_Analysis']['highvarz'],
        conf_dict['Step4_Analysis']['selectpccumvar'],
        conf_dict['Step4_Analysis']['rdnumber'],
        conf_dict['Step4_Analysis']['maxknum'],
        conf_dict['Step4_Analysis']['pctable'],
        conf_dict['Step4_Analysis']['cortable'],
        conf_dict['Step4_Analysis']['clustering_method'],
        conf_dict['Step4_Analysis']['custom_k'],
        conf_dict['Step4_Analysis']['custom_d'])
    rwlog(cmd, logfile)
    cmd = '%s %s %s %s %s' % (
        'Rscript', conf_dict['rscript'] + 'post_analysis.r',
        conf_dict['Step4_Analysis']['clusterresult'],
        conf_dict['Step2_ExpMat']['qcmatcc'], conf_dict['General']['outname'])
    rwlog(cmd, logfile)
    analysisqctime = time.time() - t
    wlog("time for analysis qc: %s" % (analysisqctime), logfile)
    wlog("Step4 analysis QC DONE", logfile)

    return conf_dict
コード例 #2
0
def step1_generate_matrix(conf_dict, logfile):
    '''
    generate expression matrix file 
    main data processing step, including mapping, generate expression matrix and QC matrix which is used in next step
    for fastq format : 
        STAR/bowtie2 mapping
        q30 filter, 
    for sam format:
        q30 filter     
    '''
    wlog("Step1: alignment", logfile)
    t = time.time()
    ### create mapping dir
    mapping_dir = conf_dict['General']['outputdirectory'] + 'mapping/'
    createDIR(mapping_dir)
    ### check reads file format , start mapping step if format is fastq
    if conf_dict['General']['format'] == 'sam':
        wlog('reads file format is sam, skip mapping step', logfile)
        conf_dict['General']['sam'] = conf_dict['General']['reads_file']
    else:
        wlog(
            'Now start mapping in %s , all mapping result will be here' %
            (mapping_dir), logfile)
        os.chdir(mapping_dir)
        ## choose mapping tool from STAR and bowtie2 according to config file
        if conf_dict['Step1_Mapping']['mapping_software_main'] == "STAR":
            wlog('user choose STAR as alignment software', logfile)
            if sp('which STAR')[0].strip() == "":
                ewlog(
                    'STAR is not detected in default PATH, make sure you installed STAR and export it into default PATH',
                    logfile)
            mapping_cmd = 'STAR --genomeDir %s --readFilesIn %s --runThreadN %s' % (
                conf_dict['Step1_Mapping']['mapindex'],
                conf_dict['General']['reads_file'],
                conf_dict['Step1_Mapping']['mapping_p'])
            mapping_cmd2 = 'mv Aligned.out.sam %s.sam' % (
                conf_dict['General']['outname'])
            rwlog(mapping_cmd, logfile)
            rwlog(mapping_cmd2, logfile)

        elif conf_dict['Step1_Mapping']['mapping_software_main'] == "bowtie2":
            wlog('user choose bowtie2 as alignment software', logfile)
            if sp('which bowtie2')[0].strip() == "":
                ewlog(
                    'bowtie2 is not detected in default PATH, make sure you installed bowtie2 and export it into default PATH',
                    logfile)
            mapping_cmd = 'bowtie2 -p %s -x %s -U %s -S %s.sam   2>&1 >>/dev/null |tee -a %s.bowtieout' % (
                conf_dict['Step1_Mapping']['mapping_p'],
                conf_dict['Step1_Mapping']['mapindex'],
                conf_dict['General']['reads_file'],
                conf_dict['General']['outname'],
                conf_dict['General']['outname'])
            rwlog(mapping_cmd, logfile)

        else:
            ewlog("alignment tools can only be STAR and bowtie2", logfile)

        conf_dict['General'][
            'sam'] = mapping_dir + conf_dict['General']['outname'] + '.sam'
    ### transform to bed file, awk helps to conduct q30 filtering
    wlog("transfer sam file to aligned bed file with own script", logfile)
    conf_dict['General'][
        'bed'] = mapping_dir + conf_dict['General']['outname'] + '.bed'
    conf_dict['General']['sampledownsam'] = mapping_dir + conf_dict['General'][
        'outname'] + '_sampledown.sam'
    conf_dict['General']['sampledownbed'] = mapping_dir + conf_dict['General'][
        'outname'] + '_sampledown.bed'
    if int(conf_dict['Step1_Mapping']['q30filter']) == 1:
        wlog("q30 filter is turned on", logfile)
    else:
        wlog("q30 filter is turned off", logfile)
    ### use own script to transform sam to bed, and random sampling 5M mappable reads
    sample_down_transform_sam(conf_dict['General']['sam'],
                              conf_dict['General']['bed'],
                              conf_dict['General']['sampledownsam'],
                              conf_dict['General']['sampledownbed'], 5000000,
                              int(conf_dict['Step1_Mapping']['q30filter']))
    #        q30cmd = """samtools view -q 30 -XS %s | awk '{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if (substr($2,1,1) == "r") print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr" && $5 > 30) {if ($2 == 16) print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr" && $5 > 30) {if ($2 == 16) print $3,$4-1,$4,$1,255,"-";else print $3,$4-1,$4,$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        rwlog(q30cmd,logfile,conf_dict['General']['dryrun'])
    #        q30cmd = """samtools view -XS %s | awk '{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if (substr($2,1,1) == "r") print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if ($2 == 16) print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if ($2 == 16) print $3,$4-1,$4+length($11),$1,255,"-";else print $3,$4-1,$4,$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        rwlog(q30cmd,logfile,conf_dict['General']['dryrun'])
    if not os.path.isfile(conf_dict['General']['bed']) or os.path.getsize(
            conf_dict['General']['bed']) == 0:
        ewlog(
            'Alignment step / q30 filtering step failed, check your alignment parameter and samfile',
            logfile)
    s1time = time.time() - t
    wlog("time for alignment: %s" % (s1time), logfile)
    wlog("Step1: alignment DONE", logfile)

    ### create annotation dir and generate related annotation file
    t = time.time()
    wlog("Step2: transform expression matrix", logfile)
    wlog('generate related annotation file with own script', logfile)
    annotation_dir = conf_dict['General']['outputdirectory'] + 'annotation/'
    createDIR(annotation_dir)
    os.chdir(annotation_dir)
    transform_refgene(conf_dict['General']['gene_annotation'],
                      conf_dict['Step2_ExpMat']['ttsdistance'],
                      conf_dict['General']['outname'])

    ### create expression matrix dir and generate matrix
    wlog(
        'generate expression matrix and individual cell qc matrix with own script',
        logfile)
    expdir = conf_dict['General']['outputdirectory'] + 'expmatrix/'
    createDIR(expdir)
    os.chdir(expdir)

    ### use bedtools(intersect function) to assign exon/intron/intergenic/overlapping gene  information to all reads
    ### sort according to name
    wlog('add gene annotation on aligned bed file', logfile)
    cmd1 = "bedtools intersect -a %s -b %s  -wo   | sort -k 4,4 - >  %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_symbol.bed',
        conf_dict['General']['outname'] + '_on_symbol.bed')
    cmd2 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_cds.bed',
        conf_dict['General']['outname'] + '_on_cds.bed')
    cmd3 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_3utr.bed',
        conf_dict['General']['outname'] + '_on_3utr.bed')
    cmd4 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_5utr.bed',
        conf_dict['General']['outname'] + '_on_5utr.bed')
    cmd5 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_TTSdis.bed',
        conf_dict['General']['outname'] + '_on_TTSdis.bed')
    rwlog(cmd1, logfile)
    rwlog(cmd2, logfile)
    rwlog(cmd3, logfile)
    rwlog(cmd4, logfile)
    rwlog(cmd5, logfile)

    ### transform barcode fastq to 3column txt file [name,cell_barcode,umi]
    if conf_dict['General']['format1'] == 'txt':
        wlog('barcode files is reformed txt format, skip reform step', logfile)
        conf_dict['General']['barcode_reform'] = conf_dict['General'][
            'barcode_file']
    else:
        wlog('reform barcode files with own script', logfile)
        conf_dict['General']['barcode_reform'] = expdir + conf_dict['General'][
            'outname'] + '_barcode_reform.txt'
        reform_barcode_fastq(conf_dict['General']['barcode_file'],
                             conf_dict['General']['barcode_reform'],
                             conf_dict['General']['cell_barcode_length'],
                             conf_dict['General']['umi_length'])
    ### sort according name
    cmdsort = 'sort -k 1,1 %s > %s' % (
        conf_dict['General']['barcode_reform'],
        expdir + conf_dict['General']['outname'] + '_barcode_reform_sort.txt')
    rwlog(cmdsort, logfile)
    conf_dict['General']['barcode_reform'] = expdir + conf_dict['General'][
        'outname'] + '_barcode_reform_sort.txt'

    ### combine gene annotation, reads, barcode together
    wlog('combine annotation and barcode on reads with own script', logfile)
    combine_reads(conf_dict['General']['barcode_reform'],
                  conf_dict['General']['outname'] + '_on_cds.bed',
                  conf_dict['General']['outname'] + '_on_3utr.bed',
                  conf_dict['General']['outname'] + '_on_5utr.bed',
                  conf_dict['General']['outname'] + '_on_symbol.bed',
                  conf_dict['General']['outname'] + '_on_TTSdis.bed',
                  conf_dict['General']['outname'] + '_combined.bed',
                  conf_dict['Step2_ExpMat']['duplicate_measure'])

    ### sort combined file by umi+loci, for following duplicate detection
    cmd6 = "sort -k 7,7 -k 5,5 %s > %s" % (
        conf_dict['General']['outname'] + '_combined.bed',
        conf_dict['General']['outname'] + '_combined_sort.bed')
    rwlog(cmd6, logfile)

    ### generate expression and QC matrix based on combined file
    wlog('generate expression matrix and QC matrix with own script', logfile)
    ### qcmatfull contains all cell_barcodes, while qcmat,expmat only contain cell_barcodes >= covergncutoff(100, default)
    conf_dict['Step2_ExpMat']['qcmatfull'] = expdir + conf_dict['General'][
        'outname'] + "_qcmatfull.txt"
    conf_dict['Step2_ExpMat'][
        'qcmat'] = expdir + conf_dict['General']['outname'] + "_qcmat.txt"
    conf_dict['Step2_ExpMat'][
        'expmat'] = expdir + conf_dict['General']['outname'] + "_expmat.txt"

    generate_matrix(conf_dict['General']['gene_annotation'],
                    conf_dict['General']['outname'] + '_combined_sort.bed',
                    conf_dict['Step2_ExpMat']['filterttsdistance'],
                    conf_dict['Step2_ExpMat']['qcmatfull'],
                    conf_dict['Step2_ExpMat']['qcmat'],
                    conf_dict['Step2_ExpMat']['expmat'],
                    conf_dict['Step2_ExpMat']['covergncutoff'],
                    conf_dict['Step2_ExpMat']['umidis1'])

    wlog("Step2 transform expression matrix DONE", logfile)
    s2time = time.time() - t
    wlog("time for transform expmat: %s" % (s2time), logfile)
    conf_dict['results'] = {}
    #conf_dict['results']['expmat'] = conf_dict['Step2_ExpMat']['expmat']
    #conf_dict['results']['qcmat'] = conf_dict['Step2_ExpMat']['qcmat']

    return conf_dict
コード例 #3
0
ファイル: step5_summary.py プロジェクト: pythseq/MAESTRO
def step5_summary(conf_dict,logfile):
    '''
    analysis part
    mainly Rscript
    dimentional reduction + clustering
    '''
    # start
    # create section for 
    
    wlog('Step5: summary',logfile)
    wlog('copy results',logfile)
# Rscript analysis.r expmat outname coverGN highvarZ selectPCcutoff rdnumber maxKnum
    summarydir = conf_dict['General']['outputdirectory'] + 'summary/'
    createDIR(summarydir)
    os.chdir(summarydir)
    
    plot_folder = summarydir + "plots/"
    createDIR(plot_folder)
    os.chdir(plot_folder)
    ### collect results 
    for i in conf_dict['QCplots']:
        if os.path.isfile(conf_dict['QCplots'][i]):
            #realname
            cmd = 'cp %s .'%conf_dict['QCplots'][i]
            rwlog(cmd,logfile)

    result_folder = summarydir + "results/"
    createDIR(result_folder)
    os.chdir(result_folder)
    for i in conf_dict['results']:
        if os.path.isfile(conf_dict['results'][i]):
            cmd = 'cp %s .'%conf_dict['results'][i]
            rwlog(cmd,logfile)

    os.chdir(summarydir)

    wlog('generate qc documents',logfile)
    ### initiate 
    QCdoc = """\\documentclass[11pt,a4paper]{article}
\\usepackage{tabularx}
\\usepackage[english]{babel}
\\usepackage{array}
\\usepackage{graphicx}
\\usepackage{color}
\\DeclareGraphicsExtensions{.eps,.png,.pdf,.ps}
\\begin{document}
\\title{QC and analysis reports for Drop-seq data : %s}

\\vspace{-1cm}
\\maketitle
\\tableofcontents
\\newpage
\\newpage
\\section{Data description}
\\begin{quotation}
Table 1 mainly describe the input file and mapping and analysis parameters.
\\end{quotation}
\\begin{table}[h]
\\caption{Data description}\\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|l| }

"""%(strlatexformat(conf_dict['General']['outname']))
    ### table1 prepare parameter
    if int(conf_dict['Step1_Mapping']['q30filter']) == 1:
        q30filter = "True"
    else:
        q30filter = "False"
    if int(conf_dict['Step2_ExpMat']['filterttsdistance']) == 1:
        filtertts = "True"
    else: 
        filtertts = "False"
    if int(conf_dict['Step2_ExpMat']['umidis1']) == 1:
        umidis1 = "True"
    else:
        umidis1 = "False"
    if int(conf_dict['Step3_QC']['remove_low_dup_cell']) == 1:
        rmnodup = "True"
    else:
        rmnodup = "False"
          
    QCdoc += """      
\\hline
parameter & value  \\\\
\\hline
output name & %s \\\\
\\hline
barcode file(file name only) & %s \\\\
\\hline
reads file(file name only) & %s \\\\
\\hline
reads file format & %s  \\\\
\\hline
cell barcode length &  %s \\\\
\\hline
UMI length & %s \\\\
\\hline
mapping software & %s \\\\
\\hline
Q30 filter mapped reads & %s \\\\
\\hline
remove reads away TTS & %s \\\\
\\hline
"""%(strlatexformat(conf_dict['General']['outname']),
     strlatexformat(conf_dict['General']['barcode_file'].split("/")[-1]),
     strlatexformat(conf_dict['General']['reads_file'].split("/")[-1]),
     conf_dict['General']['format'].upper(),
     str(conf_dict['General']['cell_barcode_length']),
     str(conf_dict['General']['umi_length']),
     conf_dict['Step1_Mapping']['mapping_software_main'],
     q30filter,
     filtertts
     )
    ### table1 part2
    if  filtertts == "True":
        QCdoc += """TTS distance (for remove) & %s bp \\\\
\\hline
"""%(str(conf_dict['Step2_ExpMat']['ttsdistance'])) 
    if  int(conf_dict['Step2_ExpMat']['duplicate_measure']) == 1:
        QCdoc += """duplicate rate in each cell & UMI $+$ location \\\\"""
    elif int(conf_dict['Step2_ExpMat']['duplicate_measure']) == 2:
        QCdoc += """duplicate rate in each cell & UMI only \\\\"""
    elif int(conf_dict['Step2_ExpMat']['duplicate_measure']) == 3:
        QCdoc += """duplicate rate in each cell & location only \\\\"""
    else:
        QCdoc += """duplicate rate in each cell & keep all reads \\\\"""
    if int(conf_dict['Step2_ExpMat']['duplicate_measure']) in [1,2]:
        QCdoc += """
\\hline
merge UMI ED = 1 & %s \\\\ 
\\hline"""%(umidis1)
    if  int(conf_dict['Step3_QC']['select_cell_measure']) == 1:
        QCdoc += """
select STAMPs & %s covered gene \\\\
\\hline"""%(str(conf_dict['Step3_QC']['covergncluster']))
    elif int(conf_dict['Step3_QC']['select_cell_measure']) == 2:
        QCdoc += """
select STAMPs & top %s UMI count \\\\
\\hline"""%(str(conf_dict['Step3_QC']['topumicellnumber']))
    QCdoc += """
remove low duplicate rate cell & %s \\\\ 
\\hline """%(rmnodup)
    if  rmnodup == "True":
        QCdoc += """
low duplicate rate cutoff & %s  \\\\
\\hline"""%(str(conf_dict['Step3_QC']['non_dup_cutoff']))
    QCdoc += """
z-score for highly variable gene & %s \\\\ 
\\hline 
cumulative variance for selecting PC & %s \\\\
\\hline """%(str(conf_dict['Step4_Analysis']['highvarz']),
     str(100*float(conf_dict['Step4_Analysis']['selectpccumvar']))+'\\%')
 
    if  int(conf_dict['Step4_Analysis']['clustering_method']) == 1:
        QCdoc += """
cluster method & k-means (Gap statistics, first stable) \\\\"""
    elif int(conf_dict['Step4_Analysis']['clustering_method']) == 2:
        QCdoc += """
cluster method & k-means (Gap statistics, maxSE) \\\\"""
    elif int(conf_dict['Step4_Analysis']['clustering_method']) == 3:
        QCdoc += """
cluster method & k-means (custom, k=%s) \\\\"""%(conf_dict['Step4_Analysis']['custom_k'])
    else:
        QCdoc += """
cluster method & DBScan (eps=%s) \\\\"""%(conf_dict['Step4_Analysis']['custom_d'])
    QCdoc += """
\\hline
\\end{tabularx}
\\end{table}
"""
    ### bulk QC
    QCdoc += """
\\newpage
\\newpage
\\section{Reads level QC}
In the reads level QC step we measured the quality of sequencing reads, including nucleotide quality and composition. In the reads level QC step and Bulk-cell level QC step we randomly sampled down total reads to 5 million and used a published package called ``RseQC" for reference.(Wang, L., Wang, S. and Li, W. (2012) )
\\newpage
\\newpage
\\subsection{Reads quality}
\\begin{quotation}
Reads quality is one of the basic reads level quality control methods. We plotted the distribution of a widely used Phred Quality Score at every position of sequence to measure the basic sequence quality of your data. Phred Quality Score was calculate by a python function $ord(Q) - 33$. Color in the heatmap represented frequency of this quality score observed at this position. Red represented higher frequency while blue was lower frequency. You may observe a decreasing of quality near the 3'end of sequence because of general degradation of quality over the duration of long runs. If the decreasing of quality influence the mappability (see ``Bulk-cell level QC") then the common remedy is to perform quality trimming where reads are truncated based on their average quality or you can trim serveal base pair near 3'end directly. If it doesn't help, you may consider your Drop-seq data poor quality. 
\\end{quotation}
\\begin{figure}[h]
        \\caption{Reads quality} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}

\\newpage
\\newpage
\\subsection{Reads nucleotide composition}
\\begin{quotation}
We assess the nucleotide composition bias of a sample. The proportion of four different nucleotides was calculated at each position of reads. Theoretically four nucleotides had similar proportion at each position of reads. You may observe higher A/T count at 3'end of reads because of the 3'end polyA tail generated in sequencing cDNA libaray, otherwise the A/T count should be closer to C/G count. In any case, you should observe a stable pattern at least in the 3'end of reads. Spikes (un-stable pattern) which occur in the middle or tail of the reads indicate low sequence quality. You can trim serveral un-stable bases from the 3'end if low mappability (see ``Bulk-cell level QC") is also observed. If it doesn't help, you may consider your Drop-seq data poor quality. Note that t
he A/T vs G/C content can greatly vary from species to species. 
\\end{quotation}
\\begin{figure}[h]
        \\caption{Reads nucleotide composition} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}

\\newpage
\\newpage
\\subsection{Reads GC content}
\\begin{quotation}
Distribution of GC content of each read. This module measures the general quality of the library. If the distribution looks different from a single bell (too sharp or too broad) then there may be a problem with the library. Sharp peaks on an otherwise smooth distribution are normally the result of a specific contaminant (adapter dimers for example), which may well be picked up by the overrepresented sequences module. Broader peaks may represent contamination with a different species. If you observe sharp peak or broder peak and also observe low mappability (see ``Bulk-cell level QC"), you may consider your Drop-seq data poor quality.
\\end{quotation}
\\begin{figure}[h]
        \\caption{Reads GC content} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}
"""%((conf_dict['QCplots']['read_qul'].split("/")[-1]),
     (conf_dict['QCplots']['read_nvc'].split("/")[-1]),
     (conf_dict['QCplots']['read_gc'].split("/")[-1])
    )

    QCdoc += """
\\newpage
\\newpage
\\section{Bulk-cell level QC}
In the bulk-cell level QC step we measured the performance of total Drop-seq reads. In this step we did't separate cell or remove ``empty" cell barcodes, just like treated the sample as bulk RNA-seq sample.
\\newpage
\\newpage
\\subsection{Reads alignment summary}
\\begin{quotation}
The following table shows mappability and distribution of total Drop-seq reads. It measures the general quality of data as a RNA-seq sample. Low mappability indicates poor sequence quality(see ``Reads level QC") or library quality(caused by contaminant). High duplicate rate (low total UMI percentage observed, e.g. $<$ 10\\%%) indicate insufficient RNA material and Overamplification. In summary, if the percentage of ``total UMI count" is less than 5\\%%, users may consider reconstruct your library(redo the experiment), but first you should make sure you already trim the adapter and map your reads to the corresponded species(genome version). Note that UMI number was calculated by removing duplicate reads (which have identical genomic location, cell barcode and UMI sequences). Mappable reads was after Q30 filtering if Q30 filter function was turned on.\\\\
** the percentage was calculated by dividing total reads number \\\\
*** the percentage was calculated by divding total UMI number
\\end{quotation}
\\begin{table}[h]
\\caption{Reads alignment summary}\\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|X| }
    
\\hline
genomic region(Category) &  reads number \\\\
\\hline
total reads & %s \\\\
\\hline
mappble reads &  %s (%s\\%%)* \\\\
\\hline
total UMI count & %s (%s\\%%)* \\\\
\\hline
CDS exon UMI count & %s (%s\\%%)** \\\\
\\hline
3'UTR UMI count & %s (%s\\%%)** \\\\
\\hline
5'UTR UMI count & %s (%s\\%%)** \\\\
\\hline
intron UMI count & %s (%s\\%%)** \\\\
\\hline
intergenic UMI count & %s (%s\\%%)** \\\\
\\hline

\\end{tabularx}
\\end{table}
"""%(textformat(str(conf_dict['Mapping_stat']['totalreads'])),
     textformat(str(conf_dict['Mapping_stat']['q30reads'])),
     str( round(100*conf_dict['Mapping_stat']['q30reads']*1.0/conf_dict['Mapping_stat']['totalreads'], 2)),
     textformat(str(conf_dict['Mapping_stat']['umi_gene'])),
     str( round(100*conf_dict['Mapping_stat']['umi_gene']*1.0/conf_dict['Mapping_stat']['totalreads'], 2)),
     textformat(str(conf_dict['Mapping_stat']['cdsN'])),
     str( round(100*conf_dict['Mapping_stat']['cdsN']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)),
     textformat(str(conf_dict['Mapping_stat']['utr3N'])),
     str( round(100*conf_dict['Mapping_stat']['utr3N']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)),
     textformat(str(conf_dict['Mapping_stat']['utr5N'])),
     str( round(100*conf_dict['Mapping_stat']['utr5N']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)),
     textformat(str(conf_dict['Mapping_stat']['intronN'])),
     str( round(100*conf_dict['Mapping_stat']['intronN']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)),
     textformat(str(conf_dict['Mapping_stat']['intergenicN'])),
     str( round(100*conf_dict['Mapping_stat']['intergenicN']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)))
     ### genebody coverage
    QCdoc += """
\\newpage
\\newpage
\\subsection{Gene body coverage}
\\begin{quotation}
Aggregate plot of reads coverage on all genes. This module measures the general quality of the Drop-seq data. Theoretically we observe a unimodal (single bell) distribution, but for Drop-seq sample an enrichment at 3'end is observed due to library preparation using oligo-dT primers. In any case you should observe a smooth distritbuion. If loss of reads or spike are observed in certain part of gene body (e.g. middle or 3'end of gene body), poor quality of your library was indicated. Especially when low mappability and high intron rate are also observed (see ``Reads alignment summary" section).
\\end{quotation}
\\begin{figure}[h]
        \\caption{Gene body coverage} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}
"""%((conf_dict['QCplots']['gb_cover'].split("/")[-1]))

    QCdoc += """

\\newpage
\\newpage
\\section{Individual-cell level QC}
In this step we focused on the quality of individual cell and distinguishing cell barcodes from STAMPs (single-cell transcriptomes attached to microparticles)
\\newpage
\\newpage
\\subsection{Reads duplicate rate distribution}
\\begin{quotation}
Drop-seq technology has an innate advantage of detecting duplicate reads and amplification bias due to the barcode and UMI information. This module displays the distribution of duplicate rate in each cell barcode and helps to discard barcodes with low duplicate rate (which usually caused by empty cell barcodes and ambient RNA). We plot the distribution of duplicate rate in each cell barcode (though most of cell barcodes don't contain cells, they still have RNA) and observed a bimodal distribution of duplicate rate. We set an option for you to discard cell barcodes with low duplicate rate in following steps. The vertical line represented the cutoff (duplicate rate $>=$ 0.1) of discarding cell barcodes with low duplicate rate. You can adjust the cutoff and rerun Dr.seq if current cutoff didn't separate two peaks from the distribution clearly (usually happened with insufficient sequencing depth). If the distribution didn't show clear bimodal or you don't want to discard cell barcodes according to duplicate rate, you can set cutoff to 0 to keep all cell barcodes for following steps. 
\\end{quotation}
\\begin{figure}[h]
        \\caption{Reads dupliate rate distribution} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}
"""%(conf_dict['QCplots']['duprate'].split("/")[-1])
    if int(conf_dict['Step3_QC']['select_cell_measure']) == 1:
        QCdoc += """
\\newpage
\\newpage
\\subsection{Reads duplicate rate vs. cumulative covered gene number}
\\begin{quotation}
Reads duplicate rate versus cumulative covered gene numbers. This module measures whether each of your individual cell was sequenced and clearly separated from empty cell barcodes. Cell barcodes are ranked by the number of covered genes. The duplicate rate (y-axis, left side) is plotted as a function of ranked cell barcode. Red curve represents the number of genes covered by top N cell barcodes (y-axis, right side). N is displayed by x-axis. Theoretically you observe a ``knee" on your cumulative curve (slope $=$ 1 on the curve) and the cutoff of your selected STAMPs (dash line) should be close to the ``knee". The cutoff can also be far away from the ``knee" in some cases because you input too many cells and have insufficient average sequencing depth, then you should adjust your cutoff (to the position you get enough STAMPs and sufficient reads count) and rerun Dr.seq. See the description of the paramter ``select cell measure" in the Manual.
\\end{quotation}
\\begin{figure}[h]
        \\caption{Reads duplicate rate vs. cumulative covered gene number} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}

\\newpage
\\newpage
\\subsection{UMI vs. covered gene number}
\\begin{quotation}
Covered gene number is plotted as a function of the number of UMI (i.e. unique read). This module measures the quality of Drop-seq experiment and helps to distinguish STAMPs from empty cell barcodes. We observe a clearly different pattern for two groups of cell barcodes with different reads duplicate rate (blue dots versus red and purple dots). Purple dots represented the selected STAMPs for the cell-clustering step. By default we select STAMPs with 1000 gene covered after discarding low duplicate cell barcodes. You may get few STAMPs according to this cutoff if the average sequencing depth of your cells was too low or too many cells were inputed. In this case you can adjust your cutoff or tell Dr.seq to directly select cell barcodes with highest reads count (see the description of the parameter ``select cell measure"). Note that we use only STAMPs selected in this step for following analysis. The other cell barcodes are discarded. 
\\end{quotation}
\\begin{figure}[h]
        \\caption{UMI v.s. covered gene number} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}
"""%(conf_dict['QCplots']['cumumiduprate'].split("/")[-1],
     conf_dict['QCplots']['umicovergn'].split("/")[-1])
    else:
        QCdoc += """
\\newpage
\\newpage
\\subsection{Reads duplicate rate vs. cumulative covered gene number}
\\begin{quotation}
Reads duplicate rate versus cumulative covered gene numbers. This module measures whether each of your individual cell was sequenced and clearly separated from empty cell barcodes. Cell barcodes are ranked by the number of UMI count. The duplicate rate (y-axis, left side) is plotted as a function of ranked cell barcode. Red curve represents the number of genes covered by top N cell barcodes (y-axis, right side). N is displayed by x-axis. Theoretically you observe a ``knee" on your cumulative curve (slope $=$ 1 on the curve) and the cutoff of your selected STAMPs (dash line) should be close to the ``knee". The cutoff can also be far away from the ``knee" in some cases because you input too many cells and have insufficient average sequencing depth, then you should adjust your cutoff (to the position you get enough STAMPs and sufficient reads count) and rerun Dr.seq. See the description of the paramter ``select cell measure" in the Manual.
\\end{quotation}
\\begin{figure}[h]
        \\caption{Reads duplicate rate vs. cumulative covered gene number} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}

\\newpage
\\newpage
\\subsection{UMI vs. covered gene number}
\\begin{quotation}
Covered gene number is plotted as a function of the number of UMI (i.e. unique read). This module measures the quality of Drop-seq experiment and helps to distinguish STAMPs from empty cell barcodes. We observe a clearly different pattern for two groups of cell barcodes with different reads duplicate rate (blue dots versus red and purple dots). Purple dots represented the selected STAMPs for the cell-clustering step. We select 1000 STAMPs with highest UMI count after discarding low duplicate cell barcodes. You may get few STAMPs according to this cutoff if the average sequencing depth of your cells was too low or too many cells were inputed. Note that we use only STAMPs selected in this step for following analysis. The other cell barcodes are discarded. 
\\end{quotation}
\\begin{figure}[h]
        \\caption{UMI v.s. covered gene number} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}
"""%(conf_dict['QCplots']['cumumiduprate'].split("/")[-1],
     conf_dict['QCplots']['umicovergn'].split("/")[-1])
     
    QCdoc += """
\\newpage
\\newpage
\\subsection{Covered gene number distribution}
\\begin{quotation}
Histogram of covered gene number of selected STAMPs. The module measures whether the selected STAMPs have sufficient reads coverage. By default Dr.seq selects cell barcodes with $>=$ 1000 genes covered as STAMPs. If you choose to select STAMPs with highest reads count (``select cell measure" $=$ 2), then you should check this figure to make sure the STAMPs you select have enough gene covered. If most of your STAMPs have low covered gene number (e.g. $<$ 100 gene covered), you can make your cutoff more stringent (e.g. select less cell barcodes with higher reads count) to make sure you get reliable STAMPs.
\\end{quotation}
\\begin{figure}[h]
        \\caption{Covered gene number} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}

\\newpage
\\newpage
\\subsection{Intron rate distribution}
\\begin{quotation}
Intron rate is a effective method to measure the quality of a RNA-seq sample. We plot a histogram of intron rate of every STAMP barcodes to check whether reads from each STAMPs enriched in the exon region. High intron rate (e.g. $>=$ 30\\%%) indicates low quality of RNA in each STAMPs (caused by different problem, for example contaminant). You may consider your Drop-seq data low quality if most of selected STAMPs have high intron rate and low covered gene number (see ``Covered gene number distribution" section). Intron rate is defined as $\\frac{intron\\ reads\\ number}{intron + exon\\ reads\\ number}$ 
\\end{quotation}
\\begin{figure}[h]
        \\caption{Intron rate distribution} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}
 
"""%(conf_dict['QCplots']['covergn'].split("/")[-1],
     conf_dict['QCplots']['intronrate'].split("/")[-1])
    
    if int(conf_dict['Step4_Analysis']['clustering_method']) in [3,4]:
        pass
    else:
        if int(conf_dict['Step4_Analysis']['clustering_method']) in [1,2]:
            selectM = 'first stable gap'
        else:
            selectM = 'maxSE'
        QCdoc += """
\\newpage
\\newpage
\\section{Cell-clustering level QC}
This step composed by k-means clustering based on t-SNE dimentional reduction result and Gap statistics to determine best k.
\\newpage
\\newpage
\\subsection{Gap statistics}
\\begin{quotation}
We conducted a k-means clustering based on t-SNE dimensional reduction output to measure sample's ability to be separated to different cell subtypes. Gap statistics was performed to determine the best k in k-means clustering. In general, decreasing pattern (usually k $<=$ 2) is observed for pure cell type or cell line data, while increasing pattern with bigger k should be observed for mix cell types (or cell subtypes) data. If the cluster number predicted from the Gap statistics is largely different to what you expect, it indicated that your cells are not well characterized and separated by the Drop-seq experiment (due to the contaminant or the low capture efficiency of Droplets). In this case, you may consider your Drop-seq data poor quality. Alternatively, you may would like to use the parameter ``custom k" to specify the cluster number.
\\end{quotation}
\\begin{figure}[h]
        \\caption{Gap statistics} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}

"""%(conf_dict['QCplots']['gapstat'].split("/")[-1])
    
    QCdoc += """
\\newpage
\\newpage
\\subsection{Clustering plot}
\\begin{quotation}
Scatter plot represented visualization of t-SNE dimensional reduction output of selected STAMP barcodes. STAMP barcodes are colored according to the clustering result and cluster numbers are printed in the center of each cluster. This figure is mainly for visualization and help you to know how your Drop-seq data look like. If you want to combine some small groups which are close to each other, you can use the cluster matrix (named ``cluster.txt") in the Dr.seq standard analysis output to conduct your own analysis.   
\\end{quotation}
\\begin{figure}[h]
        \\caption{Clustering plot} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}
 
"""%(conf_dict['QCplots']['cluster'].split("/")[-1])
    if os.path.isfile(conf_dict['QCplots']['silhouette']):
        QCdoc += """
\\newpage
\\newpage
\\subsection{Silhouette of clustering}
\\begin{quotation}
Silhouette method is used to interprate and validate the consistency within clusters defined in previous steps. A poor Silhouette (e.g. average si $<$ 0.2 ) score indicate that Drop-seq experiments(if not properly done) may not separate well the subpopulations of cells. If most of your clusters have poor Silhouette score, it may indicate a poor quality of your Drop-seq experiments. 
\\end{quotation}
\\begin{figure}[h]
        \\caption{Silhouette score for clustered STAMPs} \\label{fig:profileunion}
        \\setlength{\\abovecaptionskip}{0pt}
        \\setlength{\\belowcaptionskip}{10pt}
        \\centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\\end{figure}
 
"""%(conf_dict['QCplots']['silhouette'].split("/")[-1])
    
#    QCdoc += """
#\\newpage
#\\newpage
#\subsection{STAMPs colored by total UMI count}
#\\begin{quotation}
#STAMPs was by the total number of UMI based on t-SNE visualization. 
#\end{quotation}
#\\begin{figure}[h]
#        \caption{STAMPs colored by total UMI count} \label{fig:profileunion}
#        \setlength{\\abovecaptionskip}{0pt}
#        \setlength{\\belowcaptionskip}{10pt}
#        \centering
#        {\includegraphics[width=0.8\\textwidth]{%s}}
#\end{figure}
 
#"""%(conf_dict['QCplots']['umicolor'].split("/")[-1])
   
#    QCdoc += """
#\\newpage
#\\newpage
#\subsection{STAMPs colored by intron rate}
#\\begin{quotation}
#STAMPs was by the intron rate based on t-SNE visualization. 
#\end{quotation}
#\\begin{figure}[h]
#        \caption{STAMPs colored by intron rate} \label{fig:profileunion}
#        \setlength{\\abovecaptionskip}{0pt}
#        \setlength{\\belowcaptionskip}{10pt}
#        \centering
#        {\includegraphics[width=0.8\\textwidth]{%s}}
#\end{figure}
# 
#"""%(conf_dict['QCplots']['itrcolor'].split("/")[-1])
      
    QCdoc += """
\\newpage
\\newpage
\\section{Output list}
\\begin{quotation}
All output files were described in the following table
\\end{quotation}
\\begin{table}[h]
\\caption{output list}\\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|l| }
    
\\hline
description & filename \\\\
\\hline
expression matrix for selected STAMPs & %s  \\\\
"""%(strlatexformat(conf_dict['results']['expmatcc'].split("/")[-1]))
    if int(conf_dict['Step4_Analysis']['pctable']) == 1:
        QCdoc += """
\\hline
top2 components of PCA dimentional reduction result & %s \\\\         
"""%(strlatexformat(conf_dict['results']['pctable'].split("/")[-1]))
    if int(conf_dict['Step4_Analysis']['cortable']) == 1:
        QCdoc += """
\\hline
pairwise correlation matrix & %s \\\\
"""%(strlatexformat(conf_dict['results']['cortable'].split("/")[-1]))
    QCdoc += """
\\hline
All features of selected STAMPs & %s \\\\
\\hline
summary QC report & %s \\\\
\\hline

\\end{tabularx}
\\end{table} 
\\end{document} 
"""%(strlatexformat(conf_dict['results']['features'].split("/")[-1]),strlatexformat(conf_dict['General']['outname'])+"\\_summary.pdf")

    os.chdir(plot_folder)

    latexfile = conf_dict['General']['outname'] + '_summary.tex'
    outf = open(latexfile,'w')
    outf.write(QCdoc)
    outf.close()
    cmd = "pdflatex %s"%(latexfile)
    cmd2 = 'cp %s %s'%(conf_dict['General']['outname'] + '_summary.pdf',summarydir)
    if conf_dict['General']['latex'] == 1:
        rwlog(cmd,logfile)
        rwlog(cmd,logfile)
        rwlog(cmd2,logfile)
        for files in os.listdir(plot_folder):
            if os.path.isfile(files) and files[-12:-4] == "_summary":
                if not files[-4:] in ['.tex','.pdf',',png','.txt']:
                    cmd = "rm %s"%(files)
                    rwlog(cmd,logfile)
        wlog('pdflatex was detected in default PATH, generate summary report %s'%('summary/'+conf_dict['General']['outname'] + '_summary.pdf'),logfile)
    else:
        wlog('pdflatex was not detected in default PATH, generate summary report .tex file in summary/plots folder, you can move the whole summary/plots/ folder to the environment with pdflatex installed and run cmd in the plots/ folder: "pdflatex %s"'%(conf_dict['General']['outname'] + '_summary.tex'),logfile)
   
        
    if conf_dict['clean']:
        wlog('--clean pararmeter was turned on, remove internal files with large size',logfile)
        rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_symbol.bed'),logfile)
        rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_cds.bed'),logfile)
        rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_3utr.bed'),logfile)
        rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_5utr.bed'),logfile)
        rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_TTSdis.bed'),logfile)
        rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_combined.bed'),logfile)
        rwlog("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_barcode_reform.txt'),logfile)

    wlog('Step5 summary DONE, check %s for final outputs'%(summarydir),logfile)


    return conf_dict
コード例 #4
0
def step0_integrate_data(conf_dict, logfile):
    '''
    step0 integrate data 
    check and complement parameter
    '''
    wlog("Start Drseq", logfile)
    wlog("Step0: Data integrate", logfile)

    ### check output name
    if "/" in conf_dict['General']['outname']:
        ewlog(
            "outname is the name of all your output result, cannot contain " /
            ", current outname is  %s" % (conf_dict['General']['outname']),
            logfile)
    ### check data path , format ,
    if "~" in conf_dict['General']['barcode_file']:
        ewlog(
            'require absolute path for barcode file, barcode file cannot contain "~", current barcode file is %s'
            % (conf_dict['General']['barcode_file']), logfile)
    if "~" in conf_dict['General']['reads_file']:
        ewlog(
            'require absolute path for reads file, reads file cannot contain "~", current reads file is %s'
            % (conf_dict['General']['reads_file']), logfile)
    if not conf_dict['General']['barcode_file'].startswith('/'):
        conf_dict['General']['barcode_file'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['barcode_file']
    if not conf_dict['General']['reads_file'].startswith('/'):
        conf_dict['General']['reads_file'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['reads_file']

    if not os.path.isfile(conf_dict['General']['barcode_file']):
        ewlog(
            "barcode file %s not found" %
            (conf_dict['General']['barcode_file']), logfile)
    if not os.path.isfile(conf_dict['General']['reads_file']):
        ewlog("reads file %s not found" % (conf_dict['General']['reads_file']),
              logfile)

    if not conf_dict['General']['barcode_file'].endswith('.fastq'):
        if conf_dict['General']['barcode_file'].endswith('.txt'):
            wlog('barcode file is reformed txt file', logfile)
            conf_dict['General']['format1'] = 'txt'
        else:
            ewlog(
                "barcode file is not a fastq file: %s" %
                (conf_dict['General']['barcode_file']), logfile)
    else:
        conf_dict['General']['format1'] = 'fastq'
    if conf_dict['General']['reads_file'].endswith(
            '.fastq') or conf_dict['General']['reads_file'].endswith('.fq'):
        conf_dict['General']['format'] = 'fastq'
        wlog('Detected input file format is fastq', logfile)
    elif conf_dict['General']['reads_file'].endswith('.sam'):
        conf_dict['General']['format'] = 'sam'
        wlog('Detected input file format is sam', logfile)
    else:
        ewlog(
            "reads file is not a fastq or sam file: %s" %
            (conf_dict['General']['reads_file']), logfile)
    ### check barcode length
    try:
        conf_dict['General']['cell_barcode_length'] = int(
            conf_dict['General']['cell_barcode_length'])
        conf_dict['General']['umi_length'] = int(
            conf_dict['General']['umi_length'])
    except:
        ewlog('barcode length should be int', logfile)
    ### check gene annotation file
    if conf_dict['General']['gene_annotation'] == "":
        ewlog("gene annotation file cannot be empty", logfile)
    if not "/" in conf_dict['General']['gene_annotation']:
        ewlog("absolute path for gene annotation file required", logfile)
    if not os.path.isfile(conf_dict['General']['gene_annotation']):
        ewlog(
            "cannot find gene annotation file : %s" %
            (conf_dict['General']['gene_annotation']), logfile)

    ### mapping index
    if conf_dict['General']['format'] == 'fastq':
        #        if not conf_dict['Step1_Mapping']['mapindex'].endswith("/"):
        #            conf_dict['Step1_Mapping']['mapindex'] += "/"
        if conf_dict['Step1_Mapping']['mapping_software_main'] == "STAR":
            wlog('use STAR as alignment tools', logfile)
            if int(conf_dict['Step1_Mapping']['checkmem']) == 1:
                wlog('memory check is turned on, check total memory', logfile)
                totalMemory = detect_memory()
                if totalMemory == "NA":
                    ewlog(
                        '''cannot detect total memory (because your server don't have /proc/meminfo file or you are running Dr.seq on Mac computer), Dr.seq exit to protect your server from crash down. You can turn off the memory check and run Dr.seq again if you do want to use STAR as mapping software or you can use bowtie2 instead.''',
                        logfile)
                elif totalMemory < 40:
                    ewlog(
                        '''Total memory of your server/computer is %sG, less than 40G (memory cutoff for STAR), Dr.seq exit to protect your server from crash down. You can turn off the memory check and run Dr.seq again if you do want to use STAR as mapping software or you can use bowtie2 instead '''
                        % (str(totalMemory)), logfile)
                else:
                    wlog(
                        '''Total memory of your  server/computer is %sG, greater than 40G (memory cutoff for STAR), Dr.seq will use STAR as mapping software'''
                        % (str(totalMemory)), logfile)
            else:
                wlog(
                    'memory check is turned off, start mapping with STAR ### STAR consume > 30G memory, make sure your server have enough memory ###',
                    logfile)
#            conf_dict['Step1_Mapping']['mapindex'] +='%s.star'%(conf_dict['General']['genome_version'])
            if not os.path.isdir(conf_dict['Step1_Mapping']['mapindex']):
                ewlog(
                    "cannot find STAR index folder : %s" %
                    (conf_dict['Step1_Mapping']['mapindex']), logfile)
        elif conf_dict['Step1_Mapping']['mapping_software_main'] == "bowtie2":
            wlog('use bowtie2 as alignment tools', logfile)
            #            conf_dict['Step1_Mapping']['mapindex'] = indexdir + conf_dict['General']['genome_version']
            indexfile1 = conf_dict['Step1_Mapping']['mapindex'] + '.1.bt2'
            #           if not os.path.isdir(indexdir):
            #               ewlog("cannot find bowtie2 index folder : %s "%(indexdir),logfile)
            if not os.path.isfile(indexfile1):
                ewlog("cannot find bowtie2 index file : %s " % (indexfile1),
                      logfile)
        else:
            ewlog("alignment tools can only be STAR and bowtie2", logfile)

    ### check options
    wlog('option setting: ', logfile)
    try:
        wlog(
            'mapping thread is %s' %
            (str(int(conf_dict['Step1_Mapping']['mapping_p']))), logfile)
    except:
        ewlog(
            'mapping_p should be int, current value is %s' %
            (conf_dict['Step1_Mapping']['mapping_p']), logfile)

    if not int(conf_dict['Step1_Mapping']['q30filter']) in [0, 1]:
        ewlog(
            'q30filter measurement can only be 0/1, current value is %s' %
            (conf_dict['Step1_Mapping']['q30filter']), logfile)

    if not int(conf_dict['Step2_ExpMat']['filterttsdistance']) in [0, 1]:
        ewlog(
            'filterttsdistance measurement can only be 0/1, current value is %s'
            % (conf_dict['Step2_ExpMat']['filterttsdistance']), logfile)

    if not int(conf_dict['Step2_ExpMat']['ttsdistance']) > 0:
        ewlog(
            'ttsdistance value should greater than 0, current value is %s' %
            (conf_dict['Step2_ExpMat']['ttsdistance']), logfile)

    if int(conf_dict['Step2_ExpMat']['covergncutoff']) > 10000:
        ewlog(
            'covergncutoff value cannot be greater than 10000, current value is %s'
            % (conf_dict['Step2_ExpMat']['covergncutoff']), logfile)

    if not int(conf_dict['Step2_ExpMat']['duplicate_measure']) in [0, 1, 2, 3]:
        ewlog(
            'duplicate_measure value can only be 0~3, current value is %s' %
            (conf_dict['Step2_ExpMat']['duplicate_measure']), logfile)

    if not int(conf_dict['Step3_QC']['select_cell_measure']) in [1, 2]:
        ewlog(
            'select_cell_measure value can only be 1 or 2, current value is %s'
            % (conf_dict['Step3_QC']['select_cell_measure']), logfile)

    if int(conf_dict['Step3_QC']['select_cell_measure']) == 1:
        try:
            int(conf_dict['Step3_QC']['covergncluster'])
        except:
            ewlog(
                'covergncluster value should be integer, current value is %s' %
                (conf_dict['Step3_QC']['covergncluster']), logfile)
    elif int(conf_dict['Step3_QC']['select_cell_measure']) == 2:
        try:
            int(conf_dict['Step3_QC']['topumicellnumber'])
        except:
            ewlog(
                'topumicellnumber value should be integer, current value is %s'
                % (conf_dict['Step3_QC']['covergncluster']), logfile)
    else:
        ewlog(
            'select_cell_measure value can only be 1 or 2, current value is %s'
            % (conf_dict['Step3_QC']['select_cell_measure']), logfile)

    if not int(conf_dict['Step3_QC']['remove_non_dup_cell']) in [0, 1]:
        ewlog(
            'remove_non_dup_cell measurement can only be 0/1, current value is %s'
            % (conf_dict['Step3_QC']['remove_non_dup_cell']), logfile)
    if float(conf_dict['Step3_QC']['non_dup_cutoff']) <= 0 or float(
            conf_dict['Step3_QC']['non_dup_cutoff']) >= 1:
        ewlog(
            'non_dup_cutoff measurement should be in 0~1, current value is %s'
            % (conf_dict['Step3_QC']['non_dup_cutoff']), logfile)
    if float(conf_dict['Step4_Analysis']['highvarz']) <= 0:
        ewlog(
            'non_dup_cutoff measurement cannot be <= 0, current value is %s' %
            (conf_dict['Step4_Analysis']['highvarz']), logfile)
    if float(conf_dict['Step4_Analysis']['selectpccumvar']) <= 0 or float(
            conf_dict['Step4_Analysis']['selectpccumvar']) >= 1:
        ewlog(
            'selectpccumvar measurement should be in 0~1, current value is %s'
            % (conf_dict['Step4_Analysis']['selectpccumvar']), logfile)
    if not int(
            conf_dict['Step4_Analysis']['clustering_method']) in [1, 2, 3, 4]:
        ewlog(
            'clustering_method measurement should be chosen from 1,2,3 and 4, current value is %s'
            % (conf_dict['Step4_Analysis']['clustering_method']), logfile)

    ### check Rscript
    #if not 'Usage' in sperr('Rscript')[1] and not 'version' in sperr('Rscript')[1]:
    #    ewlog('require Rscript',logfile)

    ### check pdflatex
    if sp('pdflatex --help')[0] == "":
        wlog(
            'pdflatex was not installed, Dr.seq is still processing but no summary QC report generated',
            logfile)
        conf_dict['General']['latex'] = 0
    else:
        conf_dict['General']['latex'] = 1

    wlog('Step0 Data integrate DONE', logfile)

    return conf_dict
コード例 #5
0
ファイル: Drseq.py プロジェクト: TongjiZhanglab/drseq
def main():

    args = parse_args()
    conf_dict = read_conf(args.config)
    ### read raw path of output dir, the startdir will be used when the input file is not in absolute path
    conf_dict['General']['startdir'] = os.getcwd() + '/'

    ### check output name and dir from input parameter
    if conf_dict['General']['outname'] == "":
        print 'your outname cannot be left blank,exit'
        sys.exit(1)
    if "." in conf_dict['General']['outname']:
        oldname = conf_dict['General']['outname']
        newname = oldname.replace(".", "-")
        conf_dict['General']['outname'] = newname
        print 'replace outname from %s to %s for latex summary' % (oldname,
                                                                   newname)
    if conf_dict['General']['outputdirectory'] == "":
        conf_dict['General']['outputdirectory'] = conf_dict['General'][
            'outname']
        print 'output directory is blank, use outname as directory name and set output directory in current folder'
    if "~" in conf_dict['General']['outname']:
        print 'ERROR: ~ cannot appeared in outname, current outname is %s' % (
            conf_dict['General']['outname'])
        sys.exit(1)
    if "~" in conf_dict['General']['outputdirectory']:
        print 'ERROR: require absolute path for outputdirectory'
        sys.exit(1)
    if not conf_dict['General']['outputdirectory'].endswith('/'):
        conf_dict['General']['outputdirectory'] += '/'
    if not conf_dict['General']['outputdirectory'].startswith('/'):
        conf_dict['General']['outputdirectory'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['outputdirectory']

    ### creat output dir
    if os.path.isfile(conf_dict['General']['outputdirectory'].rstrip("/")):
        print 'ERROR: name of your output dir is exist as a file, cannot create a dir,Dr.seq exit'
        sys.exit(1)
    elif os.path.isdir(conf_dict['General']['outputdirectory']):
        if not args.fover:
            print 'ERROR: name of your output dir is exist as a dir, Dr.seq exit because overwrite function is turned off, you can add -f parameter to turn on overwite function'
            sys.exit(1)
        else:
            print 'name of your output dir is exist as a dir, overwrite is turned on, write output result in existing dir'
    else:
        os.system("mkdir %s" % (conf_dict['General']['outputdirectory']))

    ### move to output dir
    os.chdir(conf_dict['General']['outputdirectory'])
    ## cp config file to output folder
    cmd = 'cp %s .' % (conf_dict['General']['startdir'] + args.config)
    CMD(cmd)
    ### specify the main progress log file
    logfile = conf_dict['General']['outputdirectory'] + 'progress_log.txt'
    ### remove existing log file.
    if os.path.isfile(logfile):
        CMD('rm %s' % logfile)

    ### Rscript location
    #CONFIG_TEMPLATE = os.path.join(Drseq_pipe.__path__[0], "Config/Drseq_template.conf")
    conf_dict['rscript'] = os.path.join(
        Drseqpipe.__path__[0],
        "Rscript/")  #'/mnt/Storage3/CR/Dropseq/drseq/Rscript/'
    conf_dict['clean'] = args.Clean

    ### main step for Dr.seq , see individual script for detail note.
    # preparing step, integrate parameter, prepare for following step
    t = time.time()
    step0_integrate_data(conf_dict, logfile)
    # main data processing step, including mapping, generate expression matrix and QC matrix which is used in next step
    step1_generate_matrix(conf_dict, logfile)
    step1time = time.time() - t
    wlog("running time for expression matrix generation: %s" % (step1time),
         logfile)
    # QC step, including bulk RNAseq QC(option), individual cell QC
    t = time.time()
    step3_QC(conf_dict, logfile)
    step3time = time.time() - t
    wlog("running time for QC: %s" % (step3time), logfile)
    # analysis step, including  select cell, filter high variance gene, pca + t-SNE dimentional reduction, k-means + Gap stat clustering
    t = time.time()
    step4_analysis(conf_dict, logfile)
    step4time = time.time() - t
    wlog("running time for clustering: %s" % (step4time), logfile)
    # summary step, integrate all QC figure and expression matrix, generate qC report with latex
    step5_summary(conf_dict, logfile)
コード例 #6
0
def step3_QC(conf_dict,logfile):
    '''
    start RseQC
    mapping stat
    single cell level QC
    '''
    # start
    # create section for 
    
    wlog('Step3: bulk and individual cell QC',logfile)
    ### preparing mapping state dict
    wlog('calculate mapping state',logfile)
    conf_dict['Mapping_stat'] = {}
    conf_dict['Mapping_stat']['umi_gene'] = 0
    conf_dict['Mapping_stat']['cdsN'] = 0
    conf_dict['Mapping_stat']['utr3N'] = 0
    conf_dict['Mapping_stat']['utr5N'] = 0
    conf_dict['Mapping_stat']['intronN'] = 0
    conf_dict['Mapping_stat']['intergenicN'] = 0

    ### calculate mapping state based on QC matrix
    inf = open(conf_dict['Step2_ExpMat']['qcmatfull'])
    for line in inf:
        if line.startswith('cellname'):
            continue
        ll = line.split()
        conf_dict['Mapping_stat']['umi_gene'] += int(ll[2])
        conf_dict['Mapping_stat']['cdsN'] += int(ll[3])
        conf_dict['Mapping_stat']['utr3N'] += int(ll[4])
        conf_dict['Mapping_stat']['utr5N'] += int(ll[5])
        conf_dict['Mapping_stat']['intronN'] += int(ll[6])
        conf_dict['Mapping_stat']['intergenicN'] += int(ll[7])
    inf.close()
    conf_dict['Mapping_stat']['totalreads'] = int(sp('wc -l %s'%(conf_dict['General']['barcode_reform']))[0].split()[0])    
    conf_dict['Mapping_stat']['q30reads'] = int(sp('wc -l %s'%(conf_dict['General']['bed']))[0].split()[0])

    
    ### create  QC dir and conduct QC
    wlog('generate reads QC measurement with own script, based on sample down reads',logfile)
    qcdir = conf_dict['General']['outputdirectory'] + 'QC/'
    createDIR(qcdir)
    os.chdir(qcdir)
    conf_dict['QCplots'] = {}
    conf_dict['QCplots']['map_summary'] = qcdir + conf_dict['General']['outname'] + '_map_summary.txt'
    mapsummary_doc = """genomic region(Category)\treads number
total reads\t%s
mappble reads\t%s 
total UMI count\t%s
CDS exon UMI count\t%s
3'UTR UMI count\t%s
5'UTR UMI count\t%s
intron UMI count\t%s
intergenic UMI count\t%s
"""%(str(conf_dict['Mapping_stat']['totalreads']),
     str(conf_dict['Mapping_stat']['q30reads']),
     str(conf_dict['Mapping_stat']['umi_gene']),
     str(conf_dict['Mapping_stat']['cdsN']),
     str(conf_dict['Mapping_stat']['utr3N']),
     str(conf_dict['Mapping_stat']['utr5N']),
     str(conf_dict['Mapping_stat']['intronN']),
     str(conf_dict['Mapping_stat']['intergenicN']))
    outf = open(conf_dict['QCplots']['map_summary'],'w') 
    outf.write(mapsummary_doc)
    outf.close()
    ## reads quality
    t= time.time()
    readsqc(conf_dict['General']['sampledownsam'],conf_dict['General']['outname'])
    wlog('generate bulk cell QC measurement with own script, based on sample down reads',logfile)

    cmd = "bedtools intersect -a %s -b %s -c > %s"%(conf_dict['General']['outputdirectory'] + 'annotation/'+ conf_dict['General']['outname'] + '_gene_anno_binexon.bed', conf_dict['General']['sampledownbed'],conf_dict['General']['outname']+'_sampledown_on_gbbin.bed' )
    rwlog(cmd,logfile)
    GBcover(conf_dict['General']['outname']+'_sampledown_on_gbbin.bed',conf_dict['General']['outname'])
    cmd = "%s %s %s"%('Rscript',conf_dict['rscript']+'readsbulkQC.r',conf_dict['General']['outname'])
    rwlog(cmd,logfile)
    
#       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_qul'],conf_dict['General']['sam'],conf_dict['General']['outname'])
#       rwlog(cmd,logfile)
#       ## reads nucleotide composition
#       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_nvc'],conf_dict['General']['sam'],conf_dict['General']['outname'])
#       rwlog(cmd,logfile)
#       ## reads GC content
#       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_gc'],conf_dict['General']['sam'],conf_dict['General']['outname'])
#       rwlog(cmd,logfile)
#       readsqctime = time.time() -t
#       wlog("time for readsqc: %s"%(readsqctime),logfile)
#       ## reads genebody coverage
#       t= time.time()
#
#       cmd = "%s -i %s -o %s -r %s"%(conf_dict['Step3_QC']['gb_cover'],conf_dict['General']['sam'],conf_dict['General']['outname'],conf_dict['General']['outputdirectory'] + 'annotation/'+conf_dict['General']['outname']+'_gene_anno_fullbed.bed')
#       rwlog(cmd,logfile)
#       bulkqctime = time.time() -t
#       wlog("time for bulkqc: %s"%(bulkqctime),logfile)
#       mvcmd1 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.qual.heatmap.pdf',qcdir + conf_dict['General']['outname'] + '_quality_heatmap.pdf')
#       mvcmd2 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.NVC_plot.pdf',qcdir + conf_dict['General']['outname'] + '_NVC.pdf')
#       mvcmd3 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.GC_plot.pdf',qcdir + conf_dict['General']['outname'] + '_GC.pdf')
#       mvcmd4 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.geneBodyCoverage.pdf',qcdir + conf_dict['General']['outname'] + '_GBcover.pdf')
#       rwlog(mvcmd1,logfile)
#       rwlog(mvcmd2,logfile)
#       rwlog(mvcmd3,logfile)
#       rwlog(mvcmd4,logfile)
#

    conf_dict['QCplots']['read_qul'] = qcdir + conf_dict['General']['outname'] + '_Figure1_quality_heatmap.pdf'
    conf_dict['QCplots']['read_nvc'] = qcdir + conf_dict['General']['outname'] + '_Figure2_NVC.pdf'
    conf_dict['QCplots']['read_gc'] = qcdir + conf_dict['General']['outname'] + '_Figure3_GC.pdf'
    conf_dict['QCplots']['gb_cover'] = qcdir + conf_dict['General']['outname'] + '_Figure4_GBcover.pdf'
    bulkqctime = time.time() -t
    wlog("time for bulkqc: %s"%(bulkqctime),logfile)
    
    ### individual cell QC
    wlog('generate individual cell QC measurement',logfile)
    t = time.time()
    conf_dict['QCplots']['duprate'] = qcdir + conf_dict['General']['outname'] + '_Figure5_duprate.pdf'
    conf_dict['QCplots']['covergn'] = qcdir + conf_dict['General']['outname'] + '_Figure8_coverGN.pdf'
    conf_dict['QCplots']['intronrate'] = qcdir + conf_dict['General']['outname'] + '_Figure9_intronrate.pdf'

    if conf_dict['General']['png_for_dot'] == 1:
        conf_dict['QCplots']['umicovergn'] = qcdir + conf_dict['General']['outname'] + '_Figure7_umi_coverGN.png'
        conf_dict['QCplots']['cumumiduprate'] = qcdir + conf_dict['General']['outname'] + '_Figure6_cumUMI_duprate.png'
    else:
        conf_dict['QCplots']['umicovergn'] = qcdir + conf_dict['General']['outname'] + '_Figure7_umi_coverGN.pdf'
        conf_dict['QCplots']['cumumiduprate'] = qcdir + conf_dict['General']['outname'] + '_Figure6_cumUMI_duprate.pdf'        
 
    conf_dict['Step2_ExpMat']['qcmatcc'] = qcdir + conf_dict['General']['outname'] + "_qcmat_clustercell.txt" 
    conf_dict['Step2_ExpMat']['expmatcc'] = qcdir + conf_dict['General']['outname'] + "_expmat_clustercell.txt" 
    conf_dict['results']['expmatcc'] = qcdir + conf_dict['General']['outname'] + "_expmat_clustercell.txt" 

    if int(conf_dict['Step3_QC']['select_cell_measure']) ==1:
        use_cutoff = conf_dict['Step3_QC']['covergncluster']
    elif int(conf_dict['Step3_QC']['select_cell_measure']) ==2:
        use_cutoff = conf_dict['Step3_QC']['topumicellnumber']
    else:
        ewlog('select_cell_measure value can only be 1 or 2, current value is %s'%(conf_dict['Step4_Analysis']['select_cell_measure']),logfile)

    cmd = "%s %s %s %s %s %s %s %s %s %s %s %s %s"%('Rscript',conf_dict['rscript']+'individual_qc.r',conf_dict['Step2_ExpMat']['qcmat'],conf_dict['Step2_ExpMat']['expmat'],conf_dict['General']['outname'],conf_dict['Step3_QC']['select_cell_measure'],use_cutoff,conf_dict['Step3_QC']['remove_non_dup_cell'],conf_dict['Step3_QC']['non_dup_cutoff'],conf_dict['Mapping_stat']['umi_gene'],conf_dict['Step2_ExpMat']['qcmatcc'],conf_dict['Step2_ExpMat']['expmatcc'],conf_dict['General']['png_for_dot'])
    rwlog(cmd,logfile)
    individualqctime = time.time() -t
    wlog("time for individualqc: %s"%(individualqctime),logfile)
    wlog("Step3 bulk and individual cell QC DONE",logfile)
    return conf_dict