예제 #1
0
def step4_analysis(conf_dict,logfile):
    '''
    analysis part
    mainly Rscript
    dimentional reduction + clustering
    '''   
    # start
    # create section for 
    t = time.time()
    Log('Step4: analysis',logfile)
    Log('dimentional reduction + clustering with own script, based on selected STAMP barcodes',logfile)
    analysisdir = conf_dict['General']['outputdirectory'] + 'analysis/'
    CreateDirectory(analysisdir)
    os.chdir(analysisdir)

    conf_dict['Step4_Analysis']['clusterresult'] = analysisdir + conf_dict['General']['outname']+'_cluster.txt'
    conf_dict['QCplots']['gapstat'] = analysisdir + conf_dict['General']['outname']+'_Figure10_GapStat.pdf'
    conf_dict['QCplots']['cluster'] = analysisdir + conf_dict['General']['outname']+'_Figure11_cluster.pdf'
    conf_dict['QCplots']['silhouette'] = analysisdir + conf_dict['General']['outname']+'_Figure12_silhouetteScore.pdf'
    conf_dict['QCplots']['umicolor'] = analysisdir + conf_dict['General']['outname']+'_Figure13_totalUMIcolored.pdf'
    conf_dict['QCplots']['itrcolor'] = analysisdir + conf_dict['General']['outname']+'_Figure14_intronRate_colored.pdf'
    conf_dict['results']['cortable'] = analysisdir + conf_dict['General']['outname']+'_correlation_table.txt' 
    conf_dict['results']['features'] = analysisdir + conf_dict['General']['outname']+'_pctablefeatures_clustercell.txt'
        
    if int(conf_dict["Step4_Analysis"]["dimensionreduction_method"]) == 1:
        conf_dict['results']['pctable'] = analysisdir + conf_dict['General']['outname']+'_pctable.txt'
        Log("Using t-SNE to ceducted the dimension.",logfile)
        cmd = "%s %s %s %s %s %s %s %s %s %s %s %s %s %s"%('Rscript',conf_dict['rscript']+'DrSeq_analysis.r',conf_dict['results']['expmatcc'],conf_dict['General']['outname'],conf_dict['Step4_Analysis']['highvarz'],conf_dict['Step4_Analysis']['selectpccumvar'],conf_dict['Step4_Analysis']['rdnumber'],conf_dict['Step4_Analysis']['maxknum'],conf_dict['Step4_Analysis']['pctable'],conf_dict['Step4_Analysis']['cortable'],conf_dict['Step4_Analysis']['clustering_method'],conf_dict['Step4_Analysis']['custom_k'],conf_dict['Step4_Analysis']['custom_d'],conf_dict['Step4_Analysis']['seed'])
        LogCommand(cmd,logfile)
    elif int(conf_dict["Step4_Analysis"]["dimensionreduction_method"]) == 2:
        Log("Using SIMLR to ceducted the dimension.",logfile)
        conf_dict['Step4_Analysis']['pctable'] = 0
        cmd = "%s %s %s %s %s %s %s %s %s %s %s %s"%('Rscript',conf_dict['rscript']+'DrSeq_SIMLR.r',conf_dict['results']['expmatcc'],conf_dict['General']['outname'],conf_dict['Step4_Analysis']['highvarz'],conf_dict['Step4_Analysis']['rdnumber'],conf_dict['Step4_Analysis']['maxknum'],conf_dict['Step4_Analysis']['cortable'],conf_dict['Step4_Analysis']['clustering_method'],conf_dict['Step4_Analysis']['custom_k'],conf_dict['Step4_Analysis']['custom_d'],conf_dict['rscript'])
        LogCommand(cmd,logfile)
    elif int(conf_dict["Step4_Analysis"]["dimensionreduction_method"]) == 3:
        Log("Using PCA to ceducted the dimension.",logfile)
        cmd = "%s %s %s %s %s %s %s %s %s %s %s %s %s"%('Rscript',conf_dict['rscript']+'DrSeq_PCA.r',conf_dict['results']['expmatcc'],conf_dict['General']['outname'],conf_dict['Step4_Analysis']['highvarz'],conf_dict['Step4_Analysis']['rdnumber'],conf_dict['Step4_Analysis']['maxknum'],conf_dict['Step4_Analysis']['pctable'],conf_dict['Step4_Analysis']['cortable'],conf_dict['Step4_Analysis']['clustering_method'],conf_dict['Step4_Analysis']['custom_k'],conf_dict['Step4_Analysis']['custom_d'],conf_dict['rscript'])
        LogCommand(cmd,logfile)
        conf_dict['results']['pctable'] = analysisdir + conf_dict['General']['outname']+'_pctable.txt'
    else:
        Log("You can only choose the method of Dimentional reduction from t-SNE,SIMLR and PCA by now.!",logfile)
    cmd = '%s %s %s %s %s'%('Rscript',conf_dict['rscript']+'DrSeq_post_analysis.r',conf_dict['Step4_Analysis']['clusterresult'],conf_dict['Step2_ExpMat']['qcmatcc'],conf_dict['General']['outname'])


    analysisqctime = time.time() - t
    Log("time for analysis qc: %s"%(analysisqctime),logfile)
    Log("Step4 analysis QC DONE",logfile)
    
    return conf_dict
예제 #2
0
def step1_generate_matrix(conf_dict, logfile):
    '''
    generate expression matrix file 
    main data processing step, including mapping, generate expression matrix and QC matrix which is used in next step
    for fastq format : 
        STAR/bowtie2 mapping
        q30 filter, 
    for sam format:
        q30 filter     
    '''
    Log("Step1: alignment", logfile)
    t = time.time()
    ### create mapping dir
    mapping_dir = conf_dict['General']['outputdirectory'] + 'mapping/'
    CreateDirectory(mapping_dir)
    ### check reads file format , start mapping step if format is fastq
    if conf_dict['General']['format'] == 'sam':
        Log('reads file format is sam, skip mapping step', logfile)
        conf_dict['General']['sam'] = conf_dict['General']['reads_file']
    else:
        Log(
            'Now start mapping in %s , all mapping result will be here' %
            (mapping_dir), logfile)
        os.chdir(mapping_dir)
        ## choose mapping tool from STAR and bowtie2 according to config file
        if conf_dict['Step1_Mapping']['mapping_software_main'] == "STAR":
            Log('user choose STAR as alignment software', logfile)
            if Get('which STAR')[0].strip() == "":
                LogError(
                    'STAR is not detected in default PATH, make sure you installed STAR and export it into default PATH',
                    logfile)
            mapping_cmd = 'STAR --genomeDir %s --readFilesIn %s --runThreadN %s' % (
                conf_dict['Step1_Mapping']['mapindex'],
                conf_dict['General']['reads_file'],
                conf_dict['Step1_Mapping']['mapping_p'])
            mapping_cmd2 = 'mv Aligned.out.sam %s.sam' % (
                conf_dict['General']['outname'])
            LogCommand(mapping_cmd, logfile)
            LogCommand(mapping_cmd2, logfile)

        elif conf_dict['Step1_Mapping']['mapping_software_main'] == "bowtie2":
            Log('user choose bowtie2 as alignment software', logfile)
            if Get('which bowtie2')[0].strip() == "":
                LogError(
                    'bowtie2 is not detected in default PATH, make sure you installed bowtie2 and export it into default PATH',
                    logfile)
            mapping_cmd = 'bowtie2 -p %s -x %s -U %s -S %s.sam   2>&1 >>/dev/null |tee -a %s.bowtieout' % (
                conf_dict['Step1_Mapping']['mapping_p'],
                conf_dict['Step1_Mapping']['mapindex'],
                conf_dict['General']['reads_file'],
                conf_dict['General']['outname'],
                conf_dict['General']['outname'])
            LogCommand(mapping_cmd, logfile)

        elif conf_dict["Step1_Mapping"]["mapping_software_main"] == "HISAT2":
            Log('user choose HISAT2 as alignment software', logfile)
            if Get('which hisat2')[0].strip() == "":
                LogError(
                    'hisat2 is not detected in default PATH, make sure you installed hisat2 and export it into default PATH',
                    logfile)
            mapping_cmd = 'hisat2 -p %s -x %s -U %s -S %s.sam   2>&1 >>/dev/null |tee -a %s.hisat2out' % (
                conf_dict['Step1_Mapping']['mapping_p'],
                conf_dict['Step1_Mapping']['mapindex'],
                conf_dict['General']['reads_file'],
                conf_dict['General']['outname'],
                conf_dict['General']['outname'])
            LogCommand(mapping_cmd, logfile)

        else:
            LogError("alignment tools can only be HISAT2, STAR or bowtie2",
                     logfile)

        conf_dict['General'][
            'sam'] = mapping_dir + conf_dict['General']['outname'] + '.sam'
    ### transform to bed file, awk helps to conduct q30 filtering
    Log("transfer sam file to aligned bed file with own script", logfile)
    conf_dict['General'][
        'bed'] = mapping_dir + conf_dict['General']['outname'] + '.bed'
    conf_dict['General']['sampledownsam'] = mapping_dir + conf_dict['General'][
        'outname'] + '_sampledown.sam'
    conf_dict['General']['sampledownbed'] = mapping_dir + conf_dict['General'][
        'outname'] + '_sampledown.bed'
    if int(conf_dict['Step1_Mapping']['q30filter']) == 1:
        Log("q30 filter is turned on", logfile)
    else:
        Log("q30 filter is turned off", logfile)
    ### use own script to transform sam to bed, and random sampling 5M mappable reads
    SampleDownTransformSam(conf_dict['General']['sam'],
                           conf_dict['General']['bed'],
                           conf_dict['General']['sampledownsam'],
                           conf_dict['General']['sampledownbed'], 5000000,
                           int(conf_dict['Step1_Mapping']['q30filter']))
    #        q30cmd = """samtools view -q 30 -XS %s | awk '{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if (substr($2,1,1) == "r") print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr" && $5 > 30) {if ($2 == 16) print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr" && $5 > 30) {if ($2 == 16) print $3,$4-1,$4,$1,255,"-";else print $3,$4-1,$4,$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        LogCommand(q30cmd,logfile,conf_dict['General']['dryrun'])
    #        q30cmd = """samtools view -XS %s | awk '{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if (substr($2,1,1) == "r") print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if ($2 == 16) print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if ($2 == 16) print $3,$4-1,$4+length($11),$1,255,"-";else print $3,$4-1,$4,$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        LogCommand(q30cmd,logfile,conf_dict['General']['dryrun'])
    if not os.path.isfile(conf_dict['General']['bed']) or os.path.getsize(
            conf_dict['General']['bed']) == 0:
        LogError(
            'Alignment step / q30 filtering step failed, check your alignment parameter and samfile',
            logfile)
    s1time = time.time() - t
    Log("time for alignment: %s" % (s1time), logfile)
    Log("Step1: alignment DONE", logfile)

    ### create annotation dir and generate related annotation file
    t = time.time()
    Log("Step2: transform expression matrix", logfile)
    Log('generate related annotation file with own script', logfile)
    annotation_dir = conf_dict['General']['outputdirectory'] + 'annotation/'
    CreateDirectory(annotation_dir)
    os.chdir(annotation_dir)
    GeneAnnotation(conf_dict['General']['gene_annotation'],
                   conf_dict['Step2_ExpMat']['ttsdistance'],
                   conf_dict['General']['outname'])

    ### create expression matrix dir and generate matrix
    Log(
        'generate expression matrix and individual cell qc matrix with own script',
        logfile)
    expdir = conf_dict['General']['outputdirectory'] + 'expmatrix/'
    CreateDirectory(expdir)
    os.chdir(expdir)

    ### use bedtools(intersect function) to assign exon/intron/intergenic/overlapping gene  information to all reads
    ### sort according to name
    Log('add gene annotation on aligned bed file', logfile)
    cmd1 = "bedtools intersect -a %s -b %s  -wo | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_symbol.bed',
        conf_dict['General']['outname'] + '_on_symbol.bed')
    cmd2 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_cds.bed',
        conf_dict['General']['outname'] + '_on_cds.bed')
    cmd3 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_3utr.bed',
        conf_dict['General']['outname'] + '_on_3utr.bed')
    cmd4 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_5utr.bed',
        conf_dict['General']['outname'] + '_on_5utr.bed')
    cmd5 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_TTSdis.bed',
        conf_dict['General']['outname'] + '_on_TTSdis.bed')
    LogCommand(cmd1, logfile)
    LogCommand(cmd2, logfile)
    LogCommand(cmd3, logfile)
    LogCommand(cmd4, logfile)
    LogCommand(cmd5, logfile)

    ### transform barcode fastq to 3column txt file [name,cell_barcode,umi]
    if conf_dict['General']['format1'] == 'txt':
        Log('barcode files is reformed txt format, skip reform step', logfile)
        conf_dict['General']['barcode_reform'] = conf_dict['General'][
            'barcode_file']
    else:
        Log('reform barcode files with own script', logfile)
        conf_dict['General']['barcode_reform'] = expdir + conf_dict['General'][
            'outname'] + '_barcode_reform.txt'
        ReformBarcodeFastq(conf_dict['General']['barcode_file'],
                           conf_dict['General']['barcode_reform'],
                           conf_dict['General']['cell_barcode_range'],
                           conf_dict['General']['umi_range'])
    ### sort according name
    cmdsort = 'sort -k 1,1 --parallel=6 -T . -S 8%% %s > %s' % (
        conf_dict['General']['barcode_reform'],
        expdir + conf_dict['General']['outname'] + '_barcode_reform_sort.txt')
    LogCommand(cmdsort, logfile)
    conf_dict['General']['barcode_reform'] = expdir + conf_dict['General'][
        'outname'] + '_barcode_reform_sort.txt'

    ### combine gene annotation, reads, barcode together
    Log('combine annotation and barcode on reads with own script', logfile)
    CombineReads(conf_dict['General']['barcode_reform'],
                 conf_dict['General']['outname'] + '_on_cds.bed',
                 conf_dict['General']['outname'] + '_on_3utr.bed',
                 conf_dict['General']['outname'] + '_on_5utr.bed',
                 conf_dict['General']['outname'] + '_on_symbol.bed',
                 conf_dict['General']['outname'] + '_on_TTSdis.bed',
                 conf_dict['General']['outname'] + '_combined.bed',
                 conf_dict['Step2_ExpMat']['duplicate_measure'])

    ### sort combined file by umi+loci, for following duplicate detection
    cmd6 = "sort -k 7,7 -k 5,5 --parallel=6 -T . -S 8%% %s > %s" % (
        conf_dict['General']['outname'] + '_combined.bed',
        conf_dict['General']['outname'] + '_combined_sort.bed')
    LogCommand(cmd6, logfile)

    ### generate expression and QC matrix based on combined file
    Log('generate expression matrix and QC matrix with own script', logfile)
    ### qcmatfull contains all cell_barcodes, while qcmat,expmat only contain cell_barcodes >= covergncutoff(100, default)
    conf_dict['Step2_ExpMat']['qcmatfull'] = expdir + conf_dict['General'][
        'outname'] + "_qcmatfull.txt"
    conf_dict['Step2_ExpMat'][
        'qcmat'] = expdir + conf_dict['General']['outname'] + "_qcmat.txt"
    conf_dict['Step2_ExpMat'][
        'expmat'] = expdir + conf_dict['General']['outname'] + "_expmat.txt"

    GenerateMatrix(conf_dict['General']['gene_annotation'],
                   conf_dict['General']['outname'] + '_combined_sort.bed',
                   conf_dict['Step2_ExpMat']['filterttsdistance'],
                   conf_dict['Step2_ExpMat']['qcmatfull'],
                   conf_dict['Step2_ExpMat']['qcmat'],
                   conf_dict['Step2_ExpMat']['expmat'],
                   conf_dict['Step2_ExpMat']['covergncutoff'],
                   conf_dict['Step2_ExpMat']['umidis1'])

    Log("Step2 transform expression matrix DONE", logfile)
    s2time = time.time() - t
    Log("time for transform expmat: %s" % (s2time), logfile)
    conf_dict['results'] = {}
    #conf_dict['results']['expmat'] = conf_dict['Step2_ExpMat']['expmat']
    #conf_dict['results']['qcmat'] = conf_dict['Step2_ExpMat']['qcmat']

    return conf_dict
예제 #3
0
def step3_QC(conf_dict, logfile):
    '''
    start RseQC
    mapping stat
    single cell level QC
    '''
    # start
    # create section for

    Log('Step3: bulk and individual cell QC', logfile)
    ### preparing mapping state dict
    Log('calculate mapping state', logfile)
    conf_dict['Mapping_stat'] = {}
    conf_dict['Mapping_stat']['umi_gene'] = 0
    conf_dict['Mapping_stat']['cdsN'] = 0
    conf_dict['Mapping_stat']['utr3N'] = 0
    conf_dict['Mapping_stat']['utr5N'] = 0
    conf_dict['Mapping_stat']['intronN'] = 0
    conf_dict['Mapping_stat']['intergenicN'] = 0

    ### calculate mapping state based on QC matrix
    inf = open(conf_dict['Step2_ExpMat']['qcmatfull'])
    for line in inf:
        if line.startswith('cellname'):
            continue
        ll = line.split()
        conf_dict['Mapping_stat']['umi_gene'] += int(ll[2])
        conf_dict['Mapping_stat']['cdsN'] += int(ll[3])
        conf_dict['Mapping_stat']['utr3N'] += int(ll[4])
        conf_dict['Mapping_stat']['utr5N'] += int(ll[5])
        conf_dict['Mapping_stat']['intronN'] += int(ll[6])
        conf_dict['Mapping_stat']['intergenicN'] += int(ll[7])
    inf.close()
    conf_dict['Mapping_stat']['totalreads'] = int(
        Get('wc -l %s' %
            (conf_dict['General']['barcode_reform']))[0].split()[0])
    conf_dict['Mapping_stat']['q30reads'] = int(
        Get('wc -l %s' % (conf_dict['General']['bed']))[0].split()[0])

    ### create  QC dir and conduct QC
    Log(
        'generate reads QC measurement with own script, based on sample down reads',
        logfile)
    qcdir = conf_dict['General']['outputdirectory'] + 'QC/'
    CreateDirectory(qcdir)
    os.chdir(qcdir)
    conf_dict['QCplots'] = {}
    conf_dict['QCplots']['map_summary'] = qcdir + conf_dict['General'][
        'outname'] + '_map_summary.txt'
    mapsummary_doc = """genomic region(Category)\treads number
total reads\t%s
mappble reads\t%s 
total UMI count\t%s
CDS exon UMI count\t%s
3'UTR UMI count\t%s
5'UTR UMI count\t%s
intron UMI count\t%s
intergenic UMI count\t%s
""" % (str(conf_dict['Mapping_stat']['totalreads']),
       str(conf_dict['Mapping_stat']['q30reads']),
       str(conf_dict['Mapping_stat']['umi_gene']),
       str(conf_dict['Mapping_stat']['cdsN']),
       str(conf_dict['Mapping_stat']['utr3N']),
       str(conf_dict['Mapping_stat']['utr5N']),
       str(conf_dict['Mapping_stat']['intronN']),
       str(conf_dict['Mapping_stat']['intergenicN']))
    outf = open(conf_dict['QCplots']['map_summary'], 'w')
    outf.write(mapsummary_doc)
    outf.close()
    ## reads quality
    t = time.time()
    readsqc(conf_dict['General']['sampledownsam'],
            conf_dict['General']['outname'])
    Log(
        'generate bulk cell QC measurement with own script, based on sample down reads',
        logfile)

    cmd = "bedtools intersect -a %s -b %s -c > %s" % (
        conf_dict['General']['outputdirectory'] + 'annotation/' +
        conf_dict['General']['outname'] + '_gene_anno_binexon.bed',
        conf_dict['General']['sampledownbed'],
        conf_dict['General']['outname'] + '_sampledown_on_gbbin.bed')
    LogCommand(cmd, logfile)
    GBcover(conf_dict['General']['outname'] + '_sampledown_on_gbbin.bed',
            conf_dict['General']['outname'])
    cmd = "%s %s %s" % ('Rscript',
                        conf_dict['rscript'] + 'DrSeq_readsbulk_QC.r',
                        conf_dict['General']['outname'])
    LogCommand(cmd, logfile)

    #       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_qul'],conf_dict['General']['sam'],conf_dict['General']['outname'])
    #       LogCommand(cmd,logfile)
    #       ## reads nucleotide composition
    #       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_nvc'],conf_dict['General']['sam'],conf_dict['General']['outname'])
    #       LogCommand(cmd,logfile)
    #       ## reads GC content
    #       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_gc'],conf_dict['General']['sam'],conf_dict['General']['outname'])
    #       LogCommand(cmd,logfile)
    #       readsqctime = time.time() -t
    #       Log("time for readsqc: %s"%(readsqctime),logfile)
    #       ## reads genebody coverage
    #       t= time.time()
    #
    #       cmd = "%s -i %s -o %s -r %s"%(conf_dict['Step3_QC']['gb_cover'],conf_dict['General']['sam'],conf_dict['General']['outname'],conf_dict['General']['outputdirectory'] + 'annotation/'+conf_dict['General']['outname']+'_gene_anno_fullbed.bed')
    #       LogCommand(cmd,logfile)
    #       bulkqctime = time.time() -t
    #       Log("time for bulkqc: %s"%(bulkqctime),logfile)
    #       mvcmd1 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.qual.heatmap.pdf',qcdir + conf_dict['General']['outname'] + '_quality_heatmap.pdf')
    #       mvcmd2 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.NVC_plot.pdf',qcdir + conf_dict['General']['outname'] + '_NVC.pdf')
    #       mvcmd3 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.GC_plot.pdf',qcdir + conf_dict['General']['outname'] + '_GC.pdf')
    #       mvcmd4 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.geneBodyCoverage.pdf',qcdir + conf_dict['General']['outname'] + '_GBcover.pdf')
    #       LogCommand(mvcmd1,logfile)
    #       LogCommand(mvcmd2,logfile)
    #       LogCommand(mvcmd3,logfile)
    #       LogCommand(mvcmd4,logfile)
    #

    conf_dict['QCplots']['read_qul'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure1_quality_heatmap.pdf'
    conf_dict['QCplots']['read_nvc'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure2_NVC.pdf'
    conf_dict['QCplots']['read_gc'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure3_GC.pdf'
    conf_dict['QCplots']['gb_cover'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure4_GBcover.pdf'
    bulkqctime = time.time() - t
    Log("time for bulkqc: %s" % (bulkqctime), logfile)

    ### individual cell QC
    Log('generate individual cell QC measurement', logfile)
    t = time.time()
    conf_dict['QCplots']['duprate'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure5_duprate.pdf'
    conf_dict['QCplots']['covergn'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure8_coverGN.pdf'
    conf_dict['QCplots']['intronrate'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure9_intronrate.pdf'

    if conf_dict['General']['png_for_dot'] == 1:
        conf_dict['QCplots']['umicovergn'] = qcdir + conf_dict['General'][
            'outname'] + '_Figure7_umi_coverGN.png'
        conf_dict['QCplots']['cumumiduprate'] = qcdir + conf_dict['General'][
            'outname'] + '_Figure6_cumUMI_duprate.png'
    else:
        conf_dict['QCplots']['umicovergn'] = qcdir + conf_dict['General'][
            'outname'] + '_Figure7_umi_coverGN.pdf'
        conf_dict['QCplots']['cumumiduprate'] = qcdir + conf_dict['General'][
            'outname'] + '_Figure6_cumUMI_duprate.pdf'

    conf_dict['Step2_ExpMat']['qcmatcc'] = qcdir + conf_dict['General'][
        'outname'] + "_qcmat_clustercell.txt"
    conf_dict['Step2_ExpMat']['expmatcc'] = qcdir + conf_dict['General'][
        'outname'] + "_expmat_clustercell.txt"
    conf_dict['results']['expmatcc'] = qcdir + conf_dict['General'][
        'outname'] + "_expmat_clustercell.txt"

    if int(conf_dict['Step3_QC']['select_cell_measure']) == 1:
        use_cutoff = conf_dict['Step3_QC']['covergncluster']
    elif int(conf_dict['Step3_QC']['select_cell_measure']) == 2:
        use_cutoff = conf_dict['Step3_QC']['topumicellnumber']
    else:
        LogError(
            'select_cell_measure value can only be 1 or 2, current value is %s'
            % (conf_dict['Step4_Analysis']['select_cell_measure']), logfile)

    cmd = "%s %s %s %s %s %s %s %s %s %s %s %s %s" % (
        'Rscript', conf_dict['rscript'] + 'DrSeq_individual_QC.r',
        conf_dict['Step2_ExpMat']['qcmat'],
        conf_dict['Step2_ExpMat']['expmat'], conf_dict['General']['outname'],
        conf_dict['Step3_QC']['select_cell_measure'], use_cutoff,
        conf_dict['Step3_QC']['remove_low_dup_cell'],
        conf_dict['Step3_QC']['non_dup_cutoff'],
        conf_dict['Mapping_stat']['umi_gene'],
        conf_dict['Step2_ExpMat']['qcmatcc'],
        conf_dict['Step2_ExpMat']['expmatcc'],
        conf_dict['General']['png_for_dot'])
    LogCommand(cmd, logfile)
    individualqctime = time.time() - t
    Log("time for individualqc: %s" % (individualqctime), logfile)
    Log("Step3 bulk and individual cell QC DONE", logfile)
    return conf_dict
예제 #4
0
def step5_summary(conf_dict,logfile):
    '''
    analysis part
    mainly Rscript
    dimentional reduction + clustering
    '''
    # start
    # create section for 
    
    Log('Step5: summary',logfile)
    Log('copy results',logfile)
    summarydir = conf_dict['General']['outputdirectory'] + 'summary/'
    CreateDirectory(summarydir)
    os.chdir(summarydir)
    
    plot_folder = summarydir + "plots/"
    CreateDirectory(plot_folder)
    os.chdir(plot_folder)
    ### collect results 
    for i in conf_dict['QCplots']:
        if os.path.isfile(conf_dict['QCplots'][i]):
            #realname
            cmd = 'cp %s .'%conf_dict['QCplots'][i]
            LogCommand(cmd,logfile)

    result_folder = summarydir + "results/"
    CreateDirectory(result_folder)
    os.chdir(result_folder)
    for i in conf_dict['results']:
        if os.path.isfile(conf_dict['results'][i]):
            cmd = 'cp %s .'%conf_dict['results'][i]
            LogCommand(cmd,logfile)

    os.chdir(summarydir)

    Log('generate qc documents',logfile)
    ### initiate 
    QCdoc = """\documentclass[11pt,a4paper]{article}
\usepackage{tabularx}
\usepackage[english]{babel}
\usepackage{array}
\usepackage{graphicx}
\usepackage{color}
\DeclareGraphicsExtensions{.eps,.png,.pdf,.ps}
\\begin{document}
\\title{QC and analysis reports for Drop-seq data : %s}

\maketitle
\\tableofcontents
\\newpage
\\newpage
\section{Data description}
\\begin{quotation}
Table 1 mainly describe the input file and mapping and analysis parameters.
\end{quotation}
\\begin{table}[h]
\caption{Data description}\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|l| }

"""%(LatexFormat(conf_dict['General']['outname']))
    ### table1 prepare parameter
    if int(conf_dict['Step1_Mapping']['q30filter']) == 1:
        q30filter = "True"
    else:
        q30filter = "False"
    if int(conf_dict['Step2_ExpMat']['filterttsdistance']) == 1:
        filtertts = "True"
    else: 
        filtertts = "False"
    if int(conf_dict['Step2_ExpMat']['umidis1']) == 1:
        umidis1 = "True"
    else:
        umidis1 = "False"
    if int(conf_dict['Step3_QC']['remove_low_dup_cell']) == 1:
        rmnodup = "True"
    else:
        rmnodup = "False"

    print conf_dict['General']
    QCdoc += """      
\hline
parameter & value  \\\\
\hline
output name & %s \\\\
\hline
barcode file(file name only) & %s \\\\
\hline
reads file(file name only) & %s \\\\
\hline
reads file format & %s  \\\\
\hline
cell barcode range &  %s \\\\
\hline
UMI range & %s \\\\
\hline
mapping software & %s \\\\
\hline
Q30 filter mapped reads & %s \\\\
\hline
remove reads away TTS & %s \\\\
\hline
"""%(LatexFormat(conf_dict['General']['outname']),
     LatexFormat(conf_dict['General']['barcode_file'].split("/")[-1]),
     LatexFormat(conf_dict['General']['reads_file'].split("/")[-1]),
     conf_dict['General']['format'].upper(),
     str(conf_dict['General']['cell_barcode_range']),
     str(conf_dict['General']['umi_range']),
     conf_dict['Step1_Mapping']['mapping_software_main'],
     q30filter,
     filtertts
     )
    ### table1 part2
    if  filtertts == "True":
        QCdoc += """TTS distance (for remove) & %s bp \\\\
\hline
"""%(str(conf_dict['Step2_ExpMat']['ttsdistance'])) 
    if  int(conf_dict['Step2_ExpMat']['duplicate_measure']) == 1:
        QCdoc += """duplicate rate in each cell & UMI $+$ location \\\\"""
    elif int(conf_dict['Step2_ExpMat']['duplicate_measure']) == 2:
        QCdoc += """duplicate rate in each cell & UMI only \\\\"""
    elif int(conf_dict['Step2_ExpMat']['duplicate_measure']) == 3:
        QCdoc += """duplicate rate in each cell & location only \\\\"""
    else:
        QCdoc += """duplicate rate in each cell & keep all reads \\\\"""
    if int(conf_dict['Step2_ExpMat']['duplicate_measure']) in [1,2]:
        QCdoc += """
\hline
merge UMI ED = 1 & %s \\\\ 
\hline"""%(umidis1)
    if  int(conf_dict['Step3_QC']['select_cell_measure']) == 1:
        QCdoc += """
select STAMPs & %s covered gene \\\\
\hline"""%(str(conf_dict['Step3_QC']['covergncluster']))
    elif int(conf_dict['Step3_QC']['select_cell_measure']) == 2:
        QCdoc += """
select STAMPs & top %s UMI count \\\\
\hline"""%(str(conf_dict['Step3_QC']['topumicellnumber']))
    QCdoc += """
remove low duplicate rate cell & %s \\\\ 
\hline """%(rmnodup)
    if  rmnodup == "True":
        QCdoc += """
low duplicate rate cutoff & %s  \\\\
\hline"""%(str(conf_dict['Step3_QC']['non_dup_cutoff']))
    QCdoc += """
z-score for highly variable gene & %s \\\\ 
\hline 
cumulative variance for selecting PC & %s \\\\
\hline """%(str(conf_dict['Step4_Analysis']['highvarz']),
     str(100*float(conf_dict['Step4_Analysis']['selectpccumvar']))+'\\%')
 
    if  int(conf_dict['Step4_Analysis']['clustering_method']) == 1:
        QCdoc += """
cluster method & k-means (Gap statistics, first stable) \\\\"""
    elif int(conf_dict['Step4_Analysis']['clustering_method']) == 2:
        QCdoc += """
cluster method & k-means (Gap statistics, maxSE) \\\\"""
    elif int(conf_dict['Step4_Analysis']['clustering_method']) == 3:
        QCdoc += """
cluster method & k-means (custom, k=%s) \\\\"""%(conf_dict['Step4_Analysis']['custom_k'])
    else:
        QCdoc += """
cluster method & DBScan (eps=%s) \\\\"""%(conf_dict['Step4_Analysis']['custom_d'])
    QCdoc += """
\hline
\end{tabularx}
\end{table}
"""
    ### bulk QC
    QCdoc += """
\\newpage
\\newpage
\section{Reads level QC}
In the reads level QC step we measured the quality of sequencing reads, including nucleotide quality and composition. In the reads level QC step and Bulk-cell level QC step we randomly sampled down total reads to 5 million and used a published package called ``RseQC" for reference.(Wang, L., Wang, S. and Li, W. (2012) )
\\newpage
\\newpage
\subsection{Reads quality}
\\begin{quotation}
Reads quality is one of the basic reads level quality control methods. We plotted the distribution of a widely used Phred Quality Score at every position of sequence to measure the basic sequence quality of your data. Phred Quality Score was calculate by a python function $ord(Q) - 33$. Color in the heatmap represented frequency of this quality score observed at this position. Red represented higher frequency while blue was lower frequency. You may observe a decreasing of quality near the 3'end of sequence because of general degradation of quality over the duration of long runs. If the decreasing of quality influence the mappability (see ``Bulk-cell level QC") then the common remedy is to perform quality trimming where reads are truncated based on their average quality or you can trim serveal base pair near 3'end directly. If it doesn't help, you may consider your Drop-seq data poor quality. 
\end{quotation}
\\begin{figure}[h]
        \caption{Reads quality} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}

\\newpage
\\newpage
\subsection{Reads nucleotide composition}
\\begin{quotation}
We assess the nucleotide composition bias of a sample. The proportion of four different nucleotides was calculated at each position of reads. Theoretically four nucleotides had similar proportion at each position of reads. You may observe higher A/T count at 3'end of reads because of the 3'end polyA tail generated in sequencing cDNA libaray, otherwise the A/T count should be closer to C/G count. In any case, you should observe a stable pattern at least in the 3'end of reads. Spikes (un-stable pattern) which occur in the middle or tail of the reads indicate low sequence quality. You can trim serveral un-stable bases from the 3'end if low mappability (see ``Bulk-cell level QC") is also observed. If it doesn't help, you may consider your Drop-seq data poor quality. Note that t
he A/T vs G/C content can greatly vary from Getecies to Getecies. 
\end{quotation}
\\begin{figure}[h]
        \caption{Reads nucleotide composition} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}

\\newpage
\\newpage
\subsection{Reads GC content}
\\begin{quotation}
Distribution of GC content of each read. This module measures the general quality of the library. If the distribution looks different from a single bell (too sharp or too broad) then there may be a problem with the library. Sharp peaks on an otherwise smooth distribution are normally the result of a Getecific contaminant (adapter dimers for example), which may well be picked up by the overrepresented sequences module. Broader peaks may represent contamination with a different Getecies. If you observe sharp peak or broder peak and also observe low mappability (see ``Bulk-cell level QC"), you may consider your Drop-seq data poor quality.
\end{quotation}
\\begin{figure}[h]
        \caption{Reads GC content} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
"""%((conf_dict['QCplots']['read_qul'].split("/")[-1]),
     (conf_dict['QCplots']['read_nvc'].split("/")[-1]),
     (conf_dict['QCplots']['read_gc'].split("/")[-1])
    )

    QCdoc += """
\\newpage
\\newpage
\section{Bulk-cell level QC}
In the bulk-cell level QC step we measured the performance of total Drop-seq reads. In this step we did't separate cell or remove ``empty" cell barcodes, just like treated the sample as bulk RNA-seq sample.
\\newpage
\\newpage
\subsection{Reads alignment summary}
\\begin{quotation}
The following table shows mappability and distribution of total Drop-seq reads. It measures the general quality of data as a RNA-seq sample. Low mappability indicates poor sequence quality(see ``Reads level QC") or library quality(caused by contaminant). High duplicate rate (low total UMI percentage observed, e.g. $<$ 10\\%%) indicate insufficient RNA material and Overamplification. In summary, if the percentage of ``total UMI count" is less than 5\\%%, users may consider reconstruct your library(redo the experiment), but first you should make sure you already trim the adapter and map your reads to the correGetonded Getecies(genome version). Note that UMI number was calculated by removing duplicate reads (which have identical genomic location, cell barcode and UMI sequences). Mappable reads was after Q30 filtering if Q30 filter function was turned on.\\\\
** the percentage was calculated by dividing total reads number \\\\
*** the percentage was calculated by divding total UMI number
\end{quotation}
\\begin{table}[h]
\caption{Reads alignment summary}\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|X| }
    
\hline
genomic region(Category) &  reads number \\\\
\hline
total reads & %s \\\\
\hline
mappble reads &  %s (%s\\%%)* \\\\
\hline
total UMI count & %s (%s\\%%)* \\\\
\hline
CDS exon UMI count & %s (%s\\%%)** \\\\
\hline
3'UTR UMI count & %s (%s\\%%)** \\\\
\hline
5'UTR UMI count & %s (%s\\%%)** \\\\
\hline
intron UMI count & %s (%s\\%%)** \\\\
\hline
intergenic UMI count & %s (%s\\%%)** \\\\
\hline

\end{tabularx}
\end{table}
"""%(NumberFormat(str(conf_dict['Mapping_stat']['totalreads'])),
     NumberFormat(str(conf_dict['Mapping_stat']['q30reads'])),
     str( round(100*conf_dict['Mapping_stat']['q30reads']*1.0/conf_dict['Mapping_stat']['totalreads'], 2)),
     NumberFormat(str(conf_dict['Mapping_stat']['umi_gene'])),
     str( round(100*conf_dict['Mapping_stat']['umi_gene']*1.0/conf_dict['Mapping_stat']['totalreads'], 2)),
     NumberFormat(str(conf_dict['Mapping_stat']['cdsN'])),
     str( round(100*conf_dict['Mapping_stat']['cdsN']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)),
     NumberFormat(str(conf_dict['Mapping_stat']['utr3N'])),
     str( round(100*conf_dict['Mapping_stat']['utr3N']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)),
     NumberFormat(str(conf_dict['Mapping_stat']['utr5N'])),
     str( round(100*conf_dict['Mapping_stat']['utr5N']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)),
     NumberFormat(str(conf_dict['Mapping_stat']['intronN'])),
     str( round(100*conf_dict['Mapping_stat']['intronN']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)),
     NumberFormat(str(conf_dict['Mapping_stat']['intergenicN'])),
     str( round(100*conf_dict['Mapping_stat']['intergenicN']*1.0/conf_dict['Mapping_stat']['umi_gene'], 2)))
     ### genebody coverage
    QCdoc += """
\\newpage
\\newpage
\subsection{Gene body coverage}
\\begin{quotation}
Aggregate plot of reads coverage on all genes. This module measures the general quality of the Drop-seq data. Theoretically we observe a unimodal (single bell) distribution, but for Drop-seq sample an enrichment at 3'end is observed due to library preparation using oligo-dT primers. In any case you should observe a smooth distritbuion. If loss of reads or Getike are observed in certain part of gene body (e.g. middle or 3'end of gene body), poor quality of your library was indicated. EGetecially when low mappability and high intron rate are also observed (see ``Reads alignment summary" section).
\end{quotation}
\\begin{figure}[h]
        \caption{Gene body coverage} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
"""%((conf_dict['QCplots']['gb_cover'].split("/")[-1]))

    QCdoc += """

\\newpage
\\newpage
\section{Individual-cell level QC}
In this step we focused on the quality of individual cell and distinguishing cell barcodes from STAMPs (single-cell transcriptomes attached to microparticles)
\\newpage
\\newpage
\subsection{Reads duplicate rate distribution}
\\begin{quotation}
Drop-seq technology has an innate advantage of detecting duplicate reads and amplification bias due to the barcode and UMI information. This module diGetlays the distribution of duplicate rate in each cell barcode and helps to discard barcodes with low duplicate rate (which usually caused by empty cell barcodes and ambient RNA). We plot the distribution of duplicate rate in each cell barcode (though most of cell barcodes don't contain cells, they still have RNA) and observed a bimodal distribution of duplicate rate. We set an option for you to discard cell barcodes with low duplicate rate in following steps. The vertical line represented the cutoff (duplicate rate $>=$ 0.1) of discarding cell barcodes with low duplicate rate. You can adjust the cutoff and rerun Dr.seq if current cutoff didn't separate two peaks from the distribution clearly (usually happened with insufficient sequencing depth). If the distribution didn't show clear bimodal or you don't want to discard cell barcodes according to duplicate rate, you can set cutoff to 0 to keep all cell barcodes for following steps. 
\end{quotation}
\\begin{figure}[h]
        \caption{Reads dupliate rate distribution} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
"""%(conf_dict['QCplots']['duprate'].split("/")[-1])
    if int(conf_dict['Step3_QC']['select_cell_measure']) == 1:
        QCdoc += """
\\newpage
\\newpage
\subsection{Reads duplicate rate vs. cumulative covered gene number}
\\begin{quotation}
Reads duplicate rate versus cumulative covered gene numbers. This module measures whether each of your individual cell was sequenced and clearly separated from empty cell barcodes. Cell barcodes are ranked by the number of covered genes. The duplicate rate (y-axis, left side) is plotted as a function of ranked cell barcode. Red curve represents the number of genes covered by top N cell barcodes (y-axis, right side). N is diGetlayed by x-axis. Theoretically you observe a ``knee" on your cumulative curve (slope $=$ 1 on the curve) and the cutoff of your selected STAMPs (dash line) should be close to the ``knee". The cutoff can also be far away from the ``knee" in some cases because you input too many cells and have insufficient average sequencing depth, then you should adjust your cutoff (to the position you get enough STAMPs and sufficient reads count) and rerun Dr.seq. See the description of the paramter ``select cell measure" in the Manual.
\end{quotation}
\\begin{figure}[h]
        \caption{Reads duplicate rate vs. cumulative covered gene number} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}

\\newpage
\\newpage
\subsection{UMI vs. covered gene number}
\\begin{quotation}
Covered gene number is plotted as a function of the number of UMI (i.e. unique read). This module measures the quality of Drop-seq experiment and helps to distinguish STAMPs from empty cell barcodes. We observe a clearly different pattern for two groups of cell barcodes with different reads duplicate rate (blue dots versus red and purple dots). Purple dots represented the selected STAMPs for the cell-clustering step. By default we select STAMPs with 1000 gene covered after discarding low duplicate cell barcodes. You may get few STAMPs according to this cutoff if the average sequencing depth of your cells was too low or too many cells were inputed. In this case you can adjust your cutoff or tell Dr.seq to directly select cell barcodes with highest reads count (see the description of the parameter ``select cell measure"). Note that we use only STAMPs selected in this step for following analysis. The other cell barcodes are discarded. 
\end{quotation}
\\begin{figure}[h]
        \caption{UMI v.s. covered gene number} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
"""%(conf_dict['QCplots']['cumumiduprate'].split("/")[-1],
     conf_dict['QCplots']['umicovergn'].split("/")[-1])
    else:
        QCdoc += """
\\newpage
\\newpage
\subsection{Reads duplicate rate vs. cumulative covered gene number}
\\begin{quotation}
Reads duplicate rate versus cumulative covered gene numbers. This module measures whether each of your individual cell was sequenced and clearly separated from empty cell barcodes. Cell barcodes are ranked by the number of UMI count. The duplicate rate (y-axis, left side) is plotted as a function of ranked cell barcode. Red curve represents the number of genes covered by top N cell barcodes (y-axis, right side). N is diGetlayed by x-axis. Theoretically you observe a ``knee" on your cumulative curve (slope $=$ 1 on the curve) and the cutoff of your selected STAMPs (dash line) should be close to the ``knee". The cutoff can also be far away from the ``knee" in some cases because you input too many cells and have insufficient average sequencing depth, then you should adjust your cutoff (to the position you get enough STAMPs and sufficient reads count) and rerun Dr.seq. See the description of the paramter ``select cell measure" in the Manual.
\end{quotation}
\\begin{figure}[h]
        \caption{Reads duplicate rate vs. cumulative covered gene number} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}

\\newpage
\\newpage
\subsection{UMI vs. covered gene number}
\\begin{quotation}
Covered gene number is plotted as a function of the number of UMI (i.e. unique read). This module measures the quality of Drop-seq experiment and helps to distinguish STAMPs from empty cell barcodes. We observe a clearly different pattern for two groups of cell barcodes with different reads duplicate rate (blue dots versus red and purple dots). Purple dots represented the selected STAMPs for the cell-clustering step. We select 1000 STAMPs with highest UMI count after discarding low duplicate cell barcodes. You may get few STAMPs according to this cutoff if the average sequencing depth of your cells was too low or too many cells were inputed. Note that we use only STAMPs selected in this step for following analysis. The other cell barcodes are discarded. 
\end{quotation}
\\begin{figure}[h]
        \caption{UMI v.s. covered gene number} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
"""%(conf_dict['QCplots']['cumumiduprate'].split("/")[-1],
     conf_dict['QCplots']['umicovergn'].split("/")[-1])
     
    QCdoc += """
\\newpage
\\newpage
\subsection{Covered gene number distribution}
\\begin{quotation}
Histogram of covered gene number of selected STAMPs. The module measures whether the selected STAMPs have sufficient reads coverage. By default Dr.seq selects cell barcodes with $>=$ 1000 genes covered as STAMPs. If you choose to select STAMPs with highest reads count (``select cell measure" $=$ 2), then you should check this figure to make sure the STAMPs you select have enough gene covered. If most of your STAMPs have low covered gene number (e.g. $<$ 100 gene covered), you can make your cutoff more stringent (e.g. select less cell barcodes with higher reads count) to make sure you get reliable STAMPs.
\end{quotation}
\\begin{figure}[h]
        \caption{Covered gene number} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}

\\newpage
\\newpage
\subsection{Intron rate distribution}
\\begin{quotation}
Intron rate is a effective method to measure the quality of a RNA-seq sample. We plot a histogram of intron rate of every STAMP barcodes to check whether reads from each STAMPs enriched in the exon region. High intron rate (e.g. $>=$ 30\\%%) indicates low quality of RNA in each STAMPs (caused by different problem, for example contaminant). You may consider your Drop-seq data low quality if most of selected STAMPs have high intron rate and low covered gene number (see ``Covered gene number distribution" section). Intron rate is defined as $\\frac{intron\\ reads\\ number}{intron + exon\\ reads\\ number}$ 
\end{quotation}
\\begin{figure}[h]
        \caption{Intron rate distribution} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
 
"""%(conf_dict['QCplots']['covergn'].split("/")[-1],
     conf_dict['QCplots']['intronrate'].split("/")[-1])
    
    if int(conf_dict['Step4_Analysis']['clustering_method']) in [3,4]:
        pass
    else:
        if int(conf_dict['Step4_Analysis']['clustering_method']) in [1,2]:
            selectM = 'first stable gap'
        else:
            selectM = 'maxSE'
        QCdoc += """
\\newpage
\\newpage
\section{Cell-clustering level QC}
This step composed by k-means clustering based on dimentional reduction result and Gap statistics to determine best k.
\\newpage
\\newpage
\subsection{Gap statistics}
\\begin{quotation}
We conducted a k-means clustering based on dimensional reduction output to measure sample's ability to be separated to different cell subtypes. Gap statistics was performed to determine the best k in k-means clustering. In general, decreasing pattern (usually k $<=$ 2) is observed for pure cell type or cell line data, while increasing pattern with bigger k should be observed for mix cell types (or cell subtypes) data. If the cluster number predicted from the Gap statistics is largely different to what you expect, it indicated that your cells are not well characterized and separated by the Drop-seq experiment (due to the contaminant or the low capture efficiency of Droplets). In this case, you may consider your Drop-seq data poor quality. Alternatively, you may would like to use the parameter ``custom k" to Getecify the cluster number.
\end{quotation}
\\begin{figure}[h]
        \caption{Gap statistics} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}

"""%(conf_dict['QCplots']['gapstat'].split("/")[-1])
    
    QCdoc += """
\\newpage
\\newpage
\subsection{Clustering plot}
\\begin{quotation}
Scatter plot represented visualization of dimensional reduction output of selected STAMP barcodes. STAMP barcodes are colored according to the clustering result and cluster numbers are printed in the center of each cluster. This figure is mainly for visualization and help you to know how your Drop-seq data look like. If you want to combine some small groups which are close to each other, you can use the cluster matrix (named ``cluster.txt") in the Dr.seq standard analysis output to conduct your own analysis.   
\end{quotation}
\\begin{figure}[h]
        \caption{Clustering plot} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
 
"""%(conf_dict['QCplots']['cluster'].split("/")[-1])
    if os.path.isfile(conf_dict['QCplots']['silhouette']):
        QCdoc += """
\\newpage
\\newpage
\subsection{Silhouette of clustering}
\\begin{quotation}
Silhouette method is used to interprate and validate the consistency within clusters defined in previous steps. A poor Silhouette (e.g. average si $<$ 0.2 ) score indicate that Drop-seq experiments(if not properly done) may not separate well the subpopulations of cells. If most of your clusters have poor Silhouette score, it may indicate a poor quality of your Drop-seq experiments. 
\end{quotation}
\\begin{figure}[h]
        \caption{Silhouette score for clustered STAMPs} \label{fig:profileunion}
        \setlength{\\abovecaptionskip}{0pt}
        \setlength{\\belowcaptionskip}{10pt}
        \centering
        {\includegraphics[width=0.8\\textwidth]{%s}}
\end{figure}
 
"""%(conf_dict['QCplots']['silhouette'].split("/")[-1])
    
#    QCdoc += """
#\\newpage
#\\newpage
#\subsection{STAMPs colored by total UMI count}
#\\begin{quotation}
#STAMPs was by the total number of UMI based on t-SNE visualization. 
#\end{quotation}
#\\begin{figure}[h]
#        \caption{STAMPs colored by total UMI count} \label{fig:profileunion}
#        \setlength{\\abovecaptionskip}{0pt}
#        \setlength{\\belowcaptionskip}{10pt}
#        \centering
#        {\includegraphics[width=0.8\\textwidth]{%s}}
#\end{figure}
 
#"""%(conf_dict['QCplots']['umicolor'].split("/")[-1])
   
#    QCdoc += """
#\\newpage
#\\newpage
#\subsection{STAMPs colored by intron rate}
#\\begin{quotation}
#STAMPs was by the intron rate based on t-SNE visualization. 
#\end{quotation}
#\\begin{figure}[h]
#        \caption{STAMPs colored by intron rate} \label{fig:profileunion}
#        \setlength{\\abovecaptionskip}{0pt}
#        \setlength{\\belowcaptionskip}{10pt}
#        \centering
#        {\includegraphics[width=0.8\\textwidth]{%s}}
#\end{figure}
# 
#"""%(conf_dict['QCplots']['itrcolor'].split("/")[-1])
      
    QCdoc += """
\\newpage
\\newpage
\section{Output list}
\\begin{quotation}
All output files were described in the following table
\end{quotation}
\\begin{table}[h]
\caption{output list}\label{bstable}
\\begin{tabularx}{\\textwidth}{ |X|l| }
    
\hline
description & filename \\\\
\hline
expression matrix for selected STAMPs & %s  \\\\
"""%(LatexFormat(conf_dict['results']['expmatcc'].split("/")[-1]))
    if int(conf_dict['Step4_Analysis']['pctable']) == 1:
        QCdoc += """
\hline
top2 components of PCA dimentional reduction result & %s \\\\         
"""%(LatexFormat(conf_dict['results']['pctable'].split("/")[-1]))
    if int(conf_dict['Step4_Analysis']['cortable']) == 1:
        QCdoc += """
\hline
pairwise correlation matrix & %s \\\\
"""%(LatexFormat(conf_dict['results']['cortable'].split("/")[-1]))
    QCdoc += """
\hline
All features of selected STAMPs & %s \\\\
\hline
summary QC report & %s \\\\
\hline

\end{tabularx}
\end{table} 
\end{document} 
"""%(LatexFormat(conf_dict['results']['features'].split("/")[-1]),LatexFormat(conf_dict['General']['outname'])+"\_summary.pdf")

    os.chdir(plot_folder)

    latexfile = conf_dict['General']['outname'] + '_summary.tex'
    outf = open(latexfile,'w')
    outf.write(QCdoc)
    outf.close()
    cmd = "pdflatex %s >/dev/null"%(latexfile)
    cmd2 = 'cp %s %s'%(conf_dict['General']['outname'] + '_summary.pdf',summarydir)
    if conf_dict['General']['latex'] == 1:
        LogCommand(cmd,logfile)
        LogCommand(cmd,logfile)
        LogCommand(cmd2,logfile)
        for files in os.listdir(plot_folder):
            if os.path.isfile(files) and files[-12:-4] == "_summary":
                if not files[-4:] in ['.tex','.pdf',',png','.txt']:
                    cmd = "rm %s"%(files)
                    LogCommand(cmd,logfile)
        Log('pdflatex was detected in default PATH, generate summary report %s'%('summary/'+conf_dict['General']['outname'] + '_summary.pdf'),logfile)
    else:
        Log('pdflatex was not detected in default PATH, generate summary report .tex file in summary/plots folder, you can move the whole summary/plots/ folder to the environment with pdflatex installed and run cmd in the plots/ folder: "pdflatex %s"'%(conf_dict['General']['outname'] + '_summary.tex'),logfile)
   
        
    if conf_dict['clean']:
        Log('--clean pararmeter was turned on, remove internal files with large size',logfile)
        LogCommand("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_symbol.bed'),logfile)
        LogCommand("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_cds.bed'),logfile)
        LogCommand("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_3utr.bed'),logfile)
        LogCommand("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_5utr.bed'),logfile)
        LogCommand("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_on_TTSdis.bed'),logfile)
        LogCommand("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_combined.bed'),logfile)
        LogCommand("rm %s "%(conf_dict['General']['outputdirectory'] + 'expmatrix/' + conf_dict['General']['outname']+'_barcode_reform.txt'),logfile)

    Log('Step5 summary DONE, check %s for final outputs'%(summarydir),logfile)


    return conf_dict