def step3_QC(conf_dict, logfile): ''' start RseQC mapping stat single cell level QC ''' # start # create section for Log('Step3: bulk and individual cell QC', logfile) ### preparing mapping state dict Log('calculate mapping state', logfile) conf_dict['Mapping_stat'] = {} conf_dict['Mapping_stat']['umi_gene'] = 0 conf_dict['Mapping_stat']['cdsN'] = 0 conf_dict['Mapping_stat']['utr3N'] = 0 conf_dict['Mapping_stat']['utr5N'] = 0 conf_dict['Mapping_stat']['intronN'] = 0 conf_dict['Mapping_stat']['intergenicN'] = 0 ### calculate mapping state based on QC matrix inf = open(conf_dict['Step2_ExpMat']['qcmatfull']) for line in inf: if line.startswith('cellname'): continue ll = line.split() conf_dict['Mapping_stat']['umi_gene'] += int(ll[2]) conf_dict['Mapping_stat']['cdsN'] += int(ll[3]) conf_dict['Mapping_stat']['utr3N'] += int(ll[4]) conf_dict['Mapping_stat']['utr5N'] += int(ll[5]) conf_dict['Mapping_stat']['intronN'] += int(ll[6]) conf_dict['Mapping_stat']['intergenicN'] += int(ll[7]) inf.close() conf_dict['Mapping_stat']['totalreads'] = int( Get('wc -l %s' % (conf_dict['General']['barcode_reform']))[0].split()[0]) conf_dict['Mapping_stat']['q30reads'] = int( Get('wc -l %s' % (conf_dict['General']['bed']))[0].split()[0]) ### create QC dir and conduct QC Log( 'generate reads QC measurement with own script, based on sample down reads', logfile) qcdir = conf_dict['General']['outputdirectory'] + 'QC/' CreateDirectory(qcdir) os.chdir(qcdir) conf_dict['QCplots'] = {} conf_dict['QCplots']['map_summary'] = qcdir + conf_dict['General'][ 'outname'] + '_map_summary.txt' mapsummary_doc = """genomic region(Category)\treads number total reads\t%s mappble reads\t%s total UMI count\t%s CDS exon UMI count\t%s 3'UTR UMI count\t%s 5'UTR UMI count\t%s intron UMI count\t%s intergenic UMI count\t%s """ % (str(conf_dict['Mapping_stat']['totalreads']), str(conf_dict['Mapping_stat']['q30reads']), str(conf_dict['Mapping_stat']['umi_gene']), str(conf_dict['Mapping_stat']['cdsN']), str(conf_dict['Mapping_stat']['utr3N']), str(conf_dict['Mapping_stat']['utr5N']), str(conf_dict['Mapping_stat']['intronN']), str(conf_dict['Mapping_stat']['intergenicN'])) outf = open(conf_dict['QCplots']['map_summary'], 'w') outf.write(mapsummary_doc) outf.close() ## reads quality t = time.time() readsqc(conf_dict['General']['sampledownsam'], conf_dict['General']['outname']) Log( 'generate bulk cell QC measurement with own script, based on sample down reads', logfile) cmd = "bedtools intersect -a %s -b %s -c > %s" % ( conf_dict['General']['outputdirectory'] + 'annotation/' + conf_dict['General']['outname'] + '_gene_anno_binexon.bed', conf_dict['General']['sampledownbed'], conf_dict['General']['outname'] + '_sampledown_on_gbbin.bed') LogCommand(cmd, logfile) GBcover(conf_dict['General']['outname'] + '_sampledown_on_gbbin.bed', conf_dict['General']['outname']) cmd = "%s %s %s" % ('Rscript', conf_dict['rscript'] + 'DrSeq_readsbulk_QC.r', conf_dict['General']['outname']) LogCommand(cmd, logfile) # cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_qul'],conf_dict['General']['sam'],conf_dict['General']['outname']) # LogCommand(cmd,logfile) # ## reads nucleotide composition # cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_nvc'],conf_dict['General']['sam'],conf_dict['General']['outname']) # LogCommand(cmd,logfile) # ## reads GC content # cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_gc'],conf_dict['General']['sam'],conf_dict['General']['outname']) # LogCommand(cmd,logfile) # readsqctime = time.time() -t # Log("time for readsqc: %s"%(readsqctime),logfile) # ## reads genebody coverage # t= time.time() # # cmd = "%s -i %s -o %s -r %s"%(conf_dict['Step3_QC']['gb_cover'],conf_dict['General']['sam'],conf_dict['General']['outname'],conf_dict['General']['outputdirectory'] + 'annotation/'+conf_dict['General']['outname']+'_gene_anno_fullbed.bed') # LogCommand(cmd,logfile) # bulkqctime = time.time() -t # Log("time for bulkqc: %s"%(bulkqctime),logfile) # mvcmd1 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.qual.heatmap.pdf',qcdir + conf_dict['General']['outname'] + '_quality_heatmap.pdf') # mvcmd2 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.NVC_plot.pdf',qcdir + conf_dict['General']['outname'] + '_NVC.pdf') # mvcmd3 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.GC_plot.pdf',qcdir + conf_dict['General']['outname'] + '_GC.pdf') # mvcmd4 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.geneBodyCoverage.pdf',qcdir + conf_dict['General']['outname'] + '_GBcover.pdf') # LogCommand(mvcmd1,logfile) # LogCommand(mvcmd2,logfile) # LogCommand(mvcmd3,logfile) # LogCommand(mvcmd4,logfile) # conf_dict['QCplots']['read_qul'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure1_quality_heatmap.pdf' conf_dict['QCplots']['read_nvc'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure2_NVC.pdf' conf_dict['QCplots']['read_gc'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure3_GC.pdf' conf_dict['QCplots']['gb_cover'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure4_GBcover.pdf' bulkqctime = time.time() - t Log("time for bulkqc: %s" % (bulkqctime), logfile) ### individual cell QC Log('generate individual cell QC measurement', logfile) t = time.time() conf_dict['QCplots']['duprate'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure5_duprate.pdf' conf_dict['QCplots']['covergn'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure8_coverGN.pdf' conf_dict['QCplots']['intronrate'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure9_intronrate.pdf' if conf_dict['General']['png_for_dot'] == 1: conf_dict['QCplots']['umicovergn'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure7_umi_coverGN.png' conf_dict['QCplots']['cumumiduprate'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure6_cumUMI_duprate.png' else: conf_dict['QCplots']['umicovergn'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure7_umi_coverGN.pdf' conf_dict['QCplots']['cumumiduprate'] = qcdir + conf_dict['General'][ 'outname'] + '_Figure6_cumUMI_duprate.pdf' conf_dict['Step2_ExpMat']['qcmatcc'] = qcdir + conf_dict['General'][ 'outname'] + "_qcmat_clustercell.txt" conf_dict['Step2_ExpMat']['expmatcc'] = qcdir + conf_dict['General'][ 'outname'] + "_expmat_clustercell.txt" conf_dict['results']['expmatcc'] = qcdir + conf_dict['General'][ 'outname'] + "_expmat_clustercell.txt" if int(conf_dict['Step3_QC']['select_cell_measure']) == 1: use_cutoff = conf_dict['Step3_QC']['covergncluster'] elif int(conf_dict['Step3_QC']['select_cell_measure']) == 2: use_cutoff = conf_dict['Step3_QC']['topumicellnumber'] else: LogError( 'select_cell_measure value can only be 1 or 2, current value is %s' % (conf_dict['Step4_Analysis']['select_cell_measure']), logfile) cmd = "%s %s %s %s %s %s %s %s %s %s %s %s %s" % ( 'Rscript', conf_dict['rscript'] + 'DrSeq_individual_QC.r', conf_dict['Step2_ExpMat']['qcmat'], conf_dict['Step2_ExpMat']['expmat'], conf_dict['General']['outname'], conf_dict['Step3_QC']['select_cell_measure'], use_cutoff, conf_dict['Step3_QC']['remove_low_dup_cell'], conf_dict['Step3_QC']['non_dup_cutoff'], conf_dict['Mapping_stat']['umi_gene'], conf_dict['Step2_ExpMat']['qcmatcc'], conf_dict['Step2_ExpMat']['expmatcc'], conf_dict['General']['png_for_dot']) LogCommand(cmd, logfile) individualqctime = time.time() - t Log("time for individualqc: %s" % (individualqctime), logfile) Log("Step3 bulk and individual cell QC DONE", logfile) return conf_dict
def step1_generate_matrix(conf_dict, logfile): ''' generate expression matrix file main data processing step, including mapping, generate expression matrix and QC matrix which is used in next step for fastq format : STAR/bowtie2 mapping q30 filter, for sam format: q30 filter ''' Log("Step1: alignment", logfile) t = time.time() ### create mapping dir mapping_dir = conf_dict['General']['outputdirectory'] + 'mapping/' CreateDirectory(mapping_dir) ### check reads file format , start mapping step if format is fastq if conf_dict['General']['format'] == 'sam': Log('reads file format is sam, skip mapping step', logfile) conf_dict['General']['sam'] = conf_dict['General']['reads_file'] else: Log( 'Now start mapping in %s , all mapping result will be here' % (mapping_dir), logfile) os.chdir(mapping_dir) ## choose mapping tool from STAR and bowtie2 according to config file if conf_dict['Step1_Mapping']['mapping_software_main'] == "STAR": Log('user choose STAR as alignment software', logfile) if Get('which STAR')[0].strip() == "": LogError( 'STAR is not detected in default PATH, make sure you installed STAR and export it into default PATH', logfile) mapping_cmd = 'STAR --genomeDir %s --readFilesIn %s --runThreadN %s' % ( conf_dict['Step1_Mapping']['mapindex'], conf_dict['General']['reads_file'], conf_dict['Step1_Mapping']['mapping_p']) mapping_cmd2 = 'mv Aligned.out.sam %s.sam' % ( conf_dict['General']['outname']) LogCommand(mapping_cmd, logfile) LogCommand(mapping_cmd2, logfile) elif conf_dict['Step1_Mapping']['mapping_software_main'] == "bowtie2": Log('user choose bowtie2 as alignment software', logfile) if Get('which bowtie2')[0].strip() == "": LogError( 'bowtie2 is not detected in default PATH, make sure you installed bowtie2 and export it into default PATH', logfile) mapping_cmd = 'bowtie2 -p %s -x %s -U %s -S %s.sam 2>&1 >>/dev/null |tee -a %s.bowtieout' % ( conf_dict['Step1_Mapping']['mapping_p'], conf_dict['Step1_Mapping']['mapindex'], conf_dict['General']['reads_file'], conf_dict['General']['outname'], conf_dict['General']['outname']) LogCommand(mapping_cmd, logfile) elif conf_dict["Step1_Mapping"]["mapping_software_main"] == "HISAT2": Log('user choose HISAT2 as alignment software', logfile) if Get('which hisat2')[0].strip() == "": LogError( 'hisat2 is not detected in default PATH, make sure you installed hisat2 and export it into default PATH', logfile) mapping_cmd = 'hisat2 -p %s -x %s -U %s -S %s.sam 2>&1 >>/dev/null |tee -a %s.hisat2out' % ( conf_dict['Step1_Mapping']['mapping_p'], conf_dict['Step1_Mapping']['mapindex'], conf_dict['General']['reads_file'], conf_dict['General']['outname'], conf_dict['General']['outname']) LogCommand(mapping_cmd, logfile) else: LogError("alignment tools can only be HISAT2, STAR or bowtie2", logfile) conf_dict['General'][ 'sam'] = mapping_dir + conf_dict['General']['outname'] + '.sam' ### transform to bed file, awk helps to conduct q30 filtering Log("transfer sam file to aligned bed file with own script", logfile) conf_dict['General'][ 'bed'] = mapping_dir + conf_dict['General']['outname'] + '.bed' conf_dict['General']['sampledownsam'] = mapping_dir + conf_dict['General'][ 'outname'] + '_sampledown.sam' conf_dict['General']['sampledownbed'] = mapping_dir + conf_dict['General'][ 'outname'] + '_sampledown.bed' if int(conf_dict['Step1_Mapping']['q30filter']) == 1: Log("q30 filter is turned on", logfile) else: Log("q30 filter is turned off", logfile) ### use own script to transform sam to bed, and random sampling 5M mappable reads SampleDownTransformSam(conf_dict['General']['sam'], conf_dict['General']['bed'], conf_dict['General']['sampledownsam'], conf_dict['General']['sampledownbed'], 5000000, int(conf_dict['Step1_Mapping']['q30filter'])) # q30cmd = """samtools view -q 30 -XS %s | awk '{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if (substr($2,1,1) == "r") print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed']) # q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr" && $5 > 30) {if ($2 == 16) print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed']) # q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr" && $5 > 30) {if ($2 == 16) print $3,$4-1,$4,$1,255,"-";else print $3,$4-1,$4,$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed']) # LogCommand(q30cmd,logfile,conf_dict['General']['dryrun']) # q30cmd = """samtools view -XS %s | awk '{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if (substr($2,1,1) == "r") print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed']) # q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if ($2 == 16) print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed']) # q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if ($2 == 16) print $3,$4-1,$4+length($11),$1,255,"-";else print $3,$4-1,$4,$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed']) # LogCommand(q30cmd,logfile,conf_dict['General']['dryrun']) if not os.path.isfile(conf_dict['General']['bed']) or os.path.getsize( conf_dict['General']['bed']) == 0: LogError( 'Alignment step / q30 filtering step failed, check your alignment parameter and samfile', logfile) s1time = time.time() - t Log("time for alignment: %s" % (s1time), logfile) Log("Step1: alignment DONE", logfile) ### create annotation dir and generate related annotation file t = time.time() Log("Step2: transform expression matrix", logfile) Log('generate related annotation file with own script', logfile) annotation_dir = conf_dict['General']['outputdirectory'] + 'annotation/' CreateDirectory(annotation_dir) os.chdir(annotation_dir) GeneAnnotation(conf_dict['General']['gene_annotation'], conf_dict['Step2_ExpMat']['ttsdistance'], conf_dict['General']['outname']) ### create expression matrix dir and generate matrix Log( 'generate expression matrix and individual cell qc matrix with own script', logfile) expdir = conf_dict['General']['outputdirectory'] + 'expmatrix/' CreateDirectory(expdir) os.chdir(expdir) ### use bedtools(intersect function) to assign exon/intron/intergenic/overlapping gene information to all reads ### sort according to name Log('add gene annotation on aligned bed file', logfile) cmd1 = "bedtools intersect -a %s -b %s -wo | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % ( conf_dict['General']['bed'], annotation_dir + conf_dict['General']['outname'] + '_gene_anno_symbol.bed', conf_dict['General']['outname'] + '_on_symbol.bed') cmd2 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % ( conf_dict['General']['bed'], annotation_dir + conf_dict['General']['outname'] + '_gene_anno_cds.bed', conf_dict['General']['outname'] + '_on_cds.bed') cmd3 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % ( conf_dict['General']['bed'], annotation_dir + conf_dict['General']['outname'] + '_gene_anno_3utr.bed', conf_dict['General']['outname'] + '_on_3utr.bed') cmd4 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % ( conf_dict['General']['bed'], annotation_dir + conf_dict['General']['outname'] + '_gene_anno_5utr.bed', conf_dict['General']['outname'] + '_on_5utr.bed') cmd5 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % ( conf_dict['General']['bed'], annotation_dir + conf_dict['General']['outname'] + '_gene_anno_TTSdis.bed', conf_dict['General']['outname'] + '_on_TTSdis.bed') LogCommand(cmd1, logfile) LogCommand(cmd2, logfile) LogCommand(cmd3, logfile) LogCommand(cmd4, logfile) LogCommand(cmd5, logfile) ### transform barcode fastq to 3column txt file [name,cell_barcode,umi] if conf_dict['General']['format1'] == 'txt': Log('barcode files is reformed txt format, skip reform step', logfile) conf_dict['General']['barcode_reform'] = conf_dict['General'][ 'barcode_file'] else: Log('reform barcode files with own script', logfile) conf_dict['General']['barcode_reform'] = expdir + conf_dict['General'][ 'outname'] + '_barcode_reform.txt' ReformBarcodeFastq(conf_dict['General']['barcode_file'], conf_dict['General']['barcode_reform'], conf_dict['General']['cell_barcode_range'], conf_dict['General']['umi_range']) ### sort according name cmdsort = 'sort -k 1,1 --parallel=6 -T . -S 8%% %s > %s' % ( conf_dict['General']['barcode_reform'], expdir + conf_dict['General']['outname'] + '_barcode_reform_sort.txt') LogCommand(cmdsort, logfile) conf_dict['General']['barcode_reform'] = expdir + conf_dict['General'][ 'outname'] + '_barcode_reform_sort.txt' ### combine gene annotation, reads, barcode together Log('combine annotation and barcode on reads with own script', logfile) CombineReads(conf_dict['General']['barcode_reform'], conf_dict['General']['outname'] + '_on_cds.bed', conf_dict['General']['outname'] + '_on_3utr.bed', conf_dict['General']['outname'] + '_on_5utr.bed', conf_dict['General']['outname'] + '_on_symbol.bed', conf_dict['General']['outname'] + '_on_TTSdis.bed', conf_dict['General']['outname'] + '_combined.bed', conf_dict['Step2_ExpMat']['duplicate_measure']) ### sort combined file by umi+loci, for following duplicate detection cmd6 = "sort -k 7,7 -k 5,5 --parallel=6 -T . -S 8%% %s > %s" % ( conf_dict['General']['outname'] + '_combined.bed', conf_dict['General']['outname'] + '_combined_sort.bed') LogCommand(cmd6, logfile) ### generate expression and QC matrix based on combined file Log('generate expression matrix and QC matrix with own script', logfile) ### qcmatfull contains all cell_barcodes, while qcmat,expmat only contain cell_barcodes >= covergncutoff(100, default) conf_dict['Step2_ExpMat']['qcmatfull'] = expdir + conf_dict['General'][ 'outname'] + "_qcmatfull.txt" conf_dict['Step2_ExpMat'][ 'qcmat'] = expdir + conf_dict['General']['outname'] + "_qcmat.txt" conf_dict['Step2_ExpMat'][ 'expmat'] = expdir + conf_dict['General']['outname'] + "_expmat.txt" GenerateMatrix(conf_dict['General']['gene_annotation'], conf_dict['General']['outname'] + '_combined_sort.bed', conf_dict['Step2_ExpMat']['filterttsdistance'], conf_dict['Step2_ExpMat']['qcmatfull'], conf_dict['Step2_ExpMat']['qcmat'], conf_dict['Step2_ExpMat']['expmat'], conf_dict['Step2_ExpMat']['covergncutoff'], conf_dict['Step2_ExpMat']['umidis1']) Log("Step2 transform expression matrix DONE", logfile) s2time = time.time() - t Log("time for transform expmat: %s" % (s2time), logfile) conf_dict['results'] = {} #conf_dict['results']['expmat'] = conf_dict['Step2_ExpMat']['expmat'] #conf_dict['results']['qcmat'] = conf_dict['Step2_ExpMat']['qcmat'] return conf_dict
def Step0IntegrateData(conf_dict, logfile): ''' step0 integrate data check and complement parameter ''' Log("Start ATAC", logfile) Log("Step0: Data integrate", logfile) ### check output name if "/" in conf_dict['General']['outname']: LogError( "outname is the name of all your output result, cannot contain " / ", current outname is %s" % (conf_dict['General']['outname']), logfile) ### check data path , format , if "~" in conf_dict['General']['fastq_1']: LogError( 'require absolute path for fastq_1 file, cannot contain "~", current fastq_1 file is %s' % (conf_dict['General']['fastq_1']), logfile) if "~" in conf_dict['General']['fastq_2']: LogError( 'require absolute path for fastq_2 file, cannot contain "~", current fastq_2 file is %s' % (conf_dict['General']['fastq_2']), logfile) if "~" in conf_dict['General']['barcode_file']: LogError( 'require absolute path for barcode file, cannot contain "~", current barcode file is %s' % (conf_dict['General']['barcode']), logfile) if not conf_dict['General']['fastq_1'].startswith('/'): conf_dict['General']['fastq_1'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['fastq_1'] if not conf_dict['General']['fastq_2'].startswith('/'): conf_dict['General']['fastq_2'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['fastq_2'] if not conf_dict['General']['barcode_file'].startswith('/'): conf_dict['General']['barcode_file'] = conf_dict['General'][ 'startdir'] + conf_dict['General']['barcode_file'] if not os.path.isfile(conf_dict['General']['fastq_1']): LogError( "fastq_1 file %s not found" % (conf_dict['General']['fastq_1']), logfile) if not os.path.isfile(conf_dict['General']['fastq_2']): LogError( "fastq_2 file %s not found" % (conf_dict['General']['fastq_2']), logfile) if not os.path.isfile(conf_dict['General']['barcode_file']): LogError( "barcode_file file %s not found" % (conf_dict['General']['barcode_file']), logfile) if not (conf_dict['General']['fastq_1'].endswith('.fastq') and conf_dict['General']['fastq_2'].endswith('.fastq')): LogError("input files should be fastq files.", logfile) else: Log('Detected input file format is fastq', logfile) conf_dict['General']['format'] = 'fastq' ### check gene annotation file if conf_dict['General']['gene_annotation'] == "": LogError("gene annotation file cannot be empty", logfile) if not "/" in conf_dict['General']['gene_annotation']: LogError("absolute path for gene annotation file required", logfile) if not os.path.isfile(conf_dict['General']['gene_annotation']): LogError( "cannot find gene annotation file : %s" % (conf_dict['General']['gene_annotation']), logfile) ### mapping index if conf_dict['General']['format'] == 'fastq': if conf_dict['Step1_Mapping']['mapping_software'] == "bowtie2": Log('use bowtie2 as alignment tools', logfile) # conf_dict['Step1_Mapping']['mapindex'] = indexdir + conf_dict['General']['genome_version'] indexfile1 = conf_dict['Step1_Mapping']['mapindex'] + '.1.bt2' if not os.path.isfile(indexfile1): LogError("cannot find bowtie2 index file : %s " % (indexfile1), logfile) else: LogError("alignment tools can only be bowtie2 by now", logfile) ### check options Log('option setting: ', logfile) try: Log( 'mapping thread is %s' % (str(int(conf_dict['Step1_Mapping']['p']))), logfile) except: LogError( 'p should be int, current value is %s' % (conf_dict['Step1_Mapping']['p']), logfile) if not int(conf_dict['Step1_Mapping']['q30filter']) in [0, 1]: LogError( 'q30filter measurement can only be 0/1, current value is %s' % (conf_dict['Step1_Mapping']['q30filter']), logfile) if not int(conf_dict['Step1_Mapping']['filter_reads_length']) in [0, 1]: LogError( 'filter_reads_length measurement can only be 0/1, current value is %s' % (conf_dict['Step1_Mapping']['filter_reads_length']), logfile) ### check Rscript if not 'Usage' in GetError('Rscript')[1] and not 'version' in GetError( 'Rscript')[1]: LogError('require Rscript', logfile) ### check pdflatex if Get('pdflatex --help')[0] == "": Log( 'pdflatex was not installed, ATAC is still processing but no summary QC report generated', logfile) conf_dict['General']['latex'] = 0 else: conf_dict['General']['latex'] = 1 Log('Step0 Data integrate DONE', logfile) return conf_dict