예제 #1
0
def step3_QC(conf_dict, logfile):
    '''
    start RseQC
    mapping stat
    single cell level QC
    '''
    # start
    # create section for

    Log('Step3: bulk and individual cell QC', logfile)
    ### preparing mapping state dict
    Log('calculate mapping state', logfile)
    conf_dict['Mapping_stat'] = {}
    conf_dict['Mapping_stat']['umi_gene'] = 0
    conf_dict['Mapping_stat']['cdsN'] = 0
    conf_dict['Mapping_stat']['utr3N'] = 0
    conf_dict['Mapping_stat']['utr5N'] = 0
    conf_dict['Mapping_stat']['intronN'] = 0
    conf_dict['Mapping_stat']['intergenicN'] = 0

    ### calculate mapping state based on QC matrix
    inf = open(conf_dict['Step2_ExpMat']['qcmatfull'])
    for line in inf:
        if line.startswith('cellname'):
            continue
        ll = line.split()
        conf_dict['Mapping_stat']['umi_gene'] += int(ll[2])
        conf_dict['Mapping_stat']['cdsN'] += int(ll[3])
        conf_dict['Mapping_stat']['utr3N'] += int(ll[4])
        conf_dict['Mapping_stat']['utr5N'] += int(ll[5])
        conf_dict['Mapping_stat']['intronN'] += int(ll[6])
        conf_dict['Mapping_stat']['intergenicN'] += int(ll[7])
    inf.close()
    conf_dict['Mapping_stat']['totalreads'] = int(
        Get('wc -l %s' %
            (conf_dict['General']['barcode_reform']))[0].split()[0])
    conf_dict['Mapping_stat']['q30reads'] = int(
        Get('wc -l %s' % (conf_dict['General']['bed']))[0].split()[0])

    ### create  QC dir and conduct QC
    Log(
        'generate reads QC measurement with own script, based on sample down reads',
        logfile)
    qcdir = conf_dict['General']['outputdirectory'] + 'QC/'
    CreateDirectory(qcdir)
    os.chdir(qcdir)
    conf_dict['QCplots'] = {}
    conf_dict['QCplots']['map_summary'] = qcdir + conf_dict['General'][
        'outname'] + '_map_summary.txt'
    mapsummary_doc = """genomic region(Category)\treads number
total reads\t%s
mappble reads\t%s 
total UMI count\t%s
CDS exon UMI count\t%s
3'UTR UMI count\t%s
5'UTR UMI count\t%s
intron UMI count\t%s
intergenic UMI count\t%s
""" % (str(conf_dict['Mapping_stat']['totalreads']),
       str(conf_dict['Mapping_stat']['q30reads']),
       str(conf_dict['Mapping_stat']['umi_gene']),
       str(conf_dict['Mapping_stat']['cdsN']),
       str(conf_dict['Mapping_stat']['utr3N']),
       str(conf_dict['Mapping_stat']['utr5N']),
       str(conf_dict['Mapping_stat']['intronN']),
       str(conf_dict['Mapping_stat']['intergenicN']))
    outf = open(conf_dict['QCplots']['map_summary'], 'w')
    outf.write(mapsummary_doc)
    outf.close()
    ## reads quality
    t = time.time()
    readsqc(conf_dict['General']['sampledownsam'],
            conf_dict['General']['outname'])
    Log(
        'generate bulk cell QC measurement with own script, based on sample down reads',
        logfile)

    cmd = "bedtools intersect -a %s -b %s -c > %s" % (
        conf_dict['General']['outputdirectory'] + 'annotation/' +
        conf_dict['General']['outname'] + '_gene_anno_binexon.bed',
        conf_dict['General']['sampledownbed'],
        conf_dict['General']['outname'] + '_sampledown_on_gbbin.bed')
    LogCommand(cmd, logfile)
    GBcover(conf_dict['General']['outname'] + '_sampledown_on_gbbin.bed',
            conf_dict['General']['outname'])
    cmd = "%s %s %s" % ('Rscript',
                        conf_dict['rscript'] + 'DrSeq_readsbulk_QC.r',
                        conf_dict['General']['outname'])
    LogCommand(cmd, logfile)

    #       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_qul'],conf_dict['General']['sam'],conf_dict['General']['outname'])
    #       LogCommand(cmd,logfile)
    #       ## reads nucleotide composition
    #       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_nvc'],conf_dict['General']['sam'],conf_dict['General']['outname'])
    #       LogCommand(cmd,logfile)
    #       ## reads GC content
    #       cmd = "%s -i %s -o %s"%(conf_dict['Step3_QC']['read_gc'],conf_dict['General']['sam'],conf_dict['General']['outname'])
    #       LogCommand(cmd,logfile)
    #       readsqctime = time.time() -t
    #       Log("time for readsqc: %s"%(readsqctime),logfile)
    #       ## reads genebody coverage
    #       t= time.time()
    #
    #       cmd = "%s -i %s -o %s -r %s"%(conf_dict['Step3_QC']['gb_cover'],conf_dict['General']['sam'],conf_dict['General']['outname'],conf_dict['General']['outputdirectory'] + 'annotation/'+conf_dict['General']['outname']+'_gene_anno_fullbed.bed')
    #       LogCommand(cmd,logfile)
    #       bulkqctime = time.time() -t
    #       Log("time for bulkqc: %s"%(bulkqctime),logfile)
    #       mvcmd1 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.qual.heatmap.pdf',qcdir + conf_dict['General']['outname'] + '_quality_heatmap.pdf')
    #       mvcmd2 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.NVC_plot.pdf',qcdir + conf_dict['General']['outname'] + '_NVC.pdf')
    #       mvcmd3 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.GC_plot.pdf',qcdir + conf_dict['General']['outname'] + '_GC.pdf')
    #       mvcmd4 = "mv %s %s"%(qcdir + conf_dict['General']['outname'] + '.geneBodyCoverage.pdf',qcdir + conf_dict['General']['outname'] + '_GBcover.pdf')
    #       LogCommand(mvcmd1,logfile)
    #       LogCommand(mvcmd2,logfile)
    #       LogCommand(mvcmd3,logfile)
    #       LogCommand(mvcmd4,logfile)
    #

    conf_dict['QCplots']['read_qul'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure1_quality_heatmap.pdf'
    conf_dict['QCplots']['read_nvc'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure2_NVC.pdf'
    conf_dict['QCplots']['read_gc'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure3_GC.pdf'
    conf_dict['QCplots']['gb_cover'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure4_GBcover.pdf'
    bulkqctime = time.time() - t
    Log("time for bulkqc: %s" % (bulkqctime), logfile)

    ### individual cell QC
    Log('generate individual cell QC measurement', logfile)
    t = time.time()
    conf_dict['QCplots']['duprate'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure5_duprate.pdf'
    conf_dict['QCplots']['covergn'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure8_coverGN.pdf'
    conf_dict['QCplots']['intronrate'] = qcdir + conf_dict['General'][
        'outname'] + '_Figure9_intronrate.pdf'

    if conf_dict['General']['png_for_dot'] == 1:
        conf_dict['QCplots']['umicovergn'] = qcdir + conf_dict['General'][
            'outname'] + '_Figure7_umi_coverGN.png'
        conf_dict['QCplots']['cumumiduprate'] = qcdir + conf_dict['General'][
            'outname'] + '_Figure6_cumUMI_duprate.png'
    else:
        conf_dict['QCplots']['umicovergn'] = qcdir + conf_dict['General'][
            'outname'] + '_Figure7_umi_coverGN.pdf'
        conf_dict['QCplots']['cumumiduprate'] = qcdir + conf_dict['General'][
            'outname'] + '_Figure6_cumUMI_duprate.pdf'

    conf_dict['Step2_ExpMat']['qcmatcc'] = qcdir + conf_dict['General'][
        'outname'] + "_qcmat_clustercell.txt"
    conf_dict['Step2_ExpMat']['expmatcc'] = qcdir + conf_dict['General'][
        'outname'] + "_expmat_clustercell.txt"
    conf_dict['results']['expmatcc'] = qcdir + conf_dict['General'][
        'outname'] + "_expmat_clustercell.txt"

    if int(conf_dict['Step3_QC']['select_cell_measure']) == 1:
        use_cutoff = conf_dict['Step3_QC']['covergncluster']
    elif int(conf_dict['Step3_QC']['select_cell_measure']) == 2:
        use_cutoff = conf_dict['Step3_QC']['topumicellnumber']
    else:
        LogError(
            'select_cell_measure value can only be 1 or 2, current value is %s'
            % (conf_dict['Step4_Analysis']['select_cell_measure']), logfile)

    cmd = "%s %s %s %s %s %s %s %s %s %s %s %s %s" % (
        'Rscript', conf_dict['rscript'] + 'DrSeq_individual_QC.r',
        conf_dict['Step2_ExpMat']['qcmat'],
        conf_dict['Step2_ExpMat']['expmat'], conf_dict['General']['outname'],
        conf_dict['Step3_QC']['select_cell_measure'], use_cutoff,
        conf_dict['Step3_QC']['remove_low_dup_cell'],
        conf_dict['Step3_QC']['non_dup_cutoff'],
        conf_dict['Mapping_stat']['umi_gene'],
        conf_dict['Step2_ExpMat']['qcmatcc'],
        conf_dict['Step2_ExpMat']['expmatcc'],
        conf_dict['General']['png_for_dot'])
    LogCommand(cmd, logfile)
    individualqctime = time.time() - t
    Log("time for individualqc: %s" % (individualqctime), logfile)
    Log("Step3 bulk and individual cell QC DONE", logfile)
    return conf_dict
예제 #2
0
def step1_generate_matrix(conf_dict, logfile):
    '''
    generate expression matrix file 
    main data processing step, including mapping, generate expression matrix and QC matrix which is used in next step
    for fastq format : 
        STAR/bowtie2 mapping
        q30 filter, 
    for sam format:
        q30 filter     
    '''
    Log("Step1: alignment", logfile)
    t = time.time()
    ### create mapping dir
    mapping_dir = conf_dict['General']['outputdirectory'] + 'mapping/'
    CreateDirectory(mapping_dir)
    ### check reads file format , start mapping step if format is fastq
    if conf_dict['General']['format'] == 'sam':
        Log('reads file format is sam, skip mapping step', logfile)
        conf_dict['General']['sam'] = conf_dict['General']['reads_file']
    else:
        Log(
            'Now start mapping in %s , all mapping result will be here' %
            (mapping_dir), logfile)
        os.chdir(mapping_dir)
        ## choose mapping tool from STAR and bowtie2 according to config file
        if conf_dict['Step1_Mapping']['mapping_software_main'] == "STAR":
            Log('user choose STAR as alignment software', logfile)
            if Get('which STAR')[0].strip() == "":
                LogError(
                    'STAR is not detected in default PATH, make sure you installed STAR and export it into default PATH',
                    logfile)
            mapping_cmd = 'STAR --genomeDir %s --readFilesIn %s --runThreadN %s' % (
                conf_dict['Step1_Mapping']['mapindex'],
                conf_dict['General']['reads_file'],
                conf_dict['Step1_Mapping']['mapping_p'])
            mapping_cmd2 = 'mv Aligned.out.sam %s.sam' % (
                conf_dict['General']['outname'])
            LogCommand(mapping_cmd, logfile)
            LogCommand(mapping_cmd2, logfile)

        elif conf_dict['Step1_Mapping']['mapping_software_main'] == "bowtie2":
            Log('user choose bowtie2 as alignment software', logfile)
            if Get('which bowtie2')[0].strip() == "":
                LogError(
                    'bowtie2 is not detected in default PATH, make sure you installed bowtie2 and export it into default PATH',
                    logfile)
            mapping_cmd = 'bowtie2 -p %s -x %s -U %s -S %s.sam   2>&1 >>/dev/null |tee -a %s.bowtieout' % (
                conf_dict['Step1_Mapping']['mapping_p'],
                conf_dict['Step1_Mapping']['mapindex'],
                conf_dict['General']['reads_file'],
                conf_dict['General']['outname'],
                conf_dict['General']['outname'])
            LogCommand(mapping_cmd, logfile)

        elif conf_dict["Step1_Mapping"]["mapping_software_main"] == "HISAT2":
            Log('user choose HISAT2 as alignment software', logfile)
            if Get('which hisat2')[0].strip() == "":
                LogError(
                    'hisat2 is not detected in default PATH, make sure you installed hisat2 and export it into default PATH',
                    logfile)
            mapping_cmd = 'hisat2 -p %s -x %s -U %s -S %s.sam   2>&1 >>/dev/null |tee -a %s.hisat2out' % (
                conf_dict['Step1_Mapping']['mapping_p'],
                conf_dict['Step1_Mapping']['mapindex'],
                conf_dict['General']['reads_file'],
                conf_dict['General']['outname'],
                conf_dict['General']['outname'])
            LogCommand(mapping_cmd, logfile)

        else:
            LogError("alignment tools can only be HISAT2, STAR or bowtie2",
                     logfile)

        conf_dict['General'][
            'sam'] = mapping_dir + conf_dict['General']['outname'] + '.sam'
    ### transform to bed file, awk helps to conduct q30 filtering
    Log("transfer sam file to aligned bed file with own script", logfile)
    conf_dict['General'][
        'bed'] = mapping_dir + conf_dict['General']['outname'] + '.bed'
    conf_dict['General']['sampledownsam'] = mapping_dir + conf_dict['General'][
        'outname'] + '_sampledown.sam'
    conf_dict['General']['sampledownbed'] = mapping_dir + conf_dict['General'][
        'outname'] + '_sampledown.bed'
    if int(conf_dict['Step1_Mapping']['q30filter']) == 1:
        Log("q30 filter is turned on", logfile)
    else:
        Log("q30 filter is turned off", logfile)
    ### use own script to transform sam to bed, and random sampling 5M mappable reads
    SampleDownTransformSam(conf_dict['General']['sam'],
                           conf_dict['General']['bed'],
                           conf_dict['General']['sampledownsam'],
                           conf_dict['General']['sampledownbed'], 5000000,
                           int(conf_dict['Step1_Mapping']['q30filter']))
    #        q30cmd = """samtools view -q 30 -XS %s | awk '{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if (substr($2,1,1) == "r") print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr" && $5 > 30) {if ($2 == 16) print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr" && $5 > 30) {if ($2 == 16) print $3,$4-1,$4,$1,255,"-";else print $3,$4-1,$4,$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        LogCommand(q30cmd,logfile,conf_dict['General']['dryrun'])
    #        q30cmd = """samtools view -XS %s | awk '{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if (substr($2,1,1) == "r") print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if ($2 == 16) print $3,$4-1,$4-1+length($11),$1,255,"-";else print $3,$4-1,$4-1+length($11),$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        q30cmd = """awk '/^[^@]/{FS="\t";OFS="\t";if (substr($3,1,3) == "chr") {if ($2 == 16) print $3,$4-1,$4+length($11),$1,255,"-";else print $3,$4-1,$4,$1,255,"+";}}' %s > %s"""%(conf_dict['General']['sam'],conf_dict['General']['bed'])
    #        LogCommand(q30cmd,logfile,conf_dict['General']['dryrun'])
    if not os.path.isfile(conf_dict['General']['bed']) or os.path.getsize(
            conf_dict['General']['bed']) == 0:
        LogError(
            'Alignment step / q30 filtering step failed, check your alignment parameter and samfile',
            logfile)
    s1time = time.time() - t
    Log("time for alignment: %s" % (s1time), logfile)
    Log("Step1: alignment DONE", logfile)

    ### create annotation dir and generate related annotation file
    t = time.time()
    Log("Step2: transform expression matrix", logfile)
    Log('generate related annotation file with own script', logfile)
    annotation_dir = conf_dict['General']['outputdirectory'] + 'annotation/'
    CreateDirectory(annotation_dir)
    os.chdir(annotation_dir)
    GeneAnnotation(conf_dict['General']['gene_annotation'],
                   conf_dict['Step2_ExpMat']['ttsdistance'],
                   conf_dict['General']['outname'])

    ### create expression matrix dir and generate matrix
    Log(
        'generate expression matrix and individual cell qc matrix with own script',
        logfile)
    expdir = conf_dict['General']['outputdirectory'] + 'expmatrix/'
    CreateDirectory(expdir)
    os.chdir(expdir)

    ### use bedtools(intersect function) to assign exon/intron/intergenic/overlapping gene  information to all reads
    ### sort according to name
    Log('add gene annotation on aligned bed file', logfile)
    cmd1 = "bedtools intersect -a %s -b %s  -wo | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_symbol.bed',
        conf_dict['General']['outname'] + '_on_symbol.bed')
    cmd2 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_cds.bed',
        conf_dict['General']['outname'] + '_on_cds.bed')
    cmd3 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_3utr.bed',
        conf_dict['General']['outname'] + '_on_3utr.bed')
    cmd4 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_5utr.bed',
        conf_dict['General']['outname'] + '_on_5utr.bed')
    cmd5 = "bedtools intersect -a %s -b %s -c | sort -k 4,4 --parallel=6 -T . -S 8%% - > %s" % (
        conf_dict['General']['bed'], annotation_dir +
        conf_dict['General']['outname'] + '_gene_anno_TTSdis.bed',
        conf_dict['General']['outname'] + '_on_TTSdis.bed')
    LogCommand(cmd1, logfile)
    LogCommand(cmd2, logfile)
    LogCommand(cmd3, logfile)
    LogCommand(cmd4, logfile)
    LogCommand(cmd5, logfile)

    ### transform barcode fastq to 3column txt file [name,cell_barcode,umi]
    if conf_dict['General']['format1'] == 'txt':
        Log('barcode files is reformed txt format, skip reform step', logfile)
        conf_dict['General']['barcode_reform'] = conf_dict['General'][
            'barcode_file']
    else:
        Log('reform barcode files with own script', logfile)
        conf_dict['General']['barcode_reform'] = expdir + conf_dict['General'][
            'outname'] + '_barcode_reform.txt'
        ReformBarcodeFastq(conf_dict['General']['barcode_file'],
                           conf_dict['General']['barcode_reform'],
                           conf_dict['General']['cell_barcode_range'],
                           conf_dict['General']['umi_range'])
    ### sort according name
    cmdsort = 'sort -k 1,1 --parallel=6 -T . -S 8%% %s > %s' % (
        conf_dict['General']['barcode_reform'],
        expdir + conf_dict['General']['outname'] + '_barcode_reform_sort.txt')
    LogCommand(cmdsort, logfile)
    conf_dict['General']['barcode_reform'] = expdir + conf_dict['General'][
        'outname'] + '_barcode_reform_sort.txt'

    ### combine gene annotation, reads, barcode together
    Log('combine annotation and barcode on reads with own script', logfile)
    CombineReads(conf_dict['General']['barcode_reform'],
                 conf_dict['General']['outname'] + '_on_cds.bed',
                 conf_dict['General']['outname'] + '_on_3utr.bed',
                 conf_dict['General']['outname'] + '_on_5utr.bed',
                 conf_dict['General']['outname'] + '_on_symbol.bed',
                 conf_dict['General']['outname'] + '_on_TTSdis.bed',
                 conf_dict['General']['outname'] + '_combined.bed',
                 conf_dict['Step2_ExpMat']['duplicate_measure'])

    ### sort combined file by umi+loci, for following duplicate detection
    cmd6 = "sort -k 7,7 -k 5,5 --parallel=6 -T . -S 8%% %s > %s" % (
        conf_dict['General']['outname'] + '_combined.bed',
        conf_dict['General']['outname'] + '_combined_sort.bed')
    LogCommand(cmd6, logfile)

    ### generate expression and QC matrix based on combined file
    Log('generate expression matrix and QC matrix with own script', logfile)
    ### qcmatfull contains all cell_barcodes, while qcmat,expmat only contain cell_barcodes >= covergncutoff(100, default)
    conf_dict['Step2_ExpMat']['qcmatfull'] = expdir + conf_dict['General'][
        'outname'] + "_qcmatfull.txt"
    conf_dict['Step2_ExpMat'][
        'qcmat'] = expdir + conf_dict['General']['outname'] + "_qcmat.txt"
    conf_dict['Step2_ExpMat'][
        'expmat'] = expdir + conf_dict['General']['outname'] + "_expmat.txt"

    GenerateMatrix(conf_dict['General']['gene_annotation'],
                   conf_dict['General']['outname'] + '_combined_sort.bed',
                   conf_dict['Step2_ExpMat']['filterttsdistance'],
                   conf_dict['Step2_ExpMat']['qcmatfull'],
                   conf_dict['Step2_ExpMat']['qcmat'],
                   conf_dict['Step2_ExpMat']['expmat'],
                   conf_dict['Step2_ExpMat']['covergncutoff'],
                   conf_dict['Step2_ExpMat']['umidis1'])

    Log("Step2 transform expression matrix DONE", logfile)
    s2time = time.time() - t
    Log("time for transform expmat: %s" % (s2time), logfile)
    conf_dict['results'] = {}
    #conf_dict['results']['expmat'] = conf_dict['Step2_ExpMat']['expmat']
    #conf_dict['results']['qcmat'] = conf_dict['Step2_ExpMat']['qcmat']

    return conf_dict
예제 #3
0
def Step0IntegrateData(conf_dict, logfile):
    '''
    step0 integrate data 
    check and complement parameter
    '''
    Log("Start ATAC", logfile)
    Log("Step0: Data integrate", logfile)

    ### check output name
    if "/" in conf_dict['General']['outname']:
        LogError(
            "outname is the name of all your output result, cannot contain " /
            ", current outname is  %s" % (conf_dict['General']['outname']),
            logfile)
    ### check data path , format ,
    if "~" in conf_dict['General']['fastq_1']:
        LogError(
            'require absolute path for fastq_1 file, cannot contain "~", current fastq_1 file is %s'
            % (conf_dict['General']['fastq_1']), logfile)
    if "~" in conf_dict['General']['fastq_2']:
        LogError(
            'require absolute path for fastq_2 file, cannot contain "~", current fastq_2 file is %s'
            % (conf_dict['General']['fastq_2']), logfile)
    if "~" in conf_dict['General']['barcode_file']:
        LogError(
            'require absolute path for barcode file, cannot contain "~", current barcode file is %s'
            % (conf_dict['General']['barcode']), logfile)
    if not conf_dict['General']['fastq_1'].startswith('/'):
        conf_dict['General']['fastq_1'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['fastq_1']
    if not conf_dict['General']['fastq_2'].startswith('/'):
        conf_dict['General']['fastq_2'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['fastq_2']
    if not conf_dict['General']['barcode_file'].startswith('/'):
        conf_dict['General']['barcode_file'] = conf_dict['General'][
            'startdir'] + conf_dict['General']['barcode_file']
    if not os.path.isfile(conf_dict['General']['fastq_1']):
        LogError(
            "fastq_1 file %s not found" % (conf_dict['General']['fastq_1']),
            logfile)
    if not os.path.isfile(conf_dict['General']['fastq_2']):
        LogError(
            "fastq_2 file %s not found" % (conf_dict['General']['fastq_2']),
            logfile)
    if not os.path.isfile(conf_dict['General']['barcode_file']):
        LogError(
            "barcode_file file %s not found" %
            (conf_dict['General']['barcode_file']), logfile)

    if not (conf_dict['General']['fastq_1'].endswith('.fastq')
            and conf_dict['General']['fastq_2'].endswith('.fastq')):
        LogError("input files should be fastq files.", logfile)
    else:
        Log('Detected input file format is fastq', logfile)
        conf_dict['General']['format'] = 'fastq'

    ### check gene annotation file
    if conf_dict['General']['gene_annotation'] == "":
        LogError("gene annotation file cannot be empty", logfile)
    if not "/" in conf_dict['General']['gene_annotation']:
        LogError("absolute path for gene annotation file required", logfile)
    if not os.path.isfile(conf_dict['General']['gene_annotation']):
        LogError(
            "cannot find gene annotation file : %s" %
            (conf_dict['General']['gene_annotation']), logfile)

    ### mapping index
    if conf_dict['General']['format'] == 'fastq':
        if conf_dict['Step1_Mapping']['mapping_software'] == "bowtie2":
            Log('use bowtie2 as alignment tools', logfile)
            #            conf_dict['Step1_Mapping']['mapindex'] = indexdir + conf_dict['General']['genome_version']
            indexfile1 = conf_dict['Step1_Mapping']['mapindex'] + '.1.bt2'
            if not os.path.isfile(indexfile1):
                LogError("cannot find bowtie2 index file : %s " % (indexfile1),
                         logfile)
        else:
            LogError("alignment tools can only be bowtie2 by now", logfile)

    ### check options
    Log('option setting: ', logfile)
    try:
        Log(
            'mapping thread is %s' %
            (str(int(conf_dict['Step1_Mapping']['p']))), logfile)
    except:
        LogError(
            'p should be int, current value is %s' %
            (conf_dict['Step1_Mapping']['p']), logfile)

    if not int(conf_dict['Step1_Mapping']['q30filter']) in [0, 1]:
        LogError(
            'q30filter measurement can only be 0/1, current value is %s' %
            (conf_dict['Step1_Mapping']['q30filter']), logfile)

    if not int(conf_dict['Step1_Mapping']['filter_reads_length']) in [0, 1]:
        LogError(
            'filter_reads_length measurement can only be 0/1, current value is %s'
            % (conf_dict['Step1_Mapping']['filter_reads_length']), logfile)

    ### check Rscript
    if not 'Usage' in GetError('Rscript')[1] and not 'version' in GetError(
            'Rscript')[1]:
        LogError('require Rscript', logfile)

    ### check pdflatex
    if Get('pdflatex --help')[0] == "":
        Log(
            'pdflatex was not installed, ATAC is still processing but no summary QC report generated',
            logfile)
        conf_dict['General']['latex'] = 0
    else:
        conf_dict['General']['latex'] = 1

    Log('Step0 Data integrate DONE', logfile)

    return conf_dict