Beispiel #1
0
def statistics_time(out_dir, sample, process, logger_statistics_process,
                    logger_statistics_errors):
    if not os.path.isfile(process):
        store_statistics_logs('null', logger_statistics_errors,
                              process + " does not exist!\n")
        # print(process + ' does not exist!')
    time_statistics = out_dir + '/' + sample + '_' + 'time_Cost'
    scriptdir = os.path.dirname(os.path.abspath(__file__))
    command = 'Rscript ' + scriptdir + '/statistics_time.R' + ' -p ' + process + ' -o ' + time_statistics
    os.system(command)
Beispiel #2
0
def statistics_sam_bam(samtools_dir, sam_bam, out_dir, sample, module,
                       logger_statistics_process, logger_statistics_errors,
                       renew):
    # -statistics of
    if not os.path.isfile(sam_bam):
        store_statistics_logs('null', logger_statistics_errors,
                              sam_bam + " does not exist!\n")
        # print(sam_bam + ' does not exist!')
    align_statistics = out_dir + '/' + sample + '_' + module + '_statistics.txt'
    if not os.path.isfile(align_statistics) or renew is 'T':
        command = samtools_dir + ' stats ' + sam_bam + ' | grep ^SN | cut -f 2-3  > ' + align_statistics
        os.system(command)
    return align_statistics
Beispiel #3
0
def merge_statistics_sam_bam(logger_statistics_process,
                             logger_statistics_errors, out_dir, sample, names,
                             *args):
    for arg in args:
        if not os.path.isfile(arg):
            store_statistics_logs('null', logger_statistics_errors,
                                  arg + " does not exist!\n")
            # print(arg + ' does not exist!')
    statisticsfiles = ','.join(args)
    mergestatistics = out_dir + '/' + sample + '_merge_sam_bam_statisticsfile.txt'
    scriptdir = os.path.dirname(os.path.abspath(__file__))
    command = 'Rscript ' + scriptdir + '/statistics_merge_sam_bam_statisticsfile.R' + \
              ' -p ' + statisticsfiles + ' -g ' + names + ' -o ' + mergestatistics
    os.system(command)
Beispiel #4
0
def qc_raw_reads(fastQC_dir, out_dir, sample, module, read1, read2,
                 logger_statistics_process, logger_statistics_errors):
    qc_read1 = out_dir + '/' + os.path.basename(read1).split(
        ".fastq")[0] + '_fastqc.zip'
    qc_read2 = out_dir + '/' + os.path.basename(read2).split(
        ".fastq")[0] + '_fastqc.zip'
    if not (os.path.isfile(qc_read1) and os.path.isfile(qc_read2)):
        command1 = '{0} {1} {2} -o {3}'.format(fastQC_dir, read1, read2,
                                               out_dir)
        stdout, stderr = stdout_err(command1)
        store_statistics_logs(logger_statistics_process, 'null', stdout)
        store_statistics_logs('null', logger_statistics_errors, stderr)
        store_statistics_logs(logger_statistics_process, 'null',
                              'QC-{0} has been completed.\n'.format(module))
    else:
        store_statistics_logs(logger_statistics_process, 'null',
                              'QC-{0} exists.\n'.format(module))
    qc_statistics = out_dir + '/' + sample + '.' + module + '.statistics.txt'
    qc_result1 = getinfo(out_dir + '/' + os.path.basename(read1),
                         logger_statistics_process, logger_statistics_errors)
    qc_result2 = getinfo(out_dir + '/' + os.path.basename(read2),
                         logger_statistics_process, logger_statistics_errors)
    fout = open(qc_statistics, 'w')
    fout.write('\t'.join([
        'SampleID', 'Sequence direction', 'raw reads', 'min length',
        'max length', 'GC content', 'mean of Per base qualit',
        'low qualit Bases position', 'Q20', 'Q30', 'N_bases position'
    ]) + '\n')
    fout.write('\t'.join([
        sample,
        read1.lstrip(sample +
                     '_').rstrip(".fastq.gz"), '\t'.join(qc_result1[0:9])
    ]) + '\n')
    fout.write('\t'.join([
        sample,
        read2.lstrip(sample +
                     '_').rstrip(".fastq.gz"), '\t'.join(qc_result2[0:9])
    ]) + '\n')
    fout.close()
    return qc_result1, qc_result2
Beispiel #5
0
def main_run_germline_variant_calling(path_sampleID_sub):
    time_start1 = time.time()
    source = path_sampleID_sub[0].split('\t')[0]
    sample = path_sampleID_sub[0].split('\t')[1]
    if len(path_sampleID_sub[0].split('\t')) > 2:
        tailname = path_sampleID_sub[0].split('\t')[2]
        sample = sample + '_' + tailname
    # parameters
    (output, fastqc_dir, primers_file, exome_target_bed, min_read_len,
     common_seq1, common_seq2, num_threads, edit_dist, min_mapq, max_soft_clip,
     max_dist, memory_size, snp_filter, indel_filter, ref_ens, bwa_dir,
     samtools_dir, umitools_dir, gatk_dir, ref_index_name, ref_fa_file,
     total_ref_fa_file, total_ref_fa_dict, known_sites, erc, db_cosmic,
     db_clinvar, db_g1000, test_level, exome_target, calling, tabix, bgzip,
     bcftools_dir, varsan2_dir, strelka2_dir, total_ref_chrom_fa_file,
     datasets_dir, smcounter, mtdepth, rpb, ncpu, minbq, minmq, hplen,
     mismatchthr, mtdrop, maxmt, primerdist, bedtandemrepeats,
     bedrepeatmaskersubset, bedtools_dir, renew) = path_sampleID_sub[1:55]
    # check the output
    out_dir = output + '/' + sample
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # pipeline log file
    log_dir = out_dir + '/' + 'log'
    if not os.path.isdir(log_dir):
        try:
            os.makedirs(log_dir)
        except OSError as e:
            if e.errno == 17:
                logger_pipeline_process = log_dir
                logger_pipeline_errors = log_dir
    logger_pipeline_process = log_dir
    logger_pipeline_errors = log_dir
    # time cost
    time_start = time.time()
    module = "QC"

    read1 = source + '/' + sample + '_R1_001.fastq.gz'
    read2 = source + '/' + sample + '_R2_001.fastq.gz'

    #if tools in ['all', 'qc']:
    if 'qc' in tools or 'all' in tools:
        print("Test QC module!\n")
        # qc_dir
        qc_dir = out_dir + '/' + 'QC'
        if not os.path.exists(qc_dir):
            os.makedirs(qc_dir)

        logger_statistics_process = log_dir
        logger_statistics_errors = log_dir
        qc_result1, qc_result2 = qc_raw_reads(fastqc_dir, qc_dir, sample,
                                              module, read1, read2,
                                              logger_statistics_process,
                                              logger_statistics_errors)
        # check the quality of the raw reads
        if float(qc_result1[7].strip('%')) > 70 and float(
                qc_result2[7].strip('%')) > 70:
            print(
                "The ratio of read1 and read2 with Q30 quality are both higher than 70%."
            )
        else:
            exit(
                "The ratio of read1 and read2 with Q30 quality are both lower than 80%!!!!!!!"
            )
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--QC of reads is completed after {1} min.\n".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        store_statistics_logs(
            logger_statistics_process, 'null',
            "--{0}--QC of reads is completed after {1} min.\n".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # trim
    ##########################################################################################
    # time cost
    time_start = time.time()
    # undetermined_dir
    undetermined_dir = out_dir + '/' + 'undetermined'
    trimmed1 = undetermined_dir + '/' + sample + '_R1_undetermined.fastq'
    trimmed2 = undetermined_dir + '/' + sample + '_R2_undetermined.fastq'
    stats_file = undetermined_dir + '/' + sample + '_basic_stats.txt'
    # if tools in ['all', 'trim']:
    if 'trim' in tools or 'all' in tools:
        print("please check the QC subprocess result--the min read length!")
        print("The cutoff of the min read length is the default: {0}".format(
            min_read_len))
        print("Test trim module!\n\n\n")
        # mkdir undetermined_dir
        if not os.path.exists(undetermined_dir):
            os.makedirs(undetermined_dir)

        logger_trim_process = log_dir
        logger_trim_errors = log_dir

        trim_read_pairs(read1, read2, trimmed1, trimmed2, min_read_len,
                        common_seq1, common_seq2, stats_file,
                        logger_trim_process, logger_trim_errors)
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--Trim of reads is completed after {1} min.\n".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        store_trim_logs(
            logger_trim_process, 'null',
            "--{0}--Trimming of reads is completed after {1} min.\n".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # align
    ##########################################################################################
    # time cost
    time_start = time.time()
    # aligned_dir
    aligned_dir = out_dir + '/' + 'aligned'
    trim_read1 = out_dir + '/' + 'undetermined' + '/' + sample + '_R1_undetermined.fastq'
    trim_read2 = out_dir + '/' + 'undetermined' + '/' + sample + '_R2_undetermined.fastq'

    out_file = aligned_dir + '/' + sample + '_aligned.sam'
    # if tools in ['all', 'align']:
    if 'align' in tools or 'all' in tools:
        print("please check the Trim subprocess result--undetermined.fastq!")
        print("Test align module!\n")
        if not os.path.exists(aligned_dir):
            os.makedirs(aligned_dir)
        logger_bwa_process = log_dir
        logger_bwa_errors = log_dir
        align_reads_bwa(bwa_dir, samtools_dir, ref_fa_file, ref_index_name,
                        exome_target_bed, total_ref_fa_file, trim_read1,
                        trim_read2, out_file, num_threads, logger_bwa_process,
                        logger_bwa_errors, renew)
        store_align_logs(
            logger_bwa_process, 'null',
            "--{0}--Alignment of reads is completed after {1} min.".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--Align of reads is completed after {1} min.".format(
                sample, str('%.3f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    # #####################################annotationmain####################################################
    # post_align
    # #########################################################################################
    # time cost
    time_start = time.time()
    # post_aligned_dir
    filtered_dir = out_dir + '/' + 'filtered'
    # out_file from the align
    alignment_sam = out_file
    out_file1 = filtered_dir + '/' + sample + '_tmp.sam'
    stats_file = filtered_dir + '/' + sample + '_align_stats.txt'
    primer_stats_file = filtered_dir + '/' + sample + '_primer_stats.csv'
    out_file2 = filtered_dir + '/' + sample + '_filtered.sam'

    # if tools in ['all', 'post_align']:
    if 'post_align' in tools or 'all' in tools:
        print("please check the Algin subprocess result--aligned.sam!")
        print("Test post align module!\n")
        if not os.path.exists(filtered_dir):
            os.makedirs(filtered_dir)
        logger_filter_process = log_dir
        logger_filter_errors = log_dir
        filter_alignment_samtools(samtools_dir, alignment_sam, min_mapq,
                                  max_soft_clip, out_file1, stats_file,
                                  logger_filter_process, logger_filter_errors)
        identify_gs_primers(samtools_dir, out_file1, primers_file, max_dist,
                            out_file2, stats_file, primer_stats_file,
                            logger_filter_process, logger_filter_errors)
        store_filter_logs(
            logger_filter_process, 'null',
            "--{0}--Post Alignment of reads is completed after {1} min.".
            format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--Post_align of reads is completed after {1} min.".format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # barcode clustering
    ##########################################################################################
    # time cost
    time_start = time.time()
    # clustering_dir
    clustered_dir = out_dir + '/' + 'clustered'
    filtered_sam = out_dir + '/' + 'filtered' + '/' + sample + '_filtered.sam'
    filtered_bam = clustered_dir + '/' + sample + '_filtered.bam'
    sorted_bam = clustered_dir + '/' + sample + '_filtered_sorted.bam'
    umitool_stats = clustered_dir + '/' + sample + '_deduplicated'
    #umitool_stats = clustered_dir + '/' + sample + '_group.tsv'
    umis_sam = clustered_dir + '/' + sample + '_umis.sam'
    #if tools in ['all', 'cluster']:
    if 'cluster' in tools or 'all' in tools:
        print("please check the post algin subprocess result--filtered.sam!")
        print("Test cluster module!\n")
        if not os.path.exists(clustered_dir):
            os.makedirs(clustered_dir)
        logger_umi_process = log_dir
        logger_umi_errors = log_dir
        umitool(samtools_dir, umitools_dir, filtered_sam, filtered_bam,
                sorted_bam, umitool_stats, umis_sam, edit_dist,
                logger_umi_process, logger_umi_errors)
        store_cluster_logs(
            logger_umi_process, 'null',
            "--{0}--UMIs tools clustering of reads is completed after {1} min."
            .format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            "--{0}--Cluster of reads is completed after {1} min.".format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # reformat
    ##########################################################################################
    # time cost
    time_start = time.time()
    # reformated_dir
    reformated_dir = out_dir + '/' + 'reformated'
    alignment_sam = out_dir + '/' + 'clustered' + '/' + sample + '_umis.sam'
    output_sam = reformated_dir + '/' + sample + '_vcready.sam'
    # if tools in ['all', 'reformat']:
    if 'reformat' in tools or 'all' in tools:
        print("please check the cluster subprocess result--umis.sam!")
        print("Test reformat module!\n")
        if not os.path.exists(reformated_dir):
            os.makedirs(reformated_dir)
        logger_reformat_process = log_dir
        logger_reformat_errors = log_dir
        reformat_sam(alignment_sam, output_sam, logger_reformat_process,
                     logger_reformat_errors)
        store_reformat_logs(
            logger_reformat_process, 'null',
            '--{0}--Finish reformating alignment SAM file is completed after {1} min.'
            .format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--Reformat alignment SAM file is completed after {1} min.'.
            format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    # #########################################################################################
    # Germline variant calling
    # #########################################################################################
    # time cost
    time_start = time.time()
    # germline_vc_dir
    germline_vc_dir = out_dir + '/' + 'germline_vc'
    # modify the known-sites
    known_sites = known_sites.replace(',',
                                      ' --known-sites ' + datasets_dir + '/')
    known_sites = datasets_dir + '/' + known_sites
    vready_sam = out_dir + '/' + 'reformated' + '/' + sample + '_vcready.sam'
    # marked_bqsr_bam = germline_vc_dir + '/' + sample + '_sorted.MarkDuplicates.BQSR.bam'
    if os.path.basename(exome_target_bed) != 'all':
        exon_interval = germline_vc_dir + '/' + 'target_interval.list'
    else:
        exon_interval = 'all'
    # if tools in ['all', 'variant_call']:
    if 'variant_call' in tools or 'all' in tools:
        print("please check the reformat subprocess result--vcready.sam!")
        print("Test variant_call module!\n")
        if not os.path.exists(germline_vc_dir):
            os.makedirs(germline_vc_dir)
        logger_germline_vc_process = log_dir
        logger_germline_vc_errors = log_dir

        bqsr = 'n'
        bam_to_variant, bqsr_bam_to_variant = sam_to_bam(
            gatk_dir, samtools_dir, vready_sam, sample, germline_vc_dir,
            memory_size, exome_target_bed, total_ref_fa_file,
            total_ref_fa_dict, known_sites, logger_germline_vc_process,
            logger_germline_vc_errors, bqsr, renew)
        callings = calling.split(',')
        # if calling == 'GATK':
        if 'GATK' in callings:
            germline_variant_calling(gatk_dir, bam_to_variant, sample,
                                     germline_vc_dir, memory_size,
                                     total_ref_fa_file, exon_interval, erc,
                                     snp_filter, indel_filter,
                                     logger_germline_vc_process,
                                     logger_germline_vc_errors)
        # elif calling == 'strelka2':
        if 'strelka2' in callings:
            strelka2_call(strelka2_dir, bgzip, tabix, total_ref_chrom_fa_file,
                          germline_vc_dir, sample, bam_to_variant,
                          exome_target_bed, logger_germline_vc_process,
                          logger_germline_vc_errors, renew)
        # elif calling == 'samtools':
        if 'samtools' in callings:
            samtools_call(samtools_dir, bcftools_dir, bam_to_variant, sample,
                          germline_vc_dir, total_ref_fa_file,
                          logger_germline_vc_process,
                          logger_germline_vc_errors)
        # elif calling == 'varscan2':
        if 'varscan2' in callings:
            varsan2_call(samtools_dir, varsan2_dir, total_ref_fa_file,
                         germline_vc_dir, sample, bam_to_variant,
                         logger_germline_vc_process, logger_germline_vc_errors)
        if 'smcounter' in callings:
            bam_to_variant = bam_to_variant.rstrip(
                ".MarkDuplicates.RG.bam") + '.bam'
            threshold = 0
            logfile = germline_vc_dir + '/smcountlog'
            smcounter_call(smcounter, germline_vc_dir + '/' + sample,
                           bam_to_variant, exome_target_bed, mtdepth, rpb,
                           ncpu, minbq, minmq, hplen, mismatchthr, mtdrop,
                           maxmt, primerdist, threshold, total_ref_fa_file,
                           bedtandemrepeats, bedrepeatmaskersubset,
                           bedtools_dir, logfile, logger_germline_vc_process,
                           logger_germline_vc_errors, renew)
        store_germline_vc_logs(
            logger_germline_vc_process, 'null',
            '--{0}--Germline variant calling is completed after {1} min.'.
            format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--variant_calling is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    # #########################################################################################
    # Annotation variant calling
    # #########################################################################################
    # time cost
    time_start = time.time()
    # Annotation dir
    annotation_dir = out_dir + '/' + 'annotation'
    raw_vcf = germline_vc_dir + '/' + sample + '.raw_variants.vcf'
    snp_vcf = germline_vc_dir + '/' + sample + '.raw_variants_SNP.vcf'
    indel_vcf = germline_vc_dir + '/' + sample + '.raw_variants_indel.vcf'
    # annotation
    # if tools in ['all', 'annotation']:
    if 'annotation' in tools or 'all' in tools:
        print("please check the variant_call subprocess result--VCF!")
        print("Test annotation module!\n")
        # Annotation dir
        if not os.path.exists(annotation_dir):
            os.makedirs(annotation_dir)
        logger_annotation_process = log_dir
        logger_annotation_errors = log_dir
        snp_limit, indel_limit = read_vcf_filter(snp_filter, indel_filter)
        callings = calling.split(',')
        for callingsub in callings:
            annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, raw_vcf,
                           sample, snp_limit, indel_limit, annotation_dir,
                           logger_annotation_process, logger_annotation_errors,
                           callingsub)
        if 'GATK' in callings:
            callingsub = 'GATK'
            annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, snp_vcf,
                           sample, snp_limit, indel_limit, annotation_dir,
                           logger_annotation_process, logger_annotation_errors,
                           callingsub)
            annotationmain(db_cosmic, db_clinvar, db_g1000, ref_ens, indel_vcf,
                           sample, snp_limit, indel_limit, annotation_dir,
                           logger_annotation_process, logger_annotation_errors,
                           callingsub)

        store_annotation_logs(
            logger_annotation_process, 'null',
            '--{0}--Finish annotation variant  is completed after {1} min.'.
            format(sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--Annotation variant is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        print("--" * 20 + '\n\n')

    ##########################################################################################
    # statistics of the variant calling pipeline
    ##########################################################################################
    # time cost
    time_start = time.time()
    # statistics dir
    statistics_dir = out_dir + '/' + 'statistics'
    if not os.path.exists(statistics_dir):
        os.makedirs(statistics_dir)
    # statistics the clean reads
    # statistics trim dir
    statistics_trim_dir = statistics_dir + '/' + 'trim_QC'
    if not os.path.exists(statistics_trim_dir):
        os.makedirs(statistics_trim_dir)
    # if tools in ['all', 'statis']:
    if 'statis' in tools or 'all' in tools:
        print("please check the others subprocess results!")
        print("Test statistics module!\n")
        #if tools == 'statis':
        logger_statistics_process = log_dir
        logger_statistics_errors = log_dir
        # statistics dir
        if not os.path.exists(statistics_dir):
            os.makedirs(statistics_dir)
        module = "Trim"
        trim_result1, trim_result2 = qc_raw_reads(fastqc_dir,
                                                  statistics_trim_dir, sample,
                                                  module, trimmed1, trimmed2,
                                                  logger_statistics_process,
                                                  logger_statistics_errors)
        # statistics the align
        module1 = "Align"
        align_sorted_bam = statistics_depth_coverage(
            samtools_dir, out_file, statistics_dir, sample, module1,
            exome_target, exome_target_bed, logger_statistics_process,
            logger_statistics_errors, renew)
        align_statistics = statistics_sam_bam(samtools_dir, align_sorted_bam,
                                              statistics_dir, sample, module1,
                                              logger_statistics_process,
                                              logger_statistics_errors, renew)
        # statistics the filter
        # cluster module would build the filter sorted bam, but it has been changed UMIs-tools
        module2 = "Fliter"
        filtered_sorted_bam = statistics_depth_coverage(
            samtools_dir, filtered_sam, statistics_dir, sample, module2,
            exome_target, exome_target_bed, logger_statistics_process,
            logger_statistics_errors, renew)
        fliter_statistics = statistics_sam_bam(samtools_dir,
                                               filtered_sorted_bam,
                                               statistics_dir, sample, module2,
                                               logger_statistics_process,
                                               logger_statistics_errors, renew)
        # statistics the umi-tools
        module3 = "Cluster_reformat"
        cr_sorted_bam = statistics_depth_coverage(
            samtools_dir, vready_sam, statistics_dir, sample, module3,
            exome_target, exome_target_bed, logger_statistics_process,
            logger_statistics_errors, renew)
        cr_statistics = statistics_sam_bam(samtools_dir, cr_sorted_bam,
                                           statistics_dir, sample, module3,
                                           logger_statistics_process,
                                           logger_statistics_errors, renew)
        # staistics the bases MT depth
        statistics_mtdepth_coverage(germline_vc_dir, statistics_dir, sample,
                                    exome_target, logger_statistics_process,
                                    logger_statistics_errors)
        # merge the sorted bam
        merge_statistics_sam_bam(logger_statistics_process,
                                 logger_statistics_errors, statistics_dir,
                                 sample, ','.join([module1, module2,
                                                   module3]), align_statistics,
                                 fliter_statistics, cr_statistics)
        store_statistics_logs(
            logger_statistics_process, 'null',
            '--{0}--Statistics is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--Statistics is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start) / 60))))
        store_pipeline_logs(
            logger_pipeline_process, 'null',
            '--{0}--Pipeline : {0} is completed after {1} min.'.format(
                sample, ('%.2f' % ((time.time() - time_start1) / 60))))
        # statistics the time cost
        process_log = out_dir + '/' + 'log' + '/' + 'process.log'
        statistics_time(statistics_dir, sample, process_log,
                        logger_statistics_process, logger_statistics_errors)
        print("--" * 20 + '\n\n')
    return 'Pipeline : {0} is completed after {1} min.'.format(
        sample, ('%.2f' % ((time.time() - time_start1) / 60)))
Beispiel #6
0
def getinfo(fastqc, logger_statistics_process, logger_statistics_errors):
    qc_read = fastqc.split(".fastq")[0] + '_fastqc.zip'
    if not os.path.isfile(fastqc.split(".fastq")[0] + '_fastqc'):
        command1 = 'unzip' + ' -o ' + qc_read + ' -d ' + os.path.dirname(
            qc_read)
        stdout, stderr = stdout_err(command1)
        store_statistics_logs(logger_statistics_process, 'null', stdout)
        store_statistics_logs('null', logger_statistics_errors, stderr)
    # print('{0} has been completed.'.format(qc_read))
    store_statistics_logs(logger_statistics_process, 'null',
                          '{0} has been completed.\n'.format(qc_read))
    qc_data = qc_read.rstrip(".zip") + '/' + 'fastqc_data.txt'
    qcdata = open(qc_data, "r")
    modules = 0
    #
    value = re.compile(r'\d+')
    perbasesequencequalit_posi = []
    perbasesequencequalit = []
    persequencequalityscores = []
    persequencequalityreads = []
    per_basen1 = []
    perbase_ncontent = []
    f = qcdata.readlines()
    for lines in range(0, len(f)):
        line = f[lines]
        line = line.strip()
        if '>>END_MODULE' in line:
            modules = modules + 1
        if 'Total Sequences' in line:
            raw_reads = re.findall('\d+', line)
        if 'Sequence length' in line:
            seq_length = line.split('\t')[1]
            if '-' in seq_length:
                seq_length_min, seq_length_max = seq_length.split('-')
            else:
                seq_length_min = str(seq_length)
                seq_length_max = str(seq_length)
        if '%GC' in line:
            gc = re.findall('\d+', line)
        if modules == 1 and value.match(line[0]):
            perbasesequencequalit_posi.append(line.split('\t')[0])
            perbasesequencequalit.append(float(line.split('\t')[1]))
        if modules == 3 and value.match(line[0]):
            persequencequalityscores.append(line.split('\t')[0])
            persequencequalityreads.append(float(line.split('\t')[1]))
        if modules == 6 and value.match(line[0]):
            per_basen1.append(line.split('\t')[0])
            perbase_ncontent.append(line.split('\t')[1])

    # --mean of Per base sequence quality
    all_perbasesequencequalit = 0
    for i in range(0, len(perbasesequencequalit)):
        if '-' not in perbasesequencequalit_posi[i]:
            qual = float(perbasesequencequalit[i])
            all_perbasesequencequalit += qual
        else:
            num1, num2 = perbasesequencequalit_posi[i].split('-')
            qual = float(perbasesequencequalit[i])
            all_perbasesequencequalit += qual * (int(num2) - int(num1) + 1)
    perbasesequencequalit_mean = all_perbasesequencequalit / int(
        seq_length_max)

    # low qual base : qual < 25
    lowqualit_bases = []
    for i in range(0, len(perbasesequencequalit)):
        if float(perbasesequencequalit[i]) < 25:
            lowqualit_bases.append(perbasesequencequalit_posi[i])
    if len(lowqualit_bases) == 0:
        lowqualit_bases.append('NULL')
    lowqualit_bases_region = split_n_bases(','.join(lowqualit_bases))[0]

    # percent of Q20 and Q30
    persequencequalityreads1 = []
    for i in range(0, len(persequencequalityreads)):
        persequencequalityreads1.append(
            sum(persequencequalityreads[i:]) / int(raw_reads[0]))
    for i in range(0, len(persequencequalityscores)):
        if persequencequalityscores[i] == '20':
            q20 = persequencequalityreads1[i]
        if persequencequalityscores[i] == '30':
            q30 = persequencequalityreads1[i]
    if '20' not in persequencequalityscores:
        q20 = 1.0000

    # N content
    nbases = []
    for i in range(0, len(perbase_ncontent)):
        if float(perbase_ncontent[i]) > 0:
            nbases.append(per_basen1[i])
    if len(nbases) == 0:
        nbases.append('NULL')
    nbase_region = split_n_bases(','.join(nbases))

    return [
        raw_reads[0], seq_length_min, seq_length_max, gc[0],
        str(round(perbasesequencequalit_mean, 3)), lowqualit_bases_region,
        str('%.3f%%' % (q20 * 100)),
        str('%.3f%%' % (q30 * 100)), nbase_region[0]
    ]
Beispiel #7
0
def statistics_depth_coverage(samtools_dir, sam_bam, out_dir, sample, module,
                              exome_target, exome_target_bed,
                              logger_statistics_process,
                              logger_statistics_errors, renew):
    # get the path
    scriptdir = os.path.dirname(os.path.abspath(__file__))

    if not os.path.isfile(sam_bam):
        store_statistics_logs('null', logger_statistics_errors,
                              sam_bam + " does not exist!\n")
    sorted_bam = sam_bam.rstrip('.sam') + '_sorted.bam'
    bam = sam_bam.rstrip('.sam') + '.bam'
    if not os.path.isfile(sorted_bam) or renew is 'T':
        # print(sorted_bam + ' does not exist!')
        if not os.path.isfile(bam):
            bam = sam_bam.rstrip('.sam') + '.bam'
            command1 = samtools_dir + ' view -bS ' + sam_bam + ' -o ' + bam
            stdout, stderr = stdout_err(command1)
            store_statistics_logs(logger_statistics_process, 'null', stdout)
            store_statistics_logs('null', logger_statistics_errors, stderr)
            store_statistics_logs(
                logger_statistics_process, 'null',
                '{0} has been tranformed to bam.\n'.format(sam_bam))
            command2 = samtools_dir + ' sort ' + bam + ' -o ' + sorted_bam
            stdout, stderr = stdout_err(command2)
            store_statistics_logs(logger_statistics_process, 'null', stdout)
            store_statistics_logs('null', logger_statistics_errors, stderr)
            os.system('rm -rf {0}'.format(bam))
        else:
            command2 = samtools_dir + ' sort ' + bam + ' -o ' + sorted_bam
            stdout, stderr = stdout_err(command2)
            store_statistics_logs(logger_statistics_process, 'null', stdout)
            store_statistics_logs('null', logger_statistics_errors, stderr)
    sorted_bam_index = sorted_bam + '.bai'
    if not os.path.isfile(sorted_bam_index) or renew is 'T':
        # print(sorted_bam_index + ' does not exist!')
        command3 = samtools_dir + ' index ' + sorted_bam
        stdout, stderr = stdout_err(command3)
        store_statistics_logs(logger_statistics_process, 'null', stdout)
        store_statistics_logs('null', logger_statistics_errors, stderr)
    # -numbers of reads in target region
    num_reads_in_target_region = out_dir + '/' + sample + '_' + module + '_numbersReadsInTargetRegion.txt'
    if not os.path.isfile(num_reads_in_target_region) or renew is 'T':
        command4 = samtools_dir + ' idxstats ' + sorted_bam + ' -o ' + num_reads_in_target_region
        stdout, stderr = stdout_err(command4)
        store_statistics_logs(logger_statistics_process, 'null', stdout)
        store_statistics_logs('null', logger_statistics_errors, stderr)
    # -coverage of reads in target region
    coverage_in_target_region = out_dir + '/' + sample + '_' + module + '_covergerInTargetRegion.txt'
    if not os.path.isfile(coverage_in_target_region) or renew is 'T':
        command5 = '{0} mpileup {1} | perl -alne \'{2}\' > {3}'.format(
            samtools_dir, sorted_bam,
            '{$pos{$F[0]}++;$depth{$F[0]}+=$F[3]} END{print "$_\t$pos{$_}\t$depth{$_}" foreach sort keys %pos}',
            coverage_in_target_region)
        os.system(command5)
    # -
    # -statistics and plot of  the depth and coverage in target region
    statistics_plot = out_dir + '/' + sample + '_' + module + '_depth_coverageInTargetRegion'
    if not os.path.isfile(statistics_plot + '.pdf') or renew is 'T':
        # scriptdir = os.path.dirname(os.path.abspath(__file__))
        command6 = 'Rscript ' + scriptdir + '/statistics_depth_coverage.R' + ' -p ' \
                   + num_reads_in_target_region + ' -s ' + coverage_in_target_region\
                   + ' -r ' + exome_target_bed + ' -o ' + statistics_plot
        stdout, stderr = stdout_err(command6)
        store_statistics_logs(logger_statistics_process, 'null', stdout)
        store_statistics_logs('null', logger_statistics_errors, stderr)
    # - statistics of the depth of the baes in region
    if module == 'Fliter':
        bases_depth_in_region = out_dir + '/' + sample + '_' + module + '_basesDepthInRegion.txt'
        if not os.path.isfile(bases_depth_in_region) or renew is 'T':
            command7 = '{0} depth {1} > {2}'.format(samtools_dir, sorted_bam,
                                                    bases_depth_in_region)
            os.system(command7)
        # statistics of the depth of exon region
        statistics_plot1 = out_dir + '/' + sample + '_' + module + '_basesDepthInTargetExon'
        if os.path.isfile(statistics_plot1 + '.exon_statis.txt'):
            os.system('rm {0}'.format(statistics_plot1 + '.exon_statis.txt'))
        command7 = ''.join([
            'Rscript ', scriptdir,
            '/statistics_bases_coverage_on_target_exon.R', ' -p ',
            bases_depth_in_region, ' -s ', exome_target, ' -t True ', ' -o ',
            statistics_plot1
        ])
        os.system(command7)
    # depth of the bases in target region
    bases_depth_in_target_region = out_dir + '/' + sample + '_' + module + '_basesDepthInTargetRegion.txt'
    if not os.path.isfile(bases_depth_in_target_region) or renew is 'T':
        command7 = '{0} mpileup {1} | perl -alne \'{2}\' > {3}'.format(
            samtools_dir, sorted_bam,
            '{$depth{$F[3]}++}END{print "$_\t$depth{$_}" foreach sort{$a <=> $b}keys %depth}',
            bases_depth_in_target_region)
        os.system(command7)
    # -statistics and plot of the depth and coverage in target region
    statistics_plot2 = out_dir + '/' + sample + '_' + module + '_basesDepthInTargetRegion'
    command8 = 'Rscript ' + scriptdir + '/statistics_bases_depth.R' + ' -p ' \
               + bases_depth_in_target_region + ' -o ' + statistics_plot2
    stdout, stderr = stdout_err(command8)
    store_statistics_logs(logger_statistics_process, 'null', stdout)
    store_statistics_logs('null', logger_statistics_errors, stderr)
    return sorted_bam