Ejemplo n.º 1
0
def remove_5_bp_snp_indel(raw_vcf_file, out_path, analysis, reference, logger,
                          Config):
    #variant_caller = eval(ConfigSectionMap("pipeline", Config)['variant_caller'])
    if ConfigSectionMap("pipeline", Config)['variant_caller'] == "samtools":
        print "Samtools: Removing SNPs proximate to Indel by 5bp"
        remove_snps_5_bp_snp_indel_file_name = raw_vcf_file + "_5bp_indel_removed.vcf"
        with open(raw_vcf_file, 'rU') as csv_file:
            for line in csv_file:
                if not line.startswith('#'):
                    line_array = line.split('\t')
                    if line_array[7].startswith('INDEL;'):
                        indel_positions.append(line_array[1])
            for i in indel_positions:
                lower_range = int(i) - 5
                upper_range = int(i) + 6
                for positions in range(lower_range, upper_range):
                    indel_range_positions.append(positions)
        f1 = open(remove_snps_5_bp_snp_indel_file_name, 'w+')
        with open(raw_vcf_file, 'rU') as csv_file2:
            for line in csv_file2:
                if not line.startswith('#'):
                    line_array = line.split('\t')
                    if int(line_array[1]) not in indel_range_positions:
                        print_string = line
                        f1.write(print_string)
                else:
                    print_string = line
                    f1.write(print_string)
        return remove_snps_5_bp_snp_indel_file_name

    elif ConfigSectionMap("pipeline",
                          Config)['variant_caller'] == "gatkhaplotypecaller":
        print "GATK Haplotype caller: Removing SNPs proximate to Indel by 5bp"
        remove_snps_5_bp_snp_indel_file_name = raw_vcf_file + "_5bp_indel_removed.vcf"
        indel_file_name = raw_vcf_file + "_indel.vcf"
        base_cmd = ConfigSectionMap(
            "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
                "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap(
                    "gatk", Config)['base_cmd']
        cmd = "java -jar %s -T SelectVariants -R %s -V %s -selectType INDEL -o %s" % (
            base_cmd, reference, raw_vcf_file, indel_file_name)
        call(cmd, logger)
        keep_logging('Running Command: [%s]' % cmd,
                     'Running Command: [%s]' % cmd, logger, 'info')
        with open(indel_file_name, 'rU') as csv_file:
            for line in csv_file:
                if not line.startswith('#'):
                    line_array = line.split('\t')
                    indel_positions.append(line_array[1])
            for i in indel_positions:
                lower_range = int(i) - 5
                upper_range = int(i) + 6
                for positions in range(lower_range, upper_range):
                    indel_range_positions.append(positions)
        f1 = open(remove_snps_5_bp_snp_indel_file_name, 'w+')
        with open(raw_vcf_file, 'rU') as csv_file2:
            for line in csv_file2:
                if not line.startswith('#'):
                    line_array = line.split('\t')
                    if int(line_array[1]) not in indel_range_positions:
                        print_string = line
                        f1.write(print_string)
                else:
                    print_string = line
                    f1.write(print_string)
        return remove_snps_5_bp_snp_indel_file_name
Ejemplo n.º 2
0
def extract_only_ref_variant_fasta_unique_positions_with_unmapped():
    # Get reference genome ID from reference fasta file
    get_reference = Fasta(args.reference)
    if len(get_reference.keys()) == 1:
        ref_id = get_reference.keys()

    # Read in the SNP Matrix file and seperate the columns.
    c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
    c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
    columns = list(zip(*c_reader))
    ncol = len(next(c_reader_2))

    # Generate an array of all the unique variant positions that were called in all the samples
    unique_position_array = []
    for i in columns[0][1:]:
        replace_string = i.split(' ')
        if replace_string[0] != "None":
            unique_position_array.append(int(replace_string[3]))
        else:
            unique_position_array.append(int(replace_string[2]))




    counts = 1
    end = ncol
    # Loop over each column, check if the column name matches the sample name provided with argument args.filter2_only_snp_vcf_filename
    for i in xrange(1, end, 1):
        print_string = ""
        ref_print_string = ""
        grab_vcf_filename = len(os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''))
        #print grab_vcf_filename

        sample_name_re = columns[i][0][:grab_vcf_filename]
        #print sample_name_re

        # Replaced this with a more stable check
        #sample_name = str(columns[i][0])
        # sample_name_re = re.sub('_R1.fastq.gz', '', sample_name)
        # sample_name_re = re.sub('_R1_001.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_L001.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_S.*', '', sample_name_re)

        #print len(columns[i][1:])
        if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '') or sample_name_re in os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''):

            vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re
            print_string = print_string + ">%s\n" % sample_name_re
            ref_print_string = ref_print_string + ">%s\n" % sample_name_re
            #variant_allele = ''.join(columns[i][1:])
            variant_allele = ""
            for ntd in columns[i][1:]:
                #if "/" in ntd:
                if "/" in ntd or len(ntd) > 1:
                    variant_allele = variant_allele + ntd[0]
                else:
                    variant_allele = variant_allele + ntd
            #print variant_allele
            print_string = print_string + str(variant_allele) + "\n"
            allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_vcf.write(vcf_header)
            allele_variant_fasta.write(print_string)
            allele_variant_fasta.close()
            variant_allele_array = []
            variant_allele_array_dict = {}
            #variant_allele_array.append(columns[i][1:])
            count_index = 0
            end_index = len(unique_position_array) + 1
            for start_count in xrange(1, end_index, 1):
                pos = columns[0][start_count]
                get_positions_string = pos.split(' ')
                if get_positions_string[0] != "None":
                    get_positions = int(get_positions_string[3])
                else:
                    get_positions = int(get_positions_string[2])

                variant_allele_array_dict[get_positions] = columns[i][start_count]
            # print len(variant_allele_array_dict)
            # print len(unique_position_array)
            get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re))
            if len(get_sample_reference.keys()) == 1:
                sample_ref_id = get_sample_reference.keys()
            for positions in unique_position_array:
                #print positions
                #pos_index = unique_position_array.index(positions)

                if "/" in str(variant_allele_array_dict[positions]) or len(variant_allele_array_dict[positions]) > 1:
                    allele_var = str(variant_allele_array_dict[positions][0])
                    #print allele_var
                else:
                    allele_var = str(variant_allele_array_dict[positions])
                # if str(positions) == "1477126":
                #     print allele_var
                ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)}))
                generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var)
                allele_ref_variant_vcf.write(generate_vcf_string)
            allele_ref_variant_vcf.close()
            filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir

            vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            f1 = open(filename, 'a+')
            bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename)
            f1.write(bgzip_cmd)
            subprocess.call([bgzip_cmd], shell=True)
            tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename)
            f1.write(tabix_cmd)
            subprocess.call([tabix_cmd], shell=True)
            base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin']
            fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re)
            f1.write(fasta_cmd)
            subprocess.call([fasta_cmd], shell=True)

            sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re)
            subprocess.call([sed_command], shell=True)
            f1.write(sed_command)

            #os.system("bash %s" % filename)
            #sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'])
            #os.system(sequence_lgth_cmd)
            #call("%s" % sequence_lgth_cmd, logger)

            unmapped_positions_file = "%s/%s_unmapped.bed_positions" % (args.filter2_only_snp_vcf_dir, os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''))
            #print unmapped_positions_file
            unmapped_vcf_file = "%s/%s_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            unmapped_vcf = open(
                "%s/%s_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            unmapped_vcf.write(vcf_header)
            with open(unmapped_positions_file, 'r') as fpp:
                for lines in fpp:
                    lines = lines.strip()
                    ref_allele = str(get_reference.sequence(
                        {'chr': str(get_reference.keys()[0]), 'start': int(lines), 'stop': int(lines)}))
                    generate_vcf_string_unmapped = "%s\t%s\t.\t%s\t-\t221.999\t.\t.\t.\t.\n" % (
                    ref_id[0].split(' ')[0], lines, ref_allele)
                    unmapped_vcf.write(generate_vcf_string_unmapped)
            unmapped_vcf.close()

            bgzip_cmd = "%s/%s/bgzip -f %s\n" % (
            ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'],
            unmapped_vcf_file)
            print bgzip_cmd
            tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (
            ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'],
            unmapped_vcf_file)
            print tabix_cmd
            subprocess.call([bgzip_cmd], shell=True)
            subprocess.call([tabix_cmd], shell=True)
            #allele_ref_variant_unmapped_vcf = open("%s/%s_ref_allele_variants_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')

            vcf_filename_unmapped = "%s/%s_ref_allele_unmapped.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            bcftools_merge_cmd =  "%s/%s/bcftools merge --merge snps --force-samples %s.gz %s.gz -O v -o %s" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bcftools", Config)['bcftools_bin'], unmapped_vcf_file, vcf_filename, vcf_filename_unmapped)

            bgzip_cmd = "%s/%s/bgzip -f %s\n" % (
            ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'],
            vcf_filename_unmapped)

            subprocess.call([bcftools_merge_cmd], shell=True)

            tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (
                ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'],
                vcf_filename_unmapped)

            fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_unmapped_variants.fa\n" % (
                args.reference, base_vcftools_bin, vcf_filename_unmapped, sample_name_re)

            #filename = "%s/consensus_ref_allele_unmapped_variant.sh" % args.filter2_only_snp_vcf_dir
            filename = "%s/%s_consensus_ref_allele_unmapped_variant.sh" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            f1 = open(filename, 'w+')
            f1.write(bgzip_cmd)
            f1.write(tabix_cmd)
            f1.write(fasta_cmd)
            print "print here: %s" % filename
            subprocess.call(['pwd'], shell=True)
            subprocess.call(bgzip_cmd, shell=True)
            subprocess.call(tabix_cmd, shell=True)
            subprocess.call(fasta_cmd, shell=True)
            sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_unmapped_variants.fa\n" % (sample_name_re, sample_name_re)
            subprocess.call([sed_command], shell=True)
            f1.write(sed_command)
            f1.close()

        else:
            print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re)
Ejemplo n.º 3
0
def gatk_filter_contamination(final_raw_vcf, out_path, analysis, reference,
                              logger, Config, Avg_dp):
    if ConfigSectionMap("pipeline", Config)['variant_caller'] == "samtools":
        base_cmd = ConfigSectionMap(
            "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
                "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap(
                    "gatk", Config)['base_cmd']
        filter_criteria = "contamination_filters"
        if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes":
            keep_logging('The average depth filter is turned on.',
                         'The average depth filter is turned on.', logger,
                         'info')
            low_Dp = float(Avg_dp) / 2
            high_Dp = float(Avg_dp) * 5
            DP_filter = "DP > %s && DP < %s" % (int(low_Dp), int(high_Dp))
        else:
            DP_filter = "DP > %s" % float(
                ConfigSectionMap(filter_criteria, Config)['dp'])
        MQ_filter = "MQ > %s" % float(
            ConfigSectionMap(filter_criteria, Config)['mq'])
        FQ_filter = "FQ > %s" % float(
            ConfigSectionMap(filter_criteria, Config)['fq'])
        QUAL_filter = "QUAL > %s" % float(
            ConfigSectionMap(filter_criteria, Config)['qual'])
        AF_filter = "AF1 < %s" % float(
            ConfigSectionMap(filter_criteria, Config)['af'])
        gatk_filter2_parameter_expression = "%s && %s && %s && %s && %s" % (
            FQ_filter, MQ_filter, QUAL_filter, DP_filter, AF_filter)
        if os.path.exists(final_raw_vcf):
            gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_contamination.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (
                base_cmd, reference, out_path, analysis, final_raw_vcf,
                gatk_filter2_parameter_expression)
        else:
            gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_contamination.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % (
                base_cmd, reference, out_path, analysis, final_raw_vcf,
                gatk_filter2_parameter_expression)
        filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_contamination.vcf > %s/%s_filter2_final_contamination.vcf" % (
            out_path, analysis, out_path, analysis)
        keep_logging(gatk_filter2_command, gatk_filter2_command, logger,
                     'debug')
        keep_logging(filter_flag_command, filter_flag_command, logger, 'debug')
        try:
            call(gatk_filter2_command, logger)
            call(filter_flag_command, logger)
        except sp.CalledProcessError:
            keep_logging('Error in GATK filter step. Exiting.',
                         'Error in GATK filter step. Exiting.', logger,
                         'exception')
            sys.exit(1)
        gatk_filter2_final_contamination_vcf = "%s/%s_filter2_final_contamination.vcf" % (
            out_path, analysis)
        #extract_dp = "egrep -v \"^#\" %s | cut -f 8 | sed 's/^.*DP=\([0-9]*\);.*$/\1/' > %s/%s_depth_values.txt" % (gatk_filter2_final_contamination_vcf, out_path, analysis)
        extract_dp = "egrep -v \"^#\" %s | cut -f 8 | grep -Po 'DP=[0-9]*;?' | sed 's/DP=//g' | sed 's/;//g' > %s/%s_depth_values.txt" % (
            gatk_filter2_final_contamination_vcf, out_path, analysis)
        extract_pos = "grep -v '^#' %s | awk -F'\t' '{print $2}' > %s/%s_POS_values.txt" % (
            gatk_filter2_final_contamination_vcf, out_path, analysis)
        extract_fq = "awk -F'\t' '{print $8}' %s  | grep -o 'FQ=.*' | sed 's/FQ=//g' | awk -F';' '{print $1}' > %s/%s_FQ_values.txt" % (
            gatk_filter2_final_contamination_vcf, out_path, analysis)
        extract_mq = "egrep -v \"^#\" %s | cut -f 8 | sed 's/^.*MQ=\([0-9]*\);.*$/\1/' > %s/%s_MQ_values.txt" % (
            gatk_filter2_final_contamination_vcf, out_path, analysis)
        extract_af = "awk -F'\t' '{print $8}' %s  | grep -o 'AF1=.*' | sed 's/AF1=//g' | awk -F';' '{print $1}' > %s/%s_AF1_values.txt" % (
            gatk_filter2_final_contamination_vcf, out_path, analysis)
        try:
            call(extract_dp, logger)
            call(extract_pos, logger)
            call(extract_fq, logger)
            call(extract_mq, logger)
            call(extract_af, logger)
            keep_logging(extract_dp, filter_flag_command, logger, 'debug')
            keep_logging(extract_pos, filter_flag_command, logger, 'debug')
            keep_logging(extract_fq, filter_flag_command, logger, 'debug')
            keep_logging(extract_mq, filter_flag_command, logger, 'debug')
            keep_logging(extract_af, filter_flag_command, logger, 'debug')
        except sp.CalledProcessError:
            keep_logging('Error in GATK contamination filter step. Exiting.',
                         'Error in GATK contamination filter step. Exiting.',
                         logger, 'exception')
            sys.exit(1)
        header = "pos,af"
        header_cmd = "echo \"%s\" > %s/header.txt" % (header, out_path)
        call(header_cmd, logger)
        paste_command = "paste -d, %s/%s_POS_values.txt %s/%s_AF1_values.txt > %s/%s_temp_paste_file.txt" % (
            out_path, analysis, out_path, analysis, out_path, analysis)
        call(paste_command, logger)
        combine_file_cmd = "cat %s/header.txt %s/%s_temp_paste_file.txt > %s/%s_INFO.txt" % (
            out_path, out_path, analysis, out_path, analysis)
        call(combine_file_cmd, logger)
        return gatk_filter2_final_contamination_vcf
    elif ConfigSectionMap("pipeline",
                          Config)['variant_caller'] == "gatkhaplotypecaller":
        print "filter"
Ejemplo n.º 4
0
def gatk_filter_indel(final_raw_vcf, out_path, analysis, reference, logger,
                      Config, Avg_dp):
    # if ConfigSectionMap("pipeline", Config)['variant_caller'] == "samtools":
    #     base_cmd = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap("gatk", Config)['base_cmd']
    #     filter_criteria = ConfigSectionMap("SNP_filters", Config)['filter_criteria']
    #     if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes":
    #         keep_logging("Using variant filter parameters from: %s" % filter_criteria,
    #                      "Using variant filter parameters from: %s" % filter_criteria, logger, 'info')
    #         low_Dp = float(Avg_dp) / 2
    #         high_Dp = float(Avg_dp) * 5
    #         DP_filter = "DP > %s && DP < %s" % (int(low_Dp), int(high_Dp))
    #     else:
    #         DP_filter = "DP > %s" % ConfigSectionMap(filter_criteria, Config)['dp']
    #     MQ_filter = "MQ > %s" % ConfigSectionMap(filter_criteria, Config)['mq']
    #     FQ_filter = "FQ < %s" % ConfigSectionMap(filter_criteria, Config)['fq']
    #     FQ_filter2 = "FQ < %s" % ConfigSectionMap(filter_criteria, Config)['fq2']
    #     QUAL_filter = "QUAL > %s" % ConfigSectionMap(filter_criteria, Config)['qual']
    #     AF_filter = "AF1 > %s" % float(ConfigSectionMap(filter_criteria, Config)['af'])
    #
    #     gatk_filter2_parameter_expression = "%s && %s && %s && %s && %s && %s" % (FQ_filter, MQ_filter, QUAL_filter, DP_filter, FQ_filter2, AF_filter)
    #     if os.path.exists(final_raw_vcf):
    #         gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression)
    #     else:
    #         gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % (base_cmd, reference, out_path, analysis, final_raw_vcf, gatk_filter2_parameter_expression)
    #
    #     filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_indel_gatk.vcf > %s/%s_filter2_indel_final.vcf" % (out_path, analysis, out_path, analysis)
    #     keep_logging(gatk_filter2_command, gatk_filter2_command, logger, 'debug')
    #     keep_logging(filter_flag_command, filter_flag_command, logger, 'debug')
    #     try:
    #         call(gatk_filter2_command, logger)
    #         call(filter_flag_command, logger)
    #     except sp.CalledProcessError:
    #         keep_logging('Error in GATK filter step. Exiting.', 'Error in GATK filter step. Exiting.', logger, 'exception')
    #         sys.exit(1)
    #     gatk_filter2_final_vcf = "%s/%s_filter2_indel_final.vcf" % (out_path, analysis)
    #     return gatk_filter2_final_vcf
    # elif ConfigSectionMap("pipeline", Config)['variant_caller'] == "gatkhaplotypecaller":
    base_cmd = ConfigSectionMap(
        "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
            "gatk", Config)['gatk_bin'] + "/" + ConfigSectionMap(
                "gatk", Config)['base_cmd']
    filter_criteria = ConfigSectionMap("SNP_filters",
                                       Config)['filter_criteria']
    keep_logging("Using variant filter parameters from: %s" % filter_criteria,
                 "Using variant filter parameters from: %s" % filter_criteria,
                 logger, 'info')
    if ConfigSectionMap(filter_criteria, Config)['avg_depth'] == "yes":
        keep_logging('The average depth filter is turned on.',
                     'The average depth filter is turned on.', logger, 'info')
        low_Dp = float(Avg_dp) / 2
        high_Dp = float(Avg_dp) * 5
        DP_filter = "DP > %s && DP < %s" % (int(low_Dp), int(high_Dp))
    else:
        DP_filter = "DP > %s" % float(
            ConfigSectionMap(filter_criteria, Config)['dp'])
    MQ_filter = "MQ > %s" % float(
        ConfigSectionMap(filter_criteria, Config)['mq'])
    QUAL_filter = "QD > %s" % float(
        ConfigSectionMap(filter_criteria, Config)['qd'])
    AF_filter = "AF > %s" % float(
        ConfigSectionMap(filter_criteria, Config)['af'])

    #gatk_filter2_parameter_expression = "%s && %s && %s && %s" % (MQ_filter, QUAL_filter, DP_filter, AF_filter)
    gatk_filter2_parameter_expression = "%s && %s && %s && %s" % (
        MQ_filter, QUAL_filter, DP_filter, AF_filter)

    if os.path.exists(final_raw_vcf):
        gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s --filterExpression \"%s\" --filterName PASS_filter2" % (
            base_cmd, reference, out_path, analysis, final_raw_vcf,
            gatk_filter2_parameter_expression)
    else:
        gatk_filter2_command = "java -jar %s -T VariantFiltration -R %s -o %s/%s_filter2_indel_gatk.vcf --variant %s.gz --filterExpression \"%s\" --filterName PASS_filter2" % (
            base_cmd, reference, out_path, analysis, final_raw_vcf,
            gatk_filter2_parameter_expression)

    filter_flag_command = "grep '#\|PASS_filter2' %s/%s_filter2_indel_gatk.vcf > %s/%s_filter2_indel_final.vcf" % (
        out_path, analysis, out_path, analysis)
    keep_logging(gatk_filter2_command, gatk_filter2_command, logger, 'debug')
    keep_logging(filter_flag_command, filter_flag_command, logger, 'debug')
    try:
        call(gatk_filter2_command, logger)
        call(filter_flag_command, logger)
    except sp.CalledProcessError:
        keep_logging('Error in GATK filter step. Exiting.',
                     'Error in GATK filter step. Exiting.', logger,
                     'exception')
        sys.exit(1)
    gatk_filter2_final_vcf = "%s/%s_filter2_indel_final.vcf" % (out_path,
                                                                analysis)
    return gatk_filter2_final_vcf
Ejemplo n.º 5
0
def file_exists(path1, path2, reference):
    if not os.path.isfile(path1):
        file_basename = os.path.basename(path1)
        keep_logging(
            'The input file {} does not exists. Please provide another file with full path or check the files path.\n'
            .format(file_basename),
            'The input file {} does not exists. Please provide another file or check the files path.\n'
            .format(file_basename), logger, 'exception')
        exit()
    if path2 is not None:
        if not os.path.isfile(path2):
            file_basename = os.path.basename(path2)
            keep_logging(
                'The input file {} does not exists. Please provide another file with full path or check the files path.\n'
                .format(file_basename),
                'The input file {} does not exists. Please provide another file or check the files path.\n'
                .format(file_basename), logger, 'exception')
            exit()
    if not os.path.isfile(reference):
        file_basename = os.path.basename(reference)
        keep_logging(
            'The reference fasta file {} does not exists. Please provide another with full path file with full path or check the files path.\n'
            .format(file_basename),
            'The reference fasta file {} does not exists. Please provide another file or check the files path.\n'
            .format(file_basename), logger, 'exception')
        exit()
    if ConfigSectionMap("pipeline", Config)['aligner'] == "bwa":
        ref_index_suffix1 = reference + ".bwt"
        ref_index_suffix2 = reference + ".amb"
        ref_index_suffix3 = reference + ".ann"
        ref_index_suffix4 = reference + ".sa"
        ref_index_suffix5 = reference + ".pac"
    elif ConfigSectionMap("pipeline", Config)['aligner'] == "bowtie":
        ref_index_suffix1 = reference + ".1.bt2"
        ref_index_suffix2 = reference + ".2.bt2"
        ref_index_suffix3 = reference + ".3.bt2"
        ref_index_suffix4 = reference + ".4.ebwt"
        ref_index_suffix5 = reference + ".rev.1.bt2"
        ref_index_suffix6 = reference + ".rev.2.bt2"
    if not os.path.isfile(ref_index_suffix1):
        keep_logging(
            'The reference index files given below does not exists:\n {}\n {}\n {}\n {}\n {}'
            .format(ref_index_suffix1, ref_index_suffix2, ref_index_suffix3,
                    ref_index_suffix4, ref_index_suffix5),
            'The reference index files given below does not exists:\n {}\n {}\n {}\n {}\n {}'
            .format(ref_index_suffix1, ref_index_suffix2, ref_index_suffix3,
                    ref_index_suffix4, ref_index_suffix5), logger, 'warning')
        create_index(reference, ref_index_suffix1, ref_index_suffix2,
                     ref_index_suffix3, ref_index_suffix4, ref_index_suffix5)
    else:
        keep_logging('Index file already exists.',
                     'Index file already exists.', logger, 'info')

    ref_fai_index = reference + ".fai"
    if not os.path.isfile(ref_fai_index):
        keep_logging(
            'The reference fai index file {} required for samtools does not exists.'
            .format(ref_fai_index),
            'The reference fai index file {} required for samtools does not exists.'
            .format(ref_fai_index), logger, 'warning')
        create_fai_index(reference, ref_fai_index)
    else:
        keep_logging('Samtools fai Index file already exists.',
                     'Samtools fai Index file already exists.', logger, 'info')

    dict_name = os.path.splitext(os.path.basename(reference))[0] + ".dict"
    if not os.path.isfile(
            ConfigSectionMap(args.index, Config)['ref_path'] + "/" +
            dict_name):
        keep_logging(
            'The reference seq dict file {} required for GATK and PICARD does not exists.'
            .format(dict_name),
            'The reference seq dict file {} required for GATK and PICARD does not exists.'
            .format(dict_name), logger, 'warning')
        picard_seqdict(dict_name, reference)
    else:
        keep_logging(
            'The reference seq dict file required for GATK and PICARD exists.',
            'The reference seq dict file required for GATK and PICARD exists.',
            logger, 'info')
Ejemplo n.º 6
0
def velvetoptimiser(forward_paired, reverse_paired, forward_unpaired,
                    reverse_unpaired, out_path):
    print(
        "\n################## Running VELVET on input files ##################\n"
    )
    velvet_dir = out_path + "velvet_results"
    Vforward_paired = "-shortPaired -fastq.gz " + forward_paired
    Vforward_unpaired = " -short -fastq.gz " + forward_unpaired
    Vreverse_paired = " -shortPaired2 -fastq.gz " + reverse_paired
    Vreverse_unpaired = " -short2 -fastq.gz " + reverse_unpaired
    (paired, unpaired) = check_cleanreads(forward_paired, reverse_paired,
                                          forward_unpaired, reverse_unpaired)
    contigs = out_path + "/contigs.fa"
    scaffolds = ""
    if paired == 0 and unpaired == 0:
        # Clean Paired and unpaired reads doesn't exist. Take raw Input PE files for assembly
        message = "No clean Paired and unpaired reads. Considering forward_paired and reverse_paired as raw Fastq files for assembly.\n"
        print(message)
        cmdstring = ConfigSectionMap(
            "bin_path"
        )['binbase'] + "VelvetOptimiser/VelvetOptimiser.pl -s 71 -e 121 -x 20" + " --d " + velvet_dir + " -f '" + Vforward_paired + " " + Vreverse_paired + "\'"
        print("Running: %s \n" % cmdstring)
        os.system(cmdstring)
        # Copy final contigs/scaffolds file to output directory
        cp_cmdstring = "cp " + velvet_dir + "/contigs.fa " + out_path
        os.system(cp_cmdstring)
        print("\n################## END: VELVET ASSEMBLY ##################\n")
        return contigs, scaffolds

    elif paired == 1 and unpaired == 0:
        # Only clean Paired PE files exists. Take these files for assembly input.
        message = "Taking only paired reads for assembly.\n"
        print(message)
        cmdstring = ConfigSectionMap(
            "bin_path"
        )['binbase'] + "VelvetOptimiser/VelvetOptimiser.pl -s 71 -e 121 -x 20" + " --d " + velvet_dir + " -f '" + Vforward_paired + Vreverse_paired + "\'"
        print("Running: %s \n" % cmdstring)
        os.system(cmdstring)
        # Copy final contigs/scaffolds file to output directory
        cp_cmdstring = "cp " + velvet_dir + "/contigs.fa " + out_path
        os.system(cp_cmdstring)
        print("\n################## END: VELVET ASSEMBLY ##################\n")
        return contigs, scaffolds

    elif paired == 0 and unpaired == 1:
        # Only clean unpaired PE files exists. Pending...
        cmdstring = "This can be single reads......"
        print("Running: %s \n" % cmdstring)
        #return contigs, scaffolds
        #os.system(cmdstring)
    else:
        # Clean paired and unpaired files exists. Take all these files as input.
        os.chdir(out_path)
        cmdstring = ConfigSectionMap(
            "bin_path"
        )['binbase'] + "VelvetOptimiser/VelvetOptimiser.pl -s 71 -e 121 -x 20" + " --d " + velvet_dir + " -f '" + Vforward_paired + " " + Vreverse_paired + " " + Vforward_unpaired + " " + Vreverse_unpaired + "\'"
        print("Running with all input file parameters.\n")
        print("Running: %s \n" % cmdstring)
        os.system(cmdstring)
        # Copy final contigs/scaffolds file to output directory
        cp_cmdstring = "cp " + velvet_dir + "/contigs.fa " + out_path
        os.system(cp_cmdstring)
        print("\n################## END: VELVET ASSEMBLY ##################\n")
        return contigs, scaffolds
Ejemplo n.º 7
0
def nucmer_repeat(reference, outdir, logger, Config):
    keep_logging(
        '\nFinding repeat region in reference genome: %s\n' % reference,
        '\nFinding repeat region in reference genome: %s\n' % reference,
        logger, 'info')
    prefix = str(reference.split('.')[0]) + "_repeat"
    nucmer_repeat_cmd = "%s/%s/%s --maxmatch --nosimplify --prefix=%s %s %s" % (
        ConfigSectionMap("bin_path", Config)['binbase'],
        ConfigSectionMap("mummer", Config)['mummer_bin'],
        ConfigSectionMap(
            "mummer", Config)['nucmer_base_cmd'], prefix, reference, reference)
    keep_logging('Running: %s' % nucmer_repeat_cmd,
                 'Running: %s' % nucmer_repeat_cmd, logger, 'debug')
    call(nucmer_repeat_cmd, logger)
    showcoords_cmd = "%s/%s/show-coords -I %s -r %s.delta > %s.coords" % (
        ConfigSectionMap("bin_path", Config)['binbase'],
        ConfigSectionMap("mummer", Config)['mummer_bin'],
        ConfigSectionMap("mummer", Config)['percent_id'], prefix, prefix)
    keep_logging('Running: %s' % showcoords_cmd,
                 'Running: %s' % showcoords_cmd, logger, 'debug')
    call(showcoords_cmd, logger)
    repeat_match_cmd = "%s/%s/repeat-match %s > %s.repeat_match" % (
        ConfigSectionMap("bin_path", Config)['binbase'],
        ConfigSectionMap("mummer", Config)['mummer_bin'], reference, prefix)
    tandem_repeats_cmd = "%s/%s/exact-tandems %s %s > %s_tandem_repeats_file" % (
        ConfigSectionMap("bin_path", Config)['binbase'],
        ConfigSectionMap("mummer", Config)['mummer_bin'], reference,
        ConfigSectionMap("mummer", Config)['min_tandem_repeat_length'], prefix)
    keep_logging('Running: %s' % tandem_repeats_cmd,
                 'Running: %s' % tandem_repeats_cmd, logger, 'debug')
    keep_logging('Running: %s' % repeat_match_cmd,
                 'Running: %s' % repeat_match_cmd, logger, 'debug')
    call(tandem_repeats_cmd, logger)
    call(repeat_match_cmd, logger)
    inexact_repeat_positions = []
    with open("%s.coords" % prefix) as fp:
        for i in xrange(6):
            fp.next()
        for line in fp:
            line = line.strip()
            line_split = line.split('|')
            range_str = str('-'.join(line_split[0].strip().split()))
            i_range = range_str.split('-')
            end_range = int(i_range[1]) + 1
            inexact_repeat_positions.extend(
                list(range(int(i_range[0]), end_range)))
            range_str = str('-'.join(line_split[1].strip().split()))
            i_range = range_str.split('-')
            end_range = int(i_range[1]) + 1
            inexact_repeat_positions.extend(
                list(range(int(i_range[0]), end_range)))
    fp.close()

    #Write inexact repeat position to file inexact_repeat_region_positions.txt
    f_inexact = open("%s/inexact_repeat_region_positions.txt" % outdir, 'w+')
    for i in inexact_repeat_positions:
        f_inexact.write(str(i) + '\n')

    keep_logging(
        'No. of inexact repeat matches positions: %s' %
        len(set(sorted(inexact_repeat_positions))),
        'No. of inexact repeat matches: %s' %
        len(set(sorted(inexact_repeat_positions))), logger, 'info')

    keep_logging(
        'Note: The pipeline will not remove these inexact repeat positions. Writing these postions to %s/inexact_repeat_region_positions.txt'
        % outdir,
        'Note: The pipeline will not remove these inexact repeat positions. Writing these postions to %s/inexact_repeat_region_positions.txt'
        % outdir, logger, 'info')

    #Find Tandem repeats using Nucmer
    tandem_repeats = []
    with open("%s_tandem_repeats_file" % prefix) as fp:
        for i in xrange(5):
            fp.next()
        for line in fp:
            line = line.strip()
            line_split = line.split()
            end_coords = int(line_split[0]) + int(line_split[1])
            tandem_repeats.extend(list(range(int(line_split[0]), end_coords)))
    keep_logging(
        'No. of Tandem repeat matches positions: %s' %
        len(set(sorted(tandem_repeats))),
        'No. of Tandem repeat matches positions: %s' %
        len(set(sorted(tandem_repeats))), logger, 'info')

    # Not including inexact repeats filter
    #All_repeats = sorted(set(inexact_repeat_positions + tandem_repeats))
    All_repeats = sorted(set(tandem_repeats))

    keep_logging(
        'Repeat positions in this file %s/repeat_region_positions.txt will be filtered out'
        % outdir,
        'Repeat positions in this file %s/repeat_region_positions.txt will be filtered out'
        % outdir, logger, 'info')

    f_open = open("%s/repeat_region_positions.txt" % outdir, 'w+')
    for pos in All_repeats:
        f_open.write(str(pos) + '\n')
    f_open.close()
    return "%s/repeat_region_positions.txt" % outdir
Ejemplo n.º 8
0
def pipeline(args, logger):
    keep_logging('START: Pipeline', 'START: Pipeline', logger, 'info')
    """ SANITATION CHECKS """

    # Check Subroutines: Arguments, Input FASTQ files, Reference Index
    keep_logging('START: Checking Dependencies...', 'Checking Dependencies',
                 logger, 'info')

    # Reference Genome file name
    reference = ConfigSectionMap(args.index,
                                 Config)['ref_path'] + "/" + ConfigSectionMap(
                                     args.index, Config)['ref_name']
    keep_logging(
        'Getting Reference Genome name from config file: {}'.format(reference),
        'Getting Reference Genome name from config file: {}'.format(reference),
        logger, 'info')

    # Check if FASTQ files exists
    if args.type != "PE" and args.type != "BAM":
        reverse_raw = "None"
        file_exists(args.forward_raw, args.forward_raw, reference)
    elif args.type != "PE" and args.type != "SE":
        print "BAM type... Not Integrated... continue"
    else:
        file_exists(args.forward_raw, args.reverse_raw, reference)

    # Check Java Version
    java_check()
    keep_logging('END: Checking Dependencies...', 'END: Checking Dependencies',
                 logger, 'info')
    """ Start the pipeline: """
    # split values provided with -steps argument and decide the starting point of pipeline
    steps_list = args.steps.split(',')

    # Check cluster parameter and set cluster variable, used for running pipeline locally or parallelly on local or on cluster
    if args.cluster:
        cluster = args.cluster
    else:
        cluster = "local"
    """ INDIVIDUAL SUBPROCESS FOR EACH PIPELINE STEPS"""

    ## 1. Pre-Processing Raw reads using Trimmomatic
    def clean():
        keep_logging('START: Pre-Processing Raw reads using Trimmomatic',
                     'START: Pre-Processing Raw reads using Trimmomatic',
                     logger, 'info')
        if args.type == "PE":
            trimmomatic(args.forward_raw, args.reverse_raw, args.output_folder,
                        args.croplength, logger, Config)
        else:
            reverse_raw = "None"
            trimmomatic(args.forward_raw, reverse_raw, args.output_folder,
                        args.croplength, logger, Config)
        keep_logging('END: Pre-Processing Raw reads using Trimmomatic',
                     'END: Pre-Processing Raw reads using Trimmomatic', logger,
                     'info')

    ## 2. Stages: Alignment using BWA
    def align_reads():
        keep_logging('START: Mapping Reads using BWA',
                     'START: Mapping Reads using BWA', logger, 'info')
        split_field = prepare_readgroup(
            args.forward_raw,
            ConfigSectionMap("pipeline", Config)['aligner'], logger)
        out_sam = align(args.output_folder, args.index, split_field,
                        args.analysis_name, files_to_delete, logger, Config,
                        args.type)
        keep_logging('END: Mapping Reads using BWA',
                     'END: Mapping Reads using BWA', logger, 'info')
        return out_sam

    # Run Depth of Coverage Module after read mapping and stop. Dont proceed to variant calling step.
    def coverage_depth_stats():
        gatk_DepthOfCoverage_file = gatk_DepthOfCoverage(
            out_sorted_bam, args.output_folder, args.analysis_name, reference,
            logger, Config)
        alignment_stats_file = alignment_stats(out_sorted_bam,
                                               args.output_folder,
                                               args.analysis_name, logger,
                                               Config)
        return gatk_DepthOfCoverage_file

    ## 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc
    def post_align(out_sam):
        keep_logging('START: Post-Alignment using SAMTOOLS, PICARD etc...',
                     'START: Post-Alignment using SAMTOOLS, PICARD etc...',
                     logger, 'info')
        out_sorted_bam = prepare_bam(out_sam, args.output_folder,
                                     args.analysis_name, files_to_delete,
                                     logger, Config)
        keep_logging('END: Post-Alignment using SAMTOOLS, PICARD etc...',
                     'END: Post-Alignment using SAMTOOLS, PICARD etc...',
                     logger, 'info')
        #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name)
        keep_logging('START: Creating BedGraph Coverage',
                     'START: Creating BedGraph Coverage', logger, 'info')
        bedgraph_coverage(out_sorted_bam, args.output_folder,
                          args.analysis_name, reference, logger, Config)
        only_unmapped_positions_file = bedtools(out_sorted_bam,
                                                args.output_folder,
                                                args.analysis_name, logger,
                                                Config)
        keep_logging('END: Creating BedGraph Coverage',
                     'END: Creating BedGraph Coverage', logger, 'info')
        return out_sorted_bam

    ## 4. Stages: Variant Calling
    def varcall():
        keep_logging('START: Variant Calling', 'START: Variant Calling',
                     logger, 'info')
        caller = ConfigSectionMap("pipeline", Config)['variant_caller']
        if caller == "gatkhaplotypecaller":
            keep_logging('START: Variant Calling using GATK haplotyper.',
                         'START: Variant Calling using GATK haplotyper.',
                         logger, 'info')
            final_raw_vcf_mpileup = variant_calling(out_sorted_bam,
                                                    args.output_folder,
                                                    args.index,
                                                    args.analysis_name, logger,
                                                    Config)
            #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name)
            final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup,
                                                  args.output_folder,
                                                  args.analysis_name,
                                                  reference, logger, Config)
            final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup,
                                                args.output_folder,
                                                args.analysis_name, reference,
                                                logger, Config)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf),
                         'The final raw VCF file: {}'.format(final_raw_vcf),
                         logger, 'debug')
            keep_logging(
                'The final raw Indel VCF file: {}'.format(final_raw_indel_vcf),
                'The final raw Indel VCF file: {}'.format(final_raw_indel_vcf),
                logger, 'debug')
            keep_logging(
                'END: Variant Calling using Samtools without post-align bam input files.',
                'END: Variant Calling using Samtools without post-align bam input files.',
                logger, 'info')
            return final_raw_vcf, final_raw_indel_vcf

        elif caller == "samtools":
            keep_logging(
                'START: Variant Calling using Samtools without post-align bam input files.',
                'START: Variant Calling using Samtools without post-align bam input files.',
                logger, 'info')
            final_raw_indel_vcf = prepare_indel_gatk(out_sorted_bam,
                                                     args.output_folder,
                                                     args.analysis_name,
                                                     args.index, logger,
                                                     Config)
            final_raw_vcf_mpileup = variant_calling(out_sorted_bam,
                                                    args.output_folder,
                                                    args.index,
                                                    args.analysis_name, logger,
                                                    Config)
            #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name)
            final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup,
                                                  args.output_folder,
                                                  args.analysis_name,
                                                  reference, logger, Config)
            # GATK indel calling integration
            #final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf),
                         'The final raw VCF file: {}'.format(final_raw_vcf),
                         logger, 'debug')
            keep_logging(
                'END: Variant Calling using Samtools without post-align bam input files.',
                'END: Variant Calling using Samtools without post-align bam input files.',
                logger, 'info')
            return final_raw_vcf, final_raw_indel_vcf
        else:
            keep_logging(
                'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. gatkhaplotypecaller',
                'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. gatkhaplotypecaller',
                logger, 'info')
            exit()
        keep_logging('END: Variant Calling', 'END: Variant Calling', logger,
                     'info')

    ## 5. Stages: Variant Filteration
    def filter(gatk_depth_of_coverage_file):
        keep_logging('START: Variant Filteration',
                     'START: Variant Filteration', logger, 'info')
        final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (
            args.output_folder, args.analysis_name)
        #final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config)
        if not os.path.isfile(gatk_depth_of_coverage_file):
            file_basename = os.path.basename(gatk_depth_of_coverage_file)
            keep_logging(
                'The input file {} does not exists. Please provide another file with full path or check the files path.\n'
                .format(file_basename),
                'The input file {} does not exists. Please provide another file or check the files path.\n'
                .format(file_basename), logger, 'exception')
            exit()
        Avg_dp_cmd = "grep \'^Total\' %s | awk -F\'\t\' \'{print $3}\'" % gatk_depth_of_coverage_file
        proc = sp.Popen([Avg_dp_cmd], stdout=sp.PIPE, shell=True)
        (out, err) = proc.communicate()
        Avg_dp = float(out)
        print "The Average Depth per reference genome base is: %s" % Avg_dp
        filter_variants(final_raw_vcf, args.output_folder, args.analysis_name,
                        args.index, logger, Config, Avg_dp)
        final_raw_indel_vcf = final_raw_vcf_mpileup + "_indel.vcf"
        filter_indels(final_raw_indel_vcf, args.output_folder,
                      args.analysis_name, args.index, logger, Config, Avg_dp)
        keep_logging('END: Variant Filteration', 'END: Variant Filteration',
                     logger, 'info')

    ## 6. Stages: Statistics
    def stats():
        keep_logging('START: Generating Statistics Reports',
                     'START: Generating Statistics Reports', logger, 'info')
        alignment_stats_file = alignment_stats(out_sorted_bam,
                                               args.output_folder,
                                               args.analysis_name, logger,
                                               Config)
        vcf_stats_file = vcf_stats(final_raw_vcf, args.output_folder,
                                   args.analysis_name, logger, Config)
        picard_stats_file = picardstats(out_sorted_bam, args.output_folder,
                                        args.analysis_name, args.index, logger,
                                        Config)
        #qualimap_report = qualimap(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config)
        keep_logging('END: Generating Statistics Reports',
                     'END: Generating Statistics Reports', logger, 'info')

    # ################################################### Stages: Remove Unwanted Intermediate files ######################################
    # # print "Removing Imtermediate Files...\n%s" % files_to_delete
    # # for files in files_to_delete:
    # #     os.remove(files)
    # # print "Removing Imtermediate Files...\n%s" % files_to_delete
    # # for files in files_to_delete:
    # #     os.remove(files)
    # ############################################################################ End ####################################################

    if args.downsample == "yes":
        read1, read2 = downsample(args, logger)
        args.forward_raw = read1
        args.reverse_raw = read2
        print "Using downsampled forward reads %s" % args.forward_raw
        print "Using downsampled reverse reads %s" % args.reverse_raw

    if len(steps_list) == 1:
        if steps_list[0] == "coverage_depth_stats":
            #clean()
            #out_sam = align_reads()
            #out_sorted_bam = post_align()
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            gatk_DepthOfCoverage_file = coverage_depth_stats()

        if steps_list[0] == "filter":
            #Sanity Check Post-varcall vcf and other files here
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (
                args.output_folder, args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            if os.path.exists(out_sorted_bam) and os.path.exists(
                    final_raw_vcf) and os.path.exists(
                        gatk_depth_of_coverage_file) and os.path.exists(
                            final_raw_vcf_mpileup):
                filter(gatk_depth_of_coverage_file)
                stats()
            else:
                keep_logging(
                    'The required intermediate files does not exists. Please rerun the variant calling pipeline to generate the files\n',
                    'The required intermediate files does not exists. Please rerun the variant calling pipeline to generate the files',
                    logger, 'exception')
                exit()

        if steps_list[0] == "stats":
            #Sanity Check Post-varcall vcf and other files here
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (
                args.output_folder, args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                print gatk_depth_of_coverage_file
                gatk_depth_of_coverage_file = coverage_depth_stats()
            if os.path.exists(out_sorted_bam) and os.path.exists(
                    final_raw_vcf) and os.path.exists(
                        gatk_depth_of_coverage_file) and os.path.exists(
                            final_raw_vcf_mpileup):
                stats()
            else:
                keep_logging(
                    'The required intermediate files does not exists. Please rerun the variant calling pipeline to generate the files\n',
                    'The required intermediate files does not exists. Please rerun the variant calling pipeline to generate the files',
                    logger, 'exception')
                exit()

        elif steps_list[0] == "All":
            clean()
            out_sam = align_reads()
            out_sorted_bam = post_align(out_sam)
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf, final_raw_indel_vcf = varcall()
            final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (
                args.output_folder, args.analysis_name)
            filter(gatk_depth_of_coverage_file)
            stats()

        elif steps_list[0] == "bedtools":
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            only_unmapped_positions_file = bedtools(out_sorted_bam,
                                                    args.output_folder,
                                                    args.analysis_name, logger,
                                                    Config)

        elif steps_list[0] == "varcall":
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf, final_raw_indel_vcf = varcall()

    # Run individual variant calling steps: clean, align, post-align, varcall, filter, stats etc
    else:

        if steps_list[0] == "clean":
            clean()
            out_sam = align_reads()
            out_sorted_bam = post_align()
            #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf, final_raw_indel_vcf = varcall()
            filter(gatk_depth_of_coverage_file)
            stats()
        elif steps_list[0] == "align":
            #Sanity Check clean reads here
            out_sam = align_reads()
            out_sorted_bam = post_align(out_sam)
            out_sorted_bam = post_align()
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf, final_raw_indel_vcf = varcall()
            filter(gatk_depth_of_coverage_file)
            stats()
        elif steps_list[0] == "post-align":
            #Sanity Check BAM file here
            out_sam = "%s/%s_aln.sam" % (args.output_folder,
                                         args.analysis_name)
            out_sorted_bam = post_align(out_sam)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf, final_raw_indel_vcf = varcall()
            filter(gatk_depth_of_coverage_file)
            stats()

        elif steps_list[0] == "varcall":
            #Sanity Check Post-aligned-BAM and Bed files here
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            if not os.path.exists("%s.bai" % out_sorted_bam):
                index_bam(out_sorted_bam, args.output_folder, logger, Config)

            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf, final_raw_indel_vcf = varcall()
            filter(gatk_depth_of_coverage_file)
            stats()

        elif steps_list[0] == "filter":
            #Sanity Check Post-varcall vcf and other files here
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (
                args.output_folder, args.analysis_name)
            filter(gatk_depth_of_coverage_file)
            stats()
        elif steps_list[0] == "stats":
            #Sanity check BAM and vcf files
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (
                args.output_folder, args.analysis_name)
            stats()
        else:
            keep_logging(
                'Seems like the Analysis Steps are not in sequential order. Please recheck the -steps argument and run the pipeline again',
                'Seems like the Analysis Steps are not in sequential order. Please recheck the -steps argument and run the pipeline again',
                logger, 'exception')
Ejemplo n.º 9
0
def get_scheduler_directive(scheduler, Config):
    """Generate Cluster Directive lines for a scheduler provided with args.scheduler
            Args:
                path: scheduler name, Config object

            Output:
                variables associated with scheduler
    """
    if scheduler and scheduler == "SLURM":
        script_Directive = "#SBATCH"
        job_name_flag = "--job-name="
        scheduler_directives = "#SBATCH --mail-user=%s\n#SBATCH --mail-type=%s\n#SBATCH --export=ALL\n#SBATCH --partition=%s\n#SBATCH --account=%s\n#SBATCH %s\n" \
                          % (ConfigSectionMap("slurm", Config)['email'],
                             ConfigSectionMap("slurm", Config)['notification'],
                             ConfigSectionMap("slurm", Config)['partition'],
                             ConfigSectionMap("slurm", Config)['flux_account'],
                             ConfigSectionMap("slurm", Config)['resources'])
    elif scheduler and scheduler == "PBS":
        script_Directive = "#PBS"
        job_name_flag = "-N"
        scheduler_directives = "#PBS -M %s\n#PBS -m %s\n#PBS -V\n#PBS -l %s\n#PBS -q %s\n#PBS -A %s\n#PBS -l qos=flux\n" \
                          % (ConfigSectionMap("scheduler", Config)['email'],
                             ConfigSectionMap("scheduler", Config)['notification'],
                             ConfigSectionMap("scheduler", Config)['resources'],
                             ConfigSectionMap("scheduler", Config)['queue'],
                             ConfigSectionMap("scheduler", Config)['flux_account'])
    else:
        script_Directive = "#SBATCH"
        job_name_flag = "--job-name="
        scheduler_directives = "#SBATCH --mail-user=%s\n#SBATCH --mail-type=%s\n#SBATCH --export=ALL\n#SBATCH --partition=%s\n#SBATCH --account=%s\n#SBATCH %s\n" \
                               % (ConfigSectionMap("slurm", Config)['email'],
                                  ConfigSectionMap("slurm", Config)['notification'],
                                  ConfigSectionMap("slurm", Config)['partition'],
                                  ConfigSectionMap("slurm", Config)['flux_account'],
                                  ConfigSectionMap("slurm", Config)['resources'])
    return scheduler_directives, script_Directive, job_name_flag
Ejemplo n.º 10
0
def picard_seqdict(reference_filename, reference):
    dict_name = os.path.splitext(os.path.basename(reference_filename))[0] + ".dict"
    cmd = "java -jar %s CreateSequenceDictionary REFERENCE=%s OUTPUT=%s/%s" % (base_cmd, reference_filename, ConfigSectionMap(reference, Config)['ref_path'],dict_name)
    print "\nRunning:\n [%s] \n" % cmd
    os.system(cmd)
Ejemplo n.º 11
0
global Config_readme
global logger

log_unique_time = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')

Config_readme = ConfigParser.ConfigParser()
Config_readme.read(args.readme_meta)

readme_file = args.out_dir + "/README.md"

print readme_file

if not os.path.isfile(readme_file):
    f = open(readme_file, 'w+')
    f.write("Request submitted by: %s" %
            ConfigSectionMap("Main", Config_readme)['submitter'])
    f.close()

else:
    print "README file already exists: Overwriting this file"
    f = open(readme_file, 'w+')
    f.write("Request submitted by: %s\n" %
            ConfigSectionMap("Main", Config_readme)['submitter'])
    f.write("Project Name: %s\n" %
            ConfigSectionMap("Main", Config_readme)['project_name'])
    f.write("Date when pipeline was run: %s\n" %
            ConfigSectionMap("Main", Config_readme)['date'])
    f.write("Piepline Version: %s\n" %
            ConfigSectionMap("Main", Config_readme)['version'])
    f.write("Comments: %s\n" %
            ConfigSectionMap("Description", Config_readme)['comments'])
Ejemplo n.º 12
0
def abacas(reference_genome_path, final_l500_contig, out_path, first_part,
           logger, Config):
    keep_logging('Contig Reordering using ABACAS',
                 'Contig Reordering using ABACAS', logger, 'info')
    abacas_cmd = "perl %s/%s/%s -r %s -q %s %s -o %s/%s_contigs_ordered" % (
        ConfigSectionMap("bin_path", Config)['binbase'],
        ConfigSectionMap("abacas", Config)['abacas_bin'],
        ConfigSectionMap("abacas", Config)['base_cmd'], reference_genome_path,
        final_l500_contig, ConfigSectionMap(
            "abacas", Config)['abacas_parameters'], out_path, first_part)
    try:
        keep_logging(abacas_cmd, abacas_cmd, logger, 'debug')
        call(abacas_cmd, logger)
        #print ""
        fasta_header = ">%s" % first_part
        header_cmd = "echo \"%s\" > %s/fasta_header" % (fasta_header, out_path)
        print(header_cmd)
        keep_logging(abacas_cmd, abacas_cmd, logger, 'debug')
        call(header_cmd, logger)
        abacas_ordered_multifasta = "%s/%s_contigs_ordered.MULTIFASTA.fa" % (
            out_path, first_part)
        abacas_ordered_contigsInbin = "%s/%s_contigs_ordered.contigsInbin.fas" % (
            out_path, first_part)
        join_all_contigs = "cat %s %s > %s/all_contigs.fasta" % (
            abacas_ordered_multifasta, abacas_ordered_contigsInbin, out_path)
        #print join_all_contigs
        keep_logging(join_all_contigs, join_all_contigs, logger, 'debug')
        call(join_all_contigs, logger)
        add_linker = "sed -i 's/>.*/NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN/g' %s/all_contigs.fasta" % out_path
        #print add_linker
        keep_logging(add_linker, add_linker, logger, 'debug')
        call(add_linker, logger)
        remove_spaces = "tr -d '[:space:]' < %s/all_contigs.fasta > %s/all_contigs.fasta_changed.fasta" % (
            out_path, out_path)
        #print remove_spaces
        keep_logging(remove_spaces, remove_spaces, logger, 'debug')
        call(remove_spaces, logger)
        join_files = "cat %s/fasta_header %s/all_contigs.fasta_changed.fasta > %s/%s_contigs_ordered.fasta" % (
            out_path, out_path, out_path, first_part)
        #print join_files
        keep_logging(join_files, join_files, logger, 'debug')
        call(join_files, logger)
    except sp.CalledProcessError:
        keep_logging('Error in reordering Contigs using Abacas. Exiting.',
                     'Error in reordering Contigs using Abacas. Exiting.',
                     logger, 'exception')
        sys.exit(1)
    # fasta_header = ">%s" % first_part
    # header_cmd = "echo \"%s\" > %s/fasta_header" % (fasta_header, out_path)
    # print header_cmd
    # call(header_cmd, logger)
    # abacas_ordered_multifasta = "%s/%s_contigs_ordered.MULTIFASTA.fa" % (out_path, first_part)
    # abacas_ordered_contigsInbin = "%s/%s_contigs_ordered.contigsInbin.fas" % (out_path, first_part)
    # print "here"
    # join_all_contigs = "cat %s %s > %s/all_contigs.fasta" % (abacas_ordered_multifasta, abacas_ordered_contigsInbin, out_path)
    # print join_all_contigs
    # call(join_all_contigs, logger)
    # add_linker = "sed -i 's/>.*/NNNNNCATTCCATTCATTAATTAATTAATGAATGAATGNNNNN/g' %s/all_contigs.fasta" % out_path
    # print add_linker
    # call(add_linker, logger)
    # remove_spaces = "tr -d '[:space:]' < %s/all_contigs.fasta > %s/all_contigs.fasta_changed.fasta" % (out_path, out_path)
    # print remove_spaces
    # call(remove_spaces, logger)
    # join_files = "cat %s/fasta_header %s/all_contigs.fasta_changed.fasta > %s/%s_contigs_ordered.fasta" % (out_path, out_path, out_path, first_part)
    # print join_files
    # call(join_files, logger)
    final_ordered_contigs = "%s/%s_contigs_ordered.fasta" % (out_path,
                                                             first_part)
    return final_ordered_contigs
Ejemplo n.º 13
0
def spades_assembly(forward_paired, reverse_paired, forward_unpaired,
                    reverse_unpaired, out_path, logger, Config, do_assembly):
    # check if the clean reads from Trimmomatic exists in the output folder.
    # Set the paired and unpaired string constants based on their availability
    (paired, unpaired) = check_cleanreads(forward_paired, reverse_paired,
                                          forward_unpaired, reverse_unpaired)
    # Pending Changes
    if paired == "0" and unpaired == "0":
        # Clean Paired and unpaired reads doesn't exist. Take raw Input PE files for assembly
        message = "No clean Paired and unpaired reads. Considering forward_paired and reverse_paired as raw Fastq files for assembly.\n"
        print(message)
        cmdstring = ConfigSectionMap(
            "spades", Config
        )['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_results " + ConfigSectionMap(
            "spades", Config)['spades_parameters']
        plasmid_cmdstring = ConfigSectionMap(
            "spades", Config
        )['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results " + ConfigSectionMap(
            "spades", Config)['plasmid_spades_parameters']
        print("Running: %s \n" % cmdstring)
        print("Running: %s \n" % plasmid_cmdstring)
        os.system(cmdstring)
        os.system(
            plasmid_cmdstring=ConfigSectionMap("bin_path", Config)['binbase'] +
            ConfigSectionMap("spades", Config)['spades_bin'] +
            ConfigSectionMap("spades", Config)['base_cmd'] + " --pe1-1 " +
            forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path +
            "spades_plasmid_results " +
            ConfigSectionMap("spades", Config)['plasmid_spades_parameters'])
        print("Spades assembly results can be found in " + out_path +
              "spades_results")
        print("plasmid Spades assembly results can be found in " + out_path +
              "spades_plasmid_results")
        contigs = out_path + "spades_results" + "/contigs.fasta"
        scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
        plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
        plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
        # Copy final contigs/scaffolds file to output directory
        cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
        os.system(cp_cmdstring)
        print("\n################## End: SPADES ASSEMBLY ##################\n")
        return contigs, scaffolds
    # Pending Changes
    elif paired == "1" and unpaired == "0":
        # Only clean Paired PE files exists. Take these files for assembly input.
        message = "Taking only paired reads for assembly.\n"
        print(message)
        if reverse_paired == "None" and reverse_unpaired == "None":
            cmdstring = ConfigSectionMap(
                "spades", Config
            )['base_cmd'] + " --s1 " + forward_paired + " -o " + out_path + "spades_results/ " + ConfigSectionMap(
                "spades", Config)['spades_parameters']
            plasmid_cmdstring = ConfigSectionMap(
                "spades", Config
            )['base_cmd'] + " --s1 " + forward_paired + " -o " + out_path + "spades_plasmid_results/ " + ConfigSectionMap(
                "spades", Config)['plasmid_spades_parameters']
            print("Running: %s \n" % cmdstring)
            print("Running: %s \n" % plasmid_cmdstring)
            os.system(cmdstring)
            os.system(plasmid_cmdstring)
            print("Spades assembly results can be found in " + out_path +
                  "spades_results")
            print("plasmid Spades assembly results can be found in " +
                  out_path + "spades_plasmid_results")
            contigs = out_path + "spades_results" + "/contigs.fasta"
            scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
            plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
            plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
            # Copy final contigs/scaffolds file to output directory
            cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
            os.system(cp_cmdstring)
            print(
                "\n################## End: SPADES ASSEMBLY ##################\n"
            )
        else:
            ##pending changes
            cmdstring = ConfigSectionMap(
                "spades", Config
            )['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_results/ " + ConfigSectionMap(
                "spades", Config)['spades_parameters']
            plasmid_cmdstring = ConfigSectionMap(
                "spades", Config
            )['base_cmd'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results/ " + ConfigSectionMap(
                "spades", Config)['plasmid_spades_parameters']
            print("Running: %s \n" % cmdstring)
            print("Running: %s \n" % plasmid_cmdstring)
            os.system(cmdstring)
            os.system(plasmid_cmdstring)
            print("Spades assembly results can be found in " + out_path +
                  "spades_results")
            print("plasmid Spades assembly results can be found in " +
                  out_path + "spades_plasmid_results")
            contigs = out_path + "spades_results" + "/contigs.fasta"
            scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
            plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
            plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
            # Copy final contigs/scaffolds file to output directory
            cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
            os.system(cp_cmdstring)
            print(
                "\n################## End: SPADES ASSEMBLY ##################\n"
            )
        return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds
    # Pending Changes
    elif paired == "0" and unpaired == "1":
        # Only clean unpaired PE files exists. Pending...
        cmdstring = "This can be single reads......"
        print("Running: %s \n" % cmdstring)
        os.system(cmdstring)
        print("Spades assembly results can be found in " + out_path +
              "spades_results")
        contigs = out_path + "spades_results" + "/contigs.fasta"
        scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
        # Copy final contigs/scaffolds file to output directory
        cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
        os.system(cp_cmdstring)
        print("\n################## End: SPADES ASSEMBLY ##################\n")
        return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds
    else:
        # Clean paired and unpaired files exists. Take all these files as input.
        cmdstring = ConfigSectionMap(
            "spades", Config
        )['base_cmd'] + " " + ConfigSectionMap(
            "spades", Config
        )['spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " --pe1-s " + forward_unpaired + " --pe1-s " + reverse_unpaired + " -o " + out_path + "spades_results"
        plasmid_cmdstring = ConfigSectionMap(
            "spades", Config
        )['base_cmd'] + " " + ConfigSectionMap(
            "spades", Config
        )['plasmid_spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " --pe1-s " + forward_unpaired + " --pe1-s " + reverse_unpaired + " -o " + out_path + "spades_plasmid_results"

        # Check if unpaired files are empty
        fwd_unpaired_size = get_uncompressed_size(forward_unpaired)
        rev_unpaired_size = get_uncompressed_size(reverse_unpaired)
        if fwd_unpaired_size == 0 or rev_unpaired_size == 0:
            cmdstring = ConfigSectionMap(
                "spades", Config
            )['base_cmd'] + " " + ConfigSectionMap(
                "spades", Config
            )['spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_results"
            plasmid_cmdstring = ConfigSectionMap("spades", Config)['base_cmd'] + " " + \
                                ConfigSectionMap("spades", Config)[
                                    'plasmid_spades_parameters'] + " --pe1-1 " + forward_paired + " --pe1-2 " + reverse_paired + " -o " + out_path + "spades_plasmid_results"

        if do_assembly == "both":
            keep_logging('Running Spades and plasmid Spades assembly',
                         'Running Spades and plasmid Spades assembly', logger,
                         'debug')
            try:
                keep_logging(cmdstring, cmdstring, logger, 'debug')
                call(cmdstring, logger)
                #Check if they are empty
                contigs = out_path + "spades_results" + "/contigs.fasta"
                scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
                # Copy final contigs/scaffolds file to output directory
                cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
                os.system(cp_cmdstring)
                print("")
                keep_logging(
                    'Spades assembly results can be found in {}spades_results'.
                    format(out_path),
                    'Spades assembly results can be found in {}spades_results'.
                    format(out_path), logger, 'info')
            except sp.CalledProcessError:
                keep_logging(
                    'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder',
                    'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder',
                    logger, 'exception')
                sys.exit(1)

            try:
                keep_logging(plasmid_cmdstring, plasmid_cmdstring, logger,
                             'debug')
                call(plasmid_cmdstring, logger)
                print("")
                plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
                plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
                keep_logging(
                    'Spades plasmid assembly results can be found in {}spades_plasmid_results'
                    .format(out_path),
                    'Spades plasmid assembly results can be found in {}spades_plasmid_results'
                    .format(out_path), logger, 'info')
            except sp.CalledProcessError:
                keep_logging(
                    'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder',
                    'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder',
                    logger, 'exception')
                sys.exit(1)

        if do_assembly == "wga":
            keep_logging('Running Spades assembly', 'Running Spades assembly',
                         logger, 'debug')
            try:
                keep_logging(cmdstring, cmdstring, logger, 'debug')
                call(cmdstring, logger)
                #Check if they are empty
                contigs = out_path + "spades_results" + "/contigs.fasta"
                scaffolds = out_path + "spades_results" + "/scaffolds.fasta"
                # Copy final contigs/scaffolds file to output directory
                cp_cmdstring = "cp %s %s %s" % (contigs, scaffolds, out_path)
                os.system(cp_cmdstring)
                print("")
                keep_logging(
                    'Spades assembly results can be found in {}spades_results'.
                    format(out_path),
                    'Spades assembly results can be found in {}spades_results'.
                    format(out_path), logger, 'info')
            except sp.CalledProcessError:
                keep_logging(
                    'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder',
                    'Error in Spades Assembly step. Exiting. Please check spades.log file in spades_results folder',
                    logger, 'exception')
                sys.exit(1)

            plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
            plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"

        if do_assembly == "plasmid":
            keep_logging('Running plasmid Spades assembly',
                         'Running plasmid Spades assembly', logger, 'debug')
            try:
                keep_logging(plasmid_cmdstring, plasmid_cmdstring, logger,
                             'debug')
                call(plasmid_cmdstring, logger)
                print("")
                plasmid_contigs = out_path + "spades_plasmid_results" + "/contigs.fasta"
                plasmid_scaffolds = out_path + "spades_plasmid_results" + "/contigs.fasta"
                keep_logging(
                    'Spades plasmid assembly results can be found in {}spades_plasmid_results'
                    .format(out_path),
                    'Spades plasmid assembly results can be found in {}spades_plasmid_results'
                    .format(out_path), logger, 'info')
            except sp.CalledProcessError:
                keep_logging(
                    'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder',
                    'Error in Spades Plasmid Assembly step. Exiting. Please check spades.log file in spades_results folder',
                    logger, 'exception')
                sys.exit(1)

            contigs = out_path + "spades_results" + "/contigs.fasta"
            scaffolds = out_path + "spades_results" + "/scaffolds.fasta"

        return contigs, scaffolds, plasmid_contigs, plasmid_scaffolds
Ejemplo n.º 14
0
def extract_only_ref_variant_fasta_unique_positions():
    #print "here"

    # Get reference genome ID
    get_reference = Fasta(args.reference)
    if len(get_reference.keys()) == 1:
        ref_id = get_reference.keys()


    c_reader = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
    c_reader_2 = csv.reader(open('%s/SNP_matrix_allele_new.csv' % args.filter2_only_snp_vcf_dir, 'r'), delimiter='\t')
    columns = list(zip(*c_reader))
    ncol = len(next(c_reader_2))


    unique_position_array = []
    for i in columns[0][1:]:
        replace_string = i.split(' ')
        if replace_string[0] != "None":
            unique_position_array.append(int(replace_string[3]))
        else:
            unique_position_array.append(int(replace_string[2]))
    #print unique_position_array

    counts = 1
    end = ncol
    for i in xrange(1, end, 1):
        print_string = ""
        ref_print_string = ""
        grab_vcf_filename = len(os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''))
        #print grab_vcf_filename
        sample_name_re = columns[i][0][:grab_vcf_filename]
        #print sample_name_re

        # Replaced this with a more stable check
        #sample_name = str(columns[i][0])
        # sample_name_re = re.sub('_R1.fastq.gz', '', sample_name)
        # sample_name_re = re.sub('_R1_001.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_L001.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_*1*.fastq.gz', '', sample_name_re)
        # sample_name_re = re.sub('_S.*', '', sample_name_re)



        if sample_name_re == os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', '') or sample_name_re in os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''):
            vcf_header = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % sample_name_re
            print_string = print_string + ">%s\n" % sample_name_re
            ref_print_string = ref_print_string + ">%s\n" % sample_name_re
            #variant_allele = ''.join(columns[i][1:])
            variant_allele = ""
            for ntd in columns[i][1:]:
                if "/" in ntd:
                    variant_allele = variant_allele + ntd[0]
                else:
                    variant_allele = variant_allele + ntd
            #print variant_allele
            print_string = print_string + str(variant_allele) + "\n"

            allele_variant_fasta = open("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_variant_fasta.write(print_string)
            allele_variant_fasta.close()

            allele_ref_variant_fasta = open("%s/%s_ref_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_vcf = open("%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re), 'w+')
            allele_ref_variant_vcf.write(vcf_header)

            variant_allele_array = []
            variant_allele_array.append(columns[i][1:])

            get_sample_reference = Fasta("%s/%s_allele_variants.fa" % (args.filter2_only_snp_vcf_dir, sample_name_re))
            if len(get_sample_reference.keys()) == 1:
                sample_ref_id = get_sample_reference.keys()

            for positions in unique_position_array:

                pos_index = unique_position_array.index(positions)

                if "/" in str(variant_allele_array[0][pos_index]):
                    allele_var = str(variant_allele_array[0][pos_index][0])
                    #print allele_var
                else:
                    allele_var = str(variant_allele_array[0][pos_index])

                ref_allele = str(get_reference.sequence({'chr': str(get_reference.keys()[0]), 'start': int(positions), 'stop': int(positions)}))
                generate_vcf_string = "%s\t%s\t.\t%s\t%s\t221.999\t.\t.\t.\n" % (ref_id[0].split(' ')[0], positions, ref_allele, allele_var)
                allele_ref_variant_vcf.write(generate_vcf_string)

            allele_ref_variant_vcf.close()
            filename = "%s/consensus_ref_allele_variant.sh" % args.filter2_only_snp_vcf_dir

            vcf_filename = "%s/%s_ref_allele_variants.vcf" % (args.filter2_only_snp_vcf_dir, sample_name_re)
            f1 = open(filename, 'a+')
            bgzip_cmd = "%s/%s/bgzip -f %s\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename)
            f1.write(bgzip_cmd)
            subprocess.call([bgzip_cmd], shell=True)
            tabix_cmd = "%s/%s/tabix -f -p vcf %s.gz\n" % (ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("vcftools", Config)['tabix_bin'], vcf_filename)
            f1.write(tabix_cmd)
            subprocess.call([tabix_cmd], shell=True)
            base_vcftools_bin = ConfigSectionMap("bin_path", Config)['binbase'] + "/" + ConfigSectionMap("vcftools", Config)['vcftools_bin']
            fasta_cmd = "cat %s | %s/vcf-consensus %s.gz > %s_ref_allele_variants.fa\n" % (args.reference, base_vcftools_bin, vcf_filename, sample_name_re)
            f1.write(fasta_cmd)
            subprocess.call([fasta_cmd], shell=True)

            sed_command = "sed -i 's/>.*/>%s/g' %s_ref_allele_variants.fa\n" % (sample_name_re, sample_name_re)
            subprocess.call([sed_command], shell=True)
            f1.write(sed_command)

            #os.system("bash %s" % filename)
            #sequence_lgth_cmd = "for i in %s/*.fa; do %s/%s/bioawk -c fastx \'{ print $name, length($seq) }\' < $i; done" % (args.filter2_only_snp_vcf_dir, ConfigSectionMap("bin_path", Config)['binbase'], ConfigSectionMap("bioawk", Config)['bioawk_bin'])
            #os.system(sequence_lgth_cmd)
            #call("%s" % sequence_lgth_cmd, logger)


        else:
            print "Sample name %s does not match with column name %s" % (os.path.basename(args.filter2_only_snp_vcf_filename).replace('_filter2_final.vcf_no_proximate_snp.vcf', ''), sample_name_re)
Ejemplo n.º 15
0
    Config.read(config_file)

    logger = generate_logger(args.output_folder, analysis_string, log_unique_time)




    # Set output directory paths
    if args.output_folder != '':
        args.output_folder += '/'
        make_sure_path_exists(args.output_folder)
    if args.output_folder != '':
        args.output_folder += '/'

    # Set reference genome path to map the samples and calculate coverage depth
    if "coverage_depth" in args.analysis_names:
        try:
            reference = ConfigSectionMap(args.reference, Config)['ref_path'] + "/" + ConfigSectionMap(args.reference, Config)['ref_name']
        except OSError as exception:
            if exception.errno != errno.EEXIST:
                print "Please provide reference genome name or Check the reference genome path in config file.\n"
                exit()
    else:
        reference = "NONE"

    # Main Workflow
    pipeline(args, logger, Config, args.output_folder, args.prefix, reference)

    keep_logging('End: Pipeline\n', 'End: Pipeline', logger, 'info')
    time_taken = datetime.now() - start_time_2
    keep_logging('Total Time taken: {}'.format(time_taken), 'Total Time taken: {}'.format(time_taken), logger, 'info')
Ejemplo n.º 16
0
def coverage_depth_analysis(filenames_array, Config, logger, output_folder,
                            type, samples, coverage_depth_directory, cluster,
                            reference, scheduler):
    files_to_delete = []
    #command_list = []
    if type == "PE":
        for file in filenames_array:
            command_list = []
            filename_base = os.path.basename(file)
            if "R1_001_final.fastq.gz" in filename_base:
                reverse_file = file.replace("R1_001_final.fastq.gz",
                                            "R2_001_final.fastq.gz")
                first_part_split = filename_base.split('R1_001_final.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "R1.fastq.gz" in filename_base:
                reverse_file = file.replace("R1.fastq.gz", "R2.fastq.gz")
                first_part_split = filename_base.split('R1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "1_combine.fastq.gz" in filename_base:
                reverse_file = file.replace("1_combine.fastq.gz",
                                            "2_combine.fastq.gz")
                first_part_split = filename_base.split('1_combine.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "1_sequence.fastq.gz" in filename_base:
                reverse_file = file.replace("1_sequence.fastq.gz",
                                            "2_sequence.fastq.gz")
                first_part_split = filename_base.split('1_sequence.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "_forward.fastq.gz" in filename_base:
                reverse_file = file.replace("_forward.fastq.gz",
                                            "_reverse.fastq.gz")
                first_part_split = filename_base.split('_forward.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "R1_001.fastq.gz" in filename_base:
                reverse_file = file.replace("R1_001.fastq.gz",
                                            "R2_001.fastq.gz")
                first_part_split = filename_base.split('R1_001.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "_1.fastq.gz" in filename_base:
                reverse_file = file.replace("_1.fastq.gz", "_2.fastq.gz")
                first_part_split = filename_base.split('_1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            else:
                print "Using Standard second file naming convention"
                reverse_file = file.replace("_R1_", "_R2_")
                first_part_split = filename_base.split('_R1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            file_prefix = coverage_depth_directory + "/" + first_part
            analysis = first_part
            if file.endswith('.gz'):
                keep_logging("Generating command list to create cluster jobs",
                             "Generating command list to create cluster jobs",
                             logger, 'info')
                split_field = prepare_readgroup(file, logger)
                command_list, files_to_delete, out_sam = align_bwa(
                    ConfigSectionMap("bwa", Config)['base_cmd'], file,
                    reverse_file, coverage_depth_directory, reference,
                    split_field, first_part, files_to_delete, logger, Config,
                    type, command_list)
                # out_sam = files_to_delete[0]
                command_list, files_to_delete, out_bam = samtobam(
                    out_sam, coverage_depth_directory, analysis,
                    files_to_delete, logger, Config, command_list)
                # out_bam = files_to_delete[1]
                command_list, files_to_delete, out_sort_bam = sort_bam(
                    out_bam, coverage_depth_directory, analysis, logger,
                    Config, command_list, files_to_delete)
                # out_sort_bam = files_to_delete[2]
                command_list = index_bam(out_sort_bam,
                                         coverage_depth_directory, logger,
                                         Config, command_list, files_to_delete)
                command_list, gatk_depth_of_coverage_file = gatk_DepthOfCoverage(
                    out_sort_bam, coverage_depth_directory, analysis,
                    reference, logger, Config, command_list)
                command_list = flagstat(out_sort_bam, coverage_depth_directory,
                                        analysis, logger, Config, command_list)
                coverage_depth_cmd = ""

                for i in command_list:
                    coverage_depth_cmd = coverage_depth_cmd + i + "\n"
                keep_logging('', coverage_depth_cmd, logger, 'debug')

                if cluster == "cluster":
                    generate_cluster_jobs(coverage_depth_cmd, file_prefix,
                                          scheduler, Config, logger)
                else:
                    f3 = open(file_prefix + '_commands.sh', 'w+')
                    f3.write(coverage_depth_cmd)

            else:
                keep_logging("Generating command list to create cluster jobs",
                             "Generating command list to create cluster jobs",
                             logger, 'info')
                split_field = prepare_readgroup(file, logger)
                command_list, files_to_delete = align_bwa(
                    ConfigSectionMap("bwa", Config)['base_cmd'], file,
                    reverse_file, coverage_depth_directory, reference,
                    split_field, first_part, files_to_delete, logger, Config,
                    type, command_list)
                #out_sam = files_to_delete[0]
                command_list, files_to_delete = samtobam(
                    out_sam, coverage_depth_directory, analysis,
                    files_to_delete, logger, Config, command_list,
                    files_to_delete)
                #out_bam = files_to_delete[1]
                command_list, files_to_delete = sort_bam(
                    out_bam, coverage_depth_directory, analysis, logger,
                    Config, command_list, files_to_delete)
                #out_sort_bam = files_to_delete[2]
                command_list = index_bam(out_sort_bam,
                                         coverage_depth_directory, logger,
                                         Config, command_list, files_to_delete)
                command_list, gatk_depth_of_coverage_file = gatk_DepthOfCoverage(
                    out_sorted_bam, coverage_depth_directory, analysis,
                    reference, logger, Config, command_list)
                command_list = flagstat(out_sort_bam, coverage_depth_directory,
                                        analysis, logger, Config, command_list)
                coverage_depth_cmd = ""
                for i in command_list:
                    coverage_depth_cmd = coverage_depth_cmd + i + "\n"
                keep_logging('', coverage_depth_cmd, logger, 'debug')

                if cluster == "cluster":
                    generate_cluster_jobs(coverage_depth_cmd, file_prefix,
                                          scheduler, Config, logger)
                else:
                    f3 = open(file_prefix + '_commands.sh', 'w+')
                    f3.write(coverage_depth_cmd)
    elif type == "SE":
        ###Pending Changes
        for file in filenames_array:
            filename_base = os.path.basename(file)
            if "R1_001_final.fastq.gz" in filename_base:
                reverse_file = file.replace("R1_001_final.fastq.gz",
                                            "R2_001_final.fastq.gz")
                first_part_split = filename_base.split('R1_001_final.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "R1.fastq.gz" in filename_base:
                reverse_file = file.replace("R1.fastq.gz", "R2.fastq.gz")
                first_part_split = filename_base.split('R1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "1_combine.fastq.gz" in filename_base:
                reverse_file = file.replace("1_combine.fastq.gz",
                                            "2_combine.fastq.gz")
                first_part_split = filename_base.split('1_combine.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "1_sequence.fastq.gz" in filename_base:
                reverse_file = file.replace("1_sequence.fastq.gz",
                                            "2_sequence.fastq.gz")
                first_part_split = filename_base.split('1_sequence.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "_forward.fastq.gz" in filename_base:
                reverse_file = file.replace("_forward.fastq.gz",
                                            "_reverse.fastq.gz")
                first_part_split = filename_base.split('_forward.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "R1_001.fastq.gz" in filename_base:
                reverse_file = file.replace("R1_001.fastq.gz",
                                            "R2_001.fastq.gz")
                first_part_split = filename_base.split('R1_001.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            elif "_1.fastq.gz" in filename_base:
                reverse_file = file.replace("_1.fastq.gz", "_2.fastq.gz")
                first_part_split = filename_base.split('_1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            else:
                print "Using Standard second file naming convention"
                reverse_file = file.replace("_R1_", "_R2_")
                first_part_split = filename_base.split('_R1.fastq.gz')
                first_part = first_part_split[0].replace('_L001', '')
            file_prefix = coverage_depth_directory + "/" + first_part

            if file.endswith('.gz'):
                keep_logging("Generating command list to create cluster jobs",
                             "Generating command list to create cluster jobs",
                             logger, 'info')
                split_field = prepare_readgroup(file, logger)
                command_list, files_to_delete = align_bwa(
                    ConfigSectionMap("bwa", Config)['bwa_bin'], file,
                    reverse_file, coverage_depth_directory, reference,
                    split_field, first_part, files_to_delete, logger, Config,
                    type, command_list)
                out_sam = files_to_delete[0]
                command_list, files_to_delete = samtobam(
                    out_sam, coverage_depth_directory, analysis,
                    files_to_delete, logger, Config, command_list,
                    files_to_delete)
                out_bam = files_to_delete[1]
                command_list, files_to_delete = sort_bam(
                    out_bam, coverage_depth_directory, analysis, logger,
                    Config, command_list, files_to_delete)
                out_sort_bam = files_to_delete[2]
                command_list = index_bam(out_sort_bam,
                                         coverage_depth_directory, logger,
                                         Config, command_list, files_to_delete)
                command_list, gatk_depth_of_coverage_file = gatk_DepthOfCoverage(
                    out_sorted_bam, coverage_depth_directory, analysis,
                    reference, logger, Config, command_list)
                command_list = flagstat(out_sorted_bam,
                                        coverage_depth_directory, analysis,
                                        logger, Config, command_list)
                coverage_depth_cmd = ""
                for i in command_list:
                    coverage_depth_cmd = coverage_depth_cmd + i + "\n"
                keep_logging("The coverage Depth commands for file %s are:\n",
                             "The coverage Depth commands for file %s are:\n",
                             logger, 'info')
                keep_logging(coverage_depth_cmd, coverage_depth_cmd, logger,
                             'debug')

                if cluster == "cluster":
                    generate_cluster_jobs(coverage_depth_cmd, file_prefix,
                                          Config, logger)
                else:
                    f3 = open(file_prefix + '_commands.sh', 'w+')
                    f3.write(coverage_depth_cmd)

            else:
                keep_logging("Generating command list to create cluster jobs",
                             "Generating command list to create cluster jobs",
                             logger, 'info')
                split_field = prepare_readgroup(file, logger)
                command_list, files_to_delete = align_bwa(
                    ConfigSectionMap("bin_path", Config)['binbase'] +
                    ConfigSectionMap("bwa", Config)['bwa_bin'], file,
                    reverse_file, output_folder, reference, split_field,
                    first_part, files_to_delete, logger, Config, type,
                    command_list)
                out_sam = files_to_delete[0]
                command_list, files_to_delete = samtobam(
                    out_sam, output_folder, analysis, files_to_delete, logger,
                    Config, command_list, files_to_delete)
                out_bam = files_to_delete[1]
                command_list, files_to_delete = sort_bam(
                    out_bam, output_folder, analysis, logger, Config,
                    command_list, files_to_delete)
                out_sort_bam = files_to_delete[2]
                command_list = index_bam(out_sort_bam, output_folder, logger,
                                         Config, command_list, files_to_delete)
                command_list, gatk_depth_of_coverage_file = gatk_DepthOfCoverage(
                    out_sorted_bam, output_folder, analysis, reference, logger,
                    Config, command_list)
                command_list = flagstat(out_sorted_bam,
                                        coverage_depth_directory, analysis,
                                        logger, Config, command_list)
                coverage_depth_cmd = ""

                for i in command_list:
                    coverage_depth_cmd = coverage_depth_cmd + i + "\n"
                keep_logging("The coverage Depth commands for file %s are:\n",
                             "The coverage Depth commands for file %s are:\n",
                             logger, 'info')
                keep_logging(coverage_depth_cmd, coverage_depth_cmd, logger,
                             'debug')

                if cluster == "cluster":
                    generate_cluster_jobs(coverage_depth_cmd, file_prefix,
                                          Config, logger)
                else:
                    f3 = open(file_prefix + '_commands.sh', 'w+')
                    f3.write(coverage_depth_cmd)
Ejemplo n.º 17
0
def pipeline(args, logger, Config, output_folder, prefix, reference):
    keep_logging('\nSTART: Pipeline', 'START: Pipeline', logger, 'info')

    """ Check Subroutines and create logger object: Arguments, Input files, Reference Index"""
    keep_logging('Checking Dependencies...', 'Checking Dependencies', logger, 'info')

    """ Check java availability """
    java_check()

    """ Check if the input file exists """
    with open(args.samples) as fp:
        for line in fp:
            line = line.strip()
            line = args.directory + "/" + line
            filenames_array.append(line)
            if args.type != "PE":
                reverse_raw = "None"
                file_exists(line, reverse_raw)
            else:
                #reverse_raw = args.directory + "/" + reverse_raw
                file_exists(line, line)

    keep_logging('Total no. of Samples %s' % len(filenames_array), 'Total no. of Samples %s' % len(filenames_array), logger, 'info')

    """ Start the pipeline: """
    analysis_list = args.analysis_names.split(',')

    keep_logging('Running analysis - %s' % args.analysis_names, 'Running analysis - %s' % args.analysis_names,
                 logger, 'info')

    """ Copy filenames to output folder """
    cp_cmd = "cp %s %s" % (args.samples, output_folder)
    os.system(cp_cmd)

    """ Set Default cluster mode"""
    if args.cluster:
        cluster = args.cluster
    else:
        cluster = "local"

    """ Start Specific analysis based on analysis list """
    for analysis in analysis_list:
        if analysis == "coverage":
            keep_logging("Step: Calculating Coverage...\n", "Calculating Coverage", logger, 'info')
            coverage(filenames_array, Config, logger, output_folder, args.type, args.samples, args.size, prefix)
        elif analysis == "quality":
            keep_logging("Step: Analysing Fastqc Quality...\n", "Analysing Fastqc Quality...", logger, 'info')
            fastqc_main_directory = args.output_folder + "/%s_Fastqc" % args.prefix
            make_sure_path_exists(fastqc_main_directory)
            fastqc_forward_directory = fastqc_main_directory + "/%s_Forward" % args.prefix
            make_sure_path_exists(fastqc_forward_directory)
            fastqc_reverse_directory = fastqc_main_directory + "/%s_Reverse" % args.prefix
            make_sure_path_exists(fastqc_reverse_directory)
            Multiqc_reports_directory = args.output_folder + "/%s_Multiqc_reports" % args.prefix
            make_sure_path_exists(Multiqc_reports_directory)
            quality(filenames_array, Config, logger, output_folder, args.type, args.samples, fastqc_forward_directory, fastqc_reverse_directory)
            multiqc(fastqc_forward_directory, "%s_Forward_fastqc" % args.prefix, Config, logger, Multiqc_reports_directory)
            multiqc(fastqc_reverse_directory, "%s_Reverse_fastqc" % args.prefix, Config, logger, Multiqc_reports_directory)
        elif analysis == "screen_contamination":
            keep_logging("Step: Screening Fastq reads against Reference Database...\n", "Screening Fastq reads against Reference Database...", logger, 'info')
            fastq_screen_directory = args.output_folder + "/%s_Fastqc_screen" % args.prefix
            make_sure_path_exists(fastq_screen_directory)
            screen_contamination(filenames_array, Config, logger, output_folder, args.type, args.samples, fastq_screen_directory, cluster)
            Multiqc_reports_directory = args.output_folder + "/%s_Multiqc_reports" % args.prefix
            make_sure_path_exists(Multiqc_reports_directory)
            multiqc(fastq_screen_directory, "%s_Fastq_screen" % args.prefix, Config, logger, Multiqc_reports_directory)
            keep_logging('MultiQC Report of FastQC results can be found in - %s\n' % Multiqc_reports_directory, 'MultiQC Report of FastQC results can be found in - %s\n' % Multiqc_reports_directory, logger, 'info')
        elif analysis == "kraken_contamination":
            keep_logging("Step: Running Kraken on Input reads...\n", "Running Kraken on Input reads...", logger, 'info')
            kraken_directory = args.output_folder + "/%s_Kraken_results" % args.prefix
            make_sure_path_exists(kraken_directory)
            kraken_contamination(filenames_array, Config, logger, output_folder, args.type, args.samples, kraken_directory, cluster, args.downsample, args.scheduler, args.size, args.dryrun)
        elif analysis == "kraken_report":
            keep_logging("Step: Generating Kraken report on Kraken Results...\n", "Generating Kraken report on Kraken Results...", logger, 'info')
            kraken_directory = args.output_folder + "/%s_Kraken_results" % args.prefix
            make_sure_path_exists(kraken_directory)
            kraken_report(filenames_array, Config, logger, output_folder, args.type, args.samples, kraken_directory, cluster, args.scheduler)
        elif analysis == "coverage_depth":
            keep_logging("Step: Running Coverage Depth analysis on Input reads...\n", "Running Coverage Depth analysis on Input reads...", logger, 'info')
            coverage_depth_directory = args.output_folder + "/%s_Coverage_depth" % args.prefix
            make_sure_path_exists(coverage_depth_directory)
            coverage_depth_analysis(filenames_array, Config, logger, output_folder, args.type, args.samples, coverage_depth_directory, cluster, reference, args.scheduler)
        elif analysis == "mlst":
            keep_logging("Step: Running Ariba MLST sequence typing on Input reads...\n", "Running MLST sequence typing on Input reads...", logger, 'info')
            if args.mlst_db:
                mlstdb = args.mlst_db
            else:
                mlstdb = ConfigSectionMap("ariba", Config)['mlst_db_path']
            keep_logging(
                '',
                "Using Ariba MLST Database from this path - %s" % mlstdb,
                logger, 'debug')
            mlst_directory = args.output_folder + "/%s_MLST_results" % args.prefix
            make_sure_path_exists(mlst_directory)
            mlst(filenames_array, Config, logger, mlst_directory, args.type, args.samples, mlst_directory, cluster, args.scheduler, mlstdb)
        elif analysis == "summary":
            keep_logging('', "Generating Summary report for QC'd analysis - %s" % args.prefix, logger, 'debug')
            summary(filenames_array, Config, logger, args.prefix, output_folder)
            keep_logging("Summary report - %s/%s_summary.tsv" % (output_folder, prefix), "Summary report - %s/%s_summary.tsv" % (output_folder, prefix), logger, 'info')
Ejemplo n.º 18
0
    def varcall():
        keep_logging('START: Variant Calling', 'START: Variant Calling',
                     logger, 'info')
        caller = ConfigSectionMap("pipeline", Config)['variant_caller']
        if caller == "gatkhaplotypecaller":
            keep_logging('START: Variant Calling using GATK haplotyper.',
                         'START: Variant Calling using GATK haplotyper.',
                         logger, 'info')
            final_raw_vcf_mpileup = variant_calling(out_sorted_bam,
                                                    args.output_folder,
                                                    args.index,
                                                    args.analysis_name, logger,
                                                    Config)
            #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name)
            final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup,
                                                  args.output_folder,
                                                  args.analysis_name,
                                                  reference, logger, Config)
            final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup,
                                                args.output_folder,
                                                args.analysis_name, reference,
                                                logger, Config)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf),
                         'The final raw VCF file: {}'.format(final_raw_vcf),
                         logger, 'debug')
            keep_logging(
                'The final raw Indel VCF file: {}'.format(final_raw_indel_vcf),
                'The final raw Indel VCF file: {}'.format(final_raw_indel_vcf),
                logger, 'debug')
            keep_logging(
                'END: Variant Calling using Samtools without post-align bam input files.',
                'END: Variant Calling using Samtools without post-align bam input files.',
                logger, 'info')
            return final_raw_vcf, final_raw_indel_vcf

        elif caller == "samtools":
            keep_logging(
                'START: Variant Calling using Samtools without post-align bam input files.',
                'START: Variant Calling using Samtools without post-align bam input files.',
                logger, 'info')
            final_raw_indel_vcf = prepare_indel_gatk(out_sorted_bam,
                                                     args.output_folder,
                                                     args.analysis_name,
                                                     args.index, logger,
                                                     Config)
            final_raw_vcf_mpileup = variant_calling(out_sorted_bam,
                                                    args.output_folder,
                                                    args.index,
                                                    args.analysis_name, logger,
                                                    Config)
            #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name)
            final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup,
                                                  args.output_folder,
                                                  args.analysis_name,
                                                  reference, logger, Config)
            # GATK indel calling integration
            #final_raw_indel_vcf = prepare_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf),
                         'The final raw VCF file: {}'.format(final_raw_vcf),
                         logger, 'debug')
            keep_logging(
                'END: Variant Calling using Samtools without post-align bam input files.',
                'END: Variant Calling using Samtools without post-align bam input files.',
                logger, 'info')
            return final_raw_vcf, final_raw_indel_vcf
        else:
            keep_logging(
                'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. gatkhaplotypecaller',
                'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. gatkhaplotypecaller',
                logger, 'info')
            exit()
        keep_logging('END: Variant Calling', 'END: Variant Calling', logger,
                     'info')
Ejemplo n.º 19
0
def pipeline(args, logger):
    keep_logging('START: Pipeline', 'START: Pipeline', logger, 'info')

    # Check Subroutines and create logger object: Arguments, Input files, Reference Index
    keep_logging('START: Checking Dependencies...', 'Checking Dependencies',
                 logger, 'info')

    # Reference Genome file name
    reference = ConfigSectionMap(args.index,
                                 Config)['ref_path'] + "/" + ConfigSectionMap(
                                     args.index, Config)['ref_name']
    keep_logging(
        'Getting Reference Genome name from config file: {}'.format(reference),
        'Getting Reference Genome name from config file: {}'.format(reference),
        logger, 'info')

    # Check FASTQ files
    if args.type != "PE":
        reverse_raw = "None"
        file_exists(args.forward_raw, args.forward_raw, reference)
    else:
        file_exists(args.forward_raw, args.reverse_raw, reference)

    # Check Java Version
    java_check()
    keep_logging('END: Checking Dependencies...', 'END: Checking Dependencies',
                 logger, 'info')
    """ Start the pipeline: """
    steps_list = args.steps.split(',')
    if args.cluster:
        cluster = args.cluster
    else:
        cluster = "local"

    ## 1. Pre-Processing Raw reads using Trimmomatic
    def clean():
        keep_logging('START: Pre-Processing Raw reads using Trimmomatic',
                     'START: Pre-Processing Raw reads using Trimmomatic',
                     logger, 'info')
        if args.type == "PE":
            trimmomatic(args.forward_raw, args.reverse_raw, args.output_folder,
                        args.croplength, logger, Config)
        else:
            reverse_raw = "None"
            trimmomatic(args.forward_raw, reverse_raw, args.output_folder,
                        args.croplength, logger, Config)
        keep_logging('END: Pre-Processing Raw reads using Trimmomatic',
                     'END: Pre-Processing Raw reads using Trimmomatic', logger,
                     'info')

    ## 2. Stages: Alignment using BWA
    def align_reads():
        keep_logging('START: Mapping Reads using BWA',
                     'START: Mapping Reads using BWA', logger, 'info')
        split_field = prepare_readgroup(args.forward_raw, logger)
        out_sam = align(args.output_folder, args.index, split_field,
                        args.analysis_name, files_to_delete, logger, Config,
                        args.type)
        keep_logging('END: Mapping Reads using BWA',
                     'END: Mapping Reads using BWA', logger, 'info')
        return out_sam

    # Run Depth of Coverage Module after read mapping and stop. Dont proceed to variant calling step.
    def coverage_depth_stats():
        gatk_DepthOfCoverage_file = gatk_DepthOfCoverage(
            out_sorted_bam, args.output_folder, args.analysis_name, reference,
            logger, Config)
        alignment_stats_file = alignment_stats(out_sorted_bam,
                                               args.output_folder,
                                               args.analysis_name, logger,
                                               Config)
        return gatk_DepthOfCoverage_file

    ## Continue: 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc
    ## 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc
    def post_align():
        keep_logging('START: Post-Alignment using SAMTOOLS, PICARD etc...',
                     'START: Post-Alignment using SAMTOOLS, PICARD etc...',
                     logger, 'info')
        out_sorted_bam = prepare_bam(out_sam, args.output_folder,
                                     args.analysis_name, files_to_delete,
                                     logger, Config)
        keep_logging('END: Post-Alignment using SAMTOOLS, PICARD etc...',
                     'END: Post-Alignment using SAMTOOLS, PICARD etc...',
                     logger, 'info')
        #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name)
        keep_logging('START: Creating BedGraph Coverage',
                     'START: Creating BedGraph Coverage', logger, 'info')
        bedgraph_coverage(out_sorted_bam, args.output_folder,
                          args.analysis_name, reference, logger, Config)
        only_unmapped_positions_file = bedtools(out_sorted_bam,
                                                args.output_folder,
                                                args.analysis_name, logger,
                                                Config)
        keep_logging('END: Creating BedGraph Coverage',
                     'END: Creating BedGraph Coverage', logger, 'info')
        return out_sorted_bam

    ## 4. Stages: Variant Calling
    def varcall():
        keep_logging('START: Variant Calling', 'START: Variant Calling',
                     logger, 'info')
        caller = ConfigSectionMap("pipeline", Config)['variant_caller']
        if caller == "samtoolswithpostalignbam":
            keep_logging(
                'START: Variant Calling using Samtools and post-align bam input files',
                'START: Variant Calling using Samtools and post-align bam input files',
                logger, 'info')
            out_finalbam = post_align_bam(out_sorted_bam, args.output_folder,
                                          args.index, args.analysis_name)
            final_raw_vcf = variant_calling(out_finalbam, args.output_folder,
                                            args.index, args.analysis_name)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf),
                         'The final raw VCF file: {}'.format(final_raw_vcf),
                         logger, 'debug')
            keep_logging(
                'END: Variant Calling using Samtools and post-align bam input files',
                'END: Variant Calling using Samtools and post-align bam input files',
                logger, 'info')
        elif caller == "gatkhaplotypecaller":
            keep_logging(
                'START: Variant Calling using GATK haplotyper and post-align bam input files',
                'START: Variant Calling using GATK haplotyper and post-align bam input files',
                logger, 'info')
            out_finalbam = post_align_bam(out_sorted_bam, args.output_folder,
                                          args.index, args.analysis_name)
            final_raw_vcf = variant_calling(out_finalbam, args.output_folder,
                                            args.index, args.analysis_name)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf),
                         'The final raw VCF file: {}'.format(final_raw_vcf),
                         logger, 'debug')
            keep_logging(
                'END: Variant Calling using GATK haplotyper and post-align bam input files',
                'END: Variant Calling using GATK haplotyper and post-align bam input files',
                logger, 'info')
        elif caller == "samtools":
            keep_logging(
                'START: Variant Calling using Samtools without post-align bam input files.',
                'START: Variant Calling using Samtools without post-align bam input files.',
                logger, 'info')
            final_raw_vcf_mpileup = variant_calling(out_sorted_bam,
                                                    args.output_folder,
                                                    args.index,
                                                    args.analysis_name, logger,
                                                    Config)
            #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name)
            final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup,
                                                  args.output_folder,
                                                  args.analysis_name,
                                                  reference, logger, Config)
            #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf),
                         'The final raw VCF file: {}'.format(final_raw_vcf),
                         logger, 'debug')
            keep_logging(
                'END: Variant Calling using Samtools without post-align bam input files.',
                'END: Variant Calling using Samtools without post-align bam input files.',
                logger, 'info')
            return final_raw_vcf
        else:
            keep_logging(
                'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. samtoolswithpostalignbam 3. gatkhaplotypecaller',
                'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. samtoolswithpostalignbam 3. gatkhaplotypecaller',
                logger, 'info')
            exit()
        keep_logging('END: Variant Calling', 'END: Variant Calling', logger,
                     'info')

    ## 5. Stages: Variant Filteration
    def filter(gatk_depth_of_coverage_file):
        keep_logging('START: Variant Filteration',
                     'START: Variant Filteration', logger, 'info')
        Avg_dp_cmd = "grep \'^Total\' %s | awk -F\'\t\' \'{print $3}\'" % gatk_depth_of_coverage_file
        proc = sp.Popen([Avg_dp_cmd], stdout=sp.PIPE, shell=True)
        (out, err) = proc.communicate()
        Avg_dp = float(out)
        print "The Average Depth per reference genome base is: %s" % Avg_dp
        filter2_variants(final_raw_vcf, args.output_folder, args.analysis_name,
                         args.index, logger, Config, Avg_dp)
        keep_logging('END: Variant Filteration', 'END: Variant Filteration',
                     logger, 'info')

    ## 6. Stages: Statistics
    def stats():
        keep_logging('START: Generating Statistics Reports',
                     'START: Generating Statistics Reports', logger, 'info')
        alignment_stats_file = alignment_stats(out_sorted_bam,
                                               args.output_folder,
                                               args.analysis_name, logger,
                                               Config)
        #gatk_DepthOfCoverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config)
        vcf_stats_file = vcf_stats(final_raw_vcf, args.output_folder,
                                   args.analysis_name, logger, Config)
        #qualimap_report = qualimap(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config)
        keep_logging('END: Generating Statistics Reports',
                     'END: Generating Statistics Reports', logger, 'info')

    if len(steps_list) == 1:
        if steps_list[0] == "coverage_depth_stats":
            clean()
            out_sam = align_reads()
            out_sorted_bam = post_align()
            gatk_DepthOfCoverage_file = coverage_depth_stats()

        elif steps_list[0] == "All":
            clean()
            out_sam = align_reads()
            out_sorted_bam = post_align()
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf = varcall()
            final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (
                args.output_folder, args.analysis_name)
            filter(gatk_depth_of_coverage_file)
            stats()

        #####Individual steps
    else:

        if steps_list[0] == "clean":
            clean()
            out_sam = align_reads()
            out_sorted_bam = post_align()
            #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf = varcall()
            #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name)
            filter(gatk_depth_of_coverage_file)
            stats()
        elif steps_list[0] == "align":
            #Check clean reads here
            out_sam = align_reads()
            out_sorted_bam = post_align()
            #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf = varcall()
            #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name)
            filter(gatk_depth_of_coverage_file)
            stats()
        elif steps_list[0] == "post-align":
            #Check BAM file here
            out_sorted_bam = post_align()
            #out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf = varcall()
            #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name)
            filter(gatk_depth_of_coverage_file)
            stats()

        elif steps_list[0] == "varcall":
            #Check Post-aligned-BAM and Bed files here
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf = varcall()
            #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name)
            filter(gatk_depth_of_coverage_file)
            stats()

        elif steps_list[0] == "filter":
            #Check Post-varcall vcf and other files here
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (
                args.output_folder, args.analysis_name)
            filter(gatk_depth_of_coverage_file)
            stats()
        elif steps_list[0] == "stats":
            #check BAM and vcf files
            gatk_depth_of_coverage_file = "%s/%s_depth_of_coverage.sample_summary" % (
                args.output_folder, args.analysis_name)
            if not os.path.exists(gatk_depth_of_coverage_file):
                gatk_depth_of_coverage_file = coverage_depth_stats()
            out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder,
                                                     args.analysis_name)
            final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (
                args.output_folder, args.analysis_name)

            stats()

        else:
            keep_logging(
                'Seems like the Analysis Steps are not in sequential order. Please recheck the -steps argument and run the pipeline again',
                'Seems like the Analysis Steps are not in sequential order. Please recheck the -steps argument and run the pipeline again',
                logger, 'exception')
Ejemplo n.º 20
0
def trim(input1, input2, out_path, crop, logger, Config):
    if input2 != "None":
        keep_logging('Pre-processing PE reads using Trimmomatic.',
                     'Pre-processing PE reads using Trimmomatic.', logger,
                     'info')
        adapter_file = ConfigSectionMap(
            "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
                "Trimmomatic",
                Config)['trimmomatic_bin'] + "/" + ConfigSectionMap(
                    "Trimmomatic", Config)['adaptor_filepath']
        clean_filenames = out_path + ConfigSectionMap(
            "Trimmomatic", Config)['f_p'] + " " + out_path + ConfigSectionMap(
                "Trimmomatic",
                Config)['f_up'] + " " + out_path + ConfigSectionMap(
                    "Trimmomatic",
                    Config)['r_p'] + " " + out_path + ConfigSectionMap(
                        "Trimmomatic", Config)['r_up']
        # changing this parameter for KPC variant analysis for keeping both reads. date: 31 August
        illumina_string = 'ILLUMINACLIP:' + adapter_file + ConfigSectionMap(
            "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                "Trimmomatic", Config)['seed_mismatches'] + ConfigSectionMap(
                    "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                        "Trimmomatic", Config
                    )['palindrome_clipthreshold'] + ConfigSectionMap(
                        "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                            "Trimmomatic", Config
                        )['simple_clipthreshold'] + ConfigSectionMap(
                            "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                                "Trimmomatic",
                                Config)['minadapterlength'] + ConfigSectionMap(
                                    "Trimmomatic",
                                    Config)['colon'] + ConfigSectionMap(
                                        "Trimmomatic",
                                        Config)['keep_both_reads']
        sliding_string = 'SLIDINGWINDOW:' + ConfigSectionMap(
            "Trimmomatic", Config)['window_size'] + ConfigSectionMap(
                "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                    "Trimmomatic", Config)['window_size_quality']
        minlen_string = 'MINLEN:' + ConfigSectionMap("Trimmomatic",
                                                     Config)['minlength']
        headcrop_string = 'HEADCROP:' + ConfigSectionMap(
            "Trimmomatic", Config)['headcrop_length']
        if not crop:
            cmdstring = "java -jar " + ConfigSectionMap(
                "bin_path", Config
            )['binbase'] + ConfigSectionMap(
                "Trimmomatic", Config
            )['trimmomatic_bin'] + "trimmomatic-0.36.jar PE -phred33 " + input1 + " " + input2 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string + " 2> %s/%s_trim_out.log" % (
                out_path, os.path.basename(os.path.dirname(out_path)))
            keep_logging(cmdstring, cmdstring, logger, 'debug')
            try:
                call(cmdstring, logger)
            except sp.CalledProcessError:
                keep_logging('Error in Trimming step. Exiting.',
                             'Error in Trimming step. Exiting.', logger,
                             'exception')
                sys.exit(1)
            keep_logging('End: Data Pre-processing',
                         'End: Data Pre-processing', logger, 'info')
        else:
            crop_string = 'CROP:' + crop
            cmdstring = "java -jar " + ConfigSectionMap(
                "bin_path", Config
            )['binbase'] + ConfigSectionMap(
                "Trimmomatic", Config
            )['trimmomatic_bin'] + "trimmomatic-0.36.jar PE " + input1 + " " + input2 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string + " 2> %s/%s_trim_out.log" % (
                out_path, os.path.basename(os.path.dirname(out_path)))
            try:
                call(cmdstring, logger)
            except sp.CalledProcessError:
                keep_logging('Error in Trimming step. Exiting.',
                             'Error in Trimming step. Exiting.', logger,
                             'exception')
                sys.exit(1)
            keep_logging('End: Data Pre-processing',
                         'End: Data Pre-processing', logger, 'info')
    else:
        keep_logging('Pre-processing SE reads using Trimmomatic.',
                     'Pre-processing SE reads using Trimmomatic.', logger,
                     'info')
        adapter_file = ConfigSectionMap(
            "bin_path", Config)['binbase'] + "/" + ConfigSectionMap(
                "Trimmomatic",
                Config)['trimmomatic_bin'] + "/" + ConfigSectionMap(
                    "Trimmomatic", Config)['adaptor_filepath']
        clean_filenames = out_path + ConfigSectionMap("Trimmomatic",
                                                      Config)['f_p']
        # changing this parameter for KPC variant analysis for keeping both reads. date: 31 August
        illumina_string = 'ILLUMINACLIP:' + adapter_file + ConfigSectionMap(
            "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                "Trimmomatic", Config)['seed_mismatches'] + ConfigSectionMap(
                    "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                        "Trimmomatic",
                        Config)['palindrome_clipthreshold'] + ConfigSectionMap(
                            "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                                "Trimmomatic", Config)['simple_clipthreshold']
        sliding_string = 'SLIDINGWINDOW:' + ConfigSectionMap(
            "Trimmomatic", Config)['window_size'] + ConfigSectionMap(
                "Trimmomatic", Config)['colon'] + ConfigSectionMap(
                    "Trimmomatic", Config)['window_size_quality']
        minlen_string = 'MINLEN:' + ConfigSectionMap("Trimmomatic",
                                                     Config)['minlength']
        headcrop_string = 'HEADCROP:' + ConfigSectionMap(
            "Trimmomatic", Config)['headcrop_length']
        if not crop:
            cmdstring = "java -jar " + ConfigSectionMap(
                "bin_path", Config
            )['binbase'] + ConfigSectionMap(
                "Trimmomatic", Config
            )['trimmomatic_bin'] + "trimmomatic-0.36.jar SE " + input1 + " " + clean_filenames + " " + illumina_string + " " + sliding_string + " " + minlen_string + " " + headcrop_string + " 2> %s/%s_trim_out.log" % (
                out_path, os.path.basename(os.path.dirname(out_path)))
            keep_logging(cmdstring, cmdstring, logger, 'debug')
            try:
                call(cmdstring, logger)
            except sp.CalledProcessError:
                keep_logging('Error in Trimming step. Exiting.',
                             'Error in Trimming step. Exiting.', logger,
                             'exception')
                sys.exit(1)
            keep_logging('End: Data Pre-processing',
                         'End: Data Pre-processing', logger, 'info')

        else:
            crop_string = 'CROP:' + crop
            cmdstring = "java -jar " + ConfigSectionMap(
                "bin_path", Config
            )['binbase'] + ConfigSectionMap(
                "Trimmomatic", Config
            )['trimmomatic_bin'] + "trimmomatic-0.36.jar SE " + input1 + " " + clean_filenames + " " + crop_string + " " + illumina_string + " " + sliding_string + " " + minlen_string + + " 2> %s/%s_trim_out.log" % (
                out_path, os.path.basename(os.path.dirname(out_path)))
            keep_logging(cmdstring, cmdstring, logger, 'debug')
            try:
                call(cmdstring, logger)
            except sp.CalledProcessError:
                keep_logging('Error in Trimming step. Exiting.',
                             'Error in Trimming step. Exiting.', logger,
                             'exception')
                sys.exit(1)
            keep_logging('End: Data Pre-processing',
                         'End: Data Pre-processing', logger, 'info')
Ejemplo n.º 21
0
def pipeline(args, logger):
    keep_logging('START: Pipeline', 'START: Pipeline', logger, 'info')

    # Check Subroutines and create logger object: Arguments, Input files, Reference Index
    keep_logging('START: Checking Dependencies...', 'Checking Dependencies', logger, 'info')

    # Reference Genome file name
    reference = ConfigSectionMap(args.index, Config)['ref_path'] + "/" + ConfigSectionMap(args.index, Config)['ref_name']
    keep_logging('Getting Reference Genome name from config file: {}'.format(reference), 'Getting Reference Genome name from config file: {}'.format(reference), logger, 'info')

    # Check FASTQ files
    if args.type != "PE":
        reverse_raw = "None"
        file_exists(args.forward_raw, args.forward_raw, reference)
    else:
        file_exists(args.forward_raw, args.reverse_raw, reference)

    # Check Java Version
    java_check()
    keep_logging('END: Checking Dependencies...', 'END: Checking Dependencies', logger, 'info')


    ## 1. Pre-Processing Raw reads using Trimmomatic
    keep_logging('START: Pre-Processing Raw reads using Trimmomatic', 'START: Pre-Processing Raw reads using Trimmomatic', logger, 'info')
    if args.type == "PE":
        trimmomatic(args.forward_raw, args.reverse_raw, args.output_folder, args.croplength, logger, Config)
    else:
        reverse_raw = "None"
        trimmomatic(args.forward_raw, reverse_raw, args.output_folder, args.croplength, logger, Config)
    keep_logging('END: Pre-Processing Raw reads using Trimmomatic', 'END: Pre-Processing Raw reads using Trimmomatic', logger, 'info')


    ## 2. Stages: Alignment using BWA
    keep_logging('START: Mapping Reads using BWA', 'START: Mapping Reads using BWA', logger, 'info')
    split_field = prepare_readgroup(args.forward_raw, logger)
    files_to_delete = []
    out_sam = align(args.bam_input, args.output_folder, args.index, split_field, args.analysis_name, files_to_delete, logger, Config, args.type)
    keep_logging('END: Mapping Reads using BWA', 'END: Mapping Reads using BWA', logger, 'info')


    ## 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc
    keep_logging('START: Post-Alignment using SAMTOOLS, PICARD etc...', 'START: Post-Alignment using SAMTOOLS, PICARD etc...', logger, 'info')
    out_sorted_bam = prepare_bam(out_sam, args.output_folder, args.analysis_name, files_to_delete, logger, Config)
    keep_logging('END: Post-Alignment using SAMTOOLS, PICARD etc...', 'END: Post-Alignment using SAMTOOLS, PICARD etc...', logger, 'info')
    out_sorted_bam = "%s/%s_aln_sort.bam" % (args.output_folder, args.analysis_name)


    # Run Depth of Coverage Module after read mapping and stop. Dont proceed to variant calling step.
    if args.coverage_depth_stats:
        gatk_DepthOfCoverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config)
        alignment_stats_file = alignment_stats(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config)
    else:
        ## Continue: 3. Stages: Post-Alignment using SAMTOOLS, PICARD etc
        keep_logging('START: Creating BedGraph Coverage', 'START: Creating BedGraph Coverage', logger, 'info')
        bedgraph_coverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config)
        only_unmapped_positions_file = bedtools(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config)
        keep_logging('END: Creating BedGraph Coverage', 'END: Creating BedGraph Coverage', logger, 'info')


        ## 4. Stages: Variant Calling
        keep_logging('START: Variant Calling', 'START: Variant Calling', logger, 'info')
        caller = ConfigSectionMap("pipeline", Config)['variant_caller']
        if caller == "samtoolswithpostalignbam":
            keep_logging('START: Variant Calling using Samtools and post-align bam input files', 'START: Variant Calling using Samtools and post-align bam input files', logger, 'info')
            out_finalbam = post_align_bam(out_sorted_bam, args.output_folder, args.index, args.analysis_name)
            final_raw_vcf = variant_calling(out_finalbam, args.output_folder, args.index, args.analysis_name)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug')
            keep_logging('END: Variant Calling using Samtools and post-align bam input files', 'END: Variant Calling using Samtools and post-align bam input files', logger, 'info')
        elif caller == "gatkhaplotypecaller":
            keep_logging('START: Variant Calling using GATK haplotyper and post-align bam input files', 'START: Variant Calling using GATK haplotyper and post-align bam input files', logger, 'info')
            out_finalbam = post_align_bam(out_sorted_bam, args.output_folder, args.index, args.analysis_name)
            final_raw_vcf = variant_calling(out_finalbam, args.output_folder, args.index, args.analysis_name)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug')
            keep_logging('END: Variant Calling using GATK haplotyper and post-align bam input files', 'END: Variant Calling using GATK haplotyper and post-align bam input files', logger, 'info')
        elif caller == "samtools":
            keep_logging('START: Variant Calling using Samtools without post-align bam input files.', 'START: Variant Calling using Samtools without post-align bam input files.', logger, 'info')
            final_raw_vcf_mpileup = variant_calling(out_sorted_bam, args.output_folder, args.index, args.analysis_name, logger, Config)
            #final_raw_vcf_mpileup = "%s/%s_aln_mpileup_raw.vcf" % (args.output_folder, args.analysis_name)
            final_raw_vcf = remove_5_bp_snp_indel(final_raw_vcf_mpileup, args.output_folder, args.analysis_name, reference, logger, Config)
            #final_raw_vcf = "%s/%s_aln_mpileup_raw.vcf_5bp_indel_removed.vcf" % (args.output_folder, args.analysis_name)
            keep_logging('The final raw VCF file: {}'.format(final_raw_vcf), 'The final raw VCF file: {}'.format(final_raw_vcf), logger, 'debug')
            keep_logging('END: Variant Calling using Samtools without post-align bam input files.', 'END: Variant Calling using Samtools without post-align bam input files.', logger, 'info')
        else:
            keep_logging('Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. samtoolswithpostalignbam 3. gatkhaplotypecaller', 'Please provide Variant Caller name in config file under the section [pipeline]. Options for Variant caller: 1. samtools 2. samtoolswithpostalignbam 3. gatkhaplotypecaller', logger, 'info')
            exit()
        keep_logging('END: Variant Calling', 'END: Variant Calling', logger, 'info')


        ## 5. Stages: Variant Filteration
        keep_logging('START: Variant Filteration', 'START: Variant Filteration', logger, 'info')
        filter2_variants(final_raw_vcf, args.output_folder, args.analysis_name, args.index, logger, Config)
        keep_logging('END: Variant Filteration', 'END: Variant Filteration', logger, 'info')


        ## 6. Stages: Statistics
        keep_logging('START: Generating Statistics Reports', 'START: Generating Statistics Reports', logger, 'info')
        alignment_stats_file = alignment_stats(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config)
        gatk_DepthOfCoverage(out_sorted_bam, args.output_folder, args.analysis_name, reference, logger, Config)
        vcf_stats_file = vcf_stats(final_raw_vcf, args.output_folder, args.analysis_name, logger, Config)
        #qualimap_report = qualimap(out_sorted_bam, args.output_folder, args.analysis_name, logger, Config)
        keep_logging('END: Generating Statistics Reports', 'END: Generating Statistics Reports', logger, 'info')