Esempio n. 1
0
    def filter_bad_variants(self, reference_file, input_vcf, output_prefix, snp_filter_name='ambiguous_snp', snp_QD=2.0,
                            snp_FS=60.0, snp_MQ=40.0, snp_HaplotypeScore=13.0, snp_MappingQualityRankSum=-12.5,
                            snp_ReadPosRankSum=-8.0, indel_filter_name='ambiguous_indel', indel_QD=2.0,
                            indel_ReadPosRankSum=-20.0, indel_FS=200.0, combine_vcf=False, sequence_dict_file=None,
                            picard_memory="1g", picard_dir=None):

        from RouToolPa.Tools.GATK4 import SelectVariants4
        from RouToolPa.Tools.Picard import SortVcf
        snp_raw_vcf = "%s.snp.raw.vcf" % output_prefix
        indel_raw_vcf = "%s.indel.raw.vcf" % output_prefix

        snp_filtered_vcf = "%s.snp.with_filters.vcf" % output_prefix
        indel_filtered_vcf = "%s.indel.with_filters.vcf" % output_prefix

        snp_good_vcf = "%s.snp.good.vcf" % output_prefix
        indel_good_vcf = "%s.indel.good.vcf" % output_prefix

        unsorted_combined_filtered_vcf = "%s.combined.with_filters.unsorted.vcf" % output_prefix
        unsorted_combined_good_vcf = "%s.combined.good.unsorted.vcf" % output_prefix

        combined_filtered_vcf = "%s.combined.with_filters.sorted.vcf" % output_prefix
        combined_good_vcf = "%s.combined.good.sorted.vcf" % output_prefix

        SelectVariants4.path = self.path
        SortVcf.jar_path = picard_dir
        SortVcf.max_memory = picard_memory
        #CombineVariants.jar_path = self.jar_path

        SelectVariants4.get_SNP(reference_file, input_vcf, snp_raw_vcf)
        SelectVariants4.get_indel(reference_file, input_vcf, indel_raw_vcf)

        self.filter_bad_SNP(reference_file, snp_raw_vcf, snp_filtered_vcf, filter_name=snp_filter_name, QD=snp_QD,
                            FS=snp_FS, MQ=snp_MQ, #HaplotypeScore=snp_HaplotypeScore,
                            MappingQualityRankSum=snp_MappingQualityRankSum, ReadPosRankSum=snp_ReadPosRankSum)
        self.filter_bad_indel(reference_file, indel_raw_vcf, indel_filtered_vcf, filter_name=indel_filter_name,
                              QD=indel_QD, ReadPosRankSum=indel_ReadPosRankSum, FS=indel_FS)

        SelectVariants4.remove_entries_with_filters(reference_file, snp_filtered_vcf, snp_good_vcf)
        SelectVariants4.remove_entries_with_filters(reference_file, indel_filtered_vcf, indel_good_vcf)

        if combine_vcf:
            VCFRoutines.combine_same_samples_vcfs(unsorted_combined_filtered_vcf, vcf_list=[snp_filtered_vcf, indel_filtered_vcf],
                                                  order_vcf_files=False, sort=True, chunk_folder=None, chunk_prefix=None,
                                                  chunk_suffix=None, starting_chunk=None, chunk_number_list=None,
                                                  close_fd_after=False, extension_list=[".vcf", ])

            VCFRoutines.combine_same_samples_vcfs(unsorted_combined_good_vcf, vcf_list=[snp_good_vcf, indel_good_vcf],
                                                  order_vcf_files=False, sort=True, chunk_folder=None, chunk_prefix=None,
                                                  chunk_suffix=None, starting_chunk=None, chunk_number_list=None,
                                                  close_fd_after=False, extension_list=[".vcf", ])
            if sequence_dict_file:
                SortVcf.sort_vcf(unsorted_combined_filtered_vcf, combined_filtered_vcf, seq_dict=sequence_dict_file)
                SortVcf.sort_vcf(unsorted_combined_good_vcf, combined_good_vcf, seq_dict=sequence_dict_file)
            """
Esempio n. 2
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import sys
import argparse
from RouToolPa.Routines import VCFRoutines



parser = argparse.ArgumentParser()

parser.add_argument("-o", "--output", action="store", dest="output", default=sys.stdout,
                    help="Output file. Default: stdout")
parser.add_argument("-i", "--vcf_list", action="store", dest="vcf_list", required=True,
                    type=VCFRoutines.make_list_of_path_to_files_from_string,
                    help="Comma-separated list of vcf files")
parser.add_argument("-s", "--sort", action="store_true", dest="sort", default=False,
                    help="Sort vcf files. Default:False")
parser.add_argument("-r", "--order_vcf_files", action="store_true", dest="order_vcf_files", default=False,
                    help="Order vcf files by name using natural sorting. Default:False")

args = parser.parse_args()

VCFRoutines.combine_same_samples_vcfs(args.output, vcf_list=args.vcf_list, close_fd_after=False, extension_list=[".vcf", ],
                                      order_vcf_files=args.order_vcf_files, sort=args.sort)
Esempio n. 3
0
from RouToolPa.Routines import VCFRoutines

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--input_gvcf",
                    action="store",
                    dest="input_gvcf",
                    help="Input gvcf file",
                    required=True)
parser.add_argument("-o",
                    "--output_prefix",
                    action="store",
                    dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument("-r",
                    "--reference",
                    action="store",
                    dest="reference",
                    required=True,
                    help="Fasta with reference genome")

args = parser.parse_args()

VCFRoutines.check_gvcf_integrity(args.input_gvcf,
                                 args.output_prefix,
                                 reference=args.reference,
                                 length_dict=None,
                                 parsing_mode="parse")
Esempio n. 4
0
    def parallel_call(self, reference, alignment, output_dir, output_prefix,
                      stand_call_conf=30, max_region_length=1000000, max_seqs_per_region=100,
                      length_dict=None, parsing_mode="parse", region_list=None,
                      region_file_format='simple',
                      remove_intermediate_files=False,
                      cpus_per_task=1,
                      handling_mode="local",
                      job_name=None,
                      log_prefix=None,
                      error_log_prefix=None,
                      max_running_jobs=None,
                      max_running_time=None,
                      max_memmory_per_cpu=None,
                      modules_list=None,
                      environment_variables_dict=None,
                      black_list_scaffold_id_file=None,
                      gvcf_mode=False,
                      ignore_softclipped_bases=False):

        splited_dir = "%s/splited/" % output_dir
        regions_dir = "%s/regions/" % output_dir

        from RouToolPa.Tools.GATK4 import SortVcf4
        sequence_dict = reference[:-5] + "dict"
        SortVcf4.max_memory = self.max_memory
        SortVcf4.path = self.path

        for directory in output_dir, splited_dir:
            self.safe_mkdir(directory)

        if black_list_scaffold_id_file:
            if isinstance(black_list_scaffold_id_file, str):
                black_scaffolds_list = IdList(filename=black_list_scaffold_id_file)
            else:
                black_scaffolds_list = black_list_scaffold_id_file
        else:
            black_scaffolds_list = []
        region_list, \
            scaffold_to_region_correspondence_dict = self.prepare_region_list_by_length(max_length=max_region_length,
                                                                                        max_seq_number=max_seqs_per_region,
                                                                                        length_dict=length_dict,
                                                                                        reference=None if length_dict is not None else reference,
                                                                                        parsing_mode=parsing_mode,
                                                                                        output_dir=regions_dir,
                                                                                        black_list_scaffolds=black_scaffolds_list,
                                                                                        region_file_format=region_file_format if handling_mode != "slurm" else 'GATK') if region_list is None else region_list

        options = self.parse_options_for_parallel_run(reference, alignment,

                                                      stand_call_conf=stand_call_conf,
                                                      gvcf_mode=gvcf_mode,
                                                      ignore_softclipped_bases=False)
        #options += " -nct 1"
        options_list = []

        output_index = 1

        output_file_list = []

        output_extension = "g.vcf" if gvcf_mode else "vcf"

        if handling_mode == 'local':
            for regions in region_list:

                output_file = "%s/%s_%i.%s" % (splited_dir, output_prefix, output_index, output_extension)
                region_options = " -O %s" % output_file
                output_file_list.append(output_file)
                #for region in regions:
                #    region_options += " -L %s:%i-%i" % (region[0], region[1], region[2])

                for region in regions:
                    if isinstance(region, str):
                        region_options += " -L %s" % region
                    elif len(region) == 1:
                        region_options += " -L %s" % region[0]
                    elif len(region) == 3:
                        region_options += " -L %s:%i-%i" % (region[0], region[1], region[2])

                options_list.append(options + region_options)
                output_index += 1
            print("Variant calling....")
            self.parallel_execute(options_list,
                                  cmd=("gatk --java-options -Xmx%s HaplotypeCaller" % self.max_memory) if self.max_memory else None)
            unsorted_combined_vcf = "%s/%s.unsorted.%s" % (output_dir, output_prefix, output_extension)
            sorted_combined_vcf = "%s/%s.%s" % (output_dir, output_prefix, output_extension)
            print("Combining variants...")
            VCFRoutines.combine_same_samples_vcfs(unsorted_combined_vcf,
                                                  vcf_list=output_file_list,
                                                  order_vcf_files=True,
                                                  close_fd_after=False,
                                                  extension_list=[".vcf",])
            print("Sorting...")
            SortVcf4.sort_vcf(unsorted_combined_vcf, sorted_combined_vcf, sequence_dict)
            shutil.rmtree(splited_dir)
            shutil.rmtree(regions_dir)
            os.remove(unsorted_combined_vcf)

        elif handling_mode == 'slurm':
            number_of_regions = len(region_list)
            region_file = "%s/splited/region_${SLURM_ARRAY_TASK_ID}.list" % regions_dir
            output_file = "%s/%s_${SLURM_ARRAY_TASK_ID}.%s" % (splited_dir, output_prefix, output_extension)
            options += " -O %s" % output_file
            options += " -L %s" % region_file

            slurm_cmd = "gatk --java-options -Xmx%s HaplotypeCaller" % self.max_memory if self.max_memory else "gatk HaplotypeCaller"
            slurm_cmd += " %s" % options

            last_job_id = self.slurm_run_job(job_name,
                                             log_prefix,
                                             slurm_cmd,
                                             error_log_prefix,
                                             "%s%s.slurm" % (output_dir, output_prefix),
                                             task_index_list=None,
                                             start_task_index=1,
                                             end_task_index=number_of_regions,
                                             max_running_jobs=max_running_jobs,
                                             max_running_time=max_running_time,
                                             cpus_per_task=cpus_per_task,
                                             max_memmory_per_cpu=max_memmory_per_cpu,
                                             modules_list=modules_list,
                                             environment_variables_dict=environment_variables_dict)

            print("Submitted job  %s" % last_job_id)

            #VCFRoutines.combine_same_samples_vcfs(output_file_list,
            #                                      output,
            #                                      close_fd_after=False,
            #                                      extension_list=gvcf_extension_list)
        else:
            print("ERROR!!! Unrecognized handling mode!")
Esempio n. 5
0
    def parallel_gvcf_call(self,
                           reference,
                           alignment,
                           output_dir,
                           output_prefix,
                           output,
                           genotyping_mode="DISCOVERY",
                           stand_call_conf=30,
                           max_region_length=1000000,
                           max_seqs_per_region=100,
                           length_dict=None,
                           parsing_mode="parse",
                           region_list=None,
                           region_file_format='simple',
                           remove_intermediate_files=False,
                           gvcf_extension_list=[
                               "g.vcf",
                           ],
                           cpus_per_task=1,
                           handling_mode="local",
                           job_name=None,
                           log_prefix=None,
                           error_log_prefix=None,
                           max_running_jobs=None,
                           max_running_time=None,
                           max_memmory_per_cpu=None,
                           modules_list=None,
                           environment_variables_dict=None):
        splited_dir = "%s/splited_gvcf/" % output_dir
        regions_dir = "%s/regions/" % output_dir

        for directory in output_dir, splited_dir:
            self.safe_mkdir(directory)

        region_list, \
            scaffold_to_region_correspondence_dict = self.prepare_region_list_by_length(max_length=max_region_length,
                                                                                        max_seq_number=max_seqs_per_region,
                                                                                        length_dict=length_dict,
                                                                                        reference=None if length_dict is not None else reference,
                                                                                        parsing_mode=parsing_mode,
                                                                                        output_dir=regions_dir,
                                                                                        region_file_format=region_file_format if handling_mode != "slurm" else 'GATK') if region_list is None else region_list

        options = self.parse_options_for_parallel_run(
            reference,
            alignment,
            genotyping_mode=genotyping_mode,
            stand_call_conf=stand_call_conf,
            gvcf_mode=True)
        options += " -nct 1"
        options_list = []

        output_index = 1

        output_file_list = []

        if handling_mode == 'local':
            for regions in region_list:
                output_file = "%s/%s_%i.g.vcf" % (splited_dir, output_prefix,
                                                  output_index)
                region_options = " -o %s" % output_file
                output_file_list.append(output_file)
                #for region in regions:
                #    region_options += " -L %s:%i-%i" % (region[0], region[1], region[2])

                for region in regions:
                    if isinstance(region, str):
                        region_options += " -L %s" % region
                    elif len(region) == 1:
                        region_options += " -L %s" % region[0]
                    elif len(region) == 3:
                        region_options += " -L %s:%i-%i" % (
                            region[0], region[1], region[2])

                options_list.append(options + region_options)
                output_index += 1

            self.parallel_execute(options_list)

            VCFRoutines.combine_same_samples_vcfs(
                output,
                vcf_list=output_file_list,
                order_vcf_files=True,
                close_fd_after=False,
                extension_list=gvcf_extension_list)

        elif handling_mode == 'slurm':
            number_of_regions = len(region_list)
            region_file = "%s/splited/region_${SLURM_ARRAY_TASK_ID}.list" % regions_dir
            output_file = "%s/%s_${SLURM_ARRAY_TASK_ID}.g.vcf" % (
                splited_dir, output_prefix)
            options += " -o %s" % output_file
            options += " -L %s" % region_file

            slurm_cmd = "java -Xmx%s -jar %s/%s %s" % (
                self.max_memory, self.jar_path, self.jar, options)

            last_job_id = self.slurm_run_job(
                job_name,
                log_prefix,
                slurm_cmd,
                error_log_prefix,
                "%s%s.slurm" % (output_dir, output_prefix),
                task_index_list=None,
                start_task_index=1,
                end_task_index=number_of_regions,
                max_running_jobs=max_running_jobs,
                max_running_time=max_running_time,
                cpus_per_task=cpus_per_task,
                max_memmory_per_cpu=max_memmory_per_cpu,
                modules_list=modules_list,
                environment_variables_dict=environment_variables_dict)

            print("Submitted job  %s" % last_job_id)

            #VCFRoutines.combine_same_samples_vcfs(output_file_list,
            #                                      output,
            #                                      close_fd_after=False,
            #                                      extension_list=gvcf_extension_list)
        else:
            print("ERROR!!! Unrecognized handling mode!")
Esempio n. 6
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse
from RouToolPa.Routines import VCFRoutines


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    help="Input vcf file with mutations")
parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix",
                    required=True,
                    help="Prefix of output files")
parser.add_argument("-m", "--mode", action="store", dest="mode", default="one",
                    help="Operation mode. Allowed: 'one'(default) - variant will be treated as heterozygous if "
                         "there is at least one heterozygous sample, 'all' - all samples have to be heterozygous")

args = parser.parse_args()

VCFRoutines.extract_heterozygous_variants(args.input, args.output_prefix,
                                          mode=args.mode, verbose=True)
Esempio n. 7
0
    def parallel_genotype(
            self,
            reference,
            gvcf_list,
            splited_dir,
            splited_prefix,
            output_vcf,
            max_total_scaffold_length_per_chunk=100000,
            max_scaffold_number_per_chunk=5,
            length_dict=None,
            parsing_mode="parse",
            region_list=None,
            extension_list=[
                "g.vcf",
            ],
            #disable_auto_index_creation_and_locking_when_reading_rods=True,
            max_alternate_alleles=None,
            picard_jar_path=None):

        self.safe_mkdir(splited_dir)

        regions_list,\
            scaffold_to_region_correspondence_dict = self.prepare_region_list_by_length(max_length=max_total_scaffold_length_per_chunk,
                                                                                        max_seq_number=max_scaffold_number_per_chunk,
                                                                                        length_dict=length_dict,
                                                                                        reference=None if length_dict is not None else reference,
                                                                                        parsing_mode=parsing_mode,
                                                                                        output_dir="%s/regions/" % splited_dir,
                                                                                        split_scaffolds=False) if region_list is None else region_list

        options = self.parse_options_for_parallel_run(
            reference,
            gvcf_list,
            extension_list=extension_list,
            max_alternate_alleles=max_alternate_alleles,
            #disable_auto_index_creation_and_locking_when_reading_rods=disable_auto_index_creation_and_locking_when_reading_rods
        )

        output_index = 1
        options_list = []

        region_vcf_list = []

        for regions in regions_list:
            region_options = " -O %s/%s_%i.vcf" % (splited_dir, splited_prefix,
                                                   output_index)
            region_vcf_list.append("%s/%s_%i.vcf" %
                                   (splited_dir, splited_prefix, output_index))
            for region in regions:
                if isinstance(region, str):
                    region_options += " -L %s" % region
                elif len(region) == 1:
                    region_options += " -L %s" % region[0]
                elif len(region) == 3:
                    region_options += " -L %s:%i-%i" % (region[0], region[1],
                                                        region[2])

            options_list.append(options + region_options)
            output_index += 1

        self.parallel_execute(options_list)
        unsorted_vcf = "%s.unsorted.vcf"
        VCFRoutines.combine_same_samples_vcfs(unsorted_vcf,
                                              vcf_list=region_vcf_list,
                                              order_vcf_files=True,
                                              close_fd_after=False,
                                              extension_list=[
                                                  ".vcf",
                                              ])
        sequence_dict = reference[:-6] + ".dict"

        SortVcf.jar_path = picard_jar_path

        SortVcf.sort_vcf(unsorted_vcf, output_vcf, seq_dict=sequence_dict)