Esempio n. 1
0
    def transcriptome_regions_path(self, alignment_path, parameters):
        transcriptome_regions_path = alignment_path + "aligned_coverage_regions.bed"
        if not os.path.exists(transcriptome_regions_path):
            bam_path = alignment_path + "Out.bam"
            coverage_path = alignment_path + "Out.base_coverage"
            min_coverage = 2

            # Create coverage file
            command = "bedtools genomecov -d -ibam /{}".format(bam_path)
            output_parameters = {
                "log_is_output": True,
                "out_file_path": coverage_path,
                "log_file_path": parameters["destination"] + "Coverage.log"
            }
            self.run_docker(command, parameters, output_parameters)
            file_utils.validate_file_content(coverage_path)

            # Create BED from coverage file
            command = "python base_coverage_to_bed.py /{} {} /{}".format(
                coverage_path, str(min_coverage), transcriptome_regions_path)
            self.run_docker(command,
                            parameters,
                            log_file_name="CoverageToBed.log")
            file_utils.validate_file_content(transcriptome_regions_path)

        return transcriptome_regions_path
Esempio n. 2
0
    def run(self, parameters):
        destination = parameters["destination"]
        experiment = parameters["experiment"]
        data_handler = parameters["data_handler"]

        in_file_path = experiment.get_input_directory(self.id) + "Out.bam"
        reference_path = data_handler.reference_path(experiment)

        # Remove duplicates
        deduplicated_path = destination + "Deduplicated.bam"
        metrics_path = destination + "Deduplicate.metrics"
        command = "gatk MarkDuplicates -I /{} -O /{} -M /{} " \
            "--VALIDATION_STRINGENCY=SILENT".format(
                in_file_path,
                deduplicated_path,
                metrics_path
            )
        output_parameters = {"log_file_path": destination + "Deduplicate.log"}
        self.run_docker(command, parameters, output_parameters)
        file_utils.validate_file_content(deduplicated_path)

        # Remove introns
        out_file_path = destination + "Out.bam"
        command = "gatk SplitNCigarReads -R /{} -I /{} -O /{} --tmp-dir /{}".format(
            reference_path, deduplicated_path, out_file_path, destination)
        output_parameters = {"log_file_path": destination + "SplitN.log"}
        self.run_docker(command, parameters, output_parameters)
        file_utils.validate_file_content(out_file_path)
        file_utils.delete(deduplicated_path)
Esempio n. 3
0
 def align(self, parameters, sam_file_path):
     command = self.alignment_command(parameters)
     output_parameters = {
         "log_is_output": not self.creates_output,
         "out_file_path": sam_file_path
     }
     self.run_docker(command, parameters, output_parameters)
     self.conclude_alignment(parameters, sam_file_path)
     file_utils.validate_file_content(sam_file_path)
Esempio n. 4
0
 def alignment_command(self, parameters):
     dataset = parameters["dataset"]
     genome_index_path = parameters["genome_index_path"]
     command = "novoalign -o SAM -f"
     file_utils.validate_file_content(genome_index_path)
     for direction, specification in dataset.get("data").items():
         command += " /{}".format(specification["path"])
     command += " -d /{}".format(genome_index_path)
     command += " -r All 10"  # report max. 10 alignments per read
     command += " -v 0 70 70 '[>]([^:]*)'"  # group junction and exon sequences together
     if self.__fasta_input(parameters):
         command += " -F FA"
     return command
Esempio n. 5
0
 def run(self, parameters):
     experiment = parameters["experiment"]
     soft_clips_exist = experiment.get_aligner_soft_clips()
     in_file_path = experiment.get_input_directory(self.id) + "Out.bam"
     out_file_path = parameters["destination"] + "Out.bam"
     command = "python Opossum.py --BamFile=/{} --OutFile=/{} --SoftClipsExist={}".format(
         in_file_path,
         out_file_path,
         soft_clips_exist
     )
     self.run_docker(command, parameters)
     file_utils.validate_file_content(out_file_path)
     self.__post_process(parameters, out_file_path)
Esempio n. 6
0
    def run(self, parameters):
        experiment = parameters["experiment"]
        reference_id = experiment.get("reference")
        destination = parameters["destination"]
        vcf_file_path = destination + "Out.vcf"
        alignment_path = experiment.get("pipeline")["alignment"]["directory"]
        confidence_regions_path = alignment_path + "confidence_calls.bed".format(
            reference_id)

        # Intersect confidence regions with transcriptome regions if not already done
        if not os.path.exists(confidence_regions_path):
            confidence_genome_regions_path = "data/giab/{}/confidence_calls.bed".format(
                reference_id)
            transcriptome_regions_path = self.transcriptome_regions_path(
                alignment_path, parameters)
            self.bedtools("intersect", confidence_genome_regions_path,
                          transcriptome_regions_path, confidence_regions_path,
                          parameters)
            file_utils.validate_file_content(confidence_regions_path)

        # Filter data if necessary
        action_handler = parameters["action_handler"]
        additional_commands = ""
        if hasattr(action_handler, "chromosomes"):
            # Escape spaces for bash
            space_escape = "%%"
            additional_commands = "--location{}{}".format(
                space_escape, ",".join(action_handler.chromosomes))

        command = "./hap.py /data/giab/{0}/confidence_calls.vcf /{1}Out.vcf " \
            "-f /{2} " \
            "-o /{1}Evaluation " \
            "-r /data/references/{0}.fa " \
            "--location {3}".format(
                reference_id,
                destination,
                confidence_regions_path,
                additional_commands
            )
        output_parameters = {"log_file_path": destination + "Evaluation.log"}
        self.run_docker(command, parameters, output_parameters)

        for file_name in os.listdir(destination):
            if file_name.startswith("Evaluation"):
                file_path = destination + file_name
                if not file_utils.file_has_content(file_path):
                    file_utils.delete(file_path)
Esempio n. 7
0
    def run(self, parameters):
        destination = parameters["destination"]
        experiment = parameters["experiment"]
        data_handler = parameters["data_handler"]

        in_file_path = experiment.get_input_directory(self.id) + "Out.vcf"
        reference_path = data_handler.reference_path(experiment)
        out_file_path = destination + "Out.vcf"

        command = "gatk VariantFiltration -R /{} -V /{} -O /{} " \
            "-window 35 -cluster 3 --filter-name FS -filter 'FS > 30.0' " \
            " --filter-name QD -filter 'QD < 2.0'".format(
                reference_path,
                in_file_path,
                out_file_path
            )
        self.run_docker(command, parameters)
        file_utils.validate_file_content(out_file_path)
Esempio n. 8
0
    def post_process(self, parameters, sam_file_path, bam_file_path):
        destination = parameters["destination"]
        dataset = parameters["dataset"]

        # Convert to BAM, add read groups and sort
        command = "gatk AddOrReplaceReadGroups -I /{} -O /{} -SO coordinate " \
            "-ID foo -LB bar -PL illumina -SM Sample1 -PU foo.bar " \
            "--TMP_DIR {} " \
            "--CREATE_INDEX".format(
                sam_file_path,
                bam_file_path,
                destination
        )
        output_parameters = {"log_file_path": destination + "Conversion.log"}
        self.run_docker(command, parameters, output_parameters)
        file_utils.validate_file_content(bam_file_path)

        # Delete SAM file if not needed in evaluation (which is for BEERS sets)
        evaluation = dataset.get("evaluation")
        if evaluation == None or evaluation["type"] != "beers":
            file_utils.delete(sam_file_path)

        # Create reference indices
        data_handler = parameters["data_handler"]
        experiment = parameters["experiment"]
        reference_path = data_handler.reference_path(experiment)
        reference_index_path = data_handler.reference_path(
            experiment, alternate_file_ending=".fa.fai")
        reference_dict_path = data_handler.reference_path(
            experiment, alternate_file_ending=".dict")

        # Generate index of reference if not there
        if not os.path.exists(reference_index_path):
            command = "samtools faidx /{}".format(reference_path)
            output_parameters = {"log_file_path": destination + "Index.log"}
            self.run_docker(command, parameters, output_parameters)

        # Generate dict or reference if not there
        if not os.path.exists(reference_dict_path):
            command = "gatk CreateSequenceDictionary -R /{} -O /{}".format(
                reference_path, reference_dict_path)
            output_parameters = {"log_file_path": destination + "Dict.log"}
            self.run_docker(command, parameters, output_parameters)
Esempio n. 9
0
    def run(self, parameters, in_file_path=None):
        experiment = parameters["experiment"]
        destination = parameters["destination"]
        data_handler = parameters["data_handler"]
        docker_client = parameters["docker_client"]

        reference_path = data_handler.reference_path(experiment)
        # Run variant calling
        in_file_path = in_file_path or experiment.get_input_directory(
            self.id) + "Out.bam"
        out_file_path = destination + "Out.vcf"
        command = "gatk HaplotypeCaller -I /{} -O /{} -R /{} " \
            "--dont-use-soft-clipped-bases " \
            "--standard-min-confidence-threshold-for-calling 20".format(
            in_file_path,
            out_file_path,
            data_handler.reference_path(experiment)
        )
        command = self.add_filters(command)
        self.run_docker(command, parameters)
        file_utils.validate_file_content(out_file_path)