Esempio n. 1
0
    def run(self, parameters):
        destination = parameters["destination"]
        experiment = parameters["experiment"]
        data_handler = parameters["data_handler"]

        in_file_path = experiment.get_input_directory(self.id) + "Out.bam"
        reference_path = data_handler.reference_path(experiment)

        # Remove duplicates
        deduplicated_path = destination + "Deduplicated.bam"
        metrics_path = destination + "Deduplicate.metrics"
        command = "gatk MarkDuplicates -I /{} -O /{} -M /{} " \
            "--VALIDATION_STRINGENCY=SILENT".format(
                in_file_path,
                deduplicated_path,
                metrics_path
            )
        output_parameters = {"log_file_path": destination + "Deduplicate.log"}
        self.run_docker(command, parameters, output_parameters)
        file_utils.validate_file_content(deduplicated_path)

        # Remove introns
        out_file_path = destination + "Out.bam"
        command = "gatk SplitNCigarReads -R /{} -I /{} -O /{} --tmp-dir /{}".format(
            reference_path, deduplicated_path, out_file_path, destination)
        output_parameters = {"log_file_path": destination + "SplitN.log"}
        self.run_docker(command, parameters, output_parameters)
        file_utils.validate_file_content(out_file_path)
        file_utils.delete(deduplicated_path)
Esempio n. 2
0
def get_p_falciparum(genome_id, file_path):
    url = "http://bp1.s3.amazonaws.com/malaria.tar.bz2"
    download_path = reference_directory + "malaria.tar.bz2"
    file_utils.download(url, download_path)
    print("Unzipping {}...".format(genome_id), flush=True)
    unzipped_directory = file_utils.unzip(download_path)
    os.rename(unzipped_directory + "/genome_sequence_pfal.fa", file_path)
    file_utils.delete(download_path)
    file_utils.delete(unzipped_directory)
Esempio n. 3
0
    def run(self, parameters):
        docker_client = parameters["docker_client"]
        data_handler = parameters["data_handler"]
        experiment = parameters["experiment"]
        destination = parameters["destination"]
        dataset = data_handler.datasets.select(experiment.get("dataset"))

        sam_file_path = destination + "Out.sam"
        bam_file_path = destination + "Out.bam"

        # Define genome index path and temp path (will be renamed if successful)
        parameters["reference_id"] = experiment.get("reference")
        genome_index_path = data_handler.genome_index_path(experiment, self.id)
        temp_genome_index_path = genome_index_path + ".running"

        # If neccessary, build genome index
        if not os.path.exists(genome_index_path):
            try:
                index_parameters = {
                    "docker_client": docker_client,
                    "destination": destination,
                    "genome_index_path": temp_genome_index_path,
                    "reference_path": data_handler.reference_path(experiment),
                    "dataset": dataset,
                    "reference_base_path": data_handler.reference_directory,
                    "reference_id": parameters["reference_id"]
                }
                self.build_genome_index(index_parameters)
            except:
                file_utils.delete(temp_genome_index_path)
                raise
            os.rename(temp_genome_index_path, genome_index_path)

        # Run alignment
        alignment_parameters = {
            "docker_client": docker_client,
            "destination": destination,
            "genome_index_path": genome_index_path,
            "dataset": dataset,
            "reference_id": parameters["reference_id"],
            "reference_base_path": data_handler.reference_directory
        }
        self.align(alignment_parameters, sam_file_path)

        # Create sorted BAM file from SAM file
        post_processing_parameters = {
            "docker_client": docker_client,
            "docker_image": "gatk",
            "destination": destination,
            "data_handler": data_handler,
            "experiment": experiment,
            "dataset": dataset
        }
        self.post_process(post_processing_parameters, sam_file_path,
                          bam_file_path)
Esempio n. 4
0
 def setup(self):
     super().setup()
     os.mkdir(self.directory)
     try:
         self.content["error"] = False
         self.__store_data()
     except Exception as error:
         file_utils.delete(self.directory)
         file_utils.delete(self.path)
         self.content["error"] = True
         raise error
Esempio n. 5
0
def get_human_genome(genome_id, file_path):
    url = "http://hgdownload.soe.ucsc.edu/goldenPath/"
    url += "{0}/bigZips/{0}.2bit".format(genome_id)
    two_bit_path = file_path + ".2bit"
    started_tasks.append(two_bit_path)
    file_utils.download(url, two_bit_path)
    finished_tasks.append(two_bit_path)
    # Convert .2bit file to .fa
    print("Extracting {} from 2bit file...".format(genome_id), flush=True)
    os.system("chmod +x {0}twoBitToFa && {0}twoBitToFa {1} {2}".format(
        reference_directory, two_bit_path, file_path))
    file_utils.delete(two_bit_path)
Esempio n. 6
0
    def run(self, parameters):
        dataset = parameters["dataset"]
        destination = parameters["destination"]
        command = "bash evaluate_alignment.sh {} {} /{}".format(
            dataset.get("readLength"), destination,
            dataset.get("evaluation")["truth_file"]["path"])
        output_parameters = {"log_file_path": destination + "Evaluation.log"}
        self.run_docker(command, parameters, output_parameters)

        for file_name in ["Evaluation.multi.txt", "Evaluation.txt"]:
            file_path = destination + file_name
            if not file_utils.file_has_content(file_path):
                file_utils.delete(file_path)
Esempio n. 7
0
    def run(self, parameters):
        experiment = parameters["experiment"]
        reference_id = experiment.get("reference")
        destination = parameters["destination"]
        vcf_file_path = destination + "Out.vcf"
        alignment_path = experiment.get("pipeline")["alignment"]["directory"]
        confidence_regions_path = alignment_path + "confidence_calls.bed".format(
            reference_id)

        # Intersect confidence regions with transcriptome regions if not already done
        if not os.path.exists(confidence_regions_path):
            confidence_genome_regions_path = "data/giab/{}/confidence_calls.bed".format(
                reference_id)
            transcriptome_regions_path = self.transcriptome_regions_path(
                alignment_path, parameters)
            self.bedtools("intersect", confidence_genome_regions_path,
                          transcriptome_regions_path, confidence_regions_path,
                          parameters)
            file_utils.validate_file_content(confidence_regions_path)

        # Filter data if necessary
        action_handler = parameters["action_handler"]
        additional_commands = ""
        if hasattr(action_handler, "chromosomes"):
            # Escape spaces for bash
            space_escape = "%%"
            additional_commands = "--location{}{}".format(
                space_escape, ",".join(action_handler.chromosomes))

        command = "./hap.py /data/giab/{0}/confidence_calls.vcf /{1}Out.vcf " \
            "-f /{2} " \
            "-o /{1}Evaluation " \
            "-r /data/references/{0}.fa " \
            "--location {3}".format(
                reference_id,
                destination,
                confidence_regions_path,
                additional_commands
            )
        output_parameters = {"log_file_path": destination + "Evaluation.log"}
        self.run_docker(command, parameters, output_parameters)

        for file_name in os.listdir(destination):
            if file_name.startswith("Evaluation"):
                file_path = destination + file_name
                if not file_utils.file_has_content(file_path):
                    file_utils.delete(file_path)
Esempio n. 8
0
    def get_file(file_id, direction, directory):
        print("Downloading {} file...".format(direction), flush=True)
        zip_name = "{}.fastq.gz".format(file_id)
        url = "https://www.encodeproject.org/files/{}/@@download/{}".format(
            file_id, zip_name)
        download_path = directory + "/" + zip_name
        file_utils.download(url, download_path)
        print("Unzipping {} file...".format(direction), flush=True)
        file_utils.unzip(download_path)
        file_utils.delete(download_path)

        original_name = "{}.fastq".format(file_id)
        file_origin = "{}/{}".format(directory, original_name)
        file_destination = "{}/{}{}".format(directory, direction,
                                            fastq_file_ending)
        os.rename(file_origin, file_destination)
        return original_name, file_destination
Esempio n. 9
0
    def post_process(self, parameters, sam_file_path, bam_file_path):
        destination = parameters["destination"]
        dataset = parameters["dataset"]

        # Convert to BAM, add read groups and sort
        command = "gatk AddOrReplaceReadGroups -I /{} -O /{} -SO coordinate " \
            "-ID foo -LB bar -PL illumina -SM Sample1 -PU foo.bar " \
            "--TMP_DIR {} " \
            "--CREATE_INDEX".format(
                sam_file_path,
                bam_file_path,
                destination
        )
        output_parameters = {"log_file_path": destination + "Conversion.log"}
        self.run_docker(command, parameters, output_parameters)
        file_utils.validate_file_content(bam_file_path)

        # Delete SAM file if not needed in evaluation (which is for BEERS sets)
        evaluation = dataset.get("evaluation")
        if evaluation == None or evaluation["type"] != "beers":
            file_utils.delete(sam_file_path)

        # Create reference indices
        data_handler = parameters["data_handler"]
        experiment = parameters["experiment"]
        reference_path = data_handler.reference_path(experiment)
        reference_index_path = data_handler.reference_path(
            experiment, alternate_file_ending=".fa.fai")
        reference_dict_path = data_handler.reference_path(
            experiment, alternate_file_ending=".dict")

        # Generate index of reference if not there
        if not os.path.exists(reference_index_path):
            command = "samtools faidx /{}".format(reference_path)
            output_parameters = {"log_file_path": destination + "Index.log"}
            self.run_docker(command, parameters, output_parameters)

        # Generate dict or reference if not there
        if not os.path.exists(reference_dict_path):
            command = "gatk CreateSequenceDictionary -R /{} -O /{}".format(
                reference_path, reference_dict_path)
            output_parameters = {"log_file_path": destination + "Dict.log"}
            self.run_docker(command, parameters, output_parameters)
Esempio n. 10
0
 def log_output(docker_container):
     if stdout_file_path != None:
         out_file = open(stdout_file_path, "ab")
         for line in docker_container.logs(stdout=True, stderr=False, stream=True):
             out_file.write(line)
         out_file.close()
     if stderr_file_path != None:
         log_file = open(stderr_file_path, "ab")
         for line in docker_container.logs(stdout=False, stderr=True, stream=True):
             log_file.write(line)
         log_file.close()
     docker_container.reload()
     if docker_container.status != "exited":
         docker_container.stop()
     # If log file is empty, delete it. If only one file is written it is
     # expected to be the log file, if both files are written, it is expected
     # to be stderr_file_path
     log_file_path = stderr_file_path or stdout_file_path
     if not file_utils.file_has_content(log_file_path):
         file_utils.delete(log_file_path)
Esempio n. 11
0
    def clean_up(self):
        # In case of an server stop, clean up references and experiments
        for reference in os.listdir(self.reference_directory):
            if reference.endswith(".running"):
                file_utils.delete(
                    os.path.join(self.reference_directory, reference))

        for experiment_id, experiment in self.experiments.all().items():
            status = experiment.get("status")
            pipeline = experiment.get("pipeline")
            error_message = "Server stopped unexpectedly"
            errored_action = list(pipeline.keys())[0]
            if status == self.constants["experiment"]["WAITING"]:
                experiment.mark_error(errored_action, error_message)
            if status == self.constants["experiment"]["RUNNING"]:
                for action, pipeline_step in pipeline.items():
                    started = "started" in pipeline_step and pipeline_step[
                        "started"]
                    completed = "completed" in pipeline_step and pipeline_step[
                        "completed"]
                    if started and not completed:
                        errored_action = action
                        self.cache.clean_up(experiment, action)
                experiment.mark_error(errored_action, error_message)
Esempio n. 12
0
def get_baruzzo(dataset, directory):
    zip_name = "{}.tar.bz2".format(dataset["file_name"])
    url = "http://bp1.s3.amazonaws.com/{}".format(zip_name)
    download_path = directory + "/" + zip_name
    file_utils.download(url, download_path)

    print("Unzipping {}...".format(dataset["name"]), flush=True)
    file_utils.unzip(download_path)

    # Move files to /beers directory
    beers_directory = directory + "/beers/"
    file_utils.create_directory(beers_directory)
    for file_name in os.listdir(directory):
        file_path = directory + "/" + file_name
        if not os.path.isdir(file_path) and not file_path == download_path:
            shutil.move(file_path, beers_directory + file_name)

    # Move FASTQ files to root and rename
    def setup_file(direction):
        file_name = "{}.{}.fa".format(dataset["id"], direction)
        file_origin = beers_directory + file_name
        file_destination = "{}/{}{}".format(directory, direction,
                                            fastq_file_ending)
        os.rename(file_origin, file_destination)
        return file_name, file_destination

    forward_file_name, forward_file_path = setup_file(
        constants["dataset"]["FORWARD"])
    reverse_file_name, reverse_file_path = setup_file(
        constants["dataset"]["REVERSE"])

    # Move CIG file to root and rename
    truth_file_name = "{}.cig".format(dataset["id"])
    truth_file_path = directory + "/truth.cig"
    os.rename(beers_directory + truth_file_name, truth_file_path)

    file_utils.delete(download_path)
    file_utils.delete(beers_directory)

    write_dataset_json({
        "id": dataset["id"],
        "name": dataset["name"],
        "readLength": "100",
        "data": {
            constants["dataset"]["FORWARD"]: {
                "name": forward_file_name,
                "path": forward_file_path,
            },
            constants["dataset"]["REVERSE"]: {
                "name": reverse_file_name,
                "path": reverse_file_path,
            }
        },
        "evaluation": {
            "type": "beers",
            "truth_file": {
                "name": truth_file_name,
                "path": truth_file_path
            }
        }
    })
Esempio n. 13
0
def remove_tools():
    for tool_name in tools:
        tool_path = reference_directory + tool_name
        file_utils.delete(tool_path)
Esempio n. 14
0
        if not os.path.isdir(dataset_directory):
            file_utils.create_directory(dataset_directory)
            log_task_start(dataset["name"], dataset_directory)
            dataset_getter(dataset, dataset_directory)
            log_task_end(dataset["name"], dataset_directory)
        else:
            log_data_present(dataset["name"])


###################
# SCRIPT EXECUTION
###################

print("", flush=True)
print("Downloading data", flush=True)
print("", flush=True)

file_utils.create_directory(reference_directory)
file_utils.create_directory(datasets_directory)

try:
    get_tools()
    get_genomes()
    get_datasets()
    remove_tools()
finally:
    for path in started_tasks:
        if not path in finished_tasks:
            print("An error occured, deleting {}".format(path))
            file_utils.delete(path)
Esempio n. 15
0
 def clean_up(self):
     file_utils.delete(self.directory)
Esempio n. 16
0
 def delete(self):
     file_utils.delete(self.path)