Ejemplo n.º 1
0
    def test_imputation(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "RNA-SEQ",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/",
                },
                experiment,
            )

            rnas.append(file)

        # Missing sample that will be filtered
        sample = create_sample_for_experiment(
            {
                "organism": danio_rerio,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "RNA-SEQ",
                "filename": None,
            },
            experiment,
        )
        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        imputation_index = create_compendia.COMPENDIA_PIPELINE.index(
            create_compendia._perform_imputation)

        pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
        job_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            create_compendia.COMPENDIA_PIPELINE[:imputation_index],
        )

        # First, run the imputation step without removing anything to get a baseline
        expected_context = utils.run_pipeline(
            job_context.copy(),
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])

        # Now pick some rows to remove according to the instructions from
        # https://github.com/AlexsLemonade/refinebio/pull/2879#issuecomment-895143336

        random.seed(42)

        # Select some rows randomly and mask a little bit less than 30% of the values
        rare_rows = random.sample(list(job_context["microarray_matrix"].index),
                                  k=25)
        rare_genes = {}
        for row in rare_rows:
            cols = random.sample(
                list(job_context["microarray_matrix"].columns),
                # There are around 840 samples, and we want to pick a little bit
                # less than 30% of them
                k=int(0.28 * 840),
            )
            rare_genes[row] = cols
            for col in cols:
                job_context["microarray_matrix"].loc[row, col] = np.nan

        # Now randomly select some entries from the other rows to mask
        individual_indices = random.sample(
            list(
                itertools.product(
                    set(job_context["microarray_matrix"].index) -
                    set(rare_rows),
                    job_context["microarray_matrix"].columns,
                )),
            k=1000,
        )
        for row, col in individual_indices:
            job_context["microarray_matrix"].loc[row, col] = np.nan

        final_context = utils.run_pipeline(
            job_context,
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])
        self.assertDidNotFail(job)

        index = set(final_context["merged_no_qn"].index) & set(
            expected_context["merged_no_qn"].index)
        columns = set(final_context["merged_no_qn"].columns) & set(
            expected_context["merged_no_qn"].columns)

        # Calculate the Root-Mean-Square Error (RMSE) of the imputed values.
        # See https://en.wikipedia.org/wiki/Root-mean-square_deviation
        # for a description of the formula.

        N = 0
        squared_error = 0
        affected_entries = {
            *individual_indices,
            *((row, col) for row, cols in rare_genes.items() for col in cols),
        }
        for row, col in affected_entries:
            if row in index and col in columns:
                actual = final_context["merged_no_qn"].loc[row, col]
                expected = expected_context["merged_no_qn"].loc[row, col]

                N += 1
                squared_error += (actual - expected)**2

        rmse = math.sqrt(squared_error / N)

        # The results of a previous run plus a little bit of leeway
        self.assertLess(abs(rmse - 0.2868600293662542), 0.05)
Ejemplo n.º 2
0
    def test_get_tximport_inputs(self):
        """"Tests that tximport only considers RNA-Seq samples from GEO.
        """
        # Create one experiment and two related samples, based on:
        #   https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623
        # (We don't need any original files because
        # get_tximport_inputs doesn't consider them.)
        experiment_accession = 'PRJNA242809'
        experiment = Experiment.objects.create(accession_code=experiment_accession)

        c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

        ## Sample 1
        sample1_accession = 'SRR1206053'
        sample1 = Sample.objects.create(accession_code=sample1_accession,
                                        organism=c_elegans)
        sample1.source_database = 'GEO'
        sample1.technology = 'RNA-SEQ'
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1)

        ## Sample 2
        sample2_accession = 'SRR1206054'
        sample2 = Sample.objects.create(accession_code=sample2_accession,
                                        organism=c_elegans)
        sample2.source_database = 'GEO'
        sample2.technology = 'RNA-SEQ'
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2)

        computational_result1 = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
        computational_result1.save()

        sample_result_assoc = SampleResultAssociation(sample=sample1, result=computational_result1)
        sample_result_assoc.save()

        comp_file = ComputedFile()
        comp_file.absolute_file_path = "/doesnt/matter"
        comp_file.result = computational_result1
        comp_file.size_in_bytes=1337
        comp_file.sha1="ABC"
        comp_file.s3_key = "key"
        comp_file.s3_bucket = "bucket"
        comp_file.save()

        computational_result2 = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
        computational_result2.save()

        sample_result_assoc = SampleResultAssociation(sample=sample2, result=computational_result2)
        sample_result_assoc.save()

        comp_file = ComputedFile()
        comp_file.absolute_file_path = "/doesnt/matter"
        comp_file.result = computational_result2
        comp_file.size_in_bytes=1337
        comp_file.sha1="ABC"
        comp_file.s3_key = "key"
        comp_file.s3_bucket = "bucket"
        comp_file.save()

        quantified_experiments = salmon.get_tximport_inputs({"sample": sample1})['tximport_inputs']

        self.assertEqual({}, quantified_experiments)
Ejemplo n.º 3
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "RNA-SEQ",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/",
                },
                experiment,
            )

            rnas.append(file)

        # Missing sample that will be filtered
        sample = create_sample_for_experiment(
            {
                "organism": danio_rerio,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "RNA-SEQ",
                "filename": None,
            },
            experiment,
        )
        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertSucceeded(job)

        # Verify result
        self.assertEqual(
            final_context["compendium_result"].result.computedfile_set.count(),
            1)
        for file in final_context[
                "compendium_result"].result.computedfile_set.all():
            self.assertTrue(os.path.exists(file.absolute_file_path))

        # test compendium_result
        self.assertEqual(final_context["compendium_result"].svd_algorithm,
                         "ARPACK")
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            final_context["organism_name"],
        )
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            "DANIO_RERIO")
        self.assertEqual(final_context["compendium_result"].organisms.count(),
                         1)

        self.assertEqual(len(final_context["filtered_samples"]), 10)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"], "GSE5678")
        self.assertIn(
            "This sample did not have a processed file",
            final_context["filtered_samples"]["GSM1487222"]["reason"],
        )

        # check that the 9 files with lots of missing measurements were filtered
        self.assertEqual(
            len(
                list(
                    filter(
                        lambda x: "less than 50% present values" in x["reason"
                                                                      ],
                        final_context["filtered_samples"].values(),
                    ))),
            9,
        )

        zf = zipfile.ZipFile(final_context["compendium_result"].result.
                             computedfile_set.first().absolute_file_path)
        with zf.open("aggregated_metadata.json") as f:
            metadata = json.load(f)

            self.assertFalse(metadata.get("quant_sf_only"))
            self.assertEqual(metadata.get("compendium_version"), 1)

            # 420 microarray + 420 RNA seq
            # -1 that is filtered for a missing file
            # -9 that are filtered for having less than 50% present values
            self.assertEqual(metadata.get("num_samples"), 830)

            self.assertEqual(metadata.get("num_experiments"), 2)

            # Make sure the data were quantile normalized
            self.assertTrue(metadata.get("quantile_normalized"))

        self.assertIn("ks_statistic", final_context)
        self.assertIn("ks_pvalue", final_context)
        self.assertEqual(final_context["ks_pvalue"], 1.0)
Ejemplo n.º 4
0
    def test_create_compendia_microarray_only(self):
        """
        Make sure that we can actually create a compendium with just microarray samples.
        """
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        dset = Dataset()
        dset.data = {"GSE1234": micros}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertSucceeded(job)

        # Verify result
        self.assertEqual(
            final_context["compendium_result"].result.computedfile_set.count(),
            1)
        for file in final_context[
                "compendium_result"].result.computedfile_set.all():
            self.assertTrue(os.path.exists(file.absolute_file_path))

        # test compendium_result
        self.assertEqual(final_context["compendium_result"].svd_algorithm,
                         "ARPACK")
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            final_context["organism_name"],
        )
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            "DANIO_RERIO")
        self.assertEqual(final_context["compendium_result"].organisms.count(),
                         1)

        zf = zipfile.ZipFile(final_context["compendium_result"].result.
                             computedfile_set.first().absolute_file_path)
        with zf.open("aggregated_metadata.json") as f:
            metadata = json.load(f)

            self.assertFalse(metadata.get("quant_sf_only"))
            # 420 microarray
            self.assertEqual(metadata.get("num_samples"), 420)
            self.assertEqual(metadata.get("num_experiments"), 1)

            # Make sure the data were quantile normalized
            self.assertTrue(metadata.get("quantile_normalized"))

        self.assertIn("ks_statistic", final_context)
        self.assertIn("ks_pvalue", final_context)
        self.assertEqual(final_context["ks_pvalue"], 1.0)
Ejemplo n.º 5
0
def _run_salmon(job_context: Dict) -> Dict:
    """Runs Salmon Quant."""
    logger.debug("Running Salmon..")

    # Salmon needs to be run differently for different sample types.
    # SRA files also get processed differently as we don't want to use fasterq-dump to extract
    # them to disk.
    if job_context.get("sra_input_file_path", None):

        # Single reads
        if job_context["sra_num_reads"] == 1:

            fifo = "/tmp/barney"
            os.mkfifo(fifo)

            dump_str = "fastq-dump --stdout {input_sra_file} > {fifo} &"
            formatted_dump_command = dump_str.format(
                input_sra_file=job_context["sra_input_file_path"], fifo=fifo)
            subprocess.Popen(formatted_dump_command,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)

            command_str = (
                "salmon --no-version-check quant -l A -i {index} "
                "-r {fifo} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames"
            )
            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_sra_file=job_context["sra_input_file_path"],
                fifo=fifo,
                output_directory=job_context["output_directory"],
            )
        # Paired are trickier
        else:

            # Okay, for some reason I can't explain, this only works
            # in the temp directory, otherwise the `tee` part will
            # only output to one or the other of the streams
            # (non-deterministically), but not both. This doesn't
            # appear to happen if the fifos are in tmp.
            alpha = "/tmp/alpha"
            os.mkfifo(alpha)
            beta = "/tmp/beta"
            os.mkfifo(beta)

            dump_str = (
                "fastq-dump --stdout --split-files -I {input_sra_file}"
                "| tee >(grep '@.*\.1\s' -A3 --no-group-separator > {fifo_alpha}) "
                ">(grep '@.*\.2\s' -A3 --no-group-separator > {fifo_beta}) > /dev/null &"
            )
            formatted_dump_command = dump_str.format(
                input_sra_file=job_context["sra_input_file_path"],
                fifo_alpha=alpha,
                fifo_beta=beta)
            subprocess.Popen(
                formatted_dump_command,
                shell=True,
                executable="/bin/bash",
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
            )

            command_str = (
                "salmon --no-version-check quant -l A -i {index} "
                "-1 {fifo_alpha} -2 {fifo_beta} -p 16 -o {output_directory} "
                "--seqBias --dumpEq --writeUnmappedNames")
            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_sra_file=job_context["sra_input_file_path"],
                fifo_alpha=alpha,
                fifo_beta=beta,
                output_directory=job_context["output_directory"],
            )

    else:
        if "input_file_path_2" in job_context:
            second_read_str = " -2 {}".format(job_context["input_file_path_2"])

            # Rob recommends 16 threads/process, which fits snugly on
            # an x1 at 8GB RAM per Salmon container:

            # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17
            command_str = (
                "salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}"
                " -1 {input_one}{second_read_str} -p 16 -o {output_directory}"
                " --seqBias --gcBias --dumpEq --writeUnmappedNames")

            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_one=job_context["input_file_path"],
                second_read_str=second_read_str,
                output_directory=job_context["output_directory"],
            )
        else:
            # Related: https://github.com/COMBINE-lab/salmon/issues/83
            command_str = ("salmon --no-version-check quant -l A -i {index}"
                           " -r {input_one} -p 16 -o {output_directory}"
                           " --seqBias --dumpEq --writeUnmappedNames")

            formatted_command = command_str.format(
                index=job_context["index_directory"],
                input_one=job_context["input_file_path"],
                output_directory=job_context["output_directory"],
            )

    logger.debug(
        "Running Salmon Quant using the following shell command: %s",
        formatted_command,
        processor_job=job_context["job_id"],
    )

    # Salmon probably shouldn't take longer than three hours.
    timeout = 60 * 60 * 3
    job_context["time_start"] = timezone.now()
    try:
        completed_command = subprocess.run(
            formatted_command.split(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout,
        )
    except subprocess.TimeoutExpired:
        failure_reason = "Salmon timed out because it failed to complete within 3 hours."
        logger.error(
            failure_reason,
            sample_accesion_code=job_context["sample"].accession_code,
            processor_job=job_context["job_id"],
        )
        job_context["job"].failure_reason = failure_reason
        job_context["job"].no_retry = True
        job_context["success"] = False
        return job_context

    job_context["time_end"] = timezone.now()

    if completed_command.returncode == 1:
        stderr = completed_command.stderr.decode().strip()
        error_start = stderr.upper().find("ERROR:")
        error_start = error_start if error_start != -1 else 0
        logger.error(
            "Shell call to salmon failed with error message: %s",
            stderr[error_start:],
            processor_job=job_context["job_id"],
        )

        # If salmon has an error exit code then we don't want to retry it.
        job_context["job"].no_retry = True
        job_context["job"].failure_reason = (
            "Shell call to salmon failed because: " + stderr[error_start:])
        job_context["success"] = False
    else:
        result = ComputationalResult()
        result.commands.append(formatted_command)
        result.time_start = job_context["time_start"]
        result.time_end = job_context["time_end"]
        result.organism_index = job_context["organism_index"]
        result.is_ccdl = True

        try:
            processor_key = "SALMON_QUANT"
            result.processor = utils.find_processor(processor_key)
        except Exception as e:
            return utils.handle_processor_exception(job_context, processor_key,
                                                    e)

        # Zip up the output of Salmon Quant
        try:
            with tarfile.open(job_context["output_archive"], "w:gz") as tar:
                tar.add(job_context["output_directory"], arcname=os.sep)
        except Exception:
            logger.exception(
                "Exception caught while zipping processed directory %s",
                job_context["output_directory"],
                processor_job=job_context["job_id"],
            )
            failure_template = "Exception caught while zipping processed directory {}"
            job_context["job"].failure_reason = failure_template.format(
                job_context["output_archive"])
            job_context["success"] = False
            return job_context

        salmon_quant_archive = ComputedFile()
        salmon_quant_archive.absolute_file_path = job_context["output_archive"]
        salmon_quant_archive.filename = os.path.split(
            job_context["output_archive"])[-1]
        salmon_quant_archive.calculate_sha1()
        salmon_quant_archive.calculate_size()
        salmon_quant_archive.is_public = True
        salmon_quant_archive.is_smashable = False
        salmon_quant_archive.is_qc = False

        quant_file = ComputedFile()
        quant_file.s3_bucket = S3_BUCKET_NAME
        timestamp = str(timezone.now().timestamp()).split(".")[0]
        quant_file.s3_key = "quant_files/sample_{0}_{1}_quant.sf".format(
            job_context["sample"].id, timestamp)
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = job_context[
            "output_directory"] + "quant.sf"
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.calculate_sha1()
        quant_file.calculate_size()

        # If we're running in the cloud we need to upload the quant.sf
        # file so that it can be used by a job running on any machine
        # to run tximport. We can't use sync_to_s3 though because we
        # have to sync it before we can save the file so it cannot be
        # discovered by other jobs before it is uploaded.
        if settings.RUNNING_IN_CLOUD:
            try:
                S3.upload_file(
                    quant_file.absolute_file_path,
                    quant_file.s3_bucket,
                    quant_file.s3_key,
                    ExtraArgs={"StorageClass": "STANDARD_IA"},
                )
            except Exception as e:
                logger.exception(e,
                                 processor_job=job_context["job_id"],
                                 sample=job_context["sample"].id)
                failure_template = "Exception caught while uploading quantfile to S3: {}"
                job_context["job"].failure_reason = failure_template.format(
                    quant_file.absolute_file_path)
                job_context["success"] = False
                return job_context

        # Here select_for_update() is used as a mutex that forces multiple
        # jobs to execute this block of code in serial manner. See:
        # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update
        # Theorectically any rows in any table can be locked here, we're
        # locking all existing rows in ComputationalResult table.
        with transaction.atomic():
            ComputationalResult.objects.select_for_update()
            result.save()
            job_context["quant_result"] = result
            quant_file.result = result
            quant_file.save()

            job_context["result"] = result

            job_context["pipeline"].steps.append(result.id)
            SampleResultAssociation.objects.get_or_create(
                sample=job_context["sample"], result=result)
            job_context["sample"].most_recent_quant_file = quant_file
            job_context["sample"].save()

            salmon_quant_archive.result = result
            salmon_quant_archive.save()
            job_context["computed_files"].append(salmon_quant_archive)

        kv = ComputationalResultAnnotation()
        kv.data = {
            "index_length": job_context["index_length"],
            "index_length_get": job_context.get("index_length_raw", None),
        }
        kv.result = result
        kv.is_public = True
        kv.save()

        try:
            with open(
                    os.path.join(job_context["output_directory"],
                                 "lib_format_counts.json")) as lfc_file:
                format_count_data = json.load(lfc_file)
                kv = ComputationalResultAnnotation()
                kv.data = format_count_data
                kv.result = result
                kv.is_public = True
                kv.save()
        except Exception:
            # See: https://github.com/AlexsLemonade/refinebio/issues/1167
            logger.exception(
                "Error parsing Salmon lib_format_counts JSON output!",
                processor_job=job_context["job_id"],
            )

        try:
            with open(
                    os.path.join(job_context["output_directory"], "aux_info",
                                 "meta_info.json")) as mi_file:
                meta_info = json.load(mi_file)
                kv = ComputationalResultAnnotation()
                kv.data = meta_info
                kv.result = result
                kv.is_public = True
                kv.save()
        except Exception:
            # See: https://github.com/AlexsLemonade/refinebio/issues/1167
            logger.exception("Error parsing Salmon meta_info JSON output!",
                             processor_job=job_context["job_id"])

        job_context["success"] = True

    return job_context
Ejemplo n.º 6
0
def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    archive_path = job_context["archive_path"]
    compendia_organism = job_context["compendia_organism"]
    compendium_version = job_context["compendium_version"]

    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_QUANTPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = FileUtils.get_filename(archive_path)
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.is_compendia = True
    archive_computed_file.quant_sf_only = True
    archive_computed_file.compendia_organism = compendia_organism
    archive_computed_file.compendium_version = compendium_version
    archive_computed_file.save()

    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = True
    compendium_result.result = result
    compendium_result.primary_organism = compendia_organism
    compendium_result.compendium_version = compendium_version
    compendium_result.save()

    logger.info("Quantpendia created! Uploading to S3.",
                job_id=job_context["job_id"],
                archive_path=archive_path,
                organism_name=compendia_organism.name,
                **get_process_stats())

    # Upload the result to S3
    timestamp = str(int(time.time()))
    s3_key = compendia_organism.name + "_" + str(
        compendium_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME,
                                                      s3_key)

    if not uploaded_to_s3:
        archive_computed_file.delete()

        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    return job_context
Ejemplo n.º 7
0
    def test_create_compendia(self):
        job = ProcessorJob()
        job.pipeline_applied = "COMPENDIA"
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS")

        sample = Sample()
        sample.accession_code = 'GSM1487313'
        sample.title = 'GSM1487313'
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = 'SRS332914'
        sample2.title = 'SRS332914'
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        dset = Dataset()
        dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']}
        dset.scale_by = 'NONE'
        dset.aggregate_by = 'SPECIES'
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)
Ejemplo n.º 8
0
def _run_salmontools(job_context: Dict) -> Dict:
    """Run Salmontools to extract unmapped genes."""

    logger.debug("Running SalmonTools ...")
    unmapped_filename = job_context[
        "output_directory"] + "aux_info/unmapped_names.txt"

    command_str = "salmontools extract-unmapped -u {unmapped_file} -o {output} "
    output_prefix = job_context["salmontools_directory"] + "unmapped_by_salmon"
    command_str = command_str.format(unmapped_file=unmapped_filename,
                                     output=output_prefix)
    if "input_file_path_2" in job_context:
        command_str += "-1 {input_1} -2 {input_2}"
        command_str = command_str.format(
            input_1=job_context["input_file_path"],
            input_2=job_context["input_file_path_2"])
    else:
        command_str += "-r {input_1}"
        command_str = command_str.format(
            input_1=job_context["input_file_path"])

    start_time = timezone.now()
    logger.debug(
        "Running the following SalmonTools command: %s",
        command_str,
        processor_job=job_context["job_id"],
    )

    completed_command = subprocess.run(command_str.split(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
    end_time = timezone.now()

    # As of SalmonTools 0.1.0, completed_command.returncode is always 0,
    # (even if error happens).  completed_command.stderr is not totally
    # reliable either, because it will output the following line even
    # when the execution succeeds:
    #  "There were <N> unmapped reads\n"
    # in which "<N>" is the number of lines in input unmapped_names.txt.
    #
    # As a workaround, we are using a regular expression here to test
    # the status of SalmonTools execution.  Any text in stderr that is
    # not in the above format is treated as error message.
    status_str = completed_command.stderr.decode().strip()
    success_pattern = r"^There were \d+ unmapped reads$"
    if re.match(success_pattern, status_str):
        # Zip up the output of salmontools
        try:
            with tarfile.open(job_context["salmontools_archive"],
                              "w:gz") as tar:
                tar.add(job_context["salmontools_directory"], arcname=os.sep)
        except Exception:
            logger.exception(
                "Exception caught while zipping processed directory %s",
                job_context["salmontools_directory"],
                processor_job=job_context["job_id"],
            )
            failure_template = "Exception caught while zipping salmontools directory {}"
            job_context["job"].failure_reason = failure_template.format(
                job_context["salmontools_archive"])
            job_context["success"] = False
            return job_context

        result = ComputationalResult()
        result.commands.append(command_str)
        result.time_start = start_time
        result.time_end = end_time
        result.is_ccdl = True

        try:
            processor_key = "SALMONTOOLS"
            result.processor = utils.find_processor(processor_key)
        except Exception as e:
            return utils.handle_processor_exception(job_context, processor_key,
                                                    e)

        result.save()
        job_context["pipeline"].steps.append(result.id)

        assoc = SampleResultAssociation()
        assoc.sample = job_context["sample"]
        assoc.result = result
        assoc.save()

        computed_file = ComputedFile()
        computed_file.filename = job_context["salmontools_archive"].split(
            "/")[-1]
        computed_file.absolute_file_path = job_context["salmontools_archive"]
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.is_public = True
        computed_file.is_smashable = False
        computed_file.is_qc = True
        computed_file.result = result
        computed_file.save()
        job_context["computed_files"].append(computed_file)

        assoc = SampleComputedFileAssociation()
        assoc.sample = job_context["sample"]
        assoc.computed_file = computed_file
        assoc.save()

        job_context["result"] = result
        job_context["success"] = True
    else:  # error in salmontools
        logger.error(
            "Shell call to salmontools failed with error message: %s",
            status_str,
            processor_job=job_context["job_id"],
        )
        job_context["job"].failure_reason = (
            "Shell call to salmontools failed because: " + status_str)
        job_context["success"] = False

    return job_context
Ejemplo n.º 9
0
    def test_dataset_adding_non_downloadable_samples_fails(self):
        # Make a sample that is not downloadable
        sample1 = Sample()
        sample1.title = "456"
        sample1.accession_code = "456"
        sample1.platform_name = "AFFY"
        sample1.is_processed = False
        sample1.organism = self.homo_sapiens
        sample1.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample1
        experiment_sample_association.experiment = self.experiment
        experiment_sample_association.save()

        # Bad, 456 is not processed
        jdata = json.dumps({
            "email_address": "*****@*****.**",
            "data": {
                "GSE123": ["456"]
            }
        })
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            jdata,
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 400)
        self.assertIn(
            "Non-downloadable sample(s) in dataset",
            response.json()["message"][0],
        )
        self.assertEqual(response.json()["non_downloadable_samples"], ["456"])

        # Bad, 567 does not exist
        jdata = json.dumps({
            "email_address": "*****@*****.**",
            "data": {
                "GSE123": ["567"]
            }
        })
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            jdata,
            content_type="application/json",
        )
        self.assertIn(
            "Sample(s) in dataset do not exist on refine",
            response.json()["message"][0],
        )
        self.assertEqual(response.status_code, 400)

        # Good, 789 is processed
        jdata = json.dumps({
            "email_address": "*****@*****.**",
            "data": {
                "GSE123": ["789"]
            }
        })
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            jdata,
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)

        # Bad, 456 does not have a quant.sf file
        post_data = {"email_address": "*****@*****.**", "data": {}}
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            json.dumps(post_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)

        put_data = {
            **post_data, "data": {
                "GSE123": ["456"]
            },
            "quant_sf_only": True
        }
        response = self.client.put(
            reverse("dataset",
                    kwargs={
                        "id": response.json()["id"],
                        "version": API_VERSION
                    }),
            json.dumps(put_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 400)
        self.assertIn(
            "Sample(s) in dataset are missing quant.sf files",
            response.json()["message"][0],
        )
        self.assertEqual(response.json()["non_downloadable_samples"], ["456"])

        # Bad, none of the samples in GSE123 have a quant.sf file
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            json.dumps(post_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)
        response = self.client.put(
            reverse("dataset",
                    kwargs={
                        "id": response.json()["id"],
                        "version": API_VERSION
                    }),
            json.dumps({
                **put_data, "data": {
                    "GSE123": ["ALL"]
                }
            }),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 400)
        self.assertIn(
            "Experiment(s) in dataset have zero downloadable samples",
            response.json()["message"][0],
        )
        self.assertEqual(response.json()["non_downloadable_experiments"],
                         ["GSE123"])

        # Make 456 have a quant.sf file
        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample1
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.s3_key = "smasher-test-quant.sf"
        computed_file.s3_bucket = "data-refinery-test-assets"
        computed_file.filename = "quant.sf"
        computed_file.result = result
        computed_file.size_in_bytes = 42
        computed_file.save()

        # Good, 456 does have a quant.sf file
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            json.dumps(post_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)

        response = self.client.put(
            reverse("dataset",
                    kwargs={
                        "id": response.json()["id"],
                        "version": API_VERSION
                    }),
            json.dumps(put_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 200)

        # Good, a sample in GSE123 has a quant.sf file
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            json.dumps(post_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)
        response = self.client.put(
            reverse("dataset",
                    kwargs={
                        "id": response.json()["id"],
                        "version": API_VERSION
                    }),
            json.dumps({
                **put_data, "data": {
                    "GSE123": ["ALL"]
                }
            }),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 200)
Ejemplo n.º 10
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = "COMPENDIA"
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        danio_rerio = Organism.get_object_for_name("DANIO_RERIO")

        micros = []
        for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'):

            if 'microarray.txt' in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "MICROARRAY"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'):

            if 'rnaseq.txt' in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "RNASEQ"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            rnas.append(file)

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv'
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data['organism_id'] = danio_rerio.id
        cra.data['is_qn'] = True
        cra.result = result
        cra.save()

        dset = Dataset()
        dset.data = {'GSE1234': micros, 'GSE5678': rnas}
        dset.scale_by = 'NONE'
        dset.aggregate_by = 'SPECIES'
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Verify result
        self.assertEqual(len(final_context['computed_files']), 3)
        for file in final_context['computed_files']:
            self.assertTrue(os.path.exists(file.absolute_file_path))
Ejemplo n.º 11
0
    def setUp(self):
        # Saving this for if we have protected endpoints
        # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword')
        # self.client.login(username='******', password='******')
        # self.user = User.objects.create(username="******")

        experiment = Experiment()
        experiment.accession_code = "GSE000"
        experiment.alternate_accession_code = "E-GEOD-000"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.save()
        self.experiment = experiment

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA",
                              taxonomy_id=9646,
                              is_scientific_name=True)
        ailuropoda.save()
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()
        self.sample = sample

        # add qn target for sample organism
        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {"organism_id": ailuropoda.id, "is_qn": True}
        cra.save()

        ailuropoda.qn_target = result
        ailuropoda.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()
        experiment.num_total_samples = 1
        experiment.num_processed_samples = 1
        experiment.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        processor = Processor()
        processor.name = "Salmon Quant"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        computational_result_short = ComputationalResult(processor=processor)
        computational_result_short.save()

        organism_index = OrganismIndex()
        organism_index.index_type = "TRANSCRIPTOME_SHORT"
        organism_index.organism = self.danio_rerio
        organism_index.result = computational_result_short
        organism_index.absolute_directory_path = (
            "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT")
        organism_index.is_public = True
        organism_index.s3_url = "not_blank"
        organism_index.save()

        return
Ejemplo n.º 12
0
def _create_result_objects(job_context: Dict) -> Dict:
    """
    Store and host the result as a ComputationalResult object.
    """
    result_start = log_state("start create result object",
                             job_context["job"].id)
    result = ComputationalResult()
    result.commands.append(" ".join(job_context["formatted_command"]))
    result.is_ccdl = True
    # Temporary until we re-enable the QN test step.
    result.is_public = False
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "CREATE_COMPENDIA"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)
    result.save()

    # Write the compendia dataframe to a file
    job_context["csv_outfile"] = job_context["output_dir"] + job_context[
        "organism_name"] + ".tsv"
    job_context["merged_qn"].to_csv(job_context["csv_outfile"],
                                    sep="\t",
                                    encoding="utf-8")

    organism_key = list(job_context["samples"].keys())[0]
    annotation = ComputationalResultAnnotation()
    annotation.result = result

    annotation.data = {
        "organism_id":
        job_context["samples"][organism_key][0].organism_id,
        "organism_name":
        job_context["organism_name"],
        "is_qn":
        False,
        "is_compendia":
        True,
        "samples": [
            sample.accession_code
            for sample in job_context["samples"][organism_key]
        ],
        "num_samples":
        len(job_context["samples"][organism_key]),
        "experiment_accessions":
        [e.accession_code for e in job_context["experiments"]],
        "total_percent_imputed":
        job_context["total_percent_imputed"],
    }
    annotation.save()

    # Create the resulting archive
    final_zip_base = SMASHING_DIR + str(
        job_context["dataset"].pk) + "_compendia"
    # Copy LICENSE.txt and correct README.md files.
    if job_context["dataset"].quant_sf_only:
        readme_file = "/home/user/README_QUANT.md"
    else:
        readme_file = "/home/user/README_NORMALIZED.md"

    shutil.copy(readme_file, job_context["output_dir"] + "/README.md")
    shutil.copy("/home/user/LICENSE_DATASET.txt",
                job_context["output_dir"] + "/LICENSE.TXT")
    archive_path = shutil.make_archive(final_zip_base, "zip",
                                       job_context["output_dir"])

    archive_computed_file = ComputedFile()
    archive_computed_file.absolute_file_path = archive_path
    archive_computed_file.filename = archive_path.split("/")[-1]
    archive_computed_file.calculate_sha1()
    archive_computed_file.calculate_size()
    archive_computed_file.is_smashable = False
    archive_computed_file.is_qn_target = False
    archive_computed_file.result = result
    archive_computed_file.save()

    # Compendia Result Helpers
    primary_organism = Organism.get_object_for_name(
        job_context["primary_organism"])
    organisms = [
        Organism.get_object_for_name(organism)
        for organism in job_context["all_organisms"]
    ]
    compendium_version = (CompendiumResult.objects.filter(
        primary_organism=primary_organism, quant_sf_only=False).count() + 1)
    # Save Compendia Result
    compendium_result = CompendiumResult()
    compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only
    compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm
    compendium_result.compendium_version = compendium_version
    compendium_result.result = result
    compendium_result.primary_organism = primary_organism
    compendium_result.save()

    # create relations to all organisms contained in the compendia

    compendium_result_organism_associations = []
    for compendium_organism in organisms:
        compendium_result_organism_association = CompendiumResultOrganismAssociation(
        )
        compendium_result_organism_association.compendium_result = compendium_result
        compendium_result_organism_association.organism = compendium_organism
        compendium_result_organism_associations.append(
            compendium_result_organism_association)

    CompendiumResultOrganismAssociation.objects.bulk_create(
        compendium_result_organism_associations)

    job_context["compendium_result"] = compendium_result

    logger.info("Compendium created!",
                archive_path=archive_path,
                organism_name=job_context["organism_name"])

    # Upload the result to S3
    timestamp = str(int(time.time()))
    key = job_context["organism_name"] + "_" + str(
        compendium_version) + "_" + timestamp + ".zip"
    uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME,
                                                      key)

    if not uploaded_to_s3:
        raise utils.ProcessorJobError(
            "Failed to upload compendia to S3",
            success=False,
            computed_file_id=archive_computed_file.id,
        )

    if settings.RUNNING_IN_CLOUD:
        archive_computed_file.delete_local_file()

    job_context["result"] = result
    job_context["success"] = True

    log_state("end create result object", job_context["job"].id, result_start)

    # TEMPORARY for iterating on compendia more quickly.
    # Reset this so the end_job does clean up the job's non-input-data stuff.
    job_context["work_dir"] = job_context["old_work_dir"]

    return job_context
Ejemplo n.º 13
0
    def setUpClass(cls):
        super(ESTestCases, cls).setUpClass(
        )  # ref https://stackoverflow.com/a/29655301/763705
        """
        #Set up class.
        """
        experiment = Experiment()
        experiment.accession_code = "GSE000-X"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123-X"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.num_processed_samples = 1  # added below
        experiment.num_total_samples = 1
        experiment.num_downloadable_samples = 1
        experiment.save()

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.save()

        organism = Organism(
            name=ECOLI_STRAIN_NAME,
            taxonomy_id=879462,
            is_scientific_name=True,
        )
        organism.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = organism
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob(downloader_job=downloader_job)
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        # associate the experiment with the sample
        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()

        result = ComputationalResult()
        result.save()

        # and create a qn tarjet for the sample
        computational_result = ComputationalResultAnnotation()
        computational_result.result = result
        computational_result.data = {
            "is_qn": True,
            "organism_id": sample.organism.id
        }
        computational_result.save()

        # and associate it with the sample organism
        sample.organism.qn_target = result
        sample.organism.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        # clear default cache and reindex
        # otherwise the organisms with qn_targes will be cached.
        cache.clear()
        call_command("search_index", "--rebuild", "-f")
Ejemplo n.º 14
0
def run_tximport_at_progress_point(complete_accessions: List[str], incomplete_accessions: List[str]) -> Dict:
    """Create an experiment and associated objects and run tximport on it.

    Creates a sample for each accession contained in either input
    list. The samples in complete_accessions will be simlulated as
    already having salmon quant run on them. The samples in
    incomplete_accessions won't.
    """
    # Create the experiment
    experiment_accession = 'SRP095529'
    data_dir = '/home/user/data_store/salmon_tests/'
    experiment_dir = data_dir + experiment_accession
    experiment = Experiment.objects.create(accession_code=experiment_accession)

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    # This is a lie, but this image doesn't have the dependencies for TRANSCRIPTOME_INDEX
    computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/ZEBRAFISH_INDEX/SHORT"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz"
    comp_file.result = computational_result_short
    comp_file.size_in_bytes=1337
    comp_file.sha1="ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    for accession_code in incomplete_accessions:
        last_sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database='SRA',
            technology='RNA-SEQ'
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=last_sample)

    # Create tximport result and files
    quant_processor = utils.find_processor("SALMON_QUANT")
    tximport_processor = utils.find_processor("TXIMPORT")

    # Create the already processed samples along with their
    # ComputationalResults and ComputedFiles. They don't need
    # original files for this test because we aren't going to run
    # salmon quant on them.
    for accession_code in complete_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database='SRA',
            technology='RNA-SEQ'
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample)

        if accession_code == "SRR5125622":
            current_sample = sample

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf"
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(
            sample=sample,
            result=quant_result
        )

    # Processor jobs need at least one original file associated with
    # them so they know what they're processing.
    current_og = OriginalFile()
    current_og.absolute_file_path = os.path.join(experiment_dir, 'SRR5125622.fastq.gz')
    current_og.filename = "SRR5125622.fastq.gz"
    current_og.save()

    OriginalFileSampleAssociation.objects.create(original_file=current_og, sample=current_sample).save()

    pj = ProcessorJob()
    pj.pipeline_applied = "TXIMPORT"
    pj.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = current_og
    assoc1.processor_job = pj
    assoc1.save()

    # Prep our job context
    job_context = tximport._prepare_files({"job_dir_prefix": "TEST3",
                                           "job_id": "TEST3",
                                           "job": pj,
                                           "index_directory": organism_index.absolute_directory_path,
                                           "pipeline": Pipeline(name="Salmon"),
                                           "computed_files": [],
                                           "original_files": [current_og]})

    # We don't have the raw file to run _determine_index_length so
    # just pick one, it doesn't matter that much because we aren't
    # checking the output data.
    job_context["index_length"] = "short"
    job_context = salmon._find_or_download_index(job_context)

    job_context = salmon.get_tximport_inputs(job_context)
    job_context = salmon.tximport(job_context)

    return job_context
Ejemplo n.º 15
0
def _run_tximport_for_experiment(
        job_context: Dict,
        experiment: Experiment,
        quant_files: List[ComputedFile]) -> Dict:

    # Download all the quant.sf fles for this experiment. Write all
    # their paths to a file so we can pass a path to that to
    # tximport.R rather than having to pass in one argument per
    # sample.
    tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt"
    quant_file_paths = {}
    with open(tximport_path_list_file, "w") as input_list:
        for quant_file in quant_files:
            # We create a directory in the work directory for each (quant.sf) file, as
            # tximport assigns column names based on the parent directory name,
            # and we need those names so that we can reassociate withe samples later.
            # ex., a file with absolute_file_path: /processor_job_1/SRR123_output/quant.sf
            # downloads to: /processor_job_2/SRR123_output/quant.sf
            # So the result file has frame "SRR123_output", which we can associate with sample SRR123
            sample_output = job_context["work_dir"] + str(quant_file.absolute_file_path.split('/')[-2]) + "/"
            os.makedirs(sample_output, exist_ok=True)
            quant_work_path = sample_output + quant_file.filename
            quant_file_path = quant_file.get_synced_file_path(path=quant_work_path)
            input_list.write(quant_file_path + "\n")
            quant_file_paths[quant_file_path] = os.stat(quant_file_path).st_size


    rds_filename = "txi_out.RDS"
    rds_file_path = job_context["work_dir"] + rds_filename
    tpm_filename = "gene_lengthScaledTPM.tsv"
    tpm_file_path = job_context["work_dir"] + tpm_filename
    result = ComputationalResult()
    cmd_tokens = [
        "/usr/bin/Rscript", "--vanilla",
        "/home/user/data_refinery_workers/processors/tximport.R",
        "--file_list", tximport_path_list_file,
        "--gene2txmap", job_context["genes_to_transcripts_path"],
        "--rds_file", rds_file_path,
        "--tpm_file", tpm_file_path
    ]
    result.time_start = timezone.now()

    logger.debug("Running tximport with: %s",
                 str(cmd_tokens),
                 processor_job=job_context['job_id'],
                 experiment=experiment.id)

    try:
        tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except Exception as e:
        error_template = ("Encountered error in R code while running tximport.R: {}")
        error_message = error_template.format(str(e))
        logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id)
        job_context["job"].failure_reason = error_message
        job_context["success"] = False
        return job_context

    if tximport_result.returncode != 0:
        error_template = ("Found non-zero exit code from R code while running tximport.R: {}")
        error_message = error_template.format(tximport_result.stderr.decode().strip())
        logger.error(error_message,
            processor_job=job_context["job_id"],
            experiment=experiment.id,
            quant_files=quant_files,
            cmd_tokens=cmd_tokens,
            quant_file_paths=quant_file_paths,
            )
        job_context["job"].failure_reason = error_message
        job_context["success"] = False
        return job_context

    result.time_end = timezone.now()
    result.commands.append(" ".join(cmd_tokens))
    result.is_ccdl = True
    try:
        processor_key = "TXIMPORT"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    # Associate this result with all samples in this experiment.
    # TODO: This may not be completely sensible, because `tximport` is
    # done at experiment level, not at sample level.
    # Could be very problematic if SRA's data model allows many
    # Experiments to one Run.
    # https://github.com/AlexsLemonade/refinebio/issues/297
    for sample in experiment.samples.all():
        s_r = SampleResultAssociation(sample=sample, result=result)
        s_r.save()

    rds_file = ComputedFile()
    rds_file.absolute_file_path = rds_file_path
    rds_file.filename = rds_filename
    rds_file.result = result
    rds_file.is_smashable = False
    rds_file.is_qc = False
    rds_file.is_public = True
    rds_file.calculate_sha1()
    rds_file.calculate_size()
    rds_file.save()
    job_context['computed_files'].append(rds_file)

    # Split the tximport result into smashable subfiles
    data = pd.read_csv(tpm_file_path, sep='\t', header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        # Create sample-specific TPM file.
        sample_file_name = frame.columns.values[0] + '_' + tpm_filename
        frame_path = os.path.join(job_context["work_dir"], sample_file_name)
        frame.to_csv(frame_path, sep='\t', encoding='utf-8')

        # The frame column header is based off of the path, which includes _output.
        sample = Sample.objects.get(accession_code=frame.columns.values[0].replace("_output", ""))

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = sample_file_name
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context['computed_files'].append(computed_file)
        job_context['smashable_files'].append(computed_file)

        SampleResultAssociation.objects.get_or_create(
            sample=sample,
            result=result)

        # Create association with the RDS file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample,
            computed_file=rds_file)

        # Create association with TPM file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample,
            computed_file=computed_file)

        individual_files.append(computed_file)
        job_context['samples'].append(sample)

    # Salmon-processed samples aren't marked as is_processed
    # until they are fully tximported, this value sets that
    # for the end_job function.
    job_context['tximported'] = True
    job_context['individual_files'] = individual_files
    return job_context
Ejemplo n.º 16
0
    def test_qn_reference(self):
        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()

        for code in ['1', '2', '3', '4', '5']:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_accession_code = 'A-MEXP-1171'
            sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
            sample.organism = homo_sapiens
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            file = ComputedFile()
            file.filename = code + ".tsv"
            file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            file.size_in_bytes = int(code)
            file.result = cr
            file.is_smashable = True
            file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False  # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context['success'])
        self.assertTrue(final_context['result_verified'])

        self.assertTrue(os.path.exists(final_context['target_file']))
        self.assertEqual(os.path.getsize(final_context['target_file']), 519)

        target = utils.get_most_recent_qn_target_for_organism(homo_sapiens)
        self.assertEqual(target.sha1,
                         'a38ae13de860e47e0251dd02d1a8e88f576d83ad')

        ###
        # Smasher with QN
        ###

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context['success'])

        self.assertEqual(final_context['merged_qn']['1'][0],
                         -0.437948852881293)
        self.assertEqual(final_context['original_merged']['1'][0],
                         -0.576210936113982)

        ##
        # Test via management command
        ##

        from django.core.management import call_command
        from django.test import TestCase
        from django.utils.six import StringIO

        out = StringIO()
        try:
            call_command('create_qn_target',
                         organism='homo_sapiens',
                         min=1,
                         stdout=out)
        except SystemExit as e:  # this is okay!
            pass

        stdout = out.getvalue()
        self.assertTrue('Target file' in stdout)
        path = stdout.split('\n')[0].split(':')[1].strip()
        self.assertTrue(os.path.exists(path))
        self.assertEqual(
            path,
            utils.get_most_recent_qn_target_for_organism(
                homo_sapiens).absolute_file_path)
def setup_experiment(new_version_accessions: List[str],
                     old_version_accessions: List[str]) -> Dict:
    """ Create an experiment where some samples were processed with the newest version of salmon and
    other with an older one.
    """
    # Create the experiment
    experiment_accession = "SRP095529"
    data_dir = "/home/user/data_store/"
    experiment_dir = data_dir + experiment_accession
    experiment = Experiment.objects.create(accession_code=experiment_accession,
                                           technology="RNA-SEQ")

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    # Create the transcriptome processor and result:
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.9.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.9.1"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    quant_processor = Processor()
    quant_processor.name = "Salmon Quant"
    quant_processor.version = "salmon 0.9.1"
    quant_processor.docker_image = "dr_salmon"
    quant_processor.environment = '{"some": "environment"}'
    quant_processor.save()

    for accession_code in old_version_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
            platform_accession_code="IlluminaHiSeq1000",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index  # associate with OLD organism index
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    # Create another OrganismIndex with a newer version of
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.13.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.13.1"  # DIFFERENT SALMON VERSION
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    for accession_code in new_version_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
            platform_accession_code="IlluminaHiSeq1000",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index  # NEWER VERSION
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    return experiment
Ejemplo n.º 18
0
def _run_tximport_for_experiment(job_context: Dict, experiment: Experiment,
                                 quant_files: List[ComputedFile]) -> Dict:

    # Download all the quant.sf fles for this experiment. Write all
    # their paths to a file so we can pass a path to that to
    # tximport.R rather than having to pass in one argument per
    # sample.
    tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt"
    quant_file_paths = {}
    with open(tximport_path_list_file, "w") as input_list:
        for quant_file in quant_files:
            # We create a directory in the work directory for each (quant.sf) file, as
            # tximport assigns column names based on the parent directory name,
            # and we need those names so that we can reassociate withe samples later.
            # ex., a file with absolute_file_path: /processor_job_1/SRR123_output/quant.sf
            # downloads to: /processor_job_2/SRR123_output/quant.sf
            # So the result file has frame "SRR123_output",
            # which we can associate with sample SRR123
            sample_output = (
                job_context["work_dir"] +
                str(quant_file.absolute_file_path.split("/")[-2]) + "/")
            os.makedirs(sample_output, exist_ok=True)
            quant_work_path = sample_output + quant_file.filename
            quant_file_path = quant_file.get_synced_file_path(
                path=quant_work_path)
            input_list.write(quant_file_path + "\n")
            quant_file_paths[quant_file_path] = os.stat(
                quant_file_path).st_size

    rds_filename = "txi_out.RDS"
    rds_file_path = job_context["work_dir"] + rds_filename
    tpm_filename = "gene_lengthScaledTPM.tsv"
    tpm_file_path = job_context["work_dir"] + tpm_filename
    result = ComputationalResult()
    cmd_tokens = [
        "/usr/bin/Rscript",
        "--vanilla",
        "/home/user/data_refinery_workers/processors/tximport.R",
        "--file_list",
        tximport_path_list_file,
        "--gene2txmap",
        job_context["genes_to_transcripts_path"],
        "--rds_file",
        rds_file_path,
        "--tpm_file",
        tpm_file_path,
    ]
    result.time_start = timezone.now()

    logger.debug(
        "Running tximport with: %s",
        str(cmd_tokens),
        processor_job=job_context["job_id"],
        experiment=experiment.id,
    )

    try:
        tximport_result = subprocess.run(cmd_tokens,
                                         stdout=subprocess.PIPE,
                                         stderr=subprocess.PIPE)
    except Exception as e:
        raise utils.ProcessorJobError(
            "Encountered error in R code while running tximport.R: {}".format(
                str(e)),
            success=False,
            experiment=experiment.id,
        )

    if tximport_result.returncode != 0:
        raise utils.ProcessorJobError(
            "Found non-zero exit code from R code while running tximport.R: {}"
            .format(tximport_result.stderr.decode().strip()),
            success=False,
            experiment=experiment.id,
            quant_files=quant_files,
            cmd_tokens=cmd_tokens,
            quant_file_paths=quant_file_paths,
        )

    result.time_end = timezone.now()
    result.commands.append(" ".join(cmd_tokens))
    result.is_ccdl = True
    try:
        processor_key = "TXIMPORT"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        raise utils.ProcessorJobError("Failed to set processor: {}".format(e),
                                      success=False,
                                      processor_key=processor_key)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    rds_file = ComputedFile()
    rds_file.absolute_file_path = rds_file_path
    rds_file.filename = rds_filename
    rds_file.result = result
    rds_file.is_smashable = False
    rds_file.is_qc = False
    rds_file.is_public = True
    rds_file.calculate_sha1()
    rds_file.calculate_size()
    rds_file.save()
    job_context["computed_files"].append(rds_file)

    # Split the tximport result into smashable subfiles
    data = pd.read_csv(tpm_file_path, sep="\t", header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        # Create sample-specific TPM file.
        sample_file_name = frame.columns.values[0] + "_" + tpm_filename
        frame_path = os.path.join(job_context["work_dir"], sample_file_name)
        frame.to_csv(frame_path, sep="\t", encoding="utf-8")

        # The frame column header is based off of the path, which includes _output.
        sample_accession_code = frame.columns.values[0].replace("_output", "")
        sample = Sample.objects.get(accession_code=sample_accession_code)

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = sample_file_name
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context["computed_files"].append(computed_file)
        job_context["smashable_files"].append(computed_file)

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=result)

        # Create association with the RDS file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=rds_file)

        # Create association with TPM file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

        individual_files.append(computed_file)
        job_context["samples"].append(sample)

    # Salmon-processed samples aren't marked as is_processed
    # until they are fully tximported, this value sets that
    # for the end_job function.
    job_context["tximported"] = True
    job_context["individual_files"] = individual_files
    return job_context
Ejemplo n.º 19
0
def run_tximport_at_progress_point(complete_accessions: List[str],
                                   incomplete_accessions: List[str]) -> Dict:
    """Create an experiment and associated objects and run tximport on it.

    Creates a sample for each accession contained in either input
    list. The samples in complete_accessions will be simlulated as
    already having salmon quant run on them. The samples in
    incomplete_accessions won't.
    """
    # Create the experiment
    experiment_accession = "SRP095529"
    data_dir = "/home/user/data_store/"
    experiment_dir = data_dir + experiment_accession
    experiment = Experiment.objects.create(accession_code=experiment_accession,
                                           technology="RNA-SEQ")

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    ExperimentOrganismAssociation.objects.get_or_create(experiment=experiment,
                                                        organism=zebrafish)

    # Create the transcriptome processor and result:
    transcriptome_processor = Processor()
    transcriptome_processor.name = "Transcriptome"
    transcriptome_processor.version = "salmon 0.13.1"
    transcriptome_processor.docker_image = "dr_transcriptome"
    transcriptome_processor.environment = '{"some": "environment"}'
    transcriptome_processor.save()
    computational_result_short = ComputationalResult(
        processor=transcriptome_processor)
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT"
    organism_index.salmon_version = "salmon 0.13.1"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = (
        "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz")
    comp_file.result = computational_result_short
    comp_file.size_in_bytes = 1337
    comp_file.sha1 = "ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    for accession_code in incomplete_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

    quant_processor = Processor()
    quant_processor.name = "Salmon Quant"
    quant_processor.version = "salmon 0.13.1"
    quant_processor.docker_image = "dr_salmon"
    quant_processor.environment = '{"some": "environment"}'
    quant_processor.save()
    tximport_processor = Processor()
    tximport_processor.name = "Tximport"
    tximport_processor.version = "salmon 0.13.1"
    tximport_processor.docker_image = "dr_salmon"
    tximport_processor.environment = '{"some": "environment"}'
    tximport_processor.save()

    # Create the already processed samples along with their
    # ComputationalResults and ComputedFiles. They don't need
    # original files for this test because we aren't going to run
    # salmon quant on them.
    for accession_code in complete_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database="SRA",
            technology="RNA-SEQ",
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment,
                                                   sample=sample)

        original_file = OriginalFile()
        original_file.filename = accession_code + ".SRA"
        original_file.source_filename = accession_code + ".SRA"
        original_file.save()

        OriginalFileSampleAssociation.objects.get_or_create(
            original_file=original_file, sample=sample)

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.organism_index = organism_index
        quant_result.save()

        kv = ComputationalResultAnnotation()
        kv.data = {"index_length": "short"}
        kv.result = quant_result
        kv.is_public = True
        kv.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir,
                                                       archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = (experiment_dir + "/quant_files/" +
                                         accession_code + "_output/quant.sf")
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(sample=sample,
                                                      result=quant_result)

    # Setup is done, actually run the command.
    run_tximport.run_tximport()
Ejemplo n.º 20
0
def _populate_index_object(job_context: Dict) -> Dict:
    """ """

    result = ComputationalResult()
    result.commands.append(job_context["salmon_formatted_command"])
    try:
        processor_key = "TX_INDEX"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.is_ccdl = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    result.save()
    job_context['pipeline'].steps.append(result.id)

    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["computed_archive"]
    computed_file.filename = os.path.split(job_context["computed_archive"])[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = False
    computed_file.is_qc = False
    computed_file.save()

    organism_object = Organism.get_object_for_name(
        job_context['organism_name'])
    index_object = OrganismIndex()
    index_object.organism = organism_object
    index_object.source_version = job_context["assembly_version"]
    index_object.assembly_name = job_context["assembly_name"]
    index_object.salmon_version = job_context["salmon_version"]
    index_object.index_type = "TRANSCRIPTOME_" + job_context['length'].upper()
    # This is where the index will be extracted to.
    index_object.absolute_directory_path = LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" \
                                           + organism_object.name + "/" + job_context['length']
    index_object.result = result

    if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME:
        logger.info("Uploading %s %s to s3",
                    job_context['organism_name'],
                    job_context['length'],
                    processor_job=job_context["job_id"])
        timestamp = str(timezone.now().timestamp()).split('.')[0]
        s3_key = organism_object.name + '_' + index_object.index_type + "_" + timestamp + '.tar.gz'
        sync_result = computed_file.sync_to_s3(
            S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key)
        if sync_result:
            computed_file.delete_local_file()
    else:
        logger.warn(
            "S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.",
            job_context['organism_name'],
            job_context['length'],
            processor_job=job_context["job_id"])

    index_object.save()

    # We uploaded the file ourselves since we wanted it to go to a
    # different bucket than end_job would put it in, therefore empty
    # this list so end_job doesn't try to upload it again.
    job_context['computed_files'] = []

    job_context['result'] = result
    job_context['computed_file'] = computed_file
    job_context['index'] = index_object

    # If there's not a long and a short index for this organism yet,
    # don't delete the input.
    # XXX: This will break once we introduce additional versions of these.
    short_indices = OrganismIndex.objects.filter(
        organism=organism_object,
        index_type="TRANSCRIPTOME_SHORT",
        source_version=job_context["assembly_version"])
    long_indices = OrganismIndex.objects.filter(
        organism=organism_object,
        index_type="TRANSCRIPTOME_LONG",
        source_version=job_context["assembly_version"])
    if short_indices.count() < 1 or long_indices.count() < 1:
        # utils.end_job deletes these, so remove them so it doesn't.
        job_context["original_files"] = []

    return job_context
Ejemplo n.º 21
0
    def test_create_quantpendia(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51088"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                    taxonomy_id=9606)

        sample = Sample()
        sample.accession_code = "GSM1237818"
        sample.title = "GSM1237818"
        sample.organism = homo_sapiens
        sample.technology = "RNA-SEQ"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        # Add a second non-downloadable sample. This one should not be included
        # in the count of samples available in the metadata
        sample2 = Sample()
        sample2.accession_code = "GSM1237819"
        sample2.title = "GSM1237819"
        sample2.organism = homo_sapiens
        sample2.technology = "RNA-SEQ"
        sample2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment
        esa2.sample = sample2
        esa2.save()

        computed_file = ComputedFile()
        computed_file.s3_key = "smasher-test-quant.sf"
        computed_file.s3_bucket = "data-refinery-test-assets"
        computed_file.filename = "quant.sf"
        computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf"
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.size_in_bytes = 123123
        computed_file.sha1 = (
            "08c7ea90b66b52f7cd9d9a569717a1f5f3874967"  # this matches with the downloaded file
        )
        computed_file.save()

        computed_file = ComputedFile()
        computed_file.filename = "logquant.tsv"
        computed_file.is_smashable = True
        computed_file.size_in_bytes = 123123
        computed_file.result = result
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {"GSE51088": ["GSM1237818", "GSM1237819"]}
        ds.aggregate_by = "EXPERIMENT"
        ds.scale_by = "STANDARD"
        ds.email_address = "*****@*****.**"
        ds.quant_sf_only = True  # Make the dataset include quant.sf files only
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = create_quantpendia(job.id)

        self.assertTrue(final_context["success"])
        self.assertTrue(
            os.path.exists(final_context["output_dir"] +
                           "/GSE51088/GSM1237818_quant.sf"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] + "/README.md"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] + "/LICENSE.TXT"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] +
                           "/aggregated_metadata.json"))

        # test that archive exists
        quantpendia_file = ComputedFile.objects.filter(
            is_compendia=True, quant_sf_only=True).latest()
        self.assertTrue(os.path.exists(quantpendia_file.absolute_file_path))

        zf = zipfile.ZipFile(quantpendia_file.absolute_file_path)
        with zf.open("aggregated_metadata.json") as f:
            metadata = json.load(f)

            self.assertTrue(metadata.get("quant_sf_only"))
            self.assertEqual(metadata.get("compendium_version"), 1)
            self.assertEqual(metadata.get("num_samples"), 1)
            self.assertEqual(metadata.get("num_experiments"), 1)

            # Make sure the data were not quantile normalized
            self.assertFalse(metadata.get("quantile_normalized"))