Ejemplo n.º 1
0
def make_test_data(organism):
    experiment = Experiment()
    experiment.accession_code = "GSE51088"
    experiment.technology = "RNA-SEQ"
    experiment.save()

    xoa = ExperimentOrganismAssociation()
    xoa.experiment = experiment
    xoa.organism = organism
    xoa.save()

    result = ComputationalResult()
    result.save()

    sample = Sample()
    sample.accession_code = "GSM1237818"
    sample.title = "GSM1237818"
    sample.organism = organism
    sample.technology = "RNA-SEQ"
    sample.is_processed = True
    sample.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    computed_file = ComputedFile()
    computed_file.s3_key = "smasher-test-quant.sf"
    computed_file.s3_bucket = "data-refinery-test-assets"
    computed_file.filename = "quant.sf"
    computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf"
    computed_file.result = result
    computed_file.is_smashable = True
    computed_file.size_in_bytes = 123123
    computed_file.sha1 = (
        "08c7ea90b66b52f7cd9d9a569717a1f5f3874967"  # this matches with the downloaded file
    )
    computed_file.save()

    computed_file = ComputedFile()
    computed_file.filename = "logquant.tsv"
    computed_file.is_smashable = True
    computed_file.size_in_bytes = 123123
    computed_file.result = result
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()
Ejemplo n.º 2
0
def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append("SCAN.UPC::SCAN_TwoColor")
    result.is_ccdl = True
    result.is_public = True
    result.time_start = job_context["time_start"]
    result.time_end = job_context["time_end"]
    try:
        processor_key = "AGILENT_TWOCOLOR"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    # Create a ComputedFile for the sample,
    # sync it S3 and save it.
    try:
        computed_file = ComputedFile()
        computed_file.absolute_file_path = job_context["output_file_path"]
        computed_file.filename = os.path.split(
            job_context["output_file_path"])[-1]
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.save()
        job_context["computed_files"].append(computed_file)
    except Exception:
        logger.exception(
            "Exception caught while moving file %s to S3",
            computed_file.filename,
            processor_job=job_context["job_id"],
        )
        failure_reason = "Exception caught while moving file to S3"
        job_context["job"].failure_reason = failure_reason
        job_context["success"] = False
        return job_context

    for sample in job_context["samples"]:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

    logger.info("Created %s", result)
    job_context["success"] = True

    return job_context
Ejemplo n.º 3
0
    def test_fail(self):
        """ Test our ability to fail """

        result = ComputationalResult()
        result.save()

        sample = Sample()
        sample.accession_code = 'XXX'
        sample.title = 'XXX'
        sample.organism = Organism.get_object_for_name("HOMO_SAPIENS")
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "NOT_REAL.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['XXX']}
        ds.aggregate_by = 'EXPERIMENT'
        ds.scale_by = 'MINMAX'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()
        dsid = ds.id

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)
        ds = Dataset.objects.get(id=dsid)
        print(ds.failure_reason)
        print(final_context['dataset'].failure_reason)
        self.assertNotEqual(final_context['unsmashable_files'], [])
Ejemplo n.º 4
0
def _create_result_objects(job_context: Dict) -> Dict:
    """ Create the ComputationalResult objects after a Scan run is complete """

    result = ComputationalResult()
    result.commands.append('SCAN.UPC::SCANfast')
    result.is_ccdl = True
    result.is_public = True

    result.time_start = job_context['time_start']
    result.time_end = job_context['time_end']
    try:
        processor_key = "AFFYMETRIX_SCAN"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    # Create a ComputedFile for the sample
    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["output_file_path"]
    computed_file.filename = os.path.split(job_context["output_file_path"])[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = True
    computed_file.is_qc = False
    computed_file.save()
    job_context['computed_files'].append(computed_file)

    for sample in job_context['samples']:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file)

    logger.debug("Created %s", result, processor_job=job_context["job_id"])
    job_context["success"] = True

    return job_context
Ejemplo n.º 5
0
def _create_result(job_context: Dict) -> Dict:
    """ Create the actual Result object"""

    # This is a NO-OP, but we make a ComputationalResult regardless.
    result = ComputationalResult()
    result.commands.append(job_context["script_name"])
    result.is_ccdl = True
    try:
        processor_key = "SUBMITTER_PROCESSED"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context["pipeline"].steps.append(result.id)

    # Create a ComputedFile for the computed file,
    # sync it S3 and save it.
    computed_file = ComputedFile()
    computed_file.absolute_file_path = job_context["output_file_path"]
    computed_file.filename = job_context["output_file_path"].split("/")[-1]
    computed_file.calculate_sha1()
    computed_file.calculate_size()
    computed_file.result = result
    computed_file.is_smashable = True
    computed_file.is_qc = False
    computed_file.save()

    # utils.end_job will sync this to S3 for us.
    job_context["computed_files"] = [computed_file]

    for sample in job_context["samples"]:
        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample, computed_file=computed_file
        )

    logger.debug("Created %s", result)
    job_context["success"] = True
    return job_context
Ejemplo n.º 6
0
def create_sample_for_experiment(sample_info: Dict,
                                 experiment: Experiment) -> Sample:
    result = ComputationalResult()
    result.save()

    sample = Sample()
    sample.accession_code = sample_info["accession_code"]
    sample.title = sample_info.get("title",
                                   None) or sample_info["accession_code"]
    sample.organism = sample_info["organism"]
    sample.technology = sample_info["technology"]
    sample.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    if sample_info.get("filename") is not None:
        computed_file = ComputedFile()
        computed_file.filename = sample_info["filename"]
        computed_file.absolute_file_path = sample_info[
            "data_dir"] + sample_info["filename"]
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

    return sample
Ejemplo n.º 7
0
    def test_no_smash_dupe(self):
        """ """

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1237810_T09-1084.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1237811'
        sample.title = 'GSM1237811'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']}
        ds.aggregate_by = 'ALL'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        self.assertTrue(ds.success)
        for column in final_context['original_merged'].columns:
            self.assertTrue('_x' not in column)
Ejemplo n.º 8
0
    def test_create_quantpendia(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51088"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                    taxonomy_id=9606)

        sample = Sample()
        sample.accession_code = "GSM1237818"
        sample.title = "GSM1237818"
        sample.organism = homo_sapiens
        sample.technology = "RNA-SEQ"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.s3_key = "smasher-test-quant.sf"
        computed_file.s3_bucket = "data-refinery-test-assets"
        computed_file.filename = "quant.sf"
        computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf"
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.size_in_bytes = 123123
        computed_file.sha1 = (
            "08c7ea90b66b52f7cd9d9a569717a1f5f3874967"  # this matches with the downloaded file
        )
        computed_file.save()

        computed_file = ComputedFile()
        computed_file.filename = "logquant.tsv"
        computed_file.is_smashable = True
        computed_file.size_in_bytes = 123123
        computed_file.result = result
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {"GSE51088": ["GSM1237818"]}
        ds.aggregate_by = "EXPERIMENT"
        ds.scale_by = "STANDARD"
        ds.email_address = "*****@*****.**"
        ds.quant_sf_only = True  # Make the dataset include quant.sf files only
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = create_quantpendia(job.id)

        self.assertTrue(
            os.path.exists(final_context["output_dir"] +
                           "/GSE51088/GSM1237818_quant.sf"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] + "/README.md"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] + "/LICENSE.TXT"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] +
                           "/aggregated_metadata.json"))

        self.assertTrue(final_context["metadata"]["quant_sf_only"])
        self.assertEqual(final_context["metadata"]["num_samples"], 1)
        self.assertEqual(final_context["metadata"]["num_experiments"], 1)

        # test that archive exists
        quantpendia_file = ComputedFile.objects.filter(
            is_compendia=True, quant_sf_only=True).latest()
        self.assertTrue(os.path.exists(quantpendia_file.absolute_file_path))
Ejemplo n.º 9
0
    def test_no_smash_all_diff_species(self):
        """ Smashing together with 'ALL' with different species is a really weird behavior. 
        This test isn't really testing a normal case, just make sure that it's marking the
        unsmashable files.
        """

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1237810_T09-1084.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51084"
        experiment.save()

        mus_mus = Organism.get_object_for_name("MUS_MUSCULUS")

        sample = Sample()
        sample.accession_code = 'GSM1238108'
        sample.title = 'GSM1238108'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1238108-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']}
        ds.aggregate_by = 'ALL'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)
        print(ds.failure_reason)
        print(final_context['dataset'].failure_reason)

        self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
Ejemplo n.º 10
0
    def setUpClass(cls):
        super(ESTestCases, cls).setUpClass()  # ref https://stackoverflow.com/a/29655301/763705

        """Set up class."""
        experiment = Experiment()
        experiment.accession_code = "GSE000-X"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123-X"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.num_processed_samples = 1  # added below
        experiment.num_total_samples = 1
        experiment.num_downloadable_samples = 1
        experiment.save()

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.save()

        organism = Organism(
            name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True
        )
        organism.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = organism
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        # associate the experiment with the sample
        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()

        result = ComputationalResult()
        result.save()

        # and create a qn tarjet for the sample
        computational_result = ComputationalResultAnnotation()
        computational_result.result = result
        computational_result.data = {"is_qn": True, "organism_id": sample.organism.id}
        computational_result.save()

        # and associate it with the sample organism
        sample.organism.qn_target = result
        sample.organism.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        # clear default cache and reindex
        # otherwise the organisms with qn_targes will be cached.
        cache.clear()
        call_command("search_index", "--rebuild", "-f")
Ejemplo n.º 11
0
    def test_get_results(self):
        """ Test our ability to collect the appropriate samples. """

        sample = Sample()
        sample.accession_code = 'GSM45588'
        sample.save()

        result = ComputationalResult()
        result.save()

        computed_file1 = ComputedFile()
        computed_file1.filename = "oh_boy.txt"
        computed_file1.result = result
        computed_file1.size_in_bytes = 123
        computed_file1.is_smashable = True
        computed_file1.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "gee_whiz.bmp"
        computed_file2.result = result
        computed_file2.size_in_bytes = 123
        computed_file2.is_smashable = False
        computed_file2.save()

        assoc = SampleResultAssociation()
        assoc.sample = sample
        assoc.result = result
        assoc.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file1
        assoc.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file2
        assoc.save()

        computed_files = sample.get_result_files()
        self.assertEqual(computed_files.count(), 2)
Ejemplo n.º 12
0
    def test_dualtech_smash(self):
        """ """

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS")

        sample = Sample()
        sample.accession_code = 'GSM1487313'
        sample.title = 'GSM1487313'
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = 'SRS332914'
        sample2.title = 'SRS332914'
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        # CROSS-SMASH BY SPECIES
        ds = Dataset()
        ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        self.assertTrue(ds.is_cross_technology())
        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        os.remove(final_context['output_file'])
        self.assertEqual(len(final_context['final_frame'].columns), 2)

        # THEN BY EXPERIMENT
        ds.aggregate_by = 'EXPERIMENT'
        ds.save()

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        pj.start_time = None
        pj.end_time = None
        pj.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        os.remove(final_context['output_file'])
        self.assertEqual(len(final_context['final_frame'].columns), 1)

        # THEN BY ALL
        ds.aggregate_by = 'ALL'
        ds.save()

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        pj.start_time = None
        pj.end_time = None
        pj.save()
        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        self.assertEqual(len(final_context['final_frame'].columns), 2)
Ejemplo n.º 13
0
    def setUp(self):
        # Saving this for if we have protected endpoints
        # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword')
        # self.client.login(username='******', password='******')
        # self.user = User.objects.create(username="******")

        experiment = Experiment()
        experiment.accession_code = "GSE000"
        experiment.alternate_accession_code = "E-GEOD-000"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.save()
        self.experiment = experiment

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below)
        for i in range(26):
            Organism(name=("TEST_ORGANISM_{}".format(i)),
                     taxonomy_id=(1234 + i)).save()

        ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA",
                              taxonomy_id=9646,
                              is_scientific_name=True)
        ailuropoda.save()
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()
        self.sample = sample

        # add qn target for sample organism
        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {"organism_id": ailuropoda.id, "is_qn": True}
        cra.save()

        ailuropoda.qn_target = result
        ailuropoda.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()
        experiment.num_total_samples = 1
        experiment.num_processed_samples = 1
        experiment.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        processor = Processor()
        processor.name = "Salmon Quant"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        computational_result_short = ComputationalResult(processor=processor)
        computational_result_short.save()

        organism_index = OrganismIndex()
        organism_index.index_type = "TRANSCRIPTOME_SHORT"
        organism_index.organism = self.danio_rerio
        organism_index.result = computational_result_short
        organism_index.absolute_directory_path = (
            "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT")
        organism_index.is_public = True
        organism_index.s3_url = "not_blank"
        organism_index.save()

        return
Ejemplo n.º 14
0
    def test_create_compendia(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS",
                                                     taxonomy_id=1001)

        sample = Sample()
        sample.accession_code = "GSM1487313"
        sample.title = "GSM1487313"
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # Missing sample that will be filtered
        sample = Sample()
        sample.accession_code = "GSM1487222"
        sample.title = "this sample will be filtered"
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487222_empty.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL"
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = "SRS332914"
        sample2.title = "SRS332914"
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        dset = Dataset()
        dset.data = {
            "GSE1487313": ["GSM1487313", "GSM1487222"],
            "SRX332914": ["SRS332914"]
        }
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertFalse(job.success)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"],
            "GSE1487313",
        )
Ejemplo n.º 15
0
    def test_get_tximport_inputs(self):
        """"Tests that tximport only considers RNA-Seq samples from GEO.
        """
        # Create one experiment and two related samples, based on:
        #   https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623
        # (We don't need any original files because
        # get_tximport_inputs doesn't consider them.)
        experiment_accession = 'PRJNA242809'
        experiment = Experiment.objects.create(accession_code=experiment_accession)

        c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

        ## Sample 1
        sample1_accession = 'SRR1206053'
        sample1 = Sample.objects.create(accession_code=sample1_accession,
                                        organism=c_elegans)
        sample1.source_database = 'GEO'
        sample1.technology = 'RNA-SEQ'
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1)

        ## Sample 2
        sample2_accession = 'SRR1206054'
        sample2 = Sample.objects.create(accession_code=sample2_accession,
                                        organism=c_elegans)
        sample2.source_database = 'GEO'
        sample2.technology = 'RNA-SEQ'
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2)

        computational_result1 = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
        computational_result1.save()

        sample_result_assoc = SampleResultAssociation(sample=sample1, result=computational_result1)
        sample_result_assoc.save()

        comp_file = ComputedFile()
        comp_file.absolute_file_path = "/doesnt/matter"
        comp_file.result = computational_result1
        comp_file.size_in_bytes=1337
        comp_file.sha1="ABC"
        comp_file.s3_key = "key"
        comp_file.s3_bucket = "bucket"
        comp_file.save()

        computational_result2 = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
        computational_result2.save()

        sample_result_assoc = SampleResultAssociation(sample=sample2, result=computational_result2)
        sample_result_assoc.save()

        comp_file = ComputedFile()
        comp_file.absolute_file_path = "/doesnt/matter"
        comp_file.result = computational_result2
        comp_file.size_in_bytes=1337
        comp_file.sha1="ABC"
        comp_file.s3_key = "key"
        comp_file.s3_bucket = "bucket"
        comp_file.save()

        quantified_experiments = salmon.get_tximport_inputs({"sample": sample1})['tximport_inputs']

        self.assertEqual({}, quantified_experiments)
Ejemplo n.º 16
0
    def test_create_compendia(self):
        job = ProcessorJob()
        job.pipeline_applied = "COMPENDIA"
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS")

        sample = Sample()
        sample.accession_code = 'GSM1487313'
        sample.title = 'GSM1487313'
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = 'SRS332914'
        sample2.title = 'SRS332914'
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        dset = Dataset()
        dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']}
        dset.scale_by = 'NONE'
        dset.aggregate_by = 'SPECIES'
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)
Ejemplo n.º 17
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "MICROARRAY"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "RNASEQ"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            rnas.append(file)

        # Missing sample that will be filtered
        sample = Sample()
        sample.accession_code = "GSM1487222"
        sample.title = "this sample will be filtered"
        sample.organism = danio_rerio
        sample.technology = "RNASEQ"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Verify result
        self.assertEqual(
            final_context["compendium_result"].result.computedfile_set.count(),
            1)
        for file in final_context[
                "compendium_result"].result.computedfile_set.all():
            self.assertTrue(os.path.exists(file.absolute_file_path))

        # test compendium_result
        self.assertEqual(final_context["compendium_result"].svd_algorithm,
                         "ARPACK")
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            final_context["organism_name"])
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            "DANIO_RERIO")
        self.assertEqual(final_context["compendium_result"].organisms.count(),
                         1)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"], "GSE5678")
Ejemplo n.º 18
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = "COMPENDIA"
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        danio_rerio = Organism.get_object_for_name("DANIO_RERIO")

        micros = []
        for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'):

            if 'microarray.txt' in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "MICROARRAY"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'):

            if 'rnaseq.txt' in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "RNASEQ"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            rnas.append(file)

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv'
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data['organism_id'] = danio_rerio.id
        cra.data['is_qn'] = True
        cra.result = result
        cra.save()

        dset = Dataset()
        dset.data = {'GSE1234': micros, 'GSE5678': rnas}
        dset.scale_by = 'NONE'
        dset.aggregate_by = 'SPECIES'
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Verify result
        self.assertEqual(len(final_context['computed_files']), 3)
        for file in final_context['computed_files']:
            self.assertTrue(os.path.exists(file.absolute_file_path))
Ejemplo n.º 19
0
def _run_salmontools(job_context: Dict) -> Dict:
    """ Run Salmontools to extract unmapped genes. """

    logger.debug("Running SalmonTools ...")
    unmapped_filename = job_context[
        "output_directory"] + "aux_info/unmapped_names.txt"

    command_str = "salmontools extract-unmapped -u {unmapped_file} -o {output} "
    output_prefix = job_context["salmontools_directory"] + "unmapped_by_salmon"
    command_str = command_str.format(unmapped_file=unmapped_filename,
                                     output=output_prefix)
    if "input_file_path_2" in job_context:
        command_str += "-1 {input_1} -2 {input_2}"
        command_str = command_str.format(
            input_1=job_context["input_file_path"],
            input_2=job_context["input_file_path_2"])
    else:
        command_str += "-r {input_1}"
        command_str = command_str.format(
            input_1=job_context["input_file_path"])

    start_time = timezone.now()
    logger.debug(
        "Running the following SalmonTools command: %s",
        command_str,
        processor_job=job_context["job_id"],
    )

    completed_command = subprocess.run(command_str.split(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE)
    end_time = timezone.now()

    # As of SalmonTools 0.1.0, completed_command.returncode is always 0,
    # (even if error happens).  completed_command.stderr is not totally
    # reliable either, because it will output the following line even
    # when the execution succeeds:
    #  "There were <N> unmapped reads\n"
    # in which "<N>" is the number of lines in input unmapped_names.txt.
    #
    # As a workaround, we are using a regular expression here to test
    # the status of SalmonTools execution.  Any text in stderr that is
    # not in the above format is treated as error message.
    status_str = completed_command.stderr.decode().strip()
    success_pattern = r"^There were \d+ unmapped reads$"
    if re.match(success_pattern, status_str):
        # Zip up the output of salmontools
        try:
            with tarfile.open(job_context["salmontools_archive"],
                              "w:gz") as tar:
                tar.add(job_context["salmontools_directory"], arcname=os.sep)
        except Exception:
            logger.exception(
                "Exception caught while zipping processed directory %s",
                job_context["salmontools_directory"],
                processor_job=job_context["job_id"],
            )
            failure_template = "Exception caught while zipping salmontools directory {}"
            job_context["job"].failure_reason = failure_template.format(
                job_context["salmontools_archive"])
            job_context["success"] = False
            return job_context

        result = ComputationalResult()
        result.commands.append(command_str)
        result.time_start = start_time
        result.time_end = end_time
        result.is_ccdl = True

        try:
            processor_key = "SALMONTOOLS"
            result.processor = utils.find_processor(processor_key)
        except Exception as e:
            return utils.handle_processor_exception(job_context, processor_key,
                                                    e)

        result.save()
        job_context["pipeline"].steps.append(result.id)

        assoc = SampleResultAssociation()
        assoc.sample = job_context["sample"]
        assoc.result = result
        assoc.save()

        computed_file = ComputedFile()
        computed_file.filename = job_context["salmontools_archive"].split(
            "/")[-1]
        computed_file.absolute_file_path = job_context["salmontools_archive"]
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.is_public = True
        computed_file.is_smashable = False
        computed_file.is_qc = True
        computed_file.result = result
        computed_file.save()
        job_context["computed_files"].append(computed_file)

        assoc = SampleComputedFileAssociation()
        assoc.sample = job_context["sample"]
        assoc.computed_file = computed_file
        assoc.save()

        job_context["result"] = result
        job_context["success"] = True
    else:  # error in salmontools
        logger.error(
            "Shell call to salmontools failed with error message: %s",
            status_str,
            processor_job=job_context["job_id"],
        )
        job_context["job"].failure_reason = (
            "Shell call to salmontools failed because: " + status_str)
        job_context["success"] = False

    return job_context
Ejemplo n.º 20
0
def prepare_computed_files():
    # MICROARRAY TECH
    experiment = Experiment()
    experiment.accession_code = "GSE1487313"
    experiment.num_processed_samples = 1
    experiment.save()

    result = ComputationalResult()
    result.save()

    gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS",
                                                 taxonomy_id=1001)

    sample = Sample()
    sample.accession_code = "GSM1487313"
    sample.title = "GSM1487313"
    sample.organism = gallus_gallus
    sample.technology = "MICROARRAY"
    sample.is_processed = True
    sample.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1487313_liver.PCL"
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.s3_key = "GSM1487313_liver.PCL"
    computed_file.s3_bucket = TEST_DATA_BUCKET
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    # RNASEQ TECH
    experiment2 = Experiment()
    experiment2.accession_code = "SRP332914"
    experiment2.num_processed_samples = 1
    experiment2.save()

    result2 = ComputationalResult()
    result2.save()

    sample2 = Sample()
    sample2.accession_code = "SRR332914"
    sample2.title = "SRR332914"
    sample2.organism = gallus_gallus
    sample2.technology = "RNA-SEQ"
    sample2.is_processed = True
    sample2.save()

    sra2 = SampleResultAssociation()
    sra2.sample = sample2
    sra2.result = result2
    sra2.save()

    esa2 = ExperimentSampleAssociation()
    esa2.experiment = experiment2
    esa2.sample = sample2
    esa2.save()

    computed_file2 = ComputedFile()
    computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
    computed_file2.result = result2
    computed_file2.size_in_bytes = 234
    computed_file2.is_smashable = True
    computed_file2.s3_key = "SRP149598_gene_lengthScaledTPM.tsv"
    computed_file2.s3_bucket = TEST_DATA_BUCKET
    computed_file2.save()

    assoc2 = SampleComputedFileAssociation()
    assoc2.sample = sample2
    assoc2.computed_file = computed_file2
    assoc2.save()
Ejemplo n.º 21
0
    def test_no_smash_dupe_two(self):
        """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file."""

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "SRP051449"
        experiment.save()

        result = ComputationalResult()
        result.save()

        danio_rerio = Organism.get_object_for_name("DANIO_RERIO")

        sample = Sample()
        sample.accession_code = 'SRR1731761'
        sample.title = 'Danio rerio'
        sample.organism = danio_rerio
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'SRR1731762'
        sample.title = 'Danio rerio'
        sample.organism = danio_rerio
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'NONE'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        cr = ComputationalResult()
        cr.save()

        computed_file = ComputedFile()
        computed_file.filename = "danio_target.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = cr
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = False
        computed_file.save()

        cra = ComputationalResultAnnotation()
        cra.data = {'organism_id': danio_rerio.id, 'is_qn': True}
        cra.result = cr
        cra.save()

        final_context = smasher.smash(job.pk, upload=False)
        self.assertTrue(final_context['success'])
Ejemplo n.º 22
0
def _tximport(job_context: Dict, experiment: Experiment, quant_files: List[ComputedFile]) -> Dict:
    """Run tximport R script based on input quant files and the path
    of genes_to_transcripts.txt.
    """
    # Download all the quant.sf fles for this experiment. Write all
    # their paths to a file so we can pass a path to that to
    # tximport.R rather than having to pass in one argument per
    # sample.
    tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt"
    with open(tximport_path_list_file, "w") as input_list:
        for quant_file in quant_files:
            input_list.write(quant_file.get_synced_file_path() + "\n")

    rds_filename = "txi_out.RDS"
    rds_file_path = job_context["work_dir"] + rds_filename
    tpm_filename = "gene_lengthScaledTPM.tsv"
    tpm_file_path = job_context["work_dir"] + tpm_filename
    result = ComputationalResult()
    cmd_tokens = [
        "/usr/bin/Rscript", "--vanilla",
        "/home/user/data_refinery_workers/processors/tximport.R",
        "--file_list", tximport_path_list_file,
        "--gene2txmap", job_context["genes_to_transcripts_path"],
        "--rds_file", rds_file_path,
        "--tpm_file", tpm_file_path
    ]
    result.time_start = timezone.now()

    logger.debug("Running tximport with: %s",
                 str(cmd_tokens),
                 processor_job=job_context['job_id'],
                 experiment=experiment.id)

    try:
        tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except Exception as e:
        error_template = ("Encountered error in R code while running tximport.R: {}")
        error_message = error_template.format(str(e))
        logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id)
        job_context["job"].failure_reason = error_message
        job_context["success"] = False
        return job_context

    if tximport_result.returncode != 0:
        error_template = ("Found non-zero exit code from R code while running tximport.R: {}")
        error_message = error_template.format(tximport_result.stderr.decode().strip())
        logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id)
        job_context["job"].failure_reason = error_message
        job_context["success"] = False
        return job_context

    result.time_end = timezone.now()
    result.commands.append(" ".join(cmd_tokens))
    result.is_ccdl = True
    try:
        processor_key = "TXIMPORT"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    # Associate this result with all samples in this experiment.
    # TODO: This may not be completely sensible, because `tximport` is
    # done at experiment level, not at sample level.
    # Could be very problematic if SRA's data model allows many
    # Experiments to one Run.
    # https://github.com/AlexsLemonade/refinebio/issues/297
    for sample in experiment.samples.all():
        s_r = SampleResultAssociation(sample=sample, result=result)
        s_r.save()

    rds_file = ComputedFile()
    rds_file.absolute_file_path = rds_file_path
    rds_file.filename = rds_filename
    rds_file.result = result
    rds_file.is_smashable = False
    rds_file.is_qc = False
    rds_file.is_public = True
    rds_file.calculate_sha1()
    rds_file.calculate_size()
    rds_file.save()
    job_context['computed_files'].append(rds_file)

    # Split the tximport result into smashable subfiles
    data = pd.read_csv(tpm_file_path, sep='\t', header=0, index_col=0)
    individual_files = []
    frames = np.split(data, len(data.columns), axis=1)
    for frame in frames:
        # Create sample-specific TPM file.
        sample_file_name = frame.columns.values[0] + '_' + tpm_filename
        frame_path = os.path.join(job_context["work_dir"], sample_file_name)
        frame.to_csv(frame_path, sep='\t', encoding='utf-8')

        # The frame column header is based off of the path, which includes _output.
        sample = Sample.objects.get(accession_code=frame.columns.values[0].replace("_output", ""))

        computed_file = ComputedFile()
        computed_file.absolute_file_path = frame_path
        computed_file.filename = sample_file_name
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.is_qc = False
        computed_file.is_public = True
        computed_file.calculate_sha1()
        computed_file.calculate_size()
        computed_file.save()
        job_context['computed_files'].append(computed_file)
        job_context['smashable_files'].append(computed_file)

        SampleResultAssociation.objects.get_or_create(
            sample=sample,
            result=result)

        # Create association with the RDS file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample,
            computed_file=rds_file)

        # Create association with TPM file.
        SampleComputedFileAssociation.objects.get_or_create(
            sample=sample,
            computed_file=computed_file)

        individual_files.append(computed_file)
        job_context['samples'].append(sample)

    # Clean up quant.sf files that were created just for this.
    for quant_file in quant_files:
        quant_file.delete_s3_file()

        # It's only okay to delete the local file because the full
        # output directory has already been zipped up.
        quant_file.delete_local_file()
        quant_file.delete()

    # Salmon-processed samples aren't marked as is_processed
    # until they are fully tximported, this value sets that
    # for the end_job function.
    job_context['tximported'] = True
    job_context['individual_files'] = individual_files
    return job_context
Ejemplo n.º 23
0
    def test_log2(self):
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # Has non-log2 data:
        # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421
        # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz
        experiment = Experiment()
        experiment.accession_code = "GSE44421"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1084806'
        sample.title = 'GSM1084806'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1084806-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1084807'
        sample.title = 'GSM1084807'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1084807-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']}
        ds.aggregate_by = 'EXPERIMENT'
        ds.scale_by = 'MINMAX'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        self.assertTrue(final_context['success'])
Ejemplo n.º 24
0
    def test_bad_overlap(self):

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {'hi': 'friend'}
        sample_annotation.sample = sample
        sample_annotation.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "big.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1237812'
        sample.title = 'GSM1237812'
        sample.organism = homo_sapiens
        sample.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "small.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']}
        ds.aggregate_by = 'ALL'  # [ALL or SPECIES or EXPERIMENT]
        ds.scale_by = 'NONE'  # [NONE or MINMAX or STANDARD or ROBUST]
        ds.email_address = "*****@*****.**"
        #ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # Now, make sure the bad can't zero this out.
        sample = Sample()
        sample.accession_code = 'GSM999'
        sample.title = 'GSM999'
        sample.organism = homo_sapiens
        sample.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "bad.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']}
        ds.aggregate_by = 'ALL'  # [ALL or SPECIES or EXPERIMENT]
        ds.scale_by = 'NONE'  # [NONE or MINMAX or STANDARD or ROBUST]
        ds.email_address = "*****@*****.**"
        #ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        self.assertEqual(len(final_context['final_frame']), 4)
Ejemplo n.º 25
0
    def test_dataset_adding_non_downloadable_samples_fails(self):
        # Make a sample that is not downloadable
        sample1 = Sample()
        sample1.title = "456"
        sample1.accession_code = "456"
        sample1.platform_name = "AFFY"
        sample1.is_processed = False
        sample1.organism = self.homo_sapiens
        sample1.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample1
        experiment_sample_association.experiment = self.experiment
        experiment_sample_association.save()

        # Bad, 456 is not processed
        jdata = json.dumps({
            "email_address": "*****@*****.**",
            "data": {
                "GSE123": ["456"]
            }
        })
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            jdata,
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 400)
        self.assertIn(
            "Non-downloadable sample(s) in dataset",
            response.json()["message"],
        )
        self.assertEqual(response.json()["details"], ["456"])
        cache.clear()

        # Bad, 567 does not exist
        jdata = json.dumps({
            "email_address": "*****@*****.**",
            "data": {
                "GSE123": ["567"]
            }
        })
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            jdata,
            content_type="application/json",
        )
        self.assertIn(
            "Sample(s) in dataset do not exist on refine",
            response.json()["message"],
        )
        self.assertEqual(response.status_code, 400)
        cache.clear()

        # Good, 789 is processed
        jdata = json.dumps({
            "email_address": "*****@*****.**",
            "data": {
                "GSE123": ["789"]
            }
        })
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            jdata,
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)
        cache.clear()

        # Bad, 456 does not have a quant.sf file
        post_data = {"email_address": "*****@*****.**", "data": {}}
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            json.dumps(post_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)
        cache.clear()

        put_data = {
            **post_data, "data": {
                "GSE123": ["456"]
            },
            "quant_sf_only": True
        }
        response = self.client.put(
            reverse("dataset",
                    kwargs={
                        "id": response.json()["id"],
                        "version": API_VERSION
                    }),
            json.dumps(put_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 400)
        self.assertIn(
            "Sample(s) in dataset are missing quant.sf files",
            response.json()["message"],
        )
        self.assertEqual(response.json()["details"], ["456"])
        cache.clear()

        # Bad, none of the samples in GSE123 have a quant.sf file
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            json.dumps(post_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)
        cache.clear()
        response = self.client.put(
            reverse("dataset",
                    kwargs={
                        "id": response.json()["id"],
                        "version": API_VERSION
                    }),
            json.dumps({
                **put_data, "data": {
                    "GSE123": ["ALL"]
                }
            }),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 400)
        self.assertIn(
            "Experiment(s) in dataset have zero downloadable samples",
            response.json()["message"],
        )
        self.assertEqual(response.json()["details"], ["GSE123"])
        cache.clear()

        # Make 456 have a quant.sf file
        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample1
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.s3_key = "smasher-test-quant.sf"
        computed_file.s3_bucket = "data-refinery-test-assets"
        computed_file.filename = "quant.sf"
        computed_file.result = result
        computed_file.size_in_bytes = 42
        computed_file.save()

        # Good, 456 does have a quant.sf file
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            json.dumps(post_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)
        cache.clear()

        response = self.client.put(
            reverse("dataset",
                    kwargs={
                        "id": response.json()["id"],
                        "version": API_VERSION
                    }),
            json.dumps(put_data),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 200)
        cache.clear()

        # Good, a sample in GSE123 has a quant.sf file
        response = self.client.post(
            reverse("create_dataset", kwargs={"version": API_VERSION}),
            json.dumps(post_data),
            content_type="application/json",
        )
        cache.clear()
        self.assertEqual(response.status_code, 201)
        response = self.client.put(
            reverse("dataset",
                    kwargs={
                        "id": response.json()["id"],
                        "version": API_VERSION
                    }),
            json.dumps({
                **put_data, "data": {
                    "GSE123": ["ALL"]
                }
            }),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 200)
Ejemplo n.º 26
0
def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "SMASHER"
    pj.save()

    experiment = Experiment()
    experiment.accession_code = "GSE51081"
    experiment.save()

    result = ComputationalResult()
    result.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

    sample = Sample()
    sample.accession_code = 'GSM1237810'
    sample.title = 'GSM1237810'
    sample.organism = homo_sapiens
    sample.save()

    sample_annotation = SampleAnnotation()
    sample_annotation.data = {'hi': 'friend'}
    sample_annotation.sample = sample
    sample_annotation.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237810_T09-1084.PCL"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    sample = Sample()
    sample.accession_code = 'GSM1237812'
    sample.title = 'GSM1237812'
    sample.organism = homo_sapiens
    sample.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237812_S97-PURE.PCL"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237812_S97-PURE.DAT"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = False
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    ds = Dataset()
    ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']}
    ds.aggregate_by = 'EXPERIMENT'  # [ALL or SPECIES or EXPERIMENT]
    ds.scale_by = 'STANDARD'  # [NONE or MINMAX or STANDARD or ROBUST]
    ds.email_address = "*****@*****.**"
    #ds.email_address = "*****@*****.**"
    ds.quantile_normalize = False
    ds.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = pj
    pjda.dataset = ds
    pjda.save()

    return pj
Ejemplo n.º 27
0
    def test_make_experiment_result_associations(self):
        """Tests that the correct associations are made.

        The situation we're setting up is basically this:
          * tximport has been run for an experiment.
          * It made associations between the samples in
            the experiment and the ComputationalResult.
          * It didn't make associations between the
            experiment itself and the ComputationalResult.
          * There is a second experiment that hasn't had
            tximport run but shares a sample with the
            other experiment.
          * This second experiment has a sample which has
            not yet had tximport run on it.

        And what we're going to test for is:
          * An association is created between the tximport
            result and the first experiment.
          * An association is NOT created between the
            tximport result and the second experiment.
        """
        # Get an organism to set on samples:
        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                    taxonomy_id=9606)

        # Create the tximport processor and result:
        processor = Processor()
        processor.name = "Tximport"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        result = ComputationalResult()
        result.commands.append("tximport invocation")
        result.is_ccdl = True
        result.processor = processor
        result.save()

        # Create the first experiment and it's samples:
        processed_experiment = Experiment()
        processed_experiment.accession_code = "SRP12345"
        processed_experiment.save()

        processed_sample_one = Sample()
        processed_sample_one.accession_code = "SRX12345"
        processed_sample_one.title = "SRX12345"
        processed_sample_one.organism = homo_sapiens
        processed_sample_one.save()

        sra = SampleResultAssociation()
        sra.sample = processed_sample_one
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = processed_experiment
        esa.sample = processed_sample_one
        esa.save()

        processed_sample_two = Sample()
        processed_sample_two.accession_code = "SRX12346"
        processed_sample_two.title = "SRX12346"
        processed_sample_two.organism = homo_sapiens
        processed_sample_two.save()

        sra = SampleResultAssociation()
        sra.sample = processed_sample_two
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = processed_experiment
        esa.sample = processed_sample_two
        esa.save()

        # Create the second experiment and it's additional sample.
        unprocessed_experiment = Experiment()
        unprocessed_experiment.accession_code = "SRP6789"
        unprocessed_experiment.save()

        unprocessed_sample = Sample()
        unprocessed_sample.accession_code = "SRX6789"
        unprocessed_sample.title = "SRX6789"
        unprocessed_sample.organism = homo_sapiens
        unprocessed_sample.save()

        sra = SampleResultAssociation()
        sra.sample = unprocessed_sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = unprocessed_experiment
        esa.sample = unprocessed_sample
        esa.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = unprocessed_experiment
        esa.sample = processed_sample_two
        esa.save()

        # Run the function we're testing:
        make_experiment_result_associations()

        # Test that only one association was created and that it was
        # to the processed experiment:
        eras = ExperimentResultAssociation.objects.all()

        self.assertEqual(len(eras), 1)
        self.assertEqual(eras.first().experiment, processed_experiment)
Ejemplo n.º 28
0
def _run_multiqc(job_context: Dict) -> Dict:
    """Runs the `MultiQC` package to generate the QC report.

    TODO: These seem to consume a lot of RAM, even for small files.
    We should consider tuning these or breaking them out into their
    own processors. JVM settings may reduce RAM footprint.
    """
    command_str = ("multiqc {input_directory} --outdir {qc_directory} --zip-data-dir")
    formatted_command = command_str.format(input_directory=job_context["qc_input_directory"],
                                           qc_directory=job_context["qc_directory"])

    logger.debug("Running MultiQC using the following shell command: %s",
                formatted_command,
                processor_job=job_context["job_id"])

    qc_env = os.environ.copy()
    qc_env["LC_ALL"] = "C.UTF-8"
    qc_env["LANG"] = "C.UTF-8"

    time_start = timezone.now()
    completed_command = subprocess.run(formatted_command.split(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.PIPE,
                                       env=qc_env)
    time_end = timezone.now()

    if completed_command.returncode != 0:

        stderr = completed_command.stderr.decode().strip()
        error_start = stderr.upper().find("ERROR:")
        error_start = error_start if error_start != -1 else 0
        logger.error("Shell call to MultiQC failed with error message: %s",
                     stderr[error_start:],
                     processor_job=job_context["job_id"])

        job_context["job"].failure_reason = ("Shell call to MultiQC failed because: "
                                             + stderr[error_start:])
        job_context["success"] = False

    result = ComputationalResult()
    result.commands.append(formatted_command)
    result.time_start = time_start
    result.time_end = time_end
    result.is_ccdl = True

    try:
        processor_key = "MULTIQC"
        result.processor = utils.find_processor(processor_key)
    except Exception as e:
        return utils.handle_processor_exception(job_context, processor_key, e)

    result.save()
    job_context['pipeline'].steps.append(result.id)

    assoc = SampleResultAssociation()
    assoc.sample = job_context["sample"]
    assoc.result = result
    assoc.save()

    job_context['qc_result'] = result

    data_file = ComputedFile()
    data_file.filename = "multiqc_data.zip" # This is deterministic
    data_file.absolute_file_path = os.path.join(job_context["qc_directory"], data_file.filename)
    data_file.calculate_sha1()
    data_file.calculate_size()
    data_file.is_public = True
    data_file.result = job_context['qc_result']
    data_file.is_smashable = False
    data_file.is_qc = True
    data_file.save()
    job_context['computed_files'].append(data_file)

    SampleComputedFileAssociation.objects.get_or_create(
        sample=job_context["sample"],
        computed_file=data_file)

    report_file = ComputedFile()
    report_file.filename = "multiqc_report.html" # This is deterministic
    report_file.absolute_file_path = os.path.join(job_context["qc_directory"], report_file.filename)
    report_file.calculate_sha1()
    report_file.calculate_size()
    report_file.is_public = True
    report_file.is_smashable = False
    report_file.is_qc = True
    report_file.result = job_context['qc_result']
    report_file.save()
    job_context['computed_files'].append(report_file)

    job_context['qc_files'] = [data_file, report_file]

    return job_context