def create_compendia(job_id: int) -> None:
    pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
    job_context = utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, COMPENDIA_PIPELINE)
    return job_context
def build_transcriptome_index(job_id: int, length="long") -> None:
    """The main function for the Transcriptome Index Processor.

    The steps in this process are as follows:
      * First, files are retrieved from Temporary Storage.
      * Next, they are prepared by removing pseudogenes from the gtf file.
      * Next the tool RSEM's prepare-reference is run.
      * Finally the salmon index command is run
    The output of salmon index is a directory which is pushed in full
    to Permanent Storage.
    """
    pipeline = Pipeline(name=PipelineEnum.TX_INDEX.value)
    return utils.run_pipeline(
        {
            "job_id": job_id,
            "length": length,
            "pipeline": pipeline
        },
        [
            utils.start_job,
            _compute_paths,
            _prepare_files,
            _extract_assembly_information,
            _process_gtf,
            _create_index,
            _zip_index,
            _populate_index_object,
            utils.end_job,
        ],
    )
Beispiel #3
0
def salmon(job_id: int) -> None:
    """Main processor function for the Salmon Processor.

    Runs salmon quant command line tool, specifying either a long or
    short read length. Also runs Salmontools and Tximport.
    """
    pipeline = Pipeline(name=PipelineEnum.SALMON.value)
    final_context = utils.run_pipeline(
        {
            "job_id": job_id,
            "pipeline": pipeline
        },
        [
            utils.start_job,
            _set_job_prefix,
            _prepare_files,
            _determine_index_length,
            _find_or_download_index,
            _run_salmon,
            get_tximport_inputs,
            tximport,
            _run_salmontools,
            utils.end_job,
        ],
    )
    return final_context
Beispiel #4
0
def run_janitor(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.JANITOR.value)
    job_context = utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [utils.start_job, _find_and_remove_expired_jobs, utils.end_job])
    return job_context
Beispiel #5
0
    def test_single_read(self):
        """Test outputs when the sample has one read only."""
        job_context = {
            'job_id': 456,
            'job': ProcessorJob(),
            'pipeline': Pipeline(name="Salmon"),
            'input_file_path': self.test_dir + 'single_input/single_read.fastq',
            'output_directory': self.test_dir + 'single_output/',
            'salmontools_directory': self.test_dir + 'single_salmontools/',
            'salmontools_archive': self.test_dir + 'salmontools-result.tar.gz',
            'computed_files': []
        }
        os.makedirs(job_context["salmontools_directory"], exist_ok=True)

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")
        sample = Sample()
        sample.organism = homo_sapiens
        sample.save()
        job_context["sample"] = sample

        salmon._run_salmontools(job_context)

        # Confirm job status
        self.assertTrue(job_context["success"])

        # Unpack result for checking
        os.system('gunzip ' + job_context['salmontools_directory'] + "*.gz")

        # Check output file
        output_file = job_context['salmontools_directory'] + 'unmapped_by_salmon.fa'
        expected_output_file = self.test_dir + 'expected_single_output/unmapped_by_salmon.fa'
        self.assertTrue(identical_checksum(output_file, expected_output_file))
Beispiel #6
0
    def test_salmontools_with_bad_processor(self):
        """Test salmontools with a bad processor key."""
        test_dir = '/home/user/data_store/salmontools/'
        job_context = {
            'job_id': 123,
            'job': ProcessorJob.objects.create(),
            'pipeline': Pipeline(name="Salmon"),
            'input_file_path': test_dir + 'double_input/reads_1.fastq',
            'input_file_path_2': test_dir + 'double_input/reads_2.fastq',
            'salmontools_directory': test_dir + 'double_salmontools/',
            'salmontools_archive': test_dir + 'salmontools-result.tar.gz',
            'output_directory': test_dir + 'double_output/'
        }
        os.makedirs(job_context["salmontools_directory"], exist_ok=True)
        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")
        sample = Sample()
        sample.organism = homo_sapiens
        sample.save()
        job_context["sample"] = sample

        # Set the wrong yml filename on purpose to mess up Salmontools processor
        original_yml_file = utils.ProcessorEnum['SALMONTOOLS'].value['yml_file']
        utils.ProcessorEnum['SALMONTOOLS'].value['yml_file'] = 'foobar.yml'

        salmon._run_salmontools(job_context)
        self.assertEqual(job_context["success"], False)
        self.assertTrue(job_context["job"].failure_reason.startswith('Failed to set processor:'))

        # Change yml filename back
        utils.ProcessorEnum['SALMONTOOLS'].value['yml_file'] = original_yml_file
Beispiel #7
0
def no_op_processor(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.NO_OP.value)
    return utils.run_pipeline({"job_id": job_id, "pipeline": pipeline},
                              [utils.start_job,
                               _prepare_files,
                               _convert_genes,
                               _create_result,
                               utils.end_job])
Beispiel #8
0
def illumina_to_pcl(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.ILLUMINA.value)
    return utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [
        utils.start_job, _prepare_files, _detect_columns, _detect_platform,
        _run_illumina, _create_result_objects, utils.end_job
    ])
Beispiel #9
0
def affy_to_pcl(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.ARRAY_EXPRESS.value)
    utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [
        utils.start_job, _prepare_files, _determine_brainarray_package,
        _run_scan_upc, _create_result_objects, utils.end_job
    ])
Beispiel #10
0
    def test_salmon_quant_one_sample_double_reads(self):
        """Test `salmon quant` on a sample that has double reads."""
        # Set up organism index database objects.
        prepare_organism_indices()

        # Create an Experiment that includes two samples.
        # (The first sample has test data available, but the second does not.)
        experiment_accession = 'test_experiment'
        experiment = Experiment.objects.create(accession_code=experiment_accession)

        c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

        # test_sample record
        sample_accession = 'test_sample'
        test_sample = Sample.objects.create(accession_code=sample_accession,
                                            organism=c_elegans,
                                            source_database='SRA',
                                            technology='RNA-SEQ')
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=test_sample)
        # fake_sample record (created to prevent tximport step in this experiment)
        fake_sample = Sample.objects.create(accession_code='fake_sample',
                                            source_database='SRA',
                                            technology='RNA-SEQ')
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=fake_sample)

        experiment_dir = '/home/user/data_store/salmon_tests/test_experiment'

        og_read_1 = OriginalFile()
        og_read_1.absolute_file_path = os.path.join(experiment_dir, 'raw/reads_1.fastq')
        og_read_1.filename = "reads_1.fastq"
        og_read_1.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_read_1, sample=test_sample).save()

        og_read_2 = OriginalFile()
        og_read_2.absolute_file_path = os.path.join(experiment_dir, "raw/reads_2.fastq")
        og_read_2.filename = "reads_1.fastq"
        og_read_2.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_read_2, sample=test_sample).save()

        sample_dir = os.path.join(experiment_dir, 'test_sample')

        job_context = salmon._prepare_files({"job_dir_prefix": "TEST",
                                             "job_id": "TEST",
                                             "job": ProcessorJob(),
                                             'pipeline': Pipeline(name="Salmon"),
                                             'computed_files': [],
                                             "original_files": [og_read_1, og_read_2]})

        # Run salmon.
        self.check_salmon_quant(job_context, sample_dir)

        # Confirm that this experiment is not ready for tximport yet,
        # because `salmon quant` is not run on 'fake_sample'.
        experiments_ready = salmon.get_tximport_inputs(job_context)['tximport_inputs']
        self.assertEqual(len(experiments_ready), 0)
def create_compendia(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.COMPENDIA.value)
    job_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline},
                       [utils.start_job,
                        _prepare_input,
                        _perform_imputation,
                        _create_result_objects,
                        utils.end_job])
    return job_context
Beispiel #12
0
def agilent_twocolor_to_pcl(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.AGILENT_TWOCOLOR.value)
    utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [
        utils.start_job, _prepare_files, _run_scan_twocolor,
        _create_result_objects, utils.end_job
    ])
Beispiel #13
0
def create_qn_reference(job_id: int) -> None:
    pipeline = Pipeline(name=utils.PipelineEnum.QN_REFERENCE.value)
    job_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline},
                       [utils.start_job,
                        _prepare_input,
                        _quantile_normalize,
                        _verify_result,
                        _create_result_objects,
                        utils.end_job])
    return job_context
Beispiel #14
0
    def test_fastqc(self):

        job, og_files = prepare_job()
        win_context = {
            'job': job,
            'job_id': 789,
            'job_dir_prefix': "processor_job_789",
            'pipeline': Pipeline(name="Salmon"),
            'qc_directory': "/home/user/data_store/raw/TEST/SALMON/qc",
            'original_files': og_files,
            'input_file_path': og_files[0],
            'input_file_path_2': og_files[1],
            "computed_files": [],
            'success': True

        }

        # Ensure clean testdir
        shutil.rmtree(win_context['qc_directory'], ignore_errors=True)
        os.makedirs(win_context['qc_directory'], exist_ok=True)
        win_context = salmon._prepare_files(win_context)

        win = salmon._run_fastqc(win_context)
        self.assertTrue(win['success'])
        win = salmon._run_multiqc(win_context)
        self.assertTrue(win['success'])

        for file in win['qc_files']:
            self.assertTrue(os.path.isfile(file.absolute_file_path))

        fail_context = {
            'job': job,
            'job_id': 'hippityhoppity',
            'pipeline': Pipeline(name="Salmon"),
            'qc_directory': "/home/user/data_store/raw/TEST/SALMON/derp",
            'original_files': [],
            'success': True,
            'computed_files': []
        }
        fail = salmon._run_fastqc(fail_context)
        self.assertFalse(fail['success'])
Beispiel #15
0
def smash(job_id: int, upload=True) -> None:
    """ Main Smasher interface """

    pipeline = Pipeline(name=utils.PipelineEnum.SMASHER.value)
    return utils.run_pipeline(
        {
            "job_id": job_id,
            "upload": upload,
            "pipeline": pipeline
        }, [
            utils.start_job, _prepare_files, _smash, _upload, _notify,
            _update_result_objects, utils.end_job
        ])
Beispiel #16
0
def create_qn_reference(job_id: int, create_results=True) -> None:
    pipeline = Pipeline(name=PipelineEnum.QN_REFERENCE.value)
    job_context = utils.run_pipeline(
        {"job_id": job_id, "pipeline": pipeline, "create_results": create_results},
        [
            utils.start_job,
            _prepare_input,
            _build_qn_target,
            _create_result_objects,
            _update_caches,
            utils.end_job,
        ],
    )
    return job_context
Beispiel #17
0
def tximport(job_id: int) -> None:
    """Main processor function for the Tximport Processor.

    Runs tximport command line tool on an experiment.
    """
    pipeline = Pipeline(name=utils.PipelineEnum.TXIMPORT.value)
    final_context = utils.run_pipeline({
        "job_id": job_id,
        "pipeline": pipeline
    }, [
        utils.start_job, _set_job_prefix, _prepare_files,
        salmon._find_or_download_index, salmon.tximport, utils.end_job
    ])
    return final_context
Beispiel #18
0
def create_quantpendia(job_id: int) -> None:
    pipeline = Pipeline(name=PipelineEnum.CREATE_QUANTPENDIA.value)
    job_context = utils.run_pipeline(
        {"job_id": job_id, "pipeline": pipeline},
        [
            utils.start_job,
            _make_dirs,
            _download_files,
            _add_metadata,
            _make_archive,
            _create_result_objects,
            _remove_job_dir,
            utils.end_job,
        ],
    )
    return job_context
Beispiel #19
0
    def test_queryset_iterator(self):
        """Test that the queryset iterator by using it to actually iterate over a queryset.

        Uses Pipeline just because it's easy to init."""
        # Page size defaults to 2000, so use something bigger than
        # that so there's more than one page.
        for i in range(3000):
            Pipeline(name=str(i)).save()

        pipelines = Pipeline.objects.all()

        # Build a list of the names just to do something with the data
        # so we know the query actually resolved.
        names = []
        for pipeline in utils.queryset_iterator(pipelines):
            names.append(pipeline.name)

        self.assertEqual(len(names), 3000)
Beispiel #20
0
def create_compendia(job_id: int) -> None:
    pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
    job_context = utils.run_pipeline(
        {
            "job_id": job_id,
            "pipeline": pipeline
        },
        [
            utils.start_job,
            _prepare_input,
            _prepare_frames,
            _perform_imputation,
            smashing_utils.write_non_data_files,
            _create_result_objects,
            utils.end_job,
        ],
    )
    return job_context
    def test_detect_columns(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})

        pipeline = Pipeline(name=PipelineEnum.ILLUMINA.value)

        final_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            [
                utils.start_job,
                illumina._prepare_files,
                illumina._detect_encoding,
                illumina._sanitize_input_file,
                illumina._convert_sanitized_to_tsv,
                illumina._detect_columns,
            ],
        )

        self.assertNotEqual(final_context.get("success"), False)

        # For this experiment, the probe ID is the first column
        self.assertEqual(final_context.get("probeId"), GSE22427_HEADER[0])

        expected_column_ids = ",".join(
            map(
                lambda t: str(t[0]),
                filter(
                    # For this header file, the samples all have the prefix LV-
                    lambda t: t[1].startswith("LV-"),
                    # We use start=1 here because the column IDs are formatted
                    # for R code so they treat the header as a 1-indexed list
                    enumerate(GSE22427_HEADER, start=1),
                ),
            ))
        self.assertEqual(final_context.get("columnIds"), expected_column_ids)
Beispiel #22
0
def smash(job_id: int, upload=True) -> None:
    """Main Smasher interface"""
    pipeline = Pipeline(name=PipelineEnum.SMASHER.value)
    job_context = utils.run_pipeline(
        {
            "job_id": job_id,
            "upload": upload,
            "pipeline": pipeline
        },
        [
            utils.start_job,
            smashing_utils.prepare_files,
            _smash_all,
            _upload,
            _update_result_objects,
            utils.end_job,
        ],
    )
    # ensure that `notify` is always called so that users get emails in case processing fails or succeeds
    job_context = _notify(job_context)
    return job_context
Beispiel #23
0
def salmon(job_id: int) -> None:
    """Main processor function for the Salmon Processor.

    Runs salmon quant command line tool, specifying either a long or
    short read length. Also runs FastQC, MultiQC, and Salmontools.
    """
    pipeline = Pipeline(name=utils.PipelineEnum.SALMON.value)
    final_context = utils.run_pipeline({"job_id": job_id, "pipeline": pipeline},
                       [utils.start_job,
                        _set_job_prefix,
                        _prepare_files,
                        _extract_sra,

                        _determine_index_length,
                        _find_or_download_index,

                        _run_fastqc,
                        _run_salmon,
                        _run_salmontools,
                        _run_multiqc,
                        utils.end_job])
    return final_context
    def test_imputation(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "RNA-SEQ",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/",
                },
                experiment,
            )

            rnas.append(file)

        # Missing sample that will be filtered
        sample = create_sample_for_experiment(
            {
                "organism": danio_rerio,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "RNA-SEQ",
                "filename": None,
            },
            experiment,
        )
        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        imputation_index = create_compendia.COMPENDIA_PIPELINE.index(
            create_compendia._perform_imputation)

        pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
        job_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            create_compendia.COMPENDIA_PIPELINE[:imputation_index],
        )

        # First, run the imputation step without removing anything to get a baseline
        expected_context = utils.run_pipeline(
            job_context.copy(),
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])

        # Now pick some rows to remove according to the instructions from
        # https://github.com/AlexsLemonade/refinebio/pull/2879#issuecomment-895143336

        random.seed(42)

        # Select some rows randomly and mask a little bit less than 30% of the values
        rare_rows = random.sample(list(job_context["microarray_matrix"].index),
                                  k=25)
        rare_genes = {}
        for row in rare_rows:
            cols = random.sample(
                list(job_context["microarray_matrix"].columns),
                # There are around 840 samples, and we want to pick a little bit
                # less than 30% of them
                k=int(0.28 * 840),
            )
            rare_genes[row] = cols
            for col in cols:
                job_context["microarray_matrix"].loc[row, col] = np.nan

        # Now randomly select some entries from the other rows to mask
        individual_indices = random.sample(
            list(
                itertools.product(
                    set(job_context["microarray_matrix"].index) -
                    set(rare_rows),
                    job_context["microarray_matrix"].columns,
                )),
            k=1000,
        )
        for row, col in individual_indices:
            job_context["microarray_matrix"].loc[row, col] = np.nan

        final_context = utils.run_pipeline(
            job_context,
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])
        self.assertDidNotFail(job)

        index = set(final_context["merged_no_qn"].index) & set(
            expected_context["merged_no_qn"].index)
        columns = set(final_context["merged_no_qn"].columns) & set(
            expected_context["merged_no_qn"].columns)

        # Calculate the Root-Mean-Square Error (RMSE) of the imputed values.
        # See https://en.wikipedia.org/wiki/Root-mean-square_deviation
        # for a description of the formula.

        N = 0
        squared_error = 0
        affected_entries = {
            *individual_indices,
            *((row, col) for row, cols in rare_genes.items() for col in cols),
        }
        for row, col in affected_entries:
            if row in index and col in columns:
                actual = final_context["merged_no_qn"].loc[row, col]
                expected = expected_context["merged_no_qn"].loc[row, col]

                N += 1
                squared_error += (actual - expected)**2

        rmse = math.sqrt(squared_error / N)

        # The results of a previous run plus a little bit of leeway
        self.assertLess(abs(rmse - 0.2868600293662542), 0.05)
Beispiel #25
0
def run_tximport_at_progress_point(complete_accessions: List[str], incomplete_accessions: List[str]) -> Dict:
    """Create an experiment and associated objects and run tximport on it.

    Creates a sample for each accession contained in either input
    list. The samples in complete_accessions will be simlulated as
    already having salmon quant run on them. The samples in
    incomplete_accessions won't.
    """
    # Create the experiment
    experiment_accession = 'SRP095529'
    data_dir = '/home/user/data_store/salmon_tests/'
    experiment_dir = data_dir + experiment_accession
    experiment = Experiment.objects.create(accession_code=experiment_accession)

    zebrafish = Organism.get_object_for_name("DANIO_RERIO")

    # This is a lie, but this image doesn't have the dependencies for TRANSCRIPTOME_INDEX
    computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT'))
    computational_result_short.save()

    organism_index = OrganismIndex()
    organism_index.index_type = "TRANSCRIPTOME_SHORT"
    organism_index.organism = zebrafish
    organism_index.result = computational_result_short
    organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/ZEBRAFISH_INDEX/SHORT"
    organism_index.save()

    comp_file = ComputedFile()
    # This path will not be used because we already have the files extracted.
    comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz"
    comp_file.result = computational_result_short
    comp_file.size_in_bytes=1337
    comp_file.sha1="ABC"
    comp_file.s3_key = "key"
    comp_file.s3_bucket = "bucket"
    comp_file.save()

    for accession_code in incomplete_accessions:
        last_sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database='SRA',
            technology='RNA-SEQ'
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=last_sample)

    # Create tximport result and files
    quant_processor = utils.find_processor("SALMON_QUANT")
    tximport_processor = utils.find_processor("TXIMPORT")

    # Create the already processed samples along with their
    # ComputationalResults and ComputedFiles. They don't need
    # original files for this test because we aren't going to run
    # salmon quant on them.
    for accession_code in complete_accessions:
        sample = Sample.objects.create(
            accession_code=accession_code,
            organism=zebrafish,
            source_database='SRA',
            technology='RNA-SEQ'
        )
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample)

        if accession_code == "SRR5125622":
            current_sample = sample

        # Create and associate quant result and files.
        quant_result = ComputationalResult()
        quant_result.is_ccdl = True
        quant_result.processor = quant_processor
        quant_result.save()

        # In prod the filename pattern will involve the timestamp
        # but here we're using the accession code so we can find
        # the archive file for the current sample.
        archive_filename = "result-" + accession_code + ".tar.gz"
        archive_file = ComputedFile()
        archive_file.filename = archive_filename
        archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename)
        archive_file.is_public = False
        archive_file.is_smashable = False
        archive_file.is_qc = False
        archive_file.result = quant_result
        archive_file.size_in_bytes = 12345
        archive_file.save()

        quant_file = ComputedFile()
        quant_file.filename = "quant.sf"
        quant_file.absolute_file_path = experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf"
        quant_file.is_public = False
        quant_file.is_smashable = False
        quant_file.is_qc = False
        quant_file.result = quant_result
        quant_file.size_in_bytes = 12345
        quant_file.s3_bucket = "bucket"
        quant_file.s3_key = "key"
        quant_file.save()

        SampleResultAssociation.objects.get_or_create(
            sample=sample,
            result=quant_result
        )

    # Processor jobs need at least one original file associated with
    # them so they know what they're processing.
    current_og = OriginalFile()
    current_og.absolute_file_path = os.path.join(experiment_dir, 'SRR5125622.fastq.gz')
    current_og.filename = "SRR5125622.fastq.gz"
    current_og.save()

    OriginalFileSampleAssociation.objects.create(original_file=current_og, sample=current_sample).save()

    pj = ProcessorJob()
    pj.pipeline_applied = "TXIMPORT"
    pj.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = current_og
    assoc1.processor_job = pj
    assoc1.save()

    # Prep our job context
    job_context = tximport._prepare_files({"job_dir_prefix": "TEST3",
                                           "job_id": "TEST3",
                                           "job": pj,
                                           "index_directory": organism_index.absolute_directory_path,
                                           "pipeline": Pipeline(name="Salmon"),
                                           "computed_files": [],
                                           "original_files": [current_og]})

    # We don't have the raw file to run _determine_index_length so
    # just pick one, it doesn't matter that much because we aren't
    # checking the output data.
    job_context["index_length"] = "short"
    job_context = salmon._find_or_download_index(job_context)

    job_context = salmon.get_tximport_inputs(job_context)
    job_context = salmon.tximport(job_context)

    return job_context
Beispiel #26
0
    def test_salmon_quant_two_samples_single_read(self):
        """Test `salmon quant` outputs on two samples that have single
        read and that belong to same experiment.
        """
        prepare_organism_indices()

        # Create one experiment and two related samples, based on:
        #   https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623
        # (For testing purpose, only two of the four samples' data are included.)
        experiment_accession = 'PRJNA242809'
        experiment = Experiment.objects.create(accession_code=experiment_accession)

        c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

        ## Sample 1
        sample1_accession = 'SRR1206053'
        sample1 = Sample.objects.create(accession_code=sample1_accession,
                                        organism=c_elegans)
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1)

        experiment_dir = "/home/user/data_store/salmon_tests/PRJNA242809"

        og_file_1 = OriginalFile()
        og_file_1.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206053.fastq.gz")
        og_file_1.filename = "SRR1206053.fastq.gz"
        og_file_1.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_file_1, sample=sample1).save()

        ## Sample 2
        sample2_accession = 'SRR1206054'
        sample2 = Sample.objects.create(accession_code=sample2_accession,
                                        organism=c_elegans)
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2)

        og_file_2 = OriginalFile()
        og_file_2.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206054.fastq.gz")
        og_file_2.filename = "SRR1206054.fastq.gz"
        og_file_2.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_file_2, sample=sample2).save()

        # Test `salmon quant` on sample1 (SRR1206053)
        sample1_dir = os.path.join(experiment_dir, sample1_accession)

        job1_context = salmon._prepare_files({"job_dir_prefix": "TEST",
                                              "job_id": "TEST",
                                              'pipeline': Pipeline(name="Salmon"),
                                              'computed_files': [],
                                              "original_files": [og_file_1]})

        # Check quant.sf in `salmon quant` output dir of sample1
        self.check_salmon_quant(job1_context, sample1_dir)
        # Confirm that this experiment is not ready for tximport yet.
        experiments_ready = salmon._get_tximport_inputs(job1_context)
        self.assertEqual(len(experiments_ready), 0)
        # This job should not have produced any tximport output
        # because the other sample isn't ready yet.
        self.assertFalse(os.path.exists(os.path.join(job1_context["work_dir"], 'txi_out.RDS')))

         # Now run `salmon quant` on sample2 (SRR1206054) too
        sample2_dir = os.path.join(experiment_dir, sample2_accession)
        job2_context = salmon._prepare_files({"job_dir_prefix": "TEST2",
                                              "job_id": "TEST2",
                                              'pipeline': Pipeline(name="Salmon"),
                                              'computed_files': [],
                                              "original_files": [og_file_2]})

        # Clean up tximport output:
        rds_filename = os.path.join(job2_context["work_dir"], 'txi_out.RDS')
        if (os.path.isfile(rds_filename)):
            os.remove(rds_filename)

        # Check quant.sf in `salmon quant` output dir of sample2
        self.check_salmon_quant(job2_context, sample2_dir)

        # rds_filename should have been generated by tximport at this point.
        # Note: `tximport` step is launched by subprocess module in Python.
        # If input "quant.sf" files are too large, we may have to wait for
        # a few seconds before testing the existence of rds_filename.
        self.assertTrue(os.path.exists(rds_filename))

        for computed_file in job2_context['computed_files']:
            if computed_file.filename[-4:] == '.RDS':
                rds_file_path = computed_file.absolute_file_path

        cmd_tokens = [
            "/usr/bin/Rscript", "--vanilla",
            "/home/user/data_refinery_workers/processors/test_tximport.R",
            "--txi_out", rds_file_path,
            "--gene2txmap", job2_context["genes_to_transcripts_path"]
        ]

        tximport_test_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        if tximport_test_result.returncode != 0:
            # If the exit code is not 0 then tximport failed so fail the tests.
            self.assertTrue(False)

        # Check the individual files
        self.assertTrue(len(job2_context['individual_files']), 2)
        for file in job2_context['individual_files']:
            self.assertTrue(os.path.isfile(file.absolute_file_path))