Python _prepare_filesの例、data_refinery_workers.processors.salmon._prepare_files Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_salmon.py プロジェクト: modulexcite/refinebio

    def test_salmon_quant_one_sample_double_reads(self):
        """Test `salmon quant` on a sample that has double reads."""
        # Set up organism index database objects.
        prepare_organism_indices()

        # Create an Experiment that includes two samples.
        # (The first sample has test data available, but the second does not.)
        experiment_accession = 'test_experiment'
        experiment = Experiment.objects.create(accession_code=experiment_accession)

        c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

        # test_sample record
        sample_accession = 'test_sample'
        test_sample = Sample.objects.create(accession_code=sample_accession,
                                            organism=c_elegans,
                                            source_database='SRA',
                                            technology='RNA-SEQ')
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=test_sample)
        # fake_sample record (created to prevent tximport step in this experiment)
        fake_sample = Sample.objects.create(accession_code='fake_sample',
                                            source_database='SRA',
                                            technology='RNA-SEQ')
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=fake_sample)

        experiment_dir = '/home/user/data_store/salmon_tests/test_experiment'

        og_read_1 = OriginalFile()
        og_read_1.absolute_file_path = os.path.join(experiment_dir, 'raw/reads_1.fastq')
        og_read_1.filename = "reads_1.fastq"
        og_read_1.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_read_1, sample=test_sample).save()

        og_read_2 = OriginalFile()
        og_read_2.absolute_file_path = os.path.join(experiment_dir, "raw/reads_2.fastq")
        og_read_2.filename = "reads_1.fastq"
        og_read_2.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_read_2, sample=test_sample).save()

        sample_dir = os.path.join(experiment_dir, 'test_sample')

        job_context = salmon._prepare_files({"job_dir_prefix": "TEST",
                                             "job_id": "TEST",
                                             "job": ProcessorJob(),
                                             'pipeline': Pipeline(name="Salmon"),
                                             'computed_files': [],
                                             "original_files": [og_read_1, og_read_2]})

        # Run salmon.
        self.check_salmon_quant(job_context, sample_dir)

        # Confirm that this experiment is not ready for tximport yet,
        # because `salmon quant` is not run on 'fake_sample'.
        experiments_ready = salmon.get_tximport_inputs(job_context)['tximport_inputs']
        self.assertEqual(len(experiments_ready), 0)

コード例 #2

0

ファイルを表示

ファイル: test_salmon.py プロジェクト: Quiltomics/refinebio

    def test_salmon_determine_index_length_double_read(self):
        """Test that the right length is calculated when the sample has two reads."""
        job, files = prepare_job()

        job_context = salmon._set_job_prefix({'original_files': files,
                                              'job_id': job})
        job_context = salmon._prepare_files(job_context)
        results = salmon._determine_index_length(job_context)

        self.assertEqual(results['index_length_raw'], 41)
        self.assertEqual(results['index_length'], 'short')

コード例 #3

0

ファイルを表示

ファイル: test_salmon.py プロジェクト: dongbohu/ccdl_test

    def test_prepare_files_failure(self):
        batch, _, _ = init_objects()
        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])

        job_context = utils.start_job({
            "job": processor_job,
            "job_id": processor_job.id
        })
        job_context = salmon._prepare_files(job_context)

        self.assertFalse(job_context["success"])
        self.assertEqual(
            processor_job.failure_reason,
            "Exception caught while retrieving raw file ERR003000_2.fastq.gz")

        self.assertFalse(os.path.isfile(batch.files[0].get_temp_pre_path()))

コード例 #4

0

ファイルを表示

ファイル: test_salmon.py プロジェクト: dongbohu/ccdl_test

    def test_download_index_not_found(self):
        batch, _, _ = init_objects()
        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])

        job_context = utils.start_job({
            "job": processor_job,
            "job_id": processor_job.id,
            "kmer_size": "23"
        })
        job_context = salmon._prepare_files(job_context)

        # Function we're testing.
        salmon._download_index(job_context)

        self.assertFalse(job_context["success"])
        self.assertEqual(
            processor_job.failure_reason,
            "Failed to find an index for organism H**O SAPIENS with kmer_size of 23."
        )

コード例 #5

0

ファイルを表示

ファイル: test_salmon.py プロジェクト: Quiltomics/refinebio

    def test_fastqc(self):

        job, og_files = prepare_job()
        win_context = {
            'job': job,
            'job_id': 789,
            'job_dir_prefix': "processor_job_789",
            'pipeline': Pipeline(name="Salmon"),
            'qc_directory': "/home/user/data_store/raw/TEST/SALMON/qc",
            'original_files': og_files,
            'input_file_path': og_files[0],
            'input_file_path_2': og_files[1],
            "computed_files": [],
            'success': True

        }

        # Ensure clean testdir
        shutil.rmtree(win_context['qc_directory'], ignore_errors=True)
        os.makedirs(win_context['qc_directory'], exist_ok=True)
        win_context = salmon._prepare_files(win_context)

        win = salmon._run_fastqc(win_context)
        self.assertTrue(win['success'])
        win = salmon._run_multiqc(win_context)
        self.assertTrue(win['success'])

        for file in win['qc_files']:
            self.assertTrue(os.path.isfile(file.absolute_file_path))

        fail_context = {
            'job': job,
            'job_id': 'hippityhoppity',
            'pipeline': Pipeline(name="Salmon"),
            'qc_directory': "/home/user/data_store/raw/TEST/SALMON/derp",
            'original_files': [],
            'success': True,
            'computed_files': []
        }
        fail = salmon._run_fastqc(fail_context)
        self.assertFalse(fail['success'])

コード例 #6

0

ファイルを表示

ファイル: test_salmon.py プロジェクト: dongbohu/ccdl_test

    def test_download_index_missing(self, mock_download_processed_file):
        batch, _, _ = init_objects()
        _insert_salmon_index()
        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])

        job_context = utils.start_job({
            "job": processor_job,
            "job_id": processor_job.id,
            "job_dir_prefix": "dummy",
            "kmer_size": "23"
        })
        job_context = salmon._prepare_files(job_context)

        mock_download_processed_file.side_effect = FileNotFoundError()

        # The function being testing.
        salmon._download_index(job_context)

        self.assertFalse(job_context["success"])
        self.assertEqual(
            processor_job.failure_reason,
            "Failed to download and extract index tarball Homo_sapiens_short.gtf.gz"
        )

コード例 #7

0

ファイルを表示

ファイル: test_salmon.py プロジェクト: Quiltomics/refinebio

    def test_salmon_quant_two_samples_single_read(self):
        """Test `salmon quant` outputs on two samples that have single
        read and that belong to same experiment.
        """
        prepare_organism_indices()

        # Create one experiment and two related samples, based on:
        #   https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623
        # (For testing purpose, only two of the four samples' data are included.)
        experiment_accession = 'PRJNA242809'
        experiment = Experiment.objects.create(accession_code=experiment_accession)

        c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS")

        ## Sample 1
        sample1_accession = 'SRR1206053'
        sample1 = Sample.objects.create(accession_code=sample1_accession,
                                        organism=c_elegans)
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1)

        experiment_dir = "/home/user/data_store/salmon_tests/PRJNA242809"

        og_file_1 = OriginalFile()
        og_file_1.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206053.fastq.gz")
        og_file_1.filename = "SRR1206053.fastq.gz"
        og_file_1.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_file_1, sample=sample1).save()

        ## Sample 2
        sample2_accession = 'SRR1206054'
        sample2 = Sample.objects.create(accession_code=sample2_accession,
                                        organism=c_elegans)
        ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2)

        og_file_2 = OriginalFile()
        og_file_2.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206054.fastq.gz")
        og_file_2.filename = "SRR1206054.fastq.gz"
        og_file_2.save()

        OriginalFileSampleAssociation.objects.create(original_file=og_file_2, sample=sample2).save()

        # Test `salmon quant` on sample1 (SRR1206053)
        sample1_dir = os.path.join(experiment_dir, sample1_accession)

        job1_context = salmon._prepare_files({"job_dir_prefix": "TEST",
                                              "job_id": "TEST",
                                              'pipeline': Pipeline(name="Salmon"),
                                              'computed_files': [],
                                              "original_files": [og_file_1]})

        # Check quant.sf in `salmon quant` output dir of sample1
        self.check_salmon_quant(job1_context, sample1_dir)
        # Confirm that this experiment is not ready for tximport yet.
        experiments_ready = salmon._get_tximport_inputs(job1_context)
        self.assertEqual(len(experiments_ready), 0)
        # This job should not have produced any tximport output
        # because the other sample isn't ready yet.
        self.assertFalse(os.path.exists(os.path.join(job1_context["work_dir"], 'txi_out.RDS')))

         # Now run `salmon quant` on sample2 (SRR1206054) too
        sample2_dir = os.path.join(experiment_dir, sample2_accession)
        job2_context = salmon._prepare_files({"job_dir_prefix": "TEST2",
                                              "job_id": "TEST2",
                                              'pipeline': Pipeline(name="Salmon"),
                                              'computed_files': [],
                                              "original_files": [og_file_2]})

        # Clean up tximport output:
        rds_filename = os.path.join(job2_context["work_dir"], 'txi_out.RDS')
        if (os.path.isfile(rds_filename)):
            os.remove(rds_filename)

        # Check quant.sf in `salmon quant` output dir of sample2
        self.check_salmon_quant(job2_context, sample2_dir)

        # rds_filename should have been generated by tximport at this point.
        # Note: `tximport` step is launched by subprocess module in Python.
        # If input "quant.sf" files are too large, we may have to wait for
        # a few seconds before testing the existence of rds_filename.
        self.assertTrue(os.path.exists(rds_filename))

        for computed_file in job2_context['computed_files']:
            if computed_file.filename[-4:] == '.RDS':
                rds_file_path = computed_file.absolute_file_path

        cmd_tokens = [
            "/usr/bin/Rscript", "--vanilla",
            "/home/user/data_refinery_workers/processors/test_tximport.R",
            "--txi_out", rds_file_path,
            "--gene2txmap", job2_context["genes_to_transcripts_path"]
        ]

        tximport_test_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

        if tximport_test_result.returncode != 0:
            # If the exit code is not 0 then tximport failed so fail the tests.
            self.assertTrue(False)

        # Check the individual files
        self.assertTrue(len(job2_context['individual_files']), 2)
        for file in job2_context['individual_files']:
            self.assertTrue(os.path.isfile(file.absolute_file_path))

コード例 #8

0

ファイルを表示

ファイル: test_salmon.py プロジェクト: dongbohu/ccdl_test

    def test_success(self):
        """Tests the successful path of the module under test."""
        logger.info("STARTING SALMON SUCCESS TEST!!!!!!!!")
        # Set up test environment.
        batch, first_file, second_file = init_objects()
        _insert_salmon_index()
        # Change the batch/files to point to test-specific locations
        batch.platform_accession_code = "TEST"
        batch.save()
        first_file.internal_location = "TEST/SALMON"
        first_file.save()
        second_file.internal_location = "TEST/SALMON"
        second_file.save()

        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])
        processor_job.save()
        job_context = utils.start_job({
            "job": processor_job,
            "job_id": processor_job.id
        })
        job_context = salmon._set_job_prefix(job_context)

        # Ensure temp dir isn't leftover from a previous test.
        temp_dir = first_file.get_temp_dir(job_context["job_dir_prefix"])
        shutil.rmtree(temp_dir, ignore_errors=True)

        # One of the functions being tested:
        job_context = salmon._prepare_files(job_context)

        input_file_path = job_context["input_file_path"]
        self.assertIsInstance(input_file_path, str)
        self.assertTrue(os.path.isfile(input_file_path))
        input_file_path_2 = job_context["input_file_path_2"]
        self.assertIsInstance(input_file_path_2, str)
        self.assertTrue(os.path.isfile(input_file_path_2))
        output_directory_path = job_context["output_directory"]
        self.assertIsInstance(output_directory_path, str)
        self.assertTrue(os.path.isdir(output_directory_path))

        job_context = salmon._determine_index_length(job_context)

        # The 'kmer_size' key has been added to job_context with the
        # correct value.
        self.assertEqual(job_context["kmer_size"], "23")

        # Another function being tested
        job_context = salmon._download_index(job_context)

        self.assertTrue(job_context["success"])
        self.assertTrue("index_directory" in job_context)
        self.assertTrue(os.path.isdir(job_context["index_directory"]))
        self.assertEqual(9, len(os.listdir(job_context["index_directory"])))

        # Another function being tested
        job_context = salmon._run_salmon(job_context)

        self.assertTrue(job_context["success"])
        self.assertGreater(len(os.listdir(output_directory_path)), 1)

        # The last function to test
        job_context = salmon._zip_and_upload(job_context)

        self.assertTrue(job_context["success"])
        self.assertTrue(os.path.exists(first_file.get_processed_path()))

        # Clean up both input and output files
        first_file.remove_temp_directory()
        shutil.rmtree(first_file.get_processed_dir())
        logger.info("ENDING SALMON SUCCESS TEST!!!!!!!!")