def create_downloader_job(self): job = DownloaderJob( downloader_task="SRA", nomad_job_id="DOWNLOADER/dispatch-1528945054-e8eaf540", num_retries=0, accession_code="NUNYA", success=None) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.downloader_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file assoc.downloader_job = job assoc.save() return job
def prepare_original_files(length: str) -> List[OriginalFile]: og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/" "AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz") og_file.source_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/" "aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" ) og_file.is_downloaded = True # We need to add the URL here so that _extract_assembly_information works properly og_file.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = ( "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/" "AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz") og_file2.source_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/" "aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz") og_file2.is_downloaded = True # We need to add the URL here so that _extract_assembly_information works properly og_file2.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz" og_file2.save() return [og_file, og_file2]
def create_processor_job(self, pipeline="AFFY_TO_PCL", ram_amount=2048): job = ProcessorJob( pipeline_applied=pipeline, nomad_job_id="PROCESSOR/dispatch-1528945054-e8eaf540", ram_amount=ram_amount, num_retries=0, volume_index="1", success=None) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = og_file assoc.processor_job = job assoc.save() return job
def create_downloader_job(): job = DownloaderJob( downloader_task="SRA", batch_job_id="DEFAULT", num_retries=0, accession_code="NUNYA", success=None, ) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.downloader_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file assoc.downloader_job = job assoc.save() return job
def create_processor_job(pipeline="AFFY_TO_PCL", ram_amount=2048, start_time=None): og_file_1 = OriginalFile() og_file_1.source_filename = "doesn't matter" og_file_1.filename = "this either" og_file_1.absolute_file_path = "nor this" og_file_1.save() og_file_2 = OriginalFile() og_file_2.source_filename = "doesn't matter" og_file_2.filename = "this either" og_file_2.absolute_file_path = "nor this" og_file_2.save() downloader_job = None if pipeline == "AFFY_TO_PCL": downloader_job = DownloaderJob( downloader_task="SRA", batch_job_id="DEFAULT", num_retries=0, accession_code="NUNYA", success=None, ) downloader_job.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file_2 assoc.downloader_job = downloader_job assoc.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file_1 assoc1.downloader_job = downloader_job assoc1.save() processor_job = ProcessorJob( downloader_job=downloader_job, pipeline_applied=pipeline, batch_job_id="PROCESSOR/dispatch-1528945054-e8eaf540", ram_amount=ram_amount, num_retries=0, success=None, start_time=start_time, ) processor_job.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file_1 assoc1.processor_job = processor_job assoc1.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = og_file_2 assoc.processor_job = processor_job assoc.save() return processor_job
def test_salmon_quant_one_sample_double_reads(self): """Test `salmon quant` on a sample that has double reads.""" # Set up organism index database objects. prepare_organism_indices() # Create an Experiment that includes two samples. # (The first sample has test data available, but the second does not.) experiment_accession = 'test_experiment' experiment = Experiment.objects.create(accession_code=experiment_accession) c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") # test_sample record sample_accession = 'test_sample' test_sample = Sample.objects.create(accession_code=sample_accession, organism=c_elegans, source_database='SRA', technology='RNA-SEQ') ExperimentSampleAssociation.objects.create(experiment=experiment, sample=test_sample) # fake_sample record (created to prevent tximport step in this experiment) fake_sample = Sample.objects.create(accession_code='fake_sample', source_database='SRA', technology='RNA-SEQ') ExperimentSampleAssociation.objects.create(experiment=experiment, sample=fake_sample) experiment_dir = '/home/user/data_store/salmon_tests/test_experiment' og_read_1 = OriginalFile() og_read_1.absolute_file_path = os.path.join(experiment_dir, 'raw/reads_1.fastq') og_read_1.filename = "reads_1.fastq" og_read_1.save() OriginalFileSampleAssociation.objects.create(original_file=og_read_1, sample=test_sample).save() og_read_2 = OriginalFile() og_read_2.absolute_file_path = os.path.join(experiment_dir, "raw/reads_2.fastq") og_read_2.filename = "reads_1.fastq" og_read_2.save() OriginalFileSampleAssociation.objects.create(original_file=og_read_2, sample=test_sample).save() sample_dir = os.path.join(experiment_dir, 'test_sample') job_context = salmon._prepare_files({"job_dir_prefix": "TEST", "job_id": "TEST", "job": ProcessorJob(), 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_read_1, og_read_2]}) # Run salmon. self.check_salmon_quant(job_context, sample_dir) # Confirm that this experiment is not ready for tximport yet, # because `salmon quant` is not run on 'fake_sample'. experiments_ready = salmon.get_tximport_inputs(job_context)['tximport_inputs'] self.assertEqual(len(experiments_ready), 0)
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.source_database = 'SRA' samp.technology = 'RNA-SEQ' samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = "ERR1562482_1.fastq.gz" og_file.filename = "ERR1562482_1.fastq.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "ERR1562482_2.fastq.gz" og_file2.filename = "ERR1562482_2.fastq.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file2 assoc1.processor_job = pj assoc1.save() return pj, [og_file, og_file2]
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=1001) samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() original_file = OriginalFile() original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" original_file.filename = "GSM1426071_CD_colon_active_1.CEL" original_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" original_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = original_file assoc1.processor_job = pj assoc1.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") sample = Sample() sample.title = "Heyo" sample.organism = c_elegans sample.is_processed = False sample.save() ogsa = OriginalFileSampleAssociation() ogsa.sample = sample ogsa.original_file = original_file ogsa.save() return pj
def prepare_dotsra_job(filename="ERR1562482.sra"): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.id = random.randint(111, 999999) pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = filename og_file.filename = filename og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/" + filename og_file.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj, [og_file]
def _get_actual_file_if_queueable( extracted_subfile: Dict, original_file: OriginalFile, samples: List[Sample]) -> OriginalFile: """Returns the actual file from the archive if it should be queued. If the file has been processed or has an unstarted DownloaderJob, None will be returned. `extracted_subfile` should be a Dict containing metadata about the file that was extracted from an archive. `original_file` should be the file associated with the CURRENT DownloaderJob. `samples` are the samples that the actual file should be associated with if it has to be created. """ # Check to see if we've made this original file before: potential_existing_files = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=extracted_subfile['filename'], is_archive=False ) if potential_existing_files.count() > 0: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. actual_file = potential_existing_files[0] if actual_file.needs_processing(): if not actual_file.is_downloaded: actual_file.is_downloaded = True actual_file.save() return actual_file else: return None else: actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile['absolute_path'] actual_file.filename = extracted_subfile['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() return actual_file
def prepare_illumina_job(organism): pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz" og_file.filename = "GSE22427_non-normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt") og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() sample_names = [ "LV-C&si-Control-1", "LV-C&si-Control-2", "LV-C&si-Control-3", "LV-C&si-EZH2-1", "LV-C&si-EZH2-2", "LV-C&si-EZH2-3", "LV-EZH2&si-EZH2-1", "LV-EZH2&si-EZH2-2", "LV-EZH2&si-EZH2-3", "LV-T350A&si-EZH2-1", "LV-T350A&si-EZH2-2", "LV-T350A&si-EZH2-3", ] for name in sample_names: sample = Sample() sample.accession_code = name sample.title = name sample.organism = organism sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = {"description": [name]} sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() return pj
def _make_original_file_with_contents(contents: str) -> OriginalFile: _, path = tempfile.mkstemp(suffix=".txt") with open(path, "w") as f: f.write(contents) og_file = OriginalFile() og_file.source_filename = path og_file.filename = os.path.basename(path) og_file.absolute_file_path = os.path.realpath(path) og_file.is_downloaded = True og_file.save() return og_file
def prepare_illumina_job(job_info: Dict) -> ProcessorJob: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() for s in job_info["samples"]: # For convenience, if you give a list of strings we'll just use the # strings as both titles and accessions. annotation = None if type(s) == str: accession_code = s title = s elif type(s) == tuple and list(map(type, s)) == [str, str]: accession_code, title = s elif type(s) == tuple and list(map(type, s)) == [str, str, dict]: accession_code, title, annotation = s else: raise ValueError(f"Invalid sample type for sample {s}") sample = Sample() sample.accession_code = accession_code sample.title = title sample.organism = job_info["organism"] sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = annotation if annotation is not None else { "description": [title] } sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() return pj
def test_convert_processed_illumina(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # Reporter Identifier VALUE Detection Pval # ILMN_1343291 14.943602 0 # ILMN_1343295 13.528082 0 og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/" og_file.filename = "GSM557500_sample_table.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt") og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000156508 14.943602 # ENSG00000111640 13.528082 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 920374) self.assertTrue( no_op.check_output_quality(final_context["output_file_path"]))
def test_convert_illumina_no_header(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 # ILMN_2209417 10.0000 0.2029 # ILMN_1765401 152.0873 0.0000 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt" ) og_file.filename = "GSM1089291-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt" og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000105675 10 # ENSG00000085721 152.0873 # ENSG00000278494 152.0873 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 786207)
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_convert_illumina_bad_cols(self): """ In future, this test may be deprecated. For now it just alerts that it needs attention. """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 11.0000 0.123 # ILMN_2209417 10.0000 0.2029 11.1234 0.543 # LMN_1765401 152.0873 0.0000 99.999 0.19 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt" ) og_file.filename = "GSM1089291-tbl-1-modified.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt" ) og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertFalse(final_context["success"]) self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
def prepare_non_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM45nnn/GSM45588/suppl/GSM45588.CEL.gz" og_file.filename = "GSM45588.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM45588.CEL" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" og_file.filename = "GSM1426071_CD_colon_active_1.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_agilent_twocolor_job(): pj = ProcessorJob() pj.pipeline_applied = "AGILENT_TWOCOLOR_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE22900&format=file" og_file.filename = "GSM466597_95899_agilent.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/AGILENT_TWOCOLOR/GSM466597_95899_agilent.txt" og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_job(job_info: dict) -> ProcessorJob: job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = job_info["accession_code"] sample.title = job_info["accession_code"] sample.platform_accession_code = job_info["platform_accession_code"] manufacturer = job_info.get("manufacturer", None) if manufacturer is not None: sample.manufacturer = manufacturer # The illumina samples need the human organism if manufacturer == "ILLUMINA": homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() return job
def prepare_huex_v1_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1364nnn/GSM1364667/suppl/GSM1364667_U_110208_7-02-10_S2.CEL.gz" og_file.filename = "GSM1364667_U_110208_7-02-10_S2.CEL" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/CEL/GSM1364667_U_110208_7-02-10_S2.CEL" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def test_no_rnaseq(self): """Makes sure that no RNA-Seq data gets downloaded even if there's a job for it. """ dlj = DownloaderJob() dlj.accession_code = 'GSE103217' dlj.save() original_file = OriginalFile() original_file.filename = "GSE103217_family.xml.tgz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103217/miniml/GSE103217_family.xml.tgz" original_file.source_filename = "GSE103217_family.xml.tgz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE103217' sample.technology = "RNA-SEQ" sample.manufacturer = "ILLUMINA" sample.platform_accession_code = "Illumina HiSeq 2500" sample.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) self.assertFalse(download_result) dlj.refresh_from_db() self.assertFalse(dlj.success) # It's not necessarily that we didn't extract any files, but # none that were usable so it looks like none. self.assertEqual(dlj.failure_reason, "Failed to extract any downloaded files.")
def setup_experiment(new_version_accessions: List[str], old_version_accessions: List[str]) -> Dict: """ Create an experiment where some samples were processed with the newest version of salmon and other with an older one. """ # Create the experiment experiment_accession = "SRP095529" data_dir = "/home/user/data_store/" experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession, technology="RNA-SEQ") zebrafish = Organism.get_object_for_name("DANIO_RERIO") # Create the transcriptome processor and result: transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.9.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.9.1" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() quant_processor = Processor() quant_processor.name = "Salmon Quant" quant_processor.version = "salmon 0.9.1" quant_processor.docker_image = "dr_salmon" quant_processor.environment = '{"some": "environment"}' quant_processor.save() for accession_code in old_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # associate with OLD organism index quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) # Create another OrganismIndex with a newer version of transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.13.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.13.1" # DIFFERENT SALMON VERSION organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in new_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # NEWER VERSION quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) return experiment
def test_organism_shepherd_command(self, mock_nomad, mock_send_job, mock_get_active_volumes): """Tests that the organism shepherd requeues jobs in the right order. The situation we're setting up is basically this: * There are two experiments. * One of them has 1/2 samples processed, the other 0/1 * One of them needs a DownloaderJob requeued and the other needs a ProcessorJob requued. And what we're going to test for is: * Both of the jobs that need to be requeued are requeued. * The experiment with a processed sample is requeued first because it has a higher completion percentage. """ # First, set up our mocks to prevent network calls. mock_send_job.return_value = True active_volumes = {"1", "2", "3"} mock_get_active_volumes.return_value = active_volumes def mock_init_nomad(host, port=0, timeout=0): ret_value = MagicMock() ret_value.jobs = MagicMock() ret_value.jobs.get_jobs = MagicMock() ret_value.jobs.get_jobs.side_effect = lambda: [] return ret_value mock_nomad.side_effect = mock_init_nomad zebrafish = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) zebrafish.save() # Experiment that is 0% complete. zero_percent_experiment = Experiment(accession_code='ERP037000') zero_percent_experiment.technology = 'RNA-SEQ' zero_percent_experiment.save() organism_assoc = ExperimentOrganismAssociation.objects.create( organism=zebrafish, experiment=zero_percent_experiment) zero_percent = OriginalFile() zero_percent.filename = "ERR037001.fastq.gz" zero_percent.source_filename = "ERR037001.fastq.gz" zero_percent.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR037/ERR037001/ERR037001_1.fastq.gz" zero_percent.is_archive = True zero_percent.save() zero_percent_sample = Sample() zero_percent_sample.accession_code = 'ERR037001' zero_percent_sample.organism = zebrafish zero_percent_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = zero_percent_sample assoc.original_file = zero_percent assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = zero_percent_sample assoc.experiment = zero_percent_experiment assoc.save() # TODO: fix names of all the variables to be appropriate for this test case. zero_percent_dl_job = DownloaderJob() zero_percent_dl_job.accession_code = zero_percent_sample.accession_code zero_percent_dl_job.downloader_task = "SRA" zero_percent_dl_job.start_time = timezone.now() zero_percent_dl_job.end_time = timezone.now() zero_percent_dl_job.success = False zero_percent_dl_job.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = zero_percent_dl_job assoc.original_file = zero_percent assoc.save() # Experiment that is 50% complete. fify_percent_experiment = Experiment(accession_code='ERP036000') fify_percent_experiment.technology = 'RNA-SEQ' fify_percent_experiment.save() organism_assoc = ExperimentOrganismAssociation.objects.create( organism=zebrafish, experiment=fify_percent_experiment) ## First sample, this one has been processed. successful_pj = ProcessorJob() successful_pj.accession_code = "ERR036000" successful_pj.pipeline_applied = "SALMON" successful_pj.ram_amount = 12288 successful_pj.start_time = timezone.now() successful_pj.end_time = timezone.now() successful_pj.success = True successful_pj.save() successful_og = OriginalFile() successful_og.filename = "ERR036000.fastq.gz" successful_og.source_filename = "ERR036000.fastq.gz" successful_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" successful_og.is_archive = True successful_og.save() successful_sample = Sample() successful_sample.accession_code = 'ERR036000' successful_sample.organism = zebrafish successful_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = successful_sample assoc.original_file = successful_og assoc.save() assoc = ProcessorJobOriginalFileAssociation() assoc.processor_job = successful_pj assoc.original_file = successful_og assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = successful_sample assoc.experiment = fify_percent_experiment assoc.save() ## Second sample, this one hasn't been processed. fifty_percent_unprocessed_og = OriginalFile() fifty_percent_unprocessed_og.filename = "ERR036001.fastq.gz" fifty_percent_unprocessed_og.source_filename = "ERR036001.fastq.gz" fifty_percent_unprocessed_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036001/ERR036001_1.fastq.gz" fifty_percent_unprocessed_og.is_archive = True fifty_percent_unprocessed_og.save() fifty_percent_unprocessed_sample = Sample() fifty_percent_unprocessed_sample.accession_code = 'ERR036001' fifty_percent_unprocessed_sample.organism = zebrafish fifty_percent_unprocessed_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = fifty_percent_unprocessed_sample assoc.original_file = fifty_percent_unprocessed_og assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = fifty_percent_unprocessed_sample assoc.experiment = fify_percent_experiment assoc.save() fifty_percent_processor_job = ProcessorJob() fifty_percent_processor_job.pipeline_applied = "SALMON" fifty_percent_processor_job.accession_code = fifty_percent_unprocessed_sample.accession_code fifty_percent_processor_job.ram_amount = 12288 fifty_percent_processor_job.start_time = timezone.now() fifty_percent_processor_job.end_time = timezone.now() fifty_percent_processor_job.success = False fifty_percent_processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.processor_job = fifty_percent_processor_job assoc.original_file = fifty_percent_unprocessed_og assoc.save() # Setup is done, actually run the command. args = [] options = {"organism_name": "DANIO_RERIO"} call_command("organism_shepherd", *args, **options) # Verify that the jobs were called in the correct order. mock_calls = mock_send_job.mock_calls first_call_job_type = mock_calls[0][1][0] first_call_job_object = mock_calls[0][2]["job"] self.assertEqual(first_call_job_type, ProcessorPipeline.SALMON) self.assertEqual(first_call_job_object.pipeline_applied, fifty_percent_processor_job.pipeline_applied) self.assertEqual(first_call_job_object.ram_amount, fifty_percent_processor_job.ram_amount) self.assertIn(first_call_job_object.volume_index, active_volumes) fifty_percent_processor_job.refresh_from_db() self.assertEqual(first_call_job_object, fifty_percent_processor_job.retried_job) second_call_job_type = mock_calls[1][1][0] second_call_job_object = mock_calls[1][2]["job"] self.assertEqual(second_call_job_type, Downloaders.SRA) self.assertEqual(second_call_job_object.accession_code, zero_percent_dl_job.accession_code) self.assertEqual(second_call_job_object.downloader_task, zero_percent_dl_job.downloader_task) zero_percent_dl_job.refresh_from_db() self.assertEqual(second_call_job_object, zero_percent_dl_job.retried_job)
def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]: """Generates a Sample item for each sample in an AE experiment. There are many possible data situations for a sample: - If the sample only has raw data available: - If it is on a platform that we support: Download this raw data and process it - If it is not on a platform we support: Don't download anything, don't process anything - If the sample has both raw and derived data: - If the raw data is on a platform we support: Download the raw data and process it, abandon the derived data - If the raw data is not on a platform we support Download the derived data and no-op it, abandon the raw data - If the sample only has derived data: Download the derived data and no-op it. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples """ created_samples = [] samples_endpoint = SAMPLES_URL.format(experiment.accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # The SDRF is the complete metadata record on a sample/property basis. # We run this through our harmonizer and then attach the properties # to our created samples. SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) # An experiment can have many samples for sample_data in samples: # For some reason, this sample has no files associated with it. if "file" not in sample_data or len(sample_data['file']) == 0: continue # Each sample is given an experimenatlly-unique title. flat_sample = utils.flatten(sample_data) title = harmony.extract_title(flat_sample) # A sample may actually have many sub files. # If there is raw data, take that. # If not, take the derived. has_raw = False for sub_file in sample_data['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file # Some have the 'data' field, but not the actual data # Ex: E-GEOD-9656 if sub_file_mod['type'] == "data" and sub_file_mod[ 'comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True skip_sample = False for sub_file in sample_data['file']: # Don't get the raw data if it's only a 1-color sample. if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data): has_raw = False # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue download_url = None filename = sub_file["name"] # sub_file["comment"] is only a list if there's # more than one comment... comments = sub_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if "FTP file" in comment["name"]: download_url = comment["value"] break else: download_url = comments["value"] if not download_url: logger.error( "Sample %s did not specify a download url, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if not filename: logger.error( "Sample %s did not specify a filename, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if skip_sample: continue # The accession code is not a simple matter to determine. sample_source_name = sample_data["source"].get("name", "") sample_assay_name = sample_data["assay"].get("name", "") sample_accession_code = self.determine_sample_accession( experiment.accession_code, sample_source_name, sample_assay_name, filename) # Figure out the Organism for this sample organism_name = UNKNOWN for characteristic in sample_data["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == UNKNOWN: logger.error( "Sample %s did not specify the organism name.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) organism = None continue else: organism = Organism.get_object_for_name(organism_name) # Create the sample object try: # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. sample_object = Sample.objects.get( accession_code=sample_accession_code) # If input experiment includes new protocol information, # update sample's protocol_info. existing_protocols = sample_object.protocol_info protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols, experiment.protocol_description, experiment.source_url + '/protocols') if is_updated: sample_object.protocol_info = protocol_info sample_obejct.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() # The basics sample_object.source_database = "ARRAY_EXPRESS" sample_object.title = title sample_object.accession_code = sample_accession_code sample_object.source_archive_url = samples_endpoint sample_object.organism = organism sample_object.platform_name = platform_dict[ "platform_accession_name"] sample_object.platform_accession_code = platform_dict[ "platform_accession_code"] sample_object.manufacturer = platform_dict["manufacturer"] sample_object.technology = "MICROARRAY" protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment.protocol_description, protocol_url=experiment.source_url + '/protocols') # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() # Directly assign the harmonized properties harmonized_sample = harmonized_samples[title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) sample_object.save() sample_annotation = SampleAnnotation() sample_annotation.data = sample_data sample_annotation.sample = sample_object sample_annotation.is_ccdl = False sample_annotation.save() original_file = OriginalFile() original_file.filename = filename original_file.source_filename = filename original_file.source_url = download_url original_file.is_downloaded = False original_file.is_archive = True original_file.has_raw = has_raw original_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object original_file_sample_association.save() created_samples.append(sample_object) logger.debug( "Created " + str(sample_object), experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sample=sample_object.id) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment, organism=organism) return created_samples
def test_download_geo(self, mock_send_task): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.filename = "GSE22427_non-normalized.txt.gz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.technology = "MICROARRAY" sample.manufacturer = "AGILENT" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "Illumina_RatRef-12_V1.0" sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) file_assocs = OriginalFileSampleAssociation.objects.filter( sample=sample) self.assertEqual(file_assocs.count(), 2) for file_assoc in file_assocs: original_file = file_assoc.original_file if original_file.filename.endswith(".gz"): # We delete the archive after we extract from it self.assertFalse(original_file.is_downloaded) else: self.assertTrue(original_file.is_downloaded) # Make sure it worked self.assertTrue(download_result) self.assertTrue(dlj.failure_reason is None) self.assertTrue(len(ProcessorJob.objects.all()) > 0) self.assertEqual(ProcessorJob.objects.all()[0].pipeline_applied, "AGILENT_TWOCOLOR_TO_PCL") self.assertEqual(ProcessorJob.objects.all()[0].ram_amount, 2048)
def test_salmon_quant_two_samples_single_read(self): """Test `salmon quant` outputs on two samples that have single read and that belong to same experiment. """ prepare_organism_indices() # Create one experiment and two related samples, based on: # https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623 # (For testing purpose, only two of the four samples' data are included.) experiment_accession = 'PRJNA242809' experiment = Experiment.objects.create(accession_code=experiment_accession) c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") ## Sample 1 sample1_accession = 'SRR1206053' sample1 = Sample.objects.create(accession_code=sample1_accession, organism=c_elegans) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1) experiment_dir = "/home/user/data_store/salmon_tests/PRJNA242809" og_file_1 = OriginalFile() og_file_1.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206053.fastq.gz") og_file_1.filename = "SRR1206053.fastq.gz" og_file_1.save() OriginalFileSampleAssociation.objects.create(original_file=og_file_1, sample=sample1).save() ## Sample 2 sample2_accession = 'SRR1206054' sample2 = Sample.objects.create(accession_code=sample2_accession, organism=c_elegans) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2) og_file_2 = OriginalFile() og_file_2.absolute_file_path = os.path.join(experiment_dir, "raw/SRR1206054.fastq.gz") og_file_2.filename = "SRR1206054.fastq.gz" og_file_2.save() OriginalFileSampleAssociation.objects.create(original_file=og_file_2, sample=sample2).save() # Test `salmon quant` on sample1 (SRR1206053) sample1_dir = os.path.join(experiment_dir, sample1_accession) job1_context = salmon._prepare_files({"job_dir_prefix": "TEST", "job_id": "TEST", 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_file_1]}) # Check quant.sf in `salmon quant` output dir of sample1 self.check_salmon_quant(job1_context, sample1_dir) # Confirm that this experiment is not ready for tximport yet. experiments_ready = salmon._get_tximport_inputs(job1_context) self.assertEqual(len(experiments_ready), 0) # This job should not have produced any tximport output # because the other sample isn't ready yet. self.assertFalse(os.path.exists(os.path.join(job1_context["work_dir"], 'txi_out.RDS'))) # Now run `salmon quant` on sample2 (SRR1206054) too sample2_dir = os.path.join(experiment_dir, sample2_accession) job2_context = salmon._prepare_files({"job_dir_prefix": "TEST2", "job_id": "TEST2", 'pipeline': Pipeline(name="Salmon"), 'computed_files': [], "original_files": [og_file_2]}) # Clean up tximport output: rds_filename = os.path.join(job2_context["work_dir"], 'txi_out.RDS') if (os.path.isfile(rds_filename)): os.remove(rds_filename) # Check quant.sf in `salmon quant` output dir of sample2 self.check_salmon_quant(job2_context, sample2_dir) # rds_filename should have been generated by tximport at this point. # Note: `tximport` step is launched by subprocess module in Python. # If input "quant.sf" files are too large, we may have to wait for # a few seconds before testing the existence of rds_filename. self.assertTrue(os.path.exists(rds_filename)) for computed_file in job2_context['computed_files']: if computed_file.filename[-4:] == '.RDS': rds_file_path = computed_file.absolute_file_path cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/test_tximport.R", "--txi_out", rds_file_path, "--gene2txmap", job2_context["genes_to_transcripts_path"] ] tximport_test_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if tximport_test_result.returncode != 0: # If the exit code is not 0 then tximport failed so fail the tests. self.assertTrue(False) # Check the individual files self.assertTrue(len(job2_context['individual_files']), 2) for file in job2_context['individual_files']: self.assertTrue(os.path.isfile(file.absolute_file_path))
def test_create_missing_jobs(self): """Tests that files which should have downloader jobs get them created.""" # 1. create a sample with an original file and a downloader job original_file_with_downloader = OriginalFile() original_file_with_downloader.filename = "processed.CEL" original_file_with_downloader.source_filename = "processed.CEL" original_file_with_downloader.is_downloaded = True original_file_with_downloader.is_archive = False original_file_with_downloader.save() sample_with_downloader = Sample() sample_with_downloader.accession_code = "MA_doesnt_need_processor" sample_with_downloader.technology = "MICROARRAY" sample_with_downloader.source_database = "GEO" sample_with_downloader.platform_accession_code = "bovine" sample_with_downloader.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_with_downloader, original_file=original_file_with_downloader) downloader_job = DownloaderJob() downloader_job.success = True downloader_job.worker_id = "worker_1" downloader_job.volume_index = "1" downloader_job.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file_with_downloader) # 2. create a sample with an original file and no downloader job original_file = OriginalFile() original_file.filename = "tarball.gz" original_file.source_filename = "tarball.gz" original_file.is_downloaded = True original_file.is_archive = True original_file.save() sample_no_downloader = Sample() sample_no_downloader.accession_code = "sample_no_downloader" sample_no_downloader.technology = "MICROARRAY" sample_no_downloader.source_database = "GEO" sample_no_downloader.platform_accession_code = "bovine" # must be a supported platform sample_no_downloader.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_no_downloader, original_file=original_file) # 3. Setup is done, actually run the command. command = Command() command.handle() ## Test that a missing downloader job was created. self.assertEqual( 1, DownloaderJobOriginalFileAssociation.objects.filter( original_file=original_file).count(), ) ## Test that a downloader job that wasn't missing wasn't created. ## Of course, we created one in test setup, so we're really ## checking that it's still only 1. self.assertEqual( 1, DownloaderJobOriginalFileAssociation.objects.filter( original_file=original_file_with_downloader).count(), )