def _get_actual_file_if_queueable( extracted_subfile: Dict, original_file: OriginalFile, samples: List[Sample]) -> OriginalFile: """Returns the actual file from the archive if it should be queued. If the file has been processed or has an unstarted DownloaderJob, None will be returned. `extracted_subfile` should be a Dict containing metadata about the file that was extracted from an archive. `original_file` should be the file associated with the CURRENT DownloaderJob. `samples` are the samples that the actual file should be associated with if it has to be created. """ # Check to see if we've made this original file before: potential_existing_files = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=extracted_subfile['filename'], is_archive=False ) if potential_existing_files.count() > 0: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. actual_file = potential_existing_files[0] if actual_file.needs_processing(): if not actual_file.is_downloaded: actual_file.is_downloaded = True actual_file.save() return actual_file else: return None else: actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile['absolute_path'] actual_file.filename = extracted_subfile['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() return actual_file
def test_queue_downloader_jobs_for_original_files(self, mock_send_task): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object_1 = Sample() sample_object_1.accession_code = "Sample1" sample_object_1.platform_accession_code = "Illumina Genome Analyzer" sample_object_1.platform_accession_name = "Illumina Genome Analyzer" sample_object_1.technology = "RNA-SEQ" sample_object_1.manufacturer = "ILLUMINA" sample_object_1.source_database = "SRA" sample_object_1.save() sample_object_2 = Sample() sample_object_2.accession_code = "Sample2" sample_object_2.platform_accession_code = "Illumina Genome Analyzer" sample_object_2.platform_accession_name = "Illumina Genome Analyzer" sample_object_2.technology = "RNA-SEQ" sample_object_2.manufacturer = "ILLUMINA" sample_object_2.source_database = "SRA" sample_object_2.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_1 association.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_2 association.save() sample_1_original_files = [] sample_2_original_files = [] original_file = OriginalFile() original_file.source_url = "first_url" original_file.source_filename = "first_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_1_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "second_url" original_file.source_filename = "second_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "third_url" original_file.source_filename = "third_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "fourth_url" original_file.source_filename = "fourth_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( sample_1_original_files, experiment_object.accession_code ) surveyor.queue_downloader_job_for_original_files( sample_2_original_files, experiment_object.accession_code ) self.assertEqual(DownloaderJob.objects.all().count(), 2)
def test_no_repeat_jobs(self): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object = Sample() sample_object.accession_code = "Sample1" sample_object.platform_accession_code = "Illumina Genome Analyzer" sample_object.platform_accession_name = "Illumina Genome Analyzer" sample_object.technology = "RNA-SEQ" sample_object.manufacturer = "ILLUMINA" sample_object.source_database = "SRA" sample_object.save() original_file_1 = OriginalFile() original_file_1.source_url = "first_url" original_file_1.source_filename = "first_filename" original_file_1.is_downloaded = False original_file_1.has_raw = True original_file_1.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_1 original_file_sample_association.sample = sample_object original_file_sample_association.save() original_file_2 = OriginalFile() original_file_2.source_url = "second_url" original_file_2.source_filename = "second_filename" original_file_2.is_downloaded = False original_file_2.has_raw = True original_file_2.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_2 original_file_sample_association.sample = sample_object original_file_sample_association.save() dlj = DownloaderJob() dlj.save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_1 ).save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_2 ).save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( [original_file_1, original_file_2], experiment_object.accession_code ) # We made one DownloaderJob in this test, so # queue_downloader_job_for_original_files didn't have anything # to do, so there should still be only one: self.assertEqual(1, DownloaderJob.objects.all().count())
def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]: """Generates a Sample item for each sample in an AE experiment. There are many possible data situations for a sample: - If the sample only has raw data available: - If it is on a platform that we support: Download this raw data and process it - If it is not on a platform we support: Don't download anything, don't process anything - If the sample has both raw and derived data: - If the raw data is on a platform we support: Download the raw data and process it, abandon the derived data - If the raw data is not on a platform we support Download the derived data and no-op it, abandon the raw data - If the sample only has derived data: Download the derived data and no-op it. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples """ created_samples = [] samples_endpoint = SAMPLES_URL.format(experiment.accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # The SDRF is the complete metadata record on a sample/property basis. # We run this through our harmonizer and then attach the properties # to our created samples. SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) # An experiment can have many samples for sample_data in samples: # For some reason, this sample has no files associated with it. if "file" not in sample_data or len(sample_data['file']) == 0: continue # Each sample is given an experimenatlly-unique title. flat_sample = utils.flatten(sample_data) title = harmony.extract_title(flat_sample) # A sample may actually have many sub files. # If there is raw data, take that. # If not, take the derived. has_raw = False for sub_file in sample_data['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file # Some have the 'data' field, but not the actual data # Ex: E-GEOD-9656 if sub_file_mod['type'] == "data" and sub_file_mod[ 'comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True skip_sample = False for sub_file in sample_data['file']: # Don't get the raw data if it's only a 1-color sample. if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data): has_raw = False # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue download_url = None filename = sub_file["name"] # sub_file["comment"] is only a list if there's # more than one comment... comments = sub_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if "FTP file" in comment["name"]: download_url = comment["value"] break else: download_url = comments["value"] if not download_url: logger.error( "Sample %s did not specify a download url, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if not filename: logger.error( "Sample %s did not specify a filename, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if skip_sample: continue # The accession code is not a simple matter to determine. sample_source_name = sample_data["source"].get("name", "") sample_assay_name = sample_data["assay"].get("name", "") sample_accession_code = self.determine_sample_accession( experiment.accession_code, sample_source_name, sample_assay_name, filename) # Figure out the Organism for this sample organism_name = UNKNOWN for characteristic in sample_data["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == UNKNOWN: logger.error( "Sample %s did not specify the organism name.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) organism = None continue else: organism = Organism.get_object_for_name(organism_name) # Create the sample object try: # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. sample_object = Sample.objects.get( accession_code=sample_accession_code) # If input experiment includes new protocol information, # update sample's protocol_info. existing_protocols = sample_object.protocol_info protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols, experiment.protocol_description, experiment.source_url + '/protocols') if is_updated: sample_object.protocol_info = protocol_info sample_obejct.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() # The basics sample_object.source_database = "ARRAY_EXPRESS" sample_object.title = title sample_object.accession_code = sample_accession_code sample_object.source_archive_url = samples_endpoint sample_object.organism = organism sample_object.platform_name = platform_dict[ "platform_accession_name"] sample_object.platform_accession_code = platform_dict[ "platform_accession_code"] sample_object.manufacturer = platform_dict["manufacturer"] sample_object.technology = "MICROARRAY" protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment.protocol_description, protocol_url=experiment.source_url + '/protocols') # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() # Directly assign the harmonized properties harmonized_sample = harmonized_samples[title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) sample_object.save() sample_annotation = SampleAnnotation() sample_annotation.data = sample_data sample_annotation.sample = sample_object sample_annotation.is_ccdl = False sample_annotation.save() original_file = OriginalFile() original_file.filename = filename original_file.source_filename = filename original_file.source_url = download_url original_file.is_downloaded = False original_file.is_archive = True original_file.has_raw = has_raw original_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object original_file_sample_association.save() created_samples.append(sample_object) logger.debug( "Created " + str(sample_object), experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sample=sample_object.id) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment, organism=organism) return created_samples
def download_geo(job_id: int) -> None: """The main function for the GEO Downloader. Downloads a single tar file containing the files representing samples relating to a single experiment stored in GEO. """ job = utils.start_job(job_id) accession_code = job.accession_code original_file = job.original_files.first() if not original_file: job.failure_reason = "No files associated with the job." logger.error("No files associated with the job.", downloader_job=job_id) utils.end_downloader_job(job, success=False) return url = original_file.source_url related_samples = original_file.samples.exclude(technology="RNA-SEQ") # First, download the sample archive URL. # Then, unpack all the ones downloaded. # Then create processor jobs! # The files for all of the samples are contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + "/" + accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + "/" + accession_code + "/" + url.split("/")[-1] logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code) _download_file(url, dl_file_path, job) original_file.absolute_file_path = dl_file_path original_file.is_downloaded = True original_file.save() unpacked_sample_files = [] try: # enumerate all files inside the archive archived_files = list(ArchivedFile(dl_file_path).get_files()) except FileExtractionError as e: job.failure_reason = e logger.exception( "Error occurred while extracting file.", path=dl_file_path, exception=str(e) ) utils.end_downloader_job(job, success=False) return for og_file in archived_files: sample = og_file.get_sample() # We don't want RNA-Seq data from GEO: # https://github.com/AlexsLemonade/refinebio/issues/966 if sample and sample.technology == "RNA-SEQ": logger.warn("RNA-Seq sample found in GEO downloader job.", sample=sample) continue if not sample and ( not og_file.is_processable() or og_file.experiment_accession_code() != accession_code ): # skip the files that we know are not processable and can't be associated with a sample # also skip the files were we couldn't find a sample and they don't mention the current experiment continue potential_existing_file = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=og_file.filename, is_archive=False, ).first() if potential_existing_file: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. if potential_existing_file.needs_processing(): if not potential_existing_file.is_downloaded: potential_existing_file.is_downloaded = True potential_existing_file.save() unpacked_sample_files.append(potential_existing_file) continue # Then this is a new file and we should create an original file for it actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file.file_path actual_file.filename = og_file.filename actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() # try to see if the file should be associated with a sample if sample: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() else: # if not, we can associate this file with all samples in the experiment for sample in related_samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() unpacked_sample_files.append(actual_file) if len(unpacked_sample_files) > 0: success = True logger.debug( "File downloaded and extracted successfully.", url=url, dl_file_path=dl_file_path, downloader_job=job_id, ) else: success = False logger.info( "Unable to extract any files.", url=url, dl_file_path=dl_file_path, downloader_job=job_id, ) job.failure_reason = "Failed to extract any downloaded files." if success: create_processor_jobs_for_original_files(unpacked_sample_files, job) if original_file.is_archive: original_file.delete_local_file() utils.end_downloader_job(job, success) return success
def download_geo(job_id: int) -> None: """The main function for the GEO Downloader. Downloads a single tar file containing the files representing samples relating to a single experiement stored in GEO. """ job = utils.start_job(job_id) file_assocs = DownloaderJobOriginalFileAssociation.objects.filter( downloader_job=job) original_file = file_assocs[0].original_file url = original_file.source_url accession_code = job.accession_code sample_assocs = OriginalFileSampleAssociation.objects.filter( original_file=original_file) related_samples = Sample.objects.filter( id__in=sample_assocs.values('sample_id')) # First, download the sample archive URL. # Then, unpack all the ones downloaded. # Then create processor jobs! # The files for all of the samples are # contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + url.split( '/')[-1] logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code) _download_file(url, dl_file_path, job) original_file.absolute_file_path = dl_file_path original_file.is_downloaded = True original_file.save() has_raw = True unpacked_sample_files = [] # These files are tarred, and also subsequently gzipped if '.tar' in dl_file_path: try: extracted_files = _extract_tar(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting tar file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: filename = og_file['filename'] if '_' in filename: sample_id = filename.split('_')[0] else: sample_id = filename.split('.')[0] try: sample = Sample.objects.get(accession_code=sample_id) except Exception as e: # We don't have this sample, but it's not a total failure. This happens. continue try: # Files from the GEO supplemental file are gzipped inside of the tarball. Great! archive_file = OriginalFile.objects.get( source_filename__contains=sample_id) archive_file.is_downloaded = True archive_file.is_archive = True archive_file.absolute_file_path = og_file['absolute_path'] archive_file.calculate_size() archive_file.calculate_sha1() archive_file.save() if '.gz' in og_file['filename']: extracted_subfile = _extract_gz(og_file['absolute_path'], accession_code) else: extracted_subfile = [og_file] actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile[0][ 'absolute_path'] actual_file.filename = extracted_subfile[0]['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() archive_file.delete_local_file() archive_file.is_downloaded = False archive_file.save() unpacked_sample_files.append(actual_file) except Exception as e: # TODO - is this worth failing a job for? logger.debug( "Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], exc_info=1, file=og_file['filename'], sample_id=sample_id, accession_code=accession_code) # If we don't know why we have it, get rid of it. os.remove(og_file["absolute_path"]) # This is a .tgz file. elif '.tgz' in dl_file_path: # If this is the MINiML file, it has been preprocessed if '_family.xml.tgz' in dl_file_path: has_raw = False try: extracted_files = _extract_tgz(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting tgz file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: if '.txt' in og_file['filename']: try: gsm_id = og_file['filename'].split('-')[0] sample = Sample.objects.get(accession_code=gsm_id) except Exception as e: os.remove(og_file["absolute_path"]) continue actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file['absolute_path'] actual_file.filename = og_file['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = has_raw actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() unpacked_sample_files.append(actual_file) # These files are only gzipped. # These are generally the _actually_ raw (rather than the non-raw data in a RAW file) data elif '.gz' in dl_file_path: try: extracted_files = _extract_gz(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting gz file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: filename = og_file['filename'] sample_id = filename.split('.')[0] try: # The archive we downloaded archive_file = OriginalFile.objects.get( source_filename__contains=filename) archive_file.is_downloaded = True archive_file.is_archive = True archive_file.absolute_file_path = dl_file_path archive_file.calculate_size() archive_file.calculate_sha1() archive_file.save() actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file['absolute_path'] actual_file.filename = og_file['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in related_samples: new_association = OriginalFileSampleAssociation() new_association.original_file = actual_file new_association.sample = sample new_association.save() archive_file.delete_local_file() archive_file.is_downloaded = False archive_file.save() unpacked_sample_files.append(actual_file) except Exception as e: logger.debug( "Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], exc_info=1, file=og_file['filename'], sample_id=sample_id, accession_code=accession_code) os.remove(og_file["absolute_path"]) # This is probably just a .txt file else: filename = dl_file_path.split('/')[-1] sample_id = filename.split('_')[0] actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = dl_file_path actual_file.filename = filename actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in related_samples: new_association = OriginalFileSampleAssociation() new_association.original_file = actual_file new_association.sample = sample new_association.save() unpacked_sample_files.append(actual_file) if len(unpacked_sample_files) > 0: success = True logger.debug("File downloaded and extracted successfully.", url=url, dl_file_path=dl_file_path, downloader_job=job_id) else: success = False logger.info("Unable to extract any files.", url=url, dl_file_path=dl_file_path, downloader_job=job_id) job.failure_reason = "Failed to extract any downloaded files." if success: utils.create_processor_jobs_for_original_files(unpacked_sample_files, job) if original_file.is_archive: original_file.delete_local_file() utils.end_downloader_job(job, success) return success