def set_downloaded(self, absolute_file_path, filename=None): """ Marks the file as downloaded, if `filename` is not provided it will be parsed from the `absolute_file_path` """ self.is_downloaded = True self.is_archive = FileUtils.is_archive(absolute_file_path) self.absolute_file_path = absolute_file_path self.filename = filename if filename else os.path.basename(absolute_file_path) self.calculate_size() self.calculate_sha1_and_md5() self.save()
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id, ) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code GeoSurveyor._apply_metadata_to_experiment(experiment_object, gse) experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id, ) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata["organism_ch1"][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( "data_processing", None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata["title"][0] self.set_platform_properties(sample_object, sample.metadata, gse) GeoSurveyor._apply_harmonized_metadata_to_sample( sample_object, harmonized_samples[sample_object.title]) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( "supplementary_file", []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if (".cel" in lower_file_url or ("_non_normalized.txt" in lower_file_url) or ("_non-normalized.txt" in lower_file_url) or ("-non-normalized.txt" in lower_file_url) or ("-non_normalized.txt" in lower_file_url)): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = FileUtils.get_filename(supplementary_file_url) original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=FileUtils.is_archive(filename), )[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = "MICROARRAY" sample_object.manufacturer = "AFFYMETRIX" sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != "RNA-SEQ": created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( "supplementary_file", []): # filename and source_filename are the same for these filename = experiment_supplement_url.split("/")[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True, )[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if (("_non_normalized.txt" in lower_supplement_url) or ("_non-normalized.txt" in lower_supplement_url) or ("-non-normalized.txt" in lower_supplement_url) or ("-non_normalized.txt" in lower_supplement_url)): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0): original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split("/")[-1], has_raw=sample_object.has_raw, is_archive=True, )[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0): miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ archive_path = job_context["archive_path"] compendia_organism = _get_organisms(job_context["samples"]).first() compendia_version = _get_next_compendia_version(compendia_organism) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_QUANTPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = FileUtils.get_filename(archive_path) archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.quant_sf_only = True archive_computed_file.compendia_organism = compendia_organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() compendium_result = CompendiumResult() compendium_result.quant_sf_only = True compendium_result.result = result compendium_result.primary_organism = compendia_organism compendium_result.compendium_version = compendia_version compendium_result.save() logger.info( "Quantpendia created! Uploading to S3.", job_id=job_context["job_id"], archive_path=archive_path, organism_name=compendia_organism.name, **get_process_stats() ) # Upload the result to S3 timestamp = str(int(time.time())) s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True return job_context
def get_extension(self): """ Returns the lowercased extension of the filename Thanks to https://stackoverflow.com/a/541408/763705 """ return FileUtils.get_extension(self.filename)