def prepare_illumina_job(organism): pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz" og_file.filename = "GSE22427_non-normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt") og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() sample_names = [ "LV-C&si-Control-1", "LV-C&si-Control-2", "LV-C&si-Control-3", "LV-C&si-EZH2-1", "LV-C&si-EZH2-2", "LV-C&si-EZH2-3", "LV-EZH2&si-EZH2-1", "LV-EZH2&si-EZH2-2", "LV-EZH2&si-EZH2-3", "LV-T350A&si-EZH2-1", "LV-T350A&si-EZH2-2", "LV-T350A&si-EZH2-3", ] for name in sample_names: sample = Sample() sample.accession_code = name sample.title = name sample.organism = organism sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = {"description": [name]} sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() return pj
def prepare_illumina_job(job_info: Dict) -> ProcessorJob: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() for s in job_info["samples"]: # For convenience, if you give a list of strings we'll just use the # strings as both titles and accessions. annotation = None if type(s) == str: accession_code = s title = s elif type(s) == tuple and list(map(type, s)) == [str, str]: accession_code, title = s elif type(s) == tuple and list(map(type, s)) == [str, str, dict]: accession_code, title, annotation = s else: raise ValueError(f"Invalid sample type for sample {s}") sample = Sample() sample.accession_code = accession_code sample.title = title sample.organism = job_info["organism"] sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = annotation if annotation is not None else { "description": [title] } sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() return pj
def setUp(self): # Saving this for if we have protected endpoints # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword') # self.client.login(username='******', password='******') # self.user = User.objects.create(username="******") experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.save() self.experiment = experiment experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below) for i in range(26): Organism(name=("TEST_ORGANISM_{}".format(i)), taxonomy_id=(1234 + i)).save() ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True) ailuropoda.save() self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.is_processed = True sample.organism = ailuropoda sample.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = ailuropoda sample.save() self.sample = sample # add qn target for sample organism result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = {"organism_id": ailuropoda.id, "is_qn": True} cra.save() ailuropoda.qn_target = result ailuropoda.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() experiment.num_total_samples = 1 experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() processor = Processor() processor.name = "Salmon Quant" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() computational_result_short = ComputationalResult(processor=processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = self.danio_rerio organism_index.result = computational_result_short organism_index.absolute_directory_path = ( "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT") organism_index.is_public = True organism_index.s3_url = "not_blank" organism_index.save() return
def _convert_illumina_genes(job_context: Dict) -> Dict: """ Convert to Ensembl genes if we can""" all_databases = { "HOMO_SAPIENS": [ "illuminaHumanv1", "illuminaHumanv2", "illuminaHumanv3", "illuminaHumanv4", ], "MUS_MUSCULUS": [ "illuminaMousev1", "illuminaMousev1p1", "illuminaMousev2", ], "RATTUS_NORVEGICUS": ["illuminaRatv1"], } sample0 = job_context["samples"][0] databases = all_databases[sample0.organism.name] # Loop over all of the possible platforms and find the one with the best match. highest = 0.0 high_mapped_percent = 0.0 high_db = None for platform in databases: try: result = subprocess.check_output([ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/detect_database.R", "--platform", platform, "--inputFile", job_context["input_file_path"], "--column", job_context.get("column_name", "Reporter Identifier"), ]) results = result.decode().split("\n") cleaned_result = float(results[0].strip()) if cleaned_result > highest: highest = cleaned_result high_db = platform high_mapped_percent = float(results[1].strip()) except Exception: logger.exception("Could not detect database for file!", platform=platform, job_context=job_context) continue # Record our sample detection outputs for every sample. for sample in job_context["samples"]: sa = SampleAnnotation() sa.sample = sample sa.data = { "detected_platform": high_db, "detection_percentage": highest, "mapped_percentage": high_mapped_percent, } sa.save() job_context["script_name"] = "gene_convert_illumina.R" try: subprocess.check_output( [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/" + job_context["script_name"], "--platform", high_db, "--inputFile", job_context["input_file_path"], "--outputFile", job_context["output_file_path"], ], stderr=subprocess.PIPE, ) except subprocess.CalledProcessError as e: error_template = "Status code {0} from {1}: {2}" error_message = error_template.format(e.returncode, job_context["script_name"], e.stderr) logger.error(error_message, job_context=job_context) job_context["job"].failure_reason = error_message job_context["success"] = False job_context["job"].no_retry = True return job_context except Exception as e: error_template = ("Encountered error in R code while running {0}" " pipeline during processing of {1}: {2}") error_message = error_template.format(job_context["script_name"], job_context["input_file_path"], str(e)) logger.error(error_message, job_context=job_context) job_context["job"].failure_reason = error_message job_context["success"] = False job_context["job"].no_retry = True return job_context job_context["success"] = True return job_context
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id, ) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code GeoSurveyor._apply_metadata_to_experiment(experiment_object, gse) experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id, ) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata["organism_ch1"][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( "data_processing", None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata["title"][0] self.set_platform_properties(sample_object, sample.metadata, gse) GeoSurveyor._apply_harmonized_metadata_to_sample( sample_object, harmonized_samples[sample_object.title]) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( "supplementary_file", []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if (".cel" in lower_file_url or ("_non_normalized.txt" in lower_file_url) or ("_non-normalized.txt" in lower_file_url) or ("-non-normalized.txt" in lower_file_url) or ("-non_normalized.txt" in lower_file_url)): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = FileUtils.get_filename(supplementary_file_url) original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=FileUtils.is_archive(filename), )[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = "MICROARRAY" sample_object.manufacturer = "AFFYMETRIX" sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != "RNA-SEQ": created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( "supplementary_file", []): # filename and source_filename are the same for these filename = experiment_supplement_url.split("/")[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True, )[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if (("_non_normalized.txt" in lower_supplement_url) or ("_non-normalized.txt" in lower_supplement_url) or ("-non-normalized.txt" in lower_supplement_url) or ("-non_normalized.txt" in lower_supplement_url)): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0): original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split("/")[-1], has_raw=sample_object.has_raw, is_archive=True, )[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0): miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.DAT" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'EXPERIMENT' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'STANDARD' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() return pj
def test_bad_overlap(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "big.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "small.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Now, make sure the bad can't zero this out. sample = Sample() sample.accession_code = 'GSM999' sample.title = 'GSM999' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "bad.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertEqual(len(final_context['final_frame']), 4)
def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]: """Generates a Sample item for each sample in an AE experiment. There are many possible data situations for a sample: - If the sample only has raw data available: - If it is on a platform that we support: Download this raw data and process it - If it is not on a platform we support: Don't download anything, don't process anything - If the sample has both raw and derived data: - If the raw data is on a platform we support: Download the raw data and process it, abandon the derived data - If the raw data is not on a platform we support Download the derived data and no-op it, abandon the raw data - If the sample only has derived data: Download the derived data and no-op it. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples """ created_samples = [] samples_endpoint = SAMPLES_URL.format(experiment.accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # The SDRF is the complete metadata record on a sample/property basis. # We run this through our harmonizer and then attach the properties # to our created samples. SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) # An experiment can have many samples for sample_data in samples: # For some reason, this sample has no files associated with it. if "file" not in sample_data or len(sample_data['file']) == 0: continue # Each sample is given an experimenatlly-unique title. flat_sample = utils.flatten(sample_data) title = harmony.extract_title(flat_sample) # A sample may actually have many sub files. # If there is raw data, take that. # If not, take the derived. has_raw = False for sub_file in sample_data['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file # Some have the 'data' field, but not the actual data # Ex: E-GEOD-9656 if sub_file_mod['type'] == "data" and sub_file_mod[ 'comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True skip_sample = False for sub_file in sample_data['file']: # Don't get the raw data if it's only a 1-color sample. if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data): has_raw = False # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue download_url = None filename = sub_file["name"] # sub_file["comment"] is only a list if there's # more than one comment... comments = sub_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if "FTP file" in comment["name"]: download_url = comment["value"] break else: download_url = comments["value"] if not download_url: logger.error( "Sample %s did not specify a download url, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if not filename: logger.error( "Sample %s did not specify a filename, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if skip_sample: continue # The accession code is not a simple matter to determine. sample_source_name = sample_data["source"].get("name", "") sample_assay_name = sample_data["assay"].get("name", "") sample_accession_code = self.determine_sample_accession( experiment.accession_code, sample_source_name, sample_assay_name, filename) # Figure out the Organism for this sample organism_name = UNKNOWN for characteristic in sample_data["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == UNKNOWN: logger.error( "Sample %s did not specify the organism name.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) organism = None continue else: organism = Organism.get_object_for_name(organism_name) # Create the sample object try: # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. sample_object = Sample.objects.get( accession_code=sample_accession_code) # If input experiment includes new protocol information, # update sample's protocol_info. existing_protocols = sample_object.protocol_info protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols, experiment.protocol_description, experiment.source_url + '/protocols') if is_updated: sample_object.protocol_info = protocol_info sample_obejct.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() # The basics sample_object.source_database = "ARRAY_EXPRESS" sample_object.title = title sample_object.accession_code = sample_accession_code sample_object.source_archive_url = samples_endpoint sample_object.organism = organism sample_object.platform_name = platform_dict[ "platform_accession_name"] sample_object.platform_accession_code = platform_dict[ "platform_accession_code"] sample_object.manufacturer = platform_dict["manufacturer"] sample_object.technology = "MICROARRAY" protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment.protocol_description, protocol_url=experiment.source_url + '/protocols') # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() # Directly assign the harmonized properties harmonized_sample = harmonized_samples[title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) sample_object.save() sample_annotation = SampleAnnotation() sample_annotation.data = sample_data sample_annotation.sample = sample_object sample_annotation.is_ccdl = False sample_annotation.save() original_file = OriginalFile() original_file.filename = filename original_file.source_filename = filename original_file.source_url = download_url original_file.is_downloaded = False original_file.is_archive = True original_file.has_raw = has_raw original_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object original_file_sample_association.save() created_samples.append(sample_object) logger.debug( "Created " + str(sample_object), experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sample=sample_object.id) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment, organism=organism) return created_samples
def test_download_aspera_and_ftp(self): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() LOCAL_ROOT_DIR = "/home/user/data_store" os.makedirs(LOCAL_ROOT_DIR + '/' + sample.accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + '/' + sample.accession_code + '/' + original_file.source_url.split( '/')[-1] # Aspera result = geo._download_file(original_file.source_url, file_path=dl_file_path, job=dlj, force_ftp=False) self.assertTrue(result) self.assertTrue(os.path.exists(dl_file_path)) os.remove(dl_file_path) # FTP result = geo._download_file(original_file.source_url, file_path=dl_file_path, job=dlj, force_ftp=True) self.assertTrue(result) self.assertTrue(os.path.exists(dl_file_path)) os.remove(dl_file_path) # Aspera, fail result = geo._download_file_aspera("https://rich.zone/cool_horse.jpg", target_file_path=dl_file_path, downloader_job=dlj, attempt=5) self.assertFalse(result) self.assertTrue(dlj.failure_reason != None)
def test_download_geo(self, mock_send_task): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.filename = "GSE22427_non-normalized.txt.gz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.technology = "MICROARRAY" sample.manufacturer = "AGILENT" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "Illumina_RatRef-12_V1.0" sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) file_assocs = OriginalFileSampleAssociation.objects.filter( sample=sample) self.assertEqual(file_assocs.count(), 2) for file_assoc in file_assocs: original_file = file_assoc.original_file if original_file.filename.endswith(".gz"): # We delete the archive after we extract from it self.assertFalse(original_file.is_downloaded) else: self.assertTrue(original_file.is_downloaded) # Make sure it worked self.assertTrue(download_result) self.assertTrue(dlj.failure_reason is None) self.assertTrue(len(ProcessorJob.objects.all()) > 0) self.assertEqual(ProcessorJob.objects.all()[0].pipeline_applied, "AGILENT_TWOCOLOR_TO_PCL") self.assertEqual(ProcessorJob.objects.all()[0].ram_amount, 2048)
def _detect_platform(job_context: Dict) -> Dict: """ Determine the platform/database to process this sample with. They often provide something like "V2" or "V 2", but we don't trust them so we detect it ourselves. Related: https://github.com/AlexsLemonade/refinebio/issues/232 """ all_databases = { 'HOMO_SAPIENS': [ 'illuminaHumanv1', 'illuminaHumanv2', 'illuminaHumanv3', 'illuminaHumanv4', ], 'MUS_MUSCULUS': [ 'illuminaMousev1', 'illuminaMousev1p1', 'illuminaMousev2', ], 'RATTUS_NORVEGICUS': ['illuminaRatv1'] } sample0 = job_context['samples'][0] databases = all_databases[sample0.organism.name] # Loop over all of the possible platforms and find the one with the best match. highest = 0.0 high_mapped_percent = 0.0 high_db = None for platform in databases: try: result = subprocess.check_output([ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/detect_database.R", "--platform", platform, "--inputFile", job_context['input_file_path'], "--column", job_context['probeId'], ]) results = result.decode().split('\n') cleaned_result = float(results[0].strip()) if cleaned_result > highest: highest = cleaned_result high_db = platform high_mapped_percent = float(results[1].strip()) except Exception as e: logger.exception(e, processor_job_id=job_context["job"].id) continue # Record our sample detection outputs for every sample. for sample in job_context['samples']: sa = SampleAnnotation() sa.sample = sample sa.is_ccdl = True sa.data = { "detected_platform": high_db, "detection_percentage": highest, "mapped_percentage": high_mapped_percent } sa.save() # If the match is over 75%, record this and process it on that platform. if high_mapped_percent > 75.0: job_context['platform'] = high_db # The match percentage is too low - send this to the no-opper instead. else: logger.info("Match percentage too low, NO_OP'ing and aborting.", job=job_context['job_id']) processor_job = ProcessorJob() processor_job.pipeline_applied = "NO_OP" processor_job.volume_index = job_context["job"].volume_index processor_job.ram_amount = job_context["job"].ram_amount processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = job_context["original_files"][0] assoc.processor_job = processor_job assoc.save() try: send_job(ProcessorPipeline.NO_OP, processor_job) except Exception as e: # Nomad dispatch error, likely during local test. logger.error(e, job=processor_job) job_context['abort'] = True return job_context
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = ( "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" + experiment_accession_code) experiment_object.source_database = "GEO" experiment_object.title = gse.metadata.get('title', [''])[0] experiment_object.description = gse.metadata.get('summary', [''])[0] # Source doesn't provide time information, assume midnight. submission_date = gse.metadata["submission_date"][ 0] + " 00:00:00 UTC" experiment_object.source_first_published = dateutil.parser.parse( submission_date) last_updated_date = gse.metadata["last_update_date"][ 0] + " 00:00:00 UTC" experiment_object.source_last_updated = dateutil.parser.parse( last_updated_date) unique_institutions = list(set(gse.metadata["contact_institute"])) experiment_object.submitter_institution = ", ".join( unique_institutions) experiment_object.pubmed_id = gse.metadata.get("pubmed_id", [""])[0] # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata['organism_ch1'][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( 'data_processing', None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata['title'][0] self.set_platform_properties(sample_object, sample.metadata, gse) # Directly assign the harmonized properties harmonized_sample = harmonized_samples[sample_object.title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( 'supplementary_file', []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if '.cel' in lower_file_url \ or ('_non_normalized.txt' in lower_file_url) \ or ('_non-normalized.txt' in lower_file_url) \ or ('-non-normalized.txt' in lower_file_url) \ or ('-non_normalized.txt' in lower_file_url): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = supplementary_file_url.split('/')[-1] original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True)[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = 'MICROARRAY' sample_object.manufacturer = 'AFFYMETRTIX' sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != 'RNA-SEQ': created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( 'supplementary_file', []): # filename and source_filename are the same for these filename = experiment_supplement_url.split('/')[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True)[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if ('_non_normalized.txt' in lower_supplement_url) \ or ('_non-normalized.txt' in lower_supplement_url) \ or ('-non-normalized.txt' in lower_supplement_url) \ or ('-non_normalized.txt' in lower_supplement_url): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0: original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split('/')[-1], has_raw=sample_object.has_raw, is_archive=True)[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0: miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def setUpClass(cls): super(ESTestCases, cls).setUpClass() # ref https://stackoverflow.com/a/29655301/763705 """Set up class.""" experiment = Experiment() experiment.accession_code = "GSE000-X" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123-X" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.num_processed_samples = 1 # added below experiment.num_total_samples = 1 experiment.num_downloadable_samples = 1 experiment.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.save() organism = Organism( name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True ) organism.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = organism sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() # associate the experiment with the sample experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() result = ComputationalResult() result.save() # and create a qn tarjet for the sample computational_result = ComputationalResultAnnotation() computational_result.result = result computational_result.data = {"is_qn": True, "organism_id": sample.organism.id} computational_result.save() # and associate it with the sample organism sample.organism.qn_target = result sample.organism.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() # clear default cache and reindex # otherwise the organisms with qn_targes will be cached. cache.clear() call_command("search_index", "--rebuild", "-f")