def test_supported_microarray_platforms(self): """Test that supported microarray platforms setting is set correctly.""" supported_microarray_platforms = utils.get_supported_microarray_platforms( ) has_equgene11st = False has_A_AFFY_59 = False has_GPL23026 = False has_AGEOD23026 = False for platform in supported_microarray_platforms: if platform["platform_accession"] == "equgene11st" and platform[ "is_brainarray"]: has_equgene11st = True if platform["external_accession"] == "A-AFFY-59" and not platform[ "is_brainarray"]: has_A_AFFY_59 = True if platform["external_accession"] == "GPL23026" and not platform[ "is_brainarray"]: has_GPL23026 = True if platform[ "external_accession"] == "A-GEOD-23026" and not platform[ "is_brainarray"]: has_AGEOD23026 = True self.assertTrue(has_equgene11st) self.assertTrue(has_A_AFFY_59) self.assertTrue(has_GPL23026) self.assertTrue(has_AGEOD23026)
def _is_platform_supported(platform: str) -> bool: """Determines if platform is a platform_accession we support or not. It does so by trying to correct for common string issues such as case and spacing and then comparing against our configuration files which specify which platform are supported. """ upper_platform = platform.upper() # Check if this is a supported Microarray platform. for supported_platform in utils.get_supported_microarray_platforms(): if (supported_platform["platform_accession"].upper() == upper_platform or supported_platform["external_accession"].upper() == upper_platform): return True # Check if this is a supported RNASeq platform. # GEO RNASeq platform titles often have organisms appended to # an otherwise recognizable platform. The list of supported # RNASeq platforms isn't long, so see if any of them are # contained within what GEO gave us. # Example: GSE69572 has a platform title of: # 'Illumina Genome Analyzer IIx (Glycine max)' # Which should match 'Illumina Genome Analyzer IIx' # because RNASeq platforms are organism agnostic. for supported_platform in utils.get_supported_rnaseq_platforms(): # Spacing can be inconsistent, easiest to just remove it entirely. if supported_platform.upper().replace(" ", "") in upper_platform.replace( " ", ""): return True return False
def get_queryset(self): """ We use this opportunity not to include any experiments that only contain unsupported platforms from our ES instance""" supported_microarray_platforms = [ x['platform_accession'] for x in get_supported_microarray_platforms() ] supported_rnaseq_platforms = [ x.replace(' ', '') for x in get_supported_rnaseq_platforms() ] all_supported_platforms = supported_microarray_platforms + supported_rnaseq_platforms return super(ExperimentDocument, self).get_queryset().filter( platform_accession_codes__contained_by=all_supported_platforms ) # https://www.postgresql.org/docs/9.1/functions-array.html
def handle(self, *args, **options): """ Requeues downloader jobs for samples that haven't been processed and their original files have no no downloader jobs associated with them """ supported_microarray_platforms = [ x["platform_accession"] for x in get_supported_microarray_platforms() ] supported_rnaseq_platforms = [x.replace(" ", "") for x in get_supported_rnaseq_platforms()] all_supported_platforms = ( supported_microarray_platforms + supported_rnaseq_platforms ) # https://www.postgresql.org/docs/9.1/functions-array.html # Ensure selected samples have valid platforms samples_without_downloader = ( Sample.objects.all() .filter(platform_accession_code__in=all_supported_platforms) .annotate( original_files_count=Count("original_files"), downloader_job_count=Count("original_files__downloader_jobs"), ) .filter(is_processed=False, original_files_count__gt=0, downloader_job_count=0) ) if options.get("created_after", None): samples_without_downloader = samples_without_downloader.filter( created_at__gt=options["created_after"] ) samples_without_downloader = samples_without_downloader.prefetch_related("original_files") logger.info( "Found %d samples without downloader jobs, starting to create them now.", samples_without_downloader.count(), ) paginator = Paginator(samples_without_downloader, PAGE_SIZE) page = paginator.page() while True: for sample in page.object_list: logger.debug("Creating downloader job for a sample.", sample=sample.accession_code) create_downloader_job(sample.original_files.all()) logger.info( "Created %d new downloader jobs because their samples didn't have any.", PAGE_SIZE ) if not page.has_next(): break page = paginator.page(page.next_page_number())
def set_platform_properties(self, sample_object: Sample, sample_metadata: Dict, gse: GEOparse.GSM) -> Sample: """Sets platform-related properties on `sample_object`. Uses metadata from `gse` to populate platform_name, platform_accession_code, and technology on `sample_object`. """ # Determine platform information external_accession = get_normalized_platform( gse.metadata.get("platform_id", [UNKNOWN])[0]) if external_accession == UNKNOWN: sample_object.platform_accession_code = UNKNOWN sample_object.platform_name = UNKNOWN sample_object.manufacturer = UNKNOWN # If this sample is Affy, we potentially can extract the # platform information from the .CEL file. If it's not we # can't do anything. Therefore assume the technology is # microarray when we have no platform information. sample_object.technology = "MICROARRAY" return sample_object platform_accession_code = UNKNOWN gpl = GEOparse.get_GEO(external_accession, destdir=self.get_temp_path(), how="brief", silent=True) platform_title = gpl.metadata.get("title", [UNKNOWN])[0] # Check if this is a supported microarray platform. for platform in get_supported_microarray_platforms(): if platform["external_accession"] == external_accession: platform_accession_code = platform["platform_accession"] if platform_accession_code != UNKNOWN: # It's a supported microarray platform. # We are using the brain array package as the platform accession code, # so, for instance, GPL3213 becomes 'chicken'. sample_object.platform_accession_code = platform_accession_code sample_object.technology = "MICROARRAY" try: # Related: https://github.com/AlexsLemonade/refinebio/issues/354 # If it's Affy we can get a readable name: sample_object.platform_name = get_readable_affymetrix_names( )[platform_accession_code] sample_object.manufacturer = "AFFYMETRIX" # Sometimes Affymetrix samples have weird channel # protocol metadata, so if we find that it's # Affymetrix return it now. Example: GSE113945 return sample_object except KeyError: # Otherwise we'll use what we've got. sample_object.platform_name = platform_title # Determine manufacturer platform = sample_object.pretty_platform.upper() if "AGILENT" in platform: sample_object.manufacturer = "AGILENT" elif "ILLUMINA" in platform or "NEXTSEQ" in platform: sample_object.manufacturer = "ILLUMINA" elif "AFFYMETRIX" in platform: sample_object.manufacturer = "AFFYMETRIX" else: sample_object.manufacturer = UNKNOWN return sample_object # Check to see if this is a supported RNASeq technology: # GEO RNASeq platform titles often have organisms appended to # an otherwise recognizable platform. The list of supported # RNASeq platforms isn't long, so see if any of them are # contained within what GEO gave us. # Example: GSE69572 has a platform title of: # 'Illumina Genome Analyzer IIx (Glycine max)' # Which should really just be 'Illumina Genome Analyzer IIx' # because RNASeq platforms are organism agnostic. However, # the platforms 'Illumina Genome Analyzer' and 'Illumina # Genome Analyzer II' would also be matched, so make sure that # the longest platform names are tested first: sorted_platform_list = get_supported_rnaseq_platforms().copy() sorted_platform_list.sort(key=len, reverse=True) for platform in sorted_platform_list: if platform.upper() in platform_title.upper(): sample_object.technology = "RNA-SEQ" sample_object.platform_name = platform # We just use RNASeq platform titles as accessions sample_object.platform_accession_code = platform if "ILLUMINA" in sample_object.platform_name.upper(): sample_object.manufacturer = "ILLUMINA" elif "NEXTSEQ" in sample_object.platform_name.upper(): sample_object.manufacturer = "NEXTSEQ" elif "ION TORRENT" in sample_object.platform_name.upper(): sample_object.manufacturer = "ION_TORRENT" else: sample_object.manufacturer = UNKNOWN return sample_object # If we've made it this far, we don't know what this platform # is, therefore we can't know what its technology is. What we # do know is what GEO said was it's platform's accession and # title are, and that it's unsupported. sample_object.platform_name = platform_title sample_object.platform_accession_code = external_accession sample_object.technology = UNKNOWN sample_object.manufacturer = UNKNOWN return sample_object
def create_experiment_from_api( self, experiment_accession_code: str) -> (Experiment, Dict): """Given an experiment accession code, create an Experiment object. Also returns a dictionary of additional information about the platform discovered for the experiment. Will raise an UnsupportedPlatformException if this experiment was conducted using a platform which we don't support. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample """ request_url = EXPERIMENTS_URL + experiment_accession_code experiment_request = utils.requests_retry_session().get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error("Remote experiment has no Experiment data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise experiment = {} experiment["name"] = parsed_json["name"] experiment["experiment_accession_code"] = experiment_accession_code # This experiment has no platform at all, and is therefore useless. if 'arraydesign' not in parsed_json or len( parsed_json["arraydesign"]) == 0: logger.warn("Remote experiment has no arraydesign listed.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise UnsupportedPlatformException # If there is more than one arraydesign listed in the experiment # then there is no other way to determine which array was used # for which sample other than looking at the header of the CEL # file. That obviously cannot happen until the CEL file has been # downloaded so we can just mark it as UNKNOWN and let the # downloader inspect the downloaded file to determine the # array then. elif len(parsed_json["arraydesign"] ) != 1 or "accession" not in parsed_json["arraydesign"][0]: experiment["platform_accession_code"] = UNKNOWN experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN else: external_accession = parsed_json["arraydesign"][0]["accession"] for platform in get_supported_microarray_platforms(): if platform["external_accession"] == external_accession: experiment[ "platform_accession_code"] = get_normalized_platform( platform["platform_accession"]) # Illumina appears in the accession codes for # platforms manufactured by Illumina if "ILLUMINA" in experiment[ "platform_accession_code"].upper(): experiment["manufacturer"] = "ILLUMINA" experiment["platform_accession_name"] = platform[ "platform_accession"] else: # It's not Illumina, the only other supported Microarray platform is # Affy. As our list of supported platforms grows this logic will # need to get more sophisticated. experiment["manufacturer"] = "AFFYMETRIX" platform_mapping = get_readable_affymetrix_names() experiment[ "platform_accession_name"] = platform_mapping[ platform["platform_accession"]] if "platform_accession_code" not in experiment: # We don't know what platform this accession corresponds to. experiment["platform_accession_code"] = external_accession experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN experiment["release_date"] = parsed_json["releasedate"] if "lastupdatedate" in parsed_json: experiment["last_update_date"] = parsed_json["lastupdatedate"] else: experiment["last_update_date"] = parsed_json["releasedate"] # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: # We aren't sure these fields will be populated, or how many there will be. # Try to join them all together, or set a sensible default. experiment_descripton = "" if "description" in parsed_json and len( parsed_json["description"]) > 0: for description_item in parsed_json["description"]: if "text" in description_item: experiment_descripton = experiment_descripton + description_item[ "text"] + "\n" if experiment_descripton == "": experiment_descripton = "Description not available.\n" experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = request_url experiment_object.source_database = "ARRAY_EXPRESS" experiment_object.title = parsed_json["name"] # This will need to be updated if we ever use Array # Express to get other kinds of data. experiment_object.technology = "MICROARRAY" experiment_object.description = experiment_descripton experiment_object.source_first_published = parse_datetime( experiment["release_date"]) experiment_object.source_last_modified = parse_datetime( experiment["last_update_date"]) experiment_object.save() json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = parsed_json json_xa.is_ccdl = False json_xa.save() ## Fetch and parse the IDF/SDRF file for any other fields IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt" idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code) idf_text = utils.requests_retry_session().get(idf_url, timeout=60).text lines = idf_text.split('\n') idf_dict = {} for line in lines: keyval = line.strip().split('\t') if len(keyval) == 2: idf_dict[keyval[0]] = keyval[1] elif len(keyval) > 2: idf_dict[keyval[0]] = keyval[1:] idf_xa = ExperimentAnnotation() idf_xa.data = idf_dict idf_xa.experiment = experiment_object idf_xa.is_ccdl = False idf_xa.save() if 'Investigation Title' in idf_dict: experiment_object.title = idf_dict['Investigation Title'] if 'Person Affiliation' in idf_dict: # This is very rare, ex: E-MEXP-32 if isinstance(idf_dict['Person Affiliation'], list): unique_people = list(set(idf_dict['Person Affiliation'])) experiment_object.submitter_institution = ", ".join( unique_people)[:255] else: experiment_object.submitter_institution = idf_dict[ 'Person Affiliation'] # Get protocol_description from "<experiment_url>/protocols" # instead of from idf_dict, because the former provides more # details. protocol_url = request_url + '/protocols' protocol_request = utils.requests_retry_session().get(protocol_url, timeout=60) try: experiment_object.protocol_description = protocol_request.json( )['protocols'] except KeyError: logger.warning( "Remote experiment has no protocol data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) if 'Publication Title' in idf_dict: # This will happen for some superseries. # Ex: E-GEOD-29536 # Assume most recent is "best:, store the rest in experiment annotation. if isinstance(idf_dict['Publication Title'], list): experiment_object.publication_title = "; ".join( idf_dict['Publication Title']) else: experiment_object.publication_title = idf_dict[ 'Publication Title'] experiment_object.has_publication = True if 'Publication DOI' in idf_dict: if isinstance(idf_dict['Publication DOI'], list): experiment_object.publication_doi = ", ".join( idf_dict['Publication DOI']) else: experiment_object.publication_doi = idf_dict[ 'Publication DOI'] experiment_object.has_publication = True if 'PubMed ID' in idf_dict: if isinstance(idf_dict['PubMed ID'], list): experiment_object.pubmed_id = ", ".join( idf_dict['PubMed ID']) else: experiment_object.pubmed_id = idf_dict['PubMed ID'] experiment_object.has_publication = True # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() platform_dict = {} for k in ('platform_accession_code', 'platform_accession_name', 'manufacturer'): platform_dict[k] = experiment[k] return experiment_object, platform_dict
def download_array_express(job_id: int) -> None: """The main function for the Array Express Downloader. Downloads a single zip file containing the .PCL files representing samples relating to a single experiement stored in ArrayExpress. """ job = utils.start_job(job_id) success = True file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(downloader_job=job) # AE will have multiple files per DownloaderJob, but they are all # pieces of the same zip file so they're all referencing the same # URL. original_file = file_assocs[0].original_file url = original_file.source_url accession_code = job.accession_code # First, get all the unique sample archive URLs. # There may be more than one! # Then, unpack all the ones downloaded. # Then create processor jobs! og_files = [] # The files for all of the samples are # contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True) # Add a timestamp in milliseconds to filename to prevent multiple jobs from using the same file. filename = url.split('/')[-1] + "." + str(int(time.time() * 1000)) dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + filename + ".zip" _download_file(url, dl_file_path, job) extracted_files = _extract_files(dl_file_path, accession_code, job) for og_file in extracted_files: try: original_file = OriginalFile.objects.get( source_filename=og_file['filename'], source_url=original_file.source_url) original_file.is_downloaded = True original_file.is_archive = False original_file.absolute_file_path = og_file['absolute_path'] original_file.filename = og_file['absolute_path'].split('/')[-1] original_file.calculate_size() original_file.save() original_file.calculate_sha1() og_files.append(original_file) except Exception: # The suspicion is that there are extra files related to # another experiment, that we don't want associated with # this one. logger.debug("Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], downloader_job=job_id) os.remove(og_file["absolute_path"]) continue sample_objects = Sample.objects.filter(originalfile=original_file).order_by('created_at') if sample_objects.count() > 1: logger.warn("Found an Array Express OriginalFile with more than one sample: %s", filename, downloader_job=job_id) # If the file is a .CEL file, it is the ultimate # source of truth about the sample's platform. sample_object = sample_objects[0] if og_file["filename"].upper()[-4:] == ".CEL" and sample_object.has_raw: cel_file_platform = None platform_accession_code = "UNSUPPORTED" try: cel_file_platform = microarray.get_platform_from_CEL( original_file.absolute_file_path) for platform in get_supported_microarray_platforms(): if platform["platform_accession"] == cel_file_platform: platform_accession_code = platform["platform_accession"] except Exception as e: platform_accession_code = "UNDETERMINABLE" logger.warn("Unable to determine platform from CEL file: " + original_file.absolute_file_path, downloader_job=job_id) if platform_accession_code == "UNSUPPORTED": logger.error("Found a raw .CEL file with an unsupported platform!", file_name=original_file.absolute_file_path, sample=sample_object.id, downloader_job=job_id, cel_file_platform=cel_file_platform) job.failure_reason = ("Found a raw .CEL file with an unsupported platform: " + original_file.absolute_file_path + " (" + str(cel_file_platform) + ")") job.no_retry = True success = False # The file is unsupported, delete it! original_file.delete_local_file() original_file.delete() elif platform_accession_code == "UNDETERMINABLE": # If we cannot determine the platform from the # .CEL file, the platform discovered via metadata # may be correct so just leave it be. pass else: # We determined the file was collected with a supported Affymetrix platform. sample_object.platform_accession_code = platform_accession_code sample_object.platform_name = get_readable_affymetrix_names()[ platform_accession_code] # However, if the filename contains '.CEL' we know # it's an Affymetrix Microarray sample_object.technology = "MICROARRAY" sample_object.manufacterer = "AFFYMETRIX" sample_object.save() if success: logger.debug("File downloaded and extracted successfully.", url=url, downloader_job=job_id) utils.create_processor_jobs_for_original_files(og_files, job) utils.end_downloader_job(job, success)