def _run_scan_upc(job_context: Dict) -> Dict: """Processes an input CEL file to an output PCL file. Does so using the SCAN.UPC package's SCANfast method using R. Expects job_context to contain the keys 'input_file', 'output_file', and 'brainarray_package'. """ input_file = job_context["input_file_path"] try: # It's necessary to load the foreach library before calling SCANfast # because it doesn't load the library before calling functions # from it. ro.r("suppressMessages(library('foreach'))") # Prevents: # RRuntimeWarning: There were 50 or more warnings (use warnings() # to see the first 50) ro.r("options(warn=1)") # All R messages are turned into Python 'warnings' by rpy2. By # filtering all of them we silence a lot of useless output with warnings.catch_warnings(): warnings.simplefilter("ignore") scan_upc = ro.r['::']('SCAN.UPC', 'SCANfast') job_context['time_start'] = timezone.now() # Related: https://github.com/AlexsLemonade/refinebio/issues/64 if job_context["brainarray_package"]: # If we've detected the platform using affy, then this # is the best source of truth we'll be able to get, so # update the sample to match it. platform_accession_code = job_context[ "platform_accession_code"] platform_name = get_readable_affymetrix_names( )[platform_accession_code] for sample in job_context["samples"]: sample.platform_accession_code = platform_accession_code sample.platform_name = platform_name sample.save() scan_upc(input_file, job_context["output_file_path"], probeSummaryPackage=job_context["brainarray_package"]) else: scan_upc(input_file, job_context["output_file_path"]) job_context['time_end'] = timezone.now() except RRuntimeError as e: error_template = ( "Encountered error in R code while running AFFY_TO_PCL" " pipeline during processing of {0}: {1}") error_message = error_template.format(input_file, str(e)) logger.error(error_message, processor_job=job_context["job_id"]) job_context["job"].failure_reason = error_message job_context["success"] = False job_context["job"].no_retry = True return job_context
def test_readable_affymetrix_names(self): """Test that the setting for Affymetrix accessions to human readable names is set correctly.""" readable_platform_names = utils.get_readable_affymetrix_names() expected_readable_name = "[ChiGene-1_0-st] Affymetrix Chicken Gene 1.0 ST Array" self.assertTrue(readable_platform_names["chigene10st"] == expected_readable_name) expected_readable_name = "[Xenopus_laevis] Affymetrix Xenopus laevis Genome Array" self.assertTrue(readable_platform_names["xenopuslaevis"] == expected_readable_name)
def handle(self, *args, **options): """Main function for this command. Basically does what is described at the top of this file. """ # Create working dir LOCAL_ROOT_DIR = get_env_variable("LOCAL_ROOT_DIR", "/home/user/data_store") work_dir = LOCAL_ROOT_DIR + "/affy_correction/" os.makedirs(work_dir, exist_ok=True) for sample in Sample.objects.filter(technology="RNA-SEQ", source_database="GEO"): for original_file in sample.original_files.all(): if original_file.is_affy_data(): input_file_path = work_dir + original_file.source_filename download_success = _download_file(original_file.source_url, input_file_path) if download_success: try: brainarray_package = _determine_brainarray_package( input_file_path) if brainarray_package: logger.info( "Determined the package for sample %d is: " + brainarray_package, sample.id, ) # If we've detected the platform using affy, then this # is the best source of truth we'll be able to get, so # update the sample to match it. platform_name = get_readable_affymetrix_names( )[brainarray_package] sample.platform_accession_code = brainarray_package sample.platform_name = platform_name except: logger.exception( "Failed to detect platform from downloaded file %s.", input_file_path, ) # Regardless of whether we could detect the # platform successfully or not, we definitely know # it's an Affymetrix Microarray because that's the # only one that makes .CEL files. sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.save() # If there's other original files associated with # this sample, we don't need them because we # already corrected the platform. break # Cleanup after ourselves: shutil.rmtree(work_dir)
def set_platform_properties(self, sample_object: Sample, sample_metadata: Dict, gse: GEOparse.GSM) -> Sample: """Sets platform-related properties on `sample_object`. Uses metadata from `gse` to populate platform_name, platform_accession_code, and technology on `sample_object`. """ # Determine platform information external_accession = get_normalized_platform( gse.metadata.get("platform_id", [UNKNOWN])[0]) if external_accession == UNKNOWN: sample_object.platform_accession_code = UNKNOWN sample_object.platform_name = UNKNOWN sample_object.manufacturer = UNKNOWN # If this sample is Affy, we potentially can extract the # platform information from the .CEL file. If it's not we # can't do anything. Therefore assume the technology is # microarray when we have no platform information. sample_object.technology = "MICROARRAY" return sample_object platform_accession_code = UNKNOWN gpl = GEOparse.get_GEO(external_accession, destdir=self.get_temp_path(), how="brief", silent=True) platform_title = gpl.metadata.get("title", [UNKNOWN])[0] # Check if this is a supported microarray platform. for platform in get_supported_microarray_platforms(): if platform["external_accession"] == external_accession: platform_accession_code = platform["platform_accession"] if platform_accession_code != UNKNOWN: # It's a supported microarray platform. # We are using the brain array package as the platform accession code, # so, for instance, GPL3213 becomes 'chicken'. sample_object.platform_accession_code = platform_accession_code sample_object.technology = "MICROARRAY" try: # Related: https://github.com/AlexsLemonade/refinebio/issues/354 # If it's Affy we can get a readable name: sample_object.platform_name = get_readable_affymetrix_names( )[platform_accession_code] sample_object.manufacturer = "AFFYMETRIX" # Sometimes Affymetrix samples have weird channel # protocol metadata, so if we find that it's # Affymetrix return it now. Example: GSE113945 return sample_object except KeyError: # Otherwise we'll use what we've got. sample_object.platform_name = platform_title # Determine manufacturer platform = sample_object.pretty_platform.upper() if "AGILENT" in platform: sample_object.manufacturer = "AGILENT" elif "ILLUMINA" in platform or "NEXTSEQ" in platform: sample_object.manufacturer = "ILLUMINA" elif "AFFYMETRIX" in platform: sample_object.manufacturer = "AFFYMETRIX" else: sample_object.manufacturer = UNKNOWN return sample_object # Check to see if this is a supported RNASeq technology: # GEO RNASeq platform titles often have organisms appended to # an otherwise recognizable platform. The list of supported # RNASeq platforms isn't long, so see if any of them are # contained within what GEO gave us. # Example: GSE69572 has a platform title of: # 'Illumina Genome Analyzer IIx (Glycine max)' # Which should really just be 'Illumina Genome Analyzer IIx' # because RNASeq platforms are organism agnostic. However, # the platforms 'Illumina Genome Analyzer' and 'Illumina # Genome Analyzer II' would also be matched, so make sure that # the longest platform names are tested first: sorted_platform_list = get_supported_rnaseq_platforms().copy() sorted_platform_list.sort(key=len, reverse=True) for platform in sorted_platform_list: if platform.upper() in platform_title.upper(): sample_object.technology = "RNA-SEQ" sample_object.platform_name = platform # We just use RNASeq platform titles as accessions sample_object.platform_accession_code = platform if "ILLUMINA" in sample_object.platform_name.upper(): sample_object.manufacturer = "ILLUMINA" elif "NEXTSEQ" in sample_object.platform_name.upper(): sample_object.manufacturer = "NEXTSEQ" elif "ION TORRENT" in sample_object.platform_name.upper(): sample_object.manufacturer = "ION_TORRENT" else: sample_object.manufacturer = UNKNOWN return sample_object # If we've made it this far, we don't know what this platform # is, therefore we can't know what its technology is. What we # do know is what GEO said was it's platform's accession and # title are, and that it's unsupported. sample_object.platform_name = platform_title sample_object.platform_accession_code = external_accession sample_object.technology = UNKNOWN sample_object.manufacturer = UNKNOWN return sample_object
def create_experiment_from_api( self, experiment_accession_code: str) -> (Experiment, Dict): """Given an experiment accession code, create an Experiment object. Also returns a dictionary of additional information about the platform discovered for the experiment. Will raise an UnsupportedPlatformException if this experiment was conducted using a platform which we don't support. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample """ request_url = EXPERIMENTS_URL + experiment_accession_code experiment_request = utils.requests_retry_session().get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error("Remote experiment has no Experiment data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise experiment = {} experiment["name"] = parsed_json["name"] experiment["experiment_accession_code"] = experiment_accession_code # This experiment has no platform at all, and is therefore useless. if 'arraydesign' not in parsed_json or len( parsed_json["arraydesign"]) == 0: logger.warn("Remote experiment has no arraydesign listed.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise UnsupportedPlatformException # If there is more than one arraydesign listed in the experiment # then there is no other way to determine which array was used # for which sample other than looking at the header of the CEL # file. That obviously cannot happen until the CEL file has been # downloaded so we can just mark it as UNKNOWN and let the # downloader inspect the downloaded file to determine the # array then. elif len(parsed_json["arraydesign"] ) != 1 or "accession" not in parsed_json["arraydesign"][0]: experiment["platform_accession_code"] = UNKNOWN experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN else: external_accession = parsed_json["arraydesign"][0]["accession"] for platform in get_supported_microarray_platforms(): if platform["external_accession"] == external_accession: experiment[ "platform_accession_code"] = get_normalized_platform( platform["platform_accession"]) # Illumina appears in the accession codes for # platforms manufactured by Illumina if "ILLUMINA" in experiment[ "platform_accession_code"].upper(): experiment["manufacturer"] = "ILLUMINA" experiment["platform_accession_name"] = platform[ "platform_accession"] else: # It's not Illumina, the only other supported Microarray platform is # Affy. As our list of supported platforms grows this logic will # need to get more sophisticated. experiment["manufacturer"] = "AFFYMETRIX" platform_mapping = get_readable_affymetrix_names() experiment[ "platform_accession_name"] = platform_mapping[ platform["platform_accession"]] if "platform_accession_code" not in experiment: # We don't know what platform this accession corresponds to. experiment["platform_accession_code"] = external_accession experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN experiment["release_date"] = parsed_json["releasedate"] if "lastupdatedate" in parsed_json: experiment["last_update_date"] = parsed_json["lastupdatedate"] else: experiment["last_update_date"] = parsed_json["releasedate"] # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: # We aren't sure these fields will be populated, or how many there will be. # Try to join them all together, or set a sensible default. experiment_descripton = "" if "description" in parsed_json and len( parsed_json["description"]) > 0: for description_item in parsed_json["description"]: if "text" in description_item: experiment_descripton = experiment_descripton + description_item[ "text"] + "\n" if experiment_descripton == "": experiment_descripton = "Description not available.\n" experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = request_url experiment_object.source_database = "ARRAY_EXPRESS" experiment_object.title = parsed_json["name"] # This will need to be updated if we ever use Array # Express to get other kinds of data. experiment_object.technology = "MICROARRAY" experiment_object.description = experiment_descripton experiment_object.source_first_published = parse_datetime( experiment["release_date"]) experiment_object.source_last_modified = parse_datetime( experiment["last_update_date"]) experiment_object.save() json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = parsed_json json_xa.is_ccdl = False json_xa.save() ## Fetch and parse the IDF/SDRF file for any other fields IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt" idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code) idf_text = utils.requests_retry_session().get(idf_url, timeout=60).text lines = idf_text.split('\n') idf_dict = {} for line in lines: keyval = line.strip().split('\t') if len(keyval) == 2: idf_dict[keyval[0]] = keyval[1] elif len(keyval) > 2: idf_dict[keyval[0]] = keyval[1:] idf_xa = ExperimentAnnotation() idf_xa.data = idf_dict idf_xa.experiment = experiment_object idf_xa.is_ccdl = False idf_xa.save() if 'Investigation Title' in idf_dict: experiment_object.title = idf_dict['Investigation Title'] if 'Person Affiliation' in idf_dict: # This is very rare, ex: E-MEXP-32 if isinstance(idf_dict['Person Affiliation'], list): unique_people = list(set(idf_dict['Person Affiliation'])) experiment_object.submitter_institution = ", ".join( unique_people)[:255] else: experiment_object.submitter_institution = idf_dict[ 'Person Affiliation'] # Get protocol_description from "<experiment_url>/protocols" # instead of from idf_dict, because the former provides more # details. protocol_url = request_url + '/protocols' protocol_request = utils.requests_retry_session().get(protocol_url, timeout=60) try: experiment_object.protocol_description = protocol_request.json( )['protocols'] except KeyError: logger.warning( "Remote experiment has no protocol data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) if 'Publication Title' in idf_dict: # This will happen for some superseries. # Ex: E-GEOD-29536 # Assume most recent is "best:, store the rest in experiment annotation. if isinstance(idf_dict['Publication Title'], list): experiment_object.publication_title = "; ".join( idf_dict['Publication Title']) else: experiment_object.publication_title = idf_dict[ 'Publication Title'] experiment_object.has_publication = True if 'Publication DOI' in idf_dict: if isinstance(idf_dict['Publication DOI'], list): experiment_object.publication_doi = ", ".join( idf_dict['Publication DOI']) else: experiment_object.publication_doi = idf_dict[ 'Publication DOI'] experiment_object.has_publication = True if 'PubMed ID' in idf_dict: if isinstance(idf_dict['PubMed ID'], list): experiment_object.pubmed_id = ", ".join( idf_dict['PubMed ID']) else: experiment_object.pubmed_id = idf_dict['PubMed ID'] experiment_object.has_publication = True # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() platform_dict = {} for k in ('platform_accession_code', 'platform_accession_name', 'manufacturer'): platform_dict[k] = experiment[k] return experiment_object, platform_dict
def download_array_express(job_id: int) -> None: """The main function for the Array Express Downloader. Downloads a single zip file containing the .PCL files representing samples relating to a single experiement stored in ArrayExpress. """ job = utils.start_job(job_id) success = True file_assocs = DownloaderJobOriginalFileAssociation.objects.filter(downloader_job=job) # AE will have multiple files per DownloaderJob, but they are all # pieces of the same zip file so they're all referencing the same # URL. original_file = file_assocs[0].original_file url = original_file.source_url accession_code = job.accession_code # First, get all the unique sample archive URLs. # There may be more than one! # Then, unpack all the ones downloaded. # Then create processor jobs! og_files = [] # The files for all of the samples are # contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True) # Add a timestamp in milliseconds to filename to prevent multiple jobs from using the same file. filename = url.split('/')[-1] + "." + str(int(time.time() * 1000)) dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + filename + ".zip" _download_file(url, dl_file_path, job) extracted_files = _extract_files(dl_file_path, accession_code, job) for og_file in extracted_files: try: original_file = OriginalFile.objects.get( source_filename=og_file['filename'], source_url=original_file.source_url) original_file.is_downloaded = True original_file.is_archive = False original_file.absolute_file_path = og_file['absolute_path'] original_file.filename = og_file['absolute_path'].split('/')[-1] original_file.calculate_size() original_file.save() original_file.calculate_sha1() og_files.append(original_file) except Exception: # The suspicion is that there are extra files related to # another experiment, that we don't want associated with # this one. logger.debug("Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], downloader_job=job_id) os.remove(og_file["absolute_path"]) continue sample_objects = Sample.objects.filter(originalfile=original_file).order_by('created_at') if sample_objects.count() > 1: logger.warn("Found an Array Express OriginalFile with more than one sample: %s", filename, downloader_job=job_id) # If the file is a .CEL file, it is the ultimate # source of truth about the sample's platform. sample_object = sample_objects[0] if og_file["filename"].upper()[-4:] == ".CEL" and sample_object.has_raw: cel_file_platform = None platform_accession_code = "UNSUPPORTED" try: cel_file_platform = microarray.get_platform_from_CEL( original_file.absolute_file_path) for platform in get_supported_microarray_platforms(): if platform["platform_accession"] == cel_file_platform: platform_accession_code = platform["platform_accession"] except Exception as e: platform_accession_code = "UNDETERMINABLE" logger.warn("Unable to determine platform from CEL file: " + original_file.absolute_file_path, downloader_job=job_id) if platform_accession_code == "UNSUPPORTED": logger.error("Found a raw .CEL file with an unsupported platform!", file_name=original_file.absolute_file_path, sample=sample_object.id, downloader_job=job_id, cel_file_platform=cel_file_platform) job.failure_reason = ("Found a raw .CEL file with an unsupported platform: " + original_file.absolute_file_path + " (" + str(cel_file_platform) + ")") job.no_retry = True success = False # The file is unsupported, delete it! original_file.delete_local_file() original_file.delete() elif platform_accession_code == "UNDETERMINABLE": # If we cannot determine the platform from the # .CEL file, the platform discovered via metadata # may be correct so just leave it be. pass else: # We determined the file was collected with a supported Affymetrix platform. sample_object.platform_accession_code = platform_accession_code sample_object.platform_name = get_readable_affymetrix_names()[ platform_accession_code] # However, if the filename contains '.CEL' we know # it's an Affymetrix Microarray sample_object.technology = "MICROARRAY" sample_object.manufacterer = "AFFYMETRIX" sample_object.save() if success: logger.debug("File downloaded and extracted successfully.", url=url, downloader_job=job_id) utils.create_processor_jobs_for_original_files(og_files, job) utils.end_downloader_job(job, success)