def discover_species(self): ensembl_division = ( SurveyJobKeyValue .objects .get(survey_job_id=self.survey_job.id, key__exact="ensembl_division") .value ) logger.info("Surveying %s division of ensembl.", ensembl_division, survey_job=self.survey_job.id) # The main division has a different base URL for its REST API. if ensembl_division == "Ensembl": r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE) # Yes I'm aware that specieses isn't a word. However I need to # distinguish between a singlular species and multiple species. specieses = r.json()["species"] else: r = utils.requests_retry_session().get(DIVISION_URL_TEMPLATE.format(division=ensembl_division)) specieses = r.json() try: organism_name = SurveyJobKeyValue.objects.get(survey_job_id=self.survey_job.id, key__exact="organism_name").value organism_name = organism_name.lower().replace(' ', "_") except SurveyJobKeyValue.DoesNotExist: organism_name = None all_new_species = [] if organism_name: for species in specieses: # This key varies based on whether the division is the # main one or not... why couldn't they just make them # consistent? if ('species' in species and species['species'] == organism_name) \ or ('name' in species and species['name'] == organism_name): all_new_species.append(self._generate_files(species)) break else: for species in specieses: all_new_species.append(self._generate_files(species)) if len(all_new_species) == 0: logger.error("Unable to find any species!", ensembl_division=ensembl_division, organism_name=organism_name) return all_new_species
def gather_sample_metadata(metadata: Dict) -> None: formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format( metadata["sample_accession"]) response = utils.requests_retry_session().get(formatted_metadata_URL) sample_xml = ET.fromstring(response.text) sample = sample_xml[0] if "center_name" in sample.attrib: metadata["sample_center_name"] = sample.attrib["center_name"] for child in sample: if child.tag == "TITLE": metadata["sample_title"] = child.text elif child.tag == "SAMPLE_NAME": for grandchild in child: if grandchild.tag == "TAXON_ID": metadata["organism_id"] = grandchild.text elif grandchild.tag == "SCIENTIFIC_NAME": metadata["organism_name"] = grandchild.text.upper() elif child.tag == "SAMPLE_ATTRIBUTES": for grandchild in child: key, value = SraSurveyor.parse_attribute( grandchild, "sample_") metadata[key] = value
def __init__(self, species: Dict): """Species is a Dict containing parsed JSON from the Division API.""" self.url_root = "ensemblgenomes.org/pub/release-{assembly_version}/{short_division}" self.division = species["division"] self.short_division = DIVISION_LOOKUP[species["division"]] mapping = get_strain_mapping_for_organism(species["name"]) if mapping: self.assembly = mapping["assembly"] self.strain = mapping["strain"] else: self.assembly = species["assembly_name"].replace(" ", "_") self.strain = None assembly_response = utils.requests_retry_session().get( DIVISION_RELEASE_URL) self.assembly_version = assembly_response.json()["version"] self.species_sub_dir = species["name"] self.filename_species = species["name"].capitalize() # These fields aren't needed for the URL, but they vary between # the two REST APIs. self.scientific_name = species["name"].upper() # This field can be stored in multiple keys, but if # `species_taxonomy_id` is there it's the one we want because # it's not strain-specific. if "species_taxonomy_id" in species: self.taxonomy_id = species["species_taxonomy_id"] else: self.taxonomy_id = species["taxonomy_id"] # This field is only needed for EnsemblBacteria and EnsemblFungi. self.collection = ""
def test_ordering_mismatch(self): """Makes sure that the order samples' keys are in does not affect the title chosen. Related: https://github.com/AlexsLemonade/refinebio/pull/304 """ experiment_accession_code = "E-TABM-38" samples_endpoint = SAMPLES_URL.format(experiment_accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) json_samples = r.json()["experiment"]["sample"] flattened_json_samples = [ utils.flatten(json_sample) for json_sample in json_samples ] SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment_accession_code) parsed_samples = parse_sdrf(sdrf_url) title_field = determine_title_field(parsed_samples, flattened_json_samples) sdrf_samples = harmonize_all_samples(parsed_samples, title_field) json_titles = [ extract_title(json_sample, title_field) for json_sample in flattened_json_samples ] # The titles won't match up if the order of the sample dicts # isn't corrected for, resulting in a KeyError being raised. # So if this doesn't raise a KeyError, then we're good. for title in json_titles: sdrf_samples[title]
def gather_study_metadata(metadata: Dict) -> None: formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format( metadata["study_accession"]) response = utils.requests_retry_session().get(formatted_metadata_URL) study_xml = ET.fromstring(response.text) study = study_xml[0] for child in study: if child.tag == "DESCRIPTOR": for grandchild in child: # STUDY_TYPE is the only tag which uses attributes # instead of the text for whatever reason if grandchild.tag == "STUDY_TYPE": metadata[grandchild.tag.lower( )] = grandchild.attrib["existing_study_type"] else: metadata[grandchild.tag.lower()] = grandchild.text elif child.tag == "STUDY_ATTRIBUTES": for grandchild in child: key, value = SraSurveyor.parse_attribute( grandchild, "study_") metadata[key] = value elif child.tag == "STUDY_LINKS": for grandchild in child: for ggc in grandchild: if ggc.getchildren()[0].text == "pubmed": metadata["pubmed_id"] = ggc.getchildren()[1].text break elif child.tag == "IDENTIFIERS": for grandchild in child: if (grandchild.tag == "EXTERNAL_ID" and grandchild.attrib.get("namespace", "") == "GEO"): metadata["external_id"] = grandchild.text
def __init__(self, species: Dict): """Species is a Dict containing parsed JSON from the Division API.""" self.url_root = "ensemblgenomes.org/pub/release-{assembly_version}/{short_division}" self.short_division = DIVISION_LOOKUP[species["division"]] self.assembly = species["assembly_name"].replace(" ", "_") self.assembly_version = utils.requests_retry_session().get( DIVISION_RELEASE_URL).json()["version"] # Some species are nested within a collection directory. If # this is the case, then we need to add that extra directory # to the URL, and for whatever reason the filename is not # capitalized. COLLECTION_REGEX = r"^(.*_collection).*" match_object = re.search(COLLECTION_REGEX, species["dbname"]) if match_object: self.species_sub_dir = match_object.group( 1) + "/" + species["species"] self.filename_species = species["species"] else: self.species_sub_dir = species["species"] self.filename_species = species["species"].capitalize() # These fields aren't needed for the URL, but they vary between # the two REST APIs. self.scientific_name = species["name"].upper() self.taxonomy_id = species["taxonomy_id"]
def __init__(self, species: Dict): self.url_root = "ensembl.org/pub/release-{assembly_version}" self.short_division = None self.species_sub_dir = species["name"] self.filename_species = species["name"].capitalize() self.assembly = species["assembly"] self.assembly_version = utils.requests_retry_session().get(MAIN_RELEASE_URL).json()["release"] self.scientific_name = self.filename_species.replace("_", " ") self.taxonomy_id = species["taxon_id"]
def gather_run_metadata(run_accession: str) -> Dict: """A run refers to a specific read in an experiment.""" discoverable_accessions = [ "study_accession", "sample_accession", "submission_accession" ] response = utils.requests_retry_session().get( ENA_METADATA_URL_TEMPLATE.format(run_accession)) try: run_xml = ET.fromstring(response.text) except Exception: logger.exception("Unable to decode response", response=response.text) return {} # Necessary because ERP000263 has only one ROOT element containing this error: # Entry: ERR15562 display type is either not supported or entry is not found. if len(run_xml) == 0: return {} run_item = run_xml[0] useful_attributes = [ "center_name", "run_center", "run_date", "broker_name", "alias" ] metadata = {} for attribute in useful_attributes: if attribute in run_item.attrib: metadata[attribute] = run_item.attrib[attribute] metadata["run_accession"] = run_accession for child in run_item: if child.tag == "EXPERIMENT_REF": metadata["experiment_accession"] = child.attrib["accession"] elif child.tag == "RUN_LINKS": for grandchild in child: key, value = SraSurveyor.parse_run_link(grandchild) if value != "" and key in discoverable_accessions: metadata[key] = value elif child.tag == "RUN_ATTRIBUTES": for grandchild in child: key, value = SraSurveyor.parse_attribute( grandchild, "run_") metadata[key] = value return metadata
def gather_study_metadata(metadata: Dict) -> None: formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format( metadata["study_accession"]) response = utils.requests_retry_session().get(formatted_metadata_URL) study_xml = ET.fromstring(response.text) study = study_xml[0] for child in study: if child.tag == "DESCRIPTOR": for grandchild in child: # STUDY_TYPE is the only tag which uses attributes # instead of the text for whatever reason if grandchild.tag == "STUDY_TYPE": metadata[grandchild.tag.lower( )] = grandchild.attrib["existing_study_type"] else: metadata[grandchild.tag.lower()] = grandchild.text elif child.tag == "STUDY_ATTRIBUTES": for grandchild in child: key, value = SraSurveyor.parse_attribute( grandchild, "study_") metadata[key] = value elif child.tag == "STUDY_LINKS": for grandchild in child: for ggc in grandchild: if ggc.getchildren()[0].text == "pubmed": metadata["pubmed_id"] = ggc.getchildren()[1].text break elif child.tag == "IDENTIFIERS": for grandchild in child: if ( # Check for GEO accessions. These live inside an # EXTERNAL_ID tag with namespace GEO grandchild.tag == "EXTERNAL_ID" and grandchild.attrib.get("namespace", "") == "GEO" and re.match(r"^GSE\d{2,6}", grandchild.text) ) or ( # Check for ArrayExpress accessions. These live inside a # SUBMITTER_ID tag, but the namespace is not standardized grandchild.tag == "SUBMITTER_ID" and re.match( r"^E-[A-Z]{4}-\d{2,6}", grandchild.text)): metadata["external_id"] = grandchild.text break
def gather_submission_metadata(metadata: Dict) -> None: formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format(metadata["submission_accession"]) response = utils.requests_retry_session().get(formatted_metadata_URL) submission_xml = ET.fromstring(response.text)[0] submission_metadata = submission_xml.attrib # We already have these submission_metadata.pop("accession", '') submission_metadata.pop("alias", '') metadata.update(submission_metadata) for child in submission_xml: if child.tag == "TITLE": metadata["submission_title"] = child.text elif child.tag == "SUBMISSION_ATTRIBUTES": for grandchild in child: metadata[grandchild.find("TAG").text.lower()] = grandchild.find("VALUE").text
def parse_sdrf(sdrf_url: str) -> List: """ Given a URL to an SDRF file, download parses it into JSON. """ try: sdrf_response = requests_retry_session().get(sdrf_url, timeout=60) except Exception: logger.exception("Unable to fetch URL: " + sdrf_url) return [] if sdrf_response.status_code != 200: logger.error("Unable to fetch URL: " + sdrf_url, response_code=sdrf_response.status_code) return [] sdrf_text = sdrf_response.text samples = [] reader = csv.reader(StringIO(sdrf_text), delimiter="\t") for offset, line in enumerate(reader): # Get the keys if offset == 0: keys = line continue sample_values = line # Skip malformed lines if len(sample_values) != len(keys): continue sample = {} for col, value in enumerate(sample_values): key = keys[col] sample[key] = value samples.append(sample) return samples
def gather_experiment_metadata(metadata: Dict) -> None: formatted_metadata_URL = ENA_METADATA_URL_TEMPLATE.format( metadata["experiment_accession"]) response = utils.requests_retry_session().get(formatted_metadata_URL) experiment_xml = ET.fromstring(response.text) experiment = experiment_xml[0] for child in experiment: if child.tag == "TITLE": metadata["experiment_title"] = child.text elif child.tag == "DESIGN": for grandchild in child: if grandchild.tag == "DESIGN_DESCRIPTION": metadata[ "experiment_design_description"] = grandchild.text elif grandchild.tag == "LIBRARY_DESCRIPTOR": SraSurveyor.gather_library_metadata( metadata, grandchild) elif grandchild.tag == "SPOT_DESCRIPTOR": SraSurveyor.gather_spot_metadata(metadata, grandchild) elif child.tag == "PLATFORM": # This structure is extraneously nested. metadata["platform_instrument_model"] = child[0][0].text
def gather_file_report(run_accession: str) -> List[Dict]: """Get stats about files and check for unmated reads. This endpoint returns a weird format, so some custom parsing is required: run_accession fastq_ftp fastq_bytes fastq_md5 submitted_ftp submitted_bytes submitted_md5 sra_ftp sra_bytes sra_md5 SRR7353755 ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_2.fastq.gz 25176;2856704;3140575 7ef1ba010dcb679217112efa380798b2;6bc5651b7103306d4d65018180ab8d0d;3856c14164612d9879d576a046a9879f """ response = utils.requests_retry_session().get( ENA_FILE_REPORT_URL_TEMPLATE.format(accession=run_accession)) lines = response.text.split("\n") split_lines = [line.split("\t") for line in lines] header_row = split_lines[0] sample_row = split_lines[1] file_info = [] for i, key in enumerate(header_row): if key in ["fastq_ftp", "fastq_bytes", "fastq_md5"]: for i, value in enumerate(sample_row[i].split(";")): if i >= len(file_info): file_info.append({key: value}) else: file_info[i][key] = value return file_info
def create_experiment_from_api( self, experiment_accession_code: str) -> (Experiment, Dict): """Given an experiment accession code, create an Experiment object. Also returns a dictionary of additional information about the platform discovered for the experiment. Will raise an UnsupportedPlatformException if this experiment was conducted using a platform which we don't support. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample """ request_url = EXPERIMENTS_URL + experiment_accession_code experiment_request = utils.requests_retry_session().get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error("Remote experiment has no Experiment data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise experiment = {} experiment["name"] = parsed_json["name"] experiment["experiment_accession_code"] = experiment_accession_code # This experiment has no platform at all, and is therefore useless. if 'arraydesign' not in parsed_json or len( parsed_json["arraydesign"]) == 0: logger.warn("Remote experiment has no arraydesign listed.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise UnsupportedPlatformException # If there is more than one arraydesign listed in the experiment # then there is no other way to determine which array was used # for which sample other than looking at the header of the CEL # file. That obviously cannot happen until the CEL file has been # downloaded so we can just mark it as UNKNOWN and let the # downloader inspect the downloaded file to determine the # array then. elif len(parsed_json["arraydesign"] ) != 1 or "accession" not in parsed_json["arraydesign"][0]: experiment["platform_accession_code"] = UNKNOWN experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN else: external_accession = parsed_json["arraydesign"][0]["accession"] for platform in get_supported_microarray_platforms(): if platform["external_accession"] == external_accession: experiment[ "platform_accession_code"] = get_normalized_platform( platform["platform_accession"]) # Illumina appears in the accession codes for # platforms manufactured by Illumina if "ILLUMINA" in experiment[ "platform_accession_code"].upper(): experiment["manufacturer"] = "ILLUMINA" experiment["platform_accession_name"] = platform[ "platform_accession"] else: # It's not Illumina, the only other supported Microarray platform is # Affy. As our list of supported platforms grows this logic will # need to get more sophisticated. experiment["manufacturer"] = "AFFYMETRIX" platform_mapping = get_readable_affymetrix_names() experiment[ "platform_accession_name"] = platform_mapping[ platform["platform_accession"]] if "platform_accession_code" not in experiment: # We don't know what platform this accession corresponds to. experiment["platform_accession_code"] = external_accession experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN experiment["release_date"] = parsed_json["releasedate"] if "lastupdatedate" in parsed_json: experiment["last_update_date"] = parsed_json["lastupdatedate"] else: experiment["last_update_date"] = parsed_json["releasedate"] # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: # We aren't sure these fields will be populated, or how many there will be. # Try to join them all together, or set a sensible default. experiment_descripton = "" if "description" in parsed_json and len( parsed_json["description"]) > 0: for description_item in parsed_json["description"]: if "text" in description_item: experiment_descripton = experiment_descripton + description_item[ "text"] + "\n" if experiment_descripton == "": experiment_descripton = "Description not available.\n" experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = request_url experiment_object.source_database = "ARRAY_EXPRESS" experiment_object.title = parsed_json["name"] # This will need to be updated if we ever use Array # Express to get other kinds of data. experiment_object.technology = "MICROARRAY" experiment_object.description = experiment_descripton experiment_object.source_first_published = parse_datetime( experiment["release_date"]) experiment_object.source_last_modified = parse_datetime( experiment["last_update_date"]) experiment_object.save() json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = parsed_json json_xa.is_ccdl = False json_xa.save() ## Fetch and parse the IDF/SDRF file for any other fields IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt" idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code) idf_text = utils.requests_retry_session().get(idf_url, timeout=60).text lines = idf_text.split('\n') idf_dict = {} for line in lines: keyval = line.strip().split('\t') if len(keyval) == 2: idf_dict[keyval[0]] = keyval[1] elif len(keyval) > 2: idf_dict[keyval[0]] = keyval[1:] idf_xa = ExperimentAnnotation() idf_xa.data = idf_dict idf_xa.experiment = experiment_object idf_xa.is_ccdl = False idf_xa.save() if 'Investigation Title' in idf_dict: experiment_object.title = idf_dict['Investigation Title'] if 'Person Affiliation' in idf_dict: # This is very rare, ex: E-MEXP-32 if isinstance(idf_dict['Person Affiliation'], list): unique_people = list(set(idf_dict['Person Affiliation'])) experiment_object.submitter_institution = ", ".join( unique_people)[:255] else: experiment_object.submitter_institution = idf_dict[ 'Person Affiliation'] # Get protocol_description from "<experiment_url>/protocols" # instead of from idf_dict, because the former provides more # details. protocol_url = request_url + '/protocols' protocol_request = utils.requests_retry_session().get(protocol_url, timeout=60) try: experiment_object.protocol_description = protocol_request.json( )['protocols'] except KeyError: logger.warning( "Remote experiment has no protocol data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) if 'Publication Title' in idf_dict: # This will happen for some superseries. # Ex: E-GEOD-29536 # Assume most recent is "best:, store the rest in experiment annotation. if isinstance(idf_dict['Publication Title'], list): experiment_object.publication_title = "; ".join( idf_dict['Publication Title']) else: experiment_object.publication_title = idf_dict[ 'Publication Title'] experiment_object.has_publication = True if 'Publication DOI' in idf_dict: if isinstance(idf_dict['Publication DOI'], list): experiment_object.publication_doi = ", ".join( idf_dict['Publication DOI']) else: experiment_object.publication_doi = idf_dict[ 'Publication DOI'] experiment_object.has_publication = True if 'PubMed ID' in idf_dict: if isinstance(idf_dict['PubMed ID'], list): experiment_object.pubmed_id = ", ".join( idf_dict['PubMed ID']) else: experiment_object.pubmed_id = idf_dict['PubMed ID'] experiment_object.has_publication = True # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() platform_dict = {} for k in ('platform_accession_code', 'platform_accession_name', 'manufacturer'): platform_dict[k] = experiment[k] return experiment_object, platform_dict
def handle(self, *args, **options): """Refreshes the metadata for all experiments, or experiments from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: experiments = Experiment.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] experiments = Experiment.objects.filter( source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: logger.debug("Refreshing metadata for an experiment.", experiment=experiment.accession_code) try: if experiment.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( experiment.samples.first().accession_code) SraSurveyor._apply_metadata_to_experiment( experiment, metadata) elif experiment.source_database == "GEO": gse = GEOparse.get_GEO( experiment.accession_code, destdir="/tmp/management", silent=True, ) GeoSurveyor._apply_metadata_to_experiment( experiment, gse) elif experiment.source_database == "ARRAY_EXPRESS": request_url = EXPERIMENTS_URL + experiment.accession_code experiment_request = utils.requests_retry_session( ).get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error( "Remote experiment has no Experiment data!", experiment_accession_code=experiment. accession_code, survey_job=self.survey_job.id, ) continue ArrayExpressSurveyor._apply_metadata_to_experiment( experiment, parsed_json) experiment.save() # If there are any errors, just continue. It's likely that it's # just a problem with this experiment. except Exception: logger.exception( "exception caught while updating metadata for {}". format(experiment.accession_code)) if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)
def discover_species(self): ensembl_division = SurveyJobKeyValue.objects.get( survey_job_id=self.survey_job.id, key__exact="ensembl_division").value logger.info( "Surveying %s division of ensembl.", ensembl_division, survey_job=self.survey_job.id, ) try: organism_name = SurveyJobKeyValue.objects.get( survey_job_id=self.survey_job.id, key__exact="organism_name").value organism_name = organism_name.lower().replace(" ", "_") except SurveyJobKeyValue.DoesNotExist: organism_name = None strain_mapping = None if ensembl_division in ["EnsemblFungi", "EnsemblBacteria"]: if organism_name is None: logger.error( "Organism name must be specified for Fungi and Bacteria divisions.", ensembl_division=ensembl_division, organism_name=organism_name, ) return [] else: strain_mapping = get_strain_mapping_for_organism(organism_name) if strain_mapping is None: logger.error( ("Organism name must be listed in config/organism_strain_" "mappings.csv for Fungi and Bacteria divisions."), ensembl_division=ensembl_division, organism_name=organism_name, ) return [] # The main division has a different base URL for its REST API. if ensembl_division == "Ensembl": r = utils.requests_retry_session().get(MAIN_DIVISION_URL_TEMPLATE) # Yes I'm aware that specieses isn't a word. However I need to # distinguish between a singlular species and multiple species. specieses = r.json()["species"] else: formatted_division_url = DIVISION_URL_TEMPLATE.format( division=ensembl_division) r = utils.requests_retry_session().get(formatted_division_url) specieses = r.json() all_new_species = [] if organism_name: if strain_mapping: organism_name = organism_name + "_" + strain_mapping[ "strain"].lower() for species in specieses: if (ensembl_division in ["EnsemblFungi", "EnsemblBacteria"] and organism_name in species["name"]): # Fungi and Bacteria have a strain identifier in their # names. This is different than everything else, # so we're going to handle this special case by # just overwriting this. This is okay because we # just have to discover one species for the # organism, and then our strain mapping will make # sure we use the correct strain and assembly. species["name"] = organism_name all_new_species.append(self._generate_files(species)) break elif "name" in species and organism_name == species["name"]: all_new_species.append(self._generate_files(species)) break else: for species in specieses: all_new_species.append(self._generate_files(species)) if len(all_new_species) == 0: logger.error( "Unable to find any species!", ensembl_division=ensembl_division, organism_name=organism_name, ) return all_new_species
def discover_experiment_and_samples(self): """Returns an experiment and a list of samples for an SRA accession""" survey_job = SurveyJob.objects.get(id=self.survey_job.id) survey_job_properties = survey_job.get_properties() accession = survey_job_properties["experiment_accession_code"] # SRA Surveyor is mainly designed for SRRs, this handles SRPs if "SRP" in accession or "ERP" in accession or "DRP" in accession: response = utils.requests_retry_session().get( ENA_METADATA_URL_TEMPLATE.format(accession)) experiment_xml = ET.fromstring(response.text)[0] study_links = experiment_xml[2] # STUDY_LINKS accessions_to_run = [] for child in study_links: if child[0][0].text == "ENA-RUN": all_runs = child[0][1].text # Ranges can be disjoint, separated by commas run_segments = all_runs.split(",") for segment in run_segments: if "-" in segment: start, end = segment.split("-") else: start = segment end = segment start_id = start[3::] end_id = end[3::] for run_id in range(int(start_id), int(end_id) + 1): run_id = str(run_id).zfill(len(start_id)) accessions_to_run.append(accession[0] + "RR" + run_id) break experiment = None all_samples = [] for run_id in accessions_to_run: logger.debug( "Surveying SRA Run Accession %s for Experiment %s", run_id, accession, survey_job=self.survey_job.id, ) returned_experiment, samples = self._generate_experiment_and_samples( run_id, accession) # Some runs may return (None, None). If this happens # we don't want to set experiment to None. if returned_experiment: experiment = returned_experiment if samples: all_samples += samples # So we prevent duplicate downloads, ex for SRP111553 all_samples = list(set(all_samples)) # Experiment will always be the same return experiment, all_samples else: logger.debug("Surveying SRA Run Accession %s", accession, survey_job=self.survey_job.id) return self._generate_experiment_and_samples(accession)
def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]: """Generates a Sample item for each sample in an AE experiment. There are many possible data situations for a sample: - If the sample only has raw data available: - If it is on a platform that we support: Download this raw data and process it - If it is not on a platform we support: Don't download anything, don't process anything - If the sample has both raw and derived data: - If the raw data is on a platform we support: Download the raw data and process it, abandon the derived data - If the raw data is not on a platform we support Download the derived data and no-op it, abandon the raw data - If the sample only has derived data: Download the derived data and no-op it. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples """ created_samples = [] samples_endpoint = SAMPLES_URL.format(experiment.accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # The SDRF is the complete metadata record on a sample/property basis. # We run this through our harmonizer and then attach the properties # to our created samples. SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) # An experiment can have many samples for sample_data in samples: # For some reason, this sample has no files associated with it. if "file" not in sample_data or len(sample_data['file']) == 0: continue # Each sample is given an experimenatlly-unique title. flat_sample = utils.flatten(sample_data) title = harmony.extract_title(flat_sample) # A sample may actually have many sub files. # If there is raw data, take that. # If not, take the derived. has_raw = False for sub_file in sample_data['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file # Some have the 'data' field, but not the actual data # Ex: E-GEOD-9656 if sub_file_mod['type'] == "data" and sub_file_mod[ 'comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True skip_sample = False for sub_file in sample_data['file']: # Don't get the raw data if it's only a 1-color sample. if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data): has_raw = False # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue download_url = None filename = sub_file["name"] # sub_file["comment"] is only a list if there's # more than one comment... comments = sub_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if "FTP file" in comment["name"]: download_url = comment["value"] break else: download_url = comments["value"] if not download_url: logger.error( "Sample %s did not specify a download url, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if not filename: logger.error( "Sample %s did not specify a filename, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if skip_sample: continue # The accession code is not a simple matter to determine. sample_source_name = sample_data["source"].get("name", "") sample_assay_name = sample_data["assay"].get("name", "") sample_accession_code = self.determine_sample_accession( experiment.accession_code, sample_source_name, sample_assay_name, filename) # Figure out the Organism for this sample organism_name = UNKNOWN for characteristic in sample_data["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == UNKNOWN: logger.error( "Sample %s did not specify the organism name.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) organism = None continue else: organism = Organism.get_object_for_name(organism_name) # Create the sample object try: # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. sample_object = Sample.objects.get( accession_code=sample_accession_code) # If input experiment includes new protocol information, # update sample's protocol_info. existing_protocols = sample_object.protocol_info protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols, experiment.protocol_description, experiment.source_url + '/protocols') if is_updated: sample_object.protocol_info = protocol_info sample_obejct.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() # The basics sample_object.source_database = "ARRAY_EXPRESS" sample_object.title = title sample_object.accession_code = sample_accession_code sample_object.source_archive_url = samples_endpoint sample_object.organism = organism sample_object.platform_name = platform_dict[ "platform_accession_name"] sample_object.platform_accession_code = platform_dict[ "platform_accession_code"] sample_object.manufacturer = platform_dict["manufacturer"] sample_object.technology = "MICROARRAY" protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment.protocol_description, protocol_url=experiment.source_url + '/protocols') # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() # Directly assign the harmonized properties harmonized_sample = harmonized_samples[title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) sample_object.save() sample_annotation = SampleAnnotation() sample_annotation.data = sample_data sample_annotation.sample = sample_object sample_annotation.is_ccdl = False sample_annotation.save() original_file = OriginalFile() original_file.filename = filename original_file.source_filename = filename original_file.source_url = download_url original_file.is_downloaded = False original_file.is_archive = True original_file.has_raw = has_raw original_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object original_file_sample_association.save() created_samples.append(sample_object) logger.debug( "Created " + str(sample_object), experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sample=sample_object.id) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment, organism=organism) return created_samples