def data_providers_process(self, data): """Get data providers. Creates 4 attributes. data_provider: provider name/symbol data_providers: list of providers data_provider_pages: pages data_provider_cross_ref_set: list of xref dicts """ data_provider_object = data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') self.data_provider = data_provider_cross_ref.get('id') self.data_provider_pages = data_provider_cross_ref.get('pages') self.data_providers = [] self.data_provider_cross_ref_set = [] if self.data_provider_pages is None: return for data_provider_page in self.data_provider_pages: cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( self.data_provider, self.data_provider, alt_page=data_provider_page) self.data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( self.data_provider, self.data_provider, data_provider_page, data_provider_page, self.data_provider, cross_ref_complete_url, self.data_provider + data_provider_page)) self.data_providers.append(self.data_provider) self.logger.info("data provider: %s", self.data_provider)
def crossref_process(self, record, global_id, cross_reference_list): """Get xref.""" valid_pages = [ 'allele', 'allele/references', 'transgene', 'construct', 'transgene/references', 'construct/references' ] if 'crossReferences' not in record: return for crossRef in record['crossReferences']: crossRefId = crossRef.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRef.get('id').split(":")[0] pages = crossRef.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page in valid_pages: mod_global_cross_ref_id = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_id, crossRefId + page) xref['dataId'] = global_id cross_reference_list.append(xref)
def get_generators(self, expression_atlas_gene_pages, data_provider, batch_size): """Get Generators.""" return_set = Neo4jHelper.run_single_parameter_query( ExpressionAtlasETL.get_genes_with_expression_atlas_links_query, list(expression_atlas_gene_pages.keys())) counter = 0 cross_reference_list = [] for record in return_set: counter += 1 cross_reference = ETLHelper.get_xref_dict( record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene", "gene/expression-atlas", "gene/expressionAtlas", record["g.modLocalId"], expression_atlas_gene_pages[record["g.primaryKey"].lower()], data_provider + ":" + record["g.modLocalId"] + "gene/expression-atlas") cross_reference["genePrimaryKey"] = record["g.primaryKey"] cross_reference_list.append(cross_reference) if counter > batch_size: yield [cross_reference_list] counter = 0 cross_reference_list = [] if counter > 0: yield [cross_reference_list]
def ortho_xrefs(self, o_xrefs, ident, xrefs): """Geenrate xref for orthos.""" if o_xrefs is None: return # turn into a list if type(o_xrefs) != list: self.logger.critical("BOB: o_xrefs is not a list but is a '{}'".format(type(o_xrefs))) for xref_id_dict in o_xrefs: xref_id = xref_id_dict["val"] if ":" in xref_id: local_id = xref_id.split(":")[1].strip() prefix = xref_id.split(":")[0].strip() complete_url = self.etlh.get_complete_url_ont(local_id, xref_id) generated_xref = ETLHelper.get_xref_dict( local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", xref_id, complete_url, xref_id + "ontology_provided_cross_reference") generated_xref["oid"] = ident xrefs.append(generated_xref) if ":" in o_xrefs: # if o_xrefs is a str with ":" in it. local_id = o_xrefs.split(":")[1].strip() prefix = o_xrefs.split(":")[0].strip() complete_url = self.etlh.get_complete_url_ont(local_id, o_xrefs) generated_xref = ETLHelper.get_xref_dict( local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", o_xrefs, complete_url, o_xrefs) generated_xref["oid"] = ident xrefs.append(generated_xref)
def get_generators(self, sub_type, batch_size, species_encoded): """Get Generators""" entrez_ids = [] geo_data_file_contents = Path(sub_type.get_filepath()).read_text() geo_data = json.loads( json.dumps(xmltodict.parse(geo_data_file_contents))) for efetch_value in dict(geo_data.items()).values(): # IdList is a value returned from efetch XML spec, # within IdList, there is another map with "Id" # as the key and the entrez local ids a list value. for sub_map_key, sub_map_value in efetch_value.items(): if sub_map_key == 'IdList': for id_list in dict(sub_map_value.items()).values(): for entrez_id in id_list: self.logger.debug("here is the entrez id: %s", entrez_id) entrez_ids.append("NCBI_Gene:" + entrez_id) geo_data_list = [] return_set = Neo4jHelper.run_single_parameter_query( self.gene_crossref_query_template, entrez_ids) for record in return_set: gene_primary_key = record["g.primaryKey"] mod_local_id = record["g.modLocalId"] global_cross_ref_id = record["cr.globalCrossRefId"] geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1], "NCBI_Gene", "gene/other_expression", "gene/other_expression", "GEO", "https://www.ncbi.nlm.nih.gov/sites/entrez?" \ + "Db=geoprofiles"\ + "&DbFrom=gene"\ + "&Cmd=Link"\ + "&LinkName=gene_geoprofiles"\ + "&LinkReadableName=GEO%20Profiles"\ + "&IdsFromResult="\ + global_cross_ref_id.split(":")[1], global_cross_ref_id+"gene/other_expression") geo_xref["genePrimaryKey"] = gene_primary_key geo_xref["modLocalId"] = mod_local_id geo_data_list.append(geo_xref) yield [geo_data_list]
def get_generators(self, sub_type, batch_size, species_encoded): """Get Generators.""" entrez_ids = [] geo_data_file_contents = Path(sub_type.get_filepath()).read_text() geo_data = json.loads( json.dumps(xmltodict.parse(geo_data_file_contents))) for efetch_value in dict(geo_data.items()).values(): # IdList is a value returned from efetch XML spec, # within IdList, there is another map with "Id" # as the key and the entrez local ids a list value. for sub_map_key, sub_map_value in efetch_value.items(): if sub_map_key == 'IdList': for id_list in dict(sub_map_value.items()).values(): for entrez_id in id_list: self.logger.debug("here is the entrez id: %s", entrez_id) entrez_ids.append("NCBI_Gene:" + entrez_id) geo_data_list = [] return_set = Neo4jHelper.run_single_parameter_query( self.gene_crossref_query_template, entrez_ids) for record in return_set: gene_primary_key = record["g.primaryKey"] mod_local_id = record["g.modLocalId"] global_cross_ref_id = record["cr.globalCrossRefId"] url = self.etlh.rdh2.return_url_from_key_value( 'GEO', global_cross_ref_id.split(":")[1], 'entrezgene') geo_xref = ETLHelper.get_xref_dict( global_cross_ref_id.split(":")[1], "NCBI_Gene", "gene/other_expression", "gene/other_expression", "GEO", url, global_cross_ref_id + "gene/other_expression") geo_xref["genePrimaryKey"] = gene_primary_key geo_xref["modLocalId"] = mod_local_id geo_data_list.append(geo_xref) yield [geo_data_list]
def process_pages(self, dp, xrefs, pages): """Process pages to get xrefs.""" annotation_type = dp.get('type') xref = dp.get('crossReference') cross_ref_id = xref.get('id') if ":" in cross_ref_id: local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref_id.split(":")[0] else: local_crossref_id = "" prefix = cross_ref_id if annotation_type is None: annotation_type = 'curated' for page in pages: if (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'DOID': display_name = 'RGD' elif (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'OMIM': display_name = 'OMIM' else: display_name = cross_ref_id.split(":")[0] if display_name == 'DOID': display_name = self.data_provider mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) passing_xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, display_name, mod_global_cross_ref_url, cross_ref_id + page + annotation_type) passing_xref['dataId'] = self.disease_unique_key if 'loaded' in annotation_type: passing_xref['loadedDB'] = 'true' passing_xref['curatedDB'] = 'false' else: passing_xref['curatedDB'] = 'true' passing_xref['loadedDB'] = 'false' xrefs.append(passing_xref)
def xref_process(self, construct_record, cross_reference_list): """Process the xrefs.""" global_id = construct_record['primaryId'] if 'crossReferences' not in construct_record: return for cross_ref in construct_record.get('crossReferences'): cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is None or len(pages) == 0: continue for page in pages: if page == 'construct': mod_global_cross_ref_id = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['dataId'] = global_id cross_reference_list.append(xref)
def xref_process(self, basic_genetic_entity, cross_references, urls): # noqa """Process xrefs.""" primary_id = basic_genetic_entity.get('primaryId') global_id = basic_genetic_entity.get('primaryId') local_id = global_id.split(":")[1] taxon_id = basic_genetic_entity.get("taxonId") if 'crossReferences' not in basic_genetic_entity: return for cross_ref in basic_genetic_entity.get('crossReferences'): if ':' not in cross_ref.get('id'): continue cross_ref_id = cross_ref.get('id') local_cross_ref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') global_xref_id = cross_ref.get('id') display_name = global_xref_id # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: display_name = "" cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/expression_images': cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) elif page == 'gene': urls[ 'mod_cross_reference_complete_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) urls[ 'genetic_entity_external_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/references': urls[ 'gene_literature_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/spell': display_name = 'Serial Patterns of Expression Levels Locator (SPELL)' # TODO: fix generic_cross_reference in SGD, RGD if page == 'generic_cross_reference': cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) # TODO: fix gene/disease xrefs for SGD once # resourceDescriptor change in develop # makes its way to the release branch. if page == 'gene/disease' and taxon_id == 'NCBITaxon:559292': cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( 'SGD', local_id, page) xref_map = ETLHelper.get_xref_dict(local_cross_ref_id, prefix, page, page, display_name, cross_ref_complete_url, global_xref_id + page) xref_map['dataId'] = primary_id cross_references.append(xref_map) else: if prefix == 'PANTHER': cross_ref_primary_id = cross_ref.get( 'id') + '_' + primary_id cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) page = "gene/panther" elif prefix == 'RGD': cross_ref_primary_id = cross_ref.get('id') cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( 'RGD', local_cross_ref_id) page = "generic_cross_reference" else: cross_ref_primary_id = cross_ref.get('id') cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) page = "generic_cross_reference" xref_map = ETLHelper.get_xref_dict(local_cross_ref_id, prefix, page, page, display_name, cross_ref_complete_url, cross_ref_primary_id + page) xref_map['dataId'] = primary_id cross_references.append(xref_map)
def get_generators(self, phenotype_data, batch_size): """Get Generators""" list_to_yield = [] pge_list_to_yield = [] date_produced = phenotype_data['metaData']['dateProduced'] data_providers = [] data_provider_object = phenotype_data['metaData']['dataProvider'] counter = 0 data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] pge_key = '' load_key = date_produced + data_provider + "_phenotype" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, ETL.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.debug("data provider: %s", data_provider) for pheno in phenotype_data['data']: pecj_primary_key = str(uuid.uuid4()) counter = counter + 1 pub_med_id = None pub_mod_id = None pub_med_url = None pub_mod_url = None primary_id = pheno.get('objectId') phenotype_statement = pheno.get('phenotypeStatement') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( primary_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = pheno.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence['publicationId'] local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, primary_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') pub_mod_id = pub_xref.get('id') pub_mod_local_id = pub_mod_id.split(":")[1] if pub_mod_id is not None: pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) else: pub_mod_id = evidence.get('publicationId') if pub_mod_id is not None: pub_mod_local_id = pub_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) if pub_mod_id is None: pub_mod_id = "" if pub_med_id is None: pub_med_id = "" if pub_mod_id is None: pub_mod_id = "" date_assigned = pheno.get('dateAssigned') if pub_mod_id is None and pub_med_id is None: self.logger.info("%s is missing pubMed and pubMod id", primary_id) if 'primaryGeneticEntityIDs' in pheno: pge_ids = pheno.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) phenotype = { "primaryId": primary_id, "phenotypeUniqueKey": primary_id + phenotype_statement.strip(), "phenotypeStatement": phenotype_statement.strip(), "dateAssigned": date_assigned, "loadKey": load_key, "type": "gene", "dataProviders": data_providers, "dataProvider": data_provider, "dateProduced": date_produced, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": pub_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + pub_mod_id, "pecjPrimaryKey": pecj_primary_key } list_to_yield.append(phenotype) if counter == batch_size: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ] list_to_yield = [] pge_list_to_yield = [] counter = 0 if counter > 0: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ]
def get_generators(self, sqtr_data, data_provider, batch_size): """Get Generators""" data_providers = [] sqtrs = [] sqtr_synonyms = [] sqtr_secondary_ids = [] mod_global_cross_ref_url = "" tgs = [] counter = 0 date_produced = sqtr_data['metaData']['dateProduced'] data_provider_object = sqtr_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_SqTR" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict( \ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) for sqtr_record in sqtr_data['data']: counter = counter + 1 global_id = sqtr_record['primaryId'] local_id = global_id.split(":")[1] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry(global_id) if is_it_test_entry is False: counter = counter - 1 continue if sqtr_record.get('secondaryIds') is not None: for sid in sqtr_record.get('secondaryIds'): sqtr_secondary_id_dataset = { "primaryId": sqtr_record.get('primaryId'), "secondaryId": sid } sqtr_secondary_ids.append(sqtr_secondary_id_dataset) if sqtr_record.get('synonyms') is not None: for syn in sqtr_record.get('synonyms'): syn_dataset = { "primaryId": sqtr_record.get('primaryId'), "synonym": syn } sqtr_synonyms.append(syn_dataset) if sqtr_record.get('targetGeneIds') is not None: for target_gene_id in sqtr_record.get('targetGeneIds'): tg_dataset = { "primaryId": sqtr_record.get('primaryId'), "geneId": target_gene_id } tgs.append(tg_dataset) if 'crossReferences' in sqtr_record: for cross_ref in sqtr_record['modCrossReference']: cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is None or len(pages) == 0: continue if 'sequence_targeting_reagent' in pages: page = 'sequence_targeting_reagent' mod_global_cross_ref_url = ETLHelper.get_page_complete_url( \ local_crossref_id, self.xref_url_map, prefix, page) sqtr_dataset = { "primaryId": sqtr_record.get('primaryId'), "name": sqtr_record.get('name'), "globalId": global_id, "localId": local_id, "soTerm": sqtr_record.get('soTermId'), "taxonId": sqtr_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "modGlobalCrossRefUrl": mod_global_cross_ref_url, "dataProvider": data_provider } sqtrs.append(sqtr_dataset) if counter == batch_size: yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs] sqtrs = [] sqtr_secondary_ids = [] sqtr_synonyms = [] tgs = [] counter = 0 if counter > 0: yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs]
def get_generators(self, construct_data, data_provider, batch_size): """Create Generators""" data_providers = [] release = "" constructs = [] construct_synonyms = [] construct_secondary_ids = [] cross_reference_list = [] component_details = [] component_no_gene_details = [] non_bgi_components = [] counter = 0 date_produced = construct_data['metaData']['dateProduced'] data_provider_object = construct_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') self.logger.info("DataProvider: " + data_provider) data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_construct" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) if 'release' in construct_data['metaData']: release = construct_data['metaData']['release'] for construct_record in construct_data['data']: counter = counter + 1 global_id = construct_record['primaryId'] local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue name_text = TextProcessingHelper.cleanhtml( construct_record.get('name')) construct_dataset = { "symbol": construct_record.get('name'), "primaryId": construct_record.get('primaryId'), "globalId": global_id, "localId": local_id, "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "nameText": name_text, "name": construct_record.get('name') } constructs.append(construct_dataset) if 'crossReferences' in construct_record: for cross_ref in construct_record.get('crossReferences'): cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'construct': mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'constructComponents' in construct_record: for component in construct_record.get('constructComponents'): component_relation = component.get( 'componentRelation').upper() component_symbol = component.get('componentSymbol') component_id = component.get('componentID') if component_id is not None: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "componentID": component_id, "constructID": construct_record.get('primaryId') } component_details.append(component_detail) else: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "constructID": construct_record.get('primaryId') } non_bgi_component = { "componentSymbol": component_symbol } non_bgi_components.append(non_bgi_component) component_no_gene_details.append(component_detail) if 'synonyms' in construct_record: for syn in construct_record.get('synonyms'): construct_synonym = { "data_id": construct_record.get('primaryId'), "synonym": syn.strip() } construct_synonyms.append(construct_synonym) if 'secondaryIds' in construct_record: for secondary_id in construct_record.get('secondaryIds'): construct_secondary_id = { "data_id": construct_record.get('primaryId'), "secondary_id": secondary_id } construct_secondary_ids.append(construct_secondary_id) if counter == batch_size: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ] constructs = [] construct_secondary_ids = [] construct_synonyms = [] cross_reference_list = [] non_bgi_components = [] component_details = [] component_no_gene_details = [] counter = 0 if counter > 0: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ]
def get_generators(self, variant_data, batch_size): # noqa """Get Generators.""" data_providers = [] release = "" variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] counter = 0 date_produced = variant_data['metaData']['dateProduced'] self.data_providers_process(variant_data) load_key = date_produced + self.data_provider + "_VARIATION" if 'release' in variant_data['metaData']: release = variant_data['metaData']['release'] assemblies = {} for allele_record in variant_data['data']: chromosome = allele_record["chromosome"] if chromosome.startswith("chr"): chromosome_str = chromosome[3:] else: chromosome_str = chromosome assembly = allele_record["assembly"] if assembly not in assemblies: self.logger.info(assembly) context_info = ContextInfo() data_manager = DataFileManager( context_info.config_file_location) assemblies[assembly] = AssemblySequenceHelper( assembly, data_manager) so_term_id = allele_record.get('type') genomic_reference_sequence = allele_record.get( 'genomicReferenceSequence') genomic_variant_sequence = allele_record.get( 'genomicVariantSequence') if genomic_reference_sequence == 'N/A': genomic_reference_sequence = "" if genomic_variant_sequence == 'N/A': genomic_variant_sequence = "" padding_left = "" padding_right = "" if allele_record.get('start') != "" and allele_record.get( 'end') != "": # not insertion if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567": genomic_reference_sequence = assemblies[ assembly].get_sequence(chromosome_str, allele_record.get('start'), allele_record.get('end')) if allele_record.get('start') < allele_record.get('end'): start = allele_record.get('start') end = allele_record.get('end') else: start = allele_record.get('end') end = allele_record.get('start') padding_width = 500 if so_term_id != "SO:0000667": # not insertion start = start - 1 end = end + 1 left_padding_start = start - padding_width if left_padding_start < 1: left_padding_start = 1 padding_left = assemblies[assembly].get_sequence( chromosome_str, left_padding_start, start) right_padding_end = end + padding_width padding_right = assemblies[assembly].get_sequence( chromosome_str, end, right_padding_end) counter = counter + 1 global_id = allele_record.get('alleleId') mod_global_cross_ref_id = "" cross_references = [] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue cross_ref_primary_id = allele_record.get( 'sequenceOfReferenceAccessionNumber') local_cross_ref_id = cross_ref_primary_id.split(":")[1] prefix = cross_ref_primary_id.split(":")[0] cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, global_id) xref_map = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, "variant_sequence_of_reference", "sequence_of_reference_accession_number", global_id, cross_ref_complete_url, cross_ref_primary_id + "variant_sequence_of_reference") xref_map['dataId'] = global_id if cross_ref_primary_id is not None: cross_references.append(xref_map) if genomic_reference_sequence is not None: if len(genomic_reference_sequence) > 1000 and ( allele_record.get('type') == 'SO:1000002' or allele_record.get('type') == 'SO:1000008'): self.logger.debug("%s genomicReferenceSequence", allele_record.get('alleleId')) if genomic_variant_sequence is not None: if len(genomic_variant_sequence) > 1000 and ( allele_record.get('type') in ['SO:1000002', 'SO:1000008']): self.logger.debug("%s genomicVariantSequence", allele_record.get('alleleId')) hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature( allele_record.get('sequenceOfReferenceAccessionNumber'), allele_record.get('type'), allele_record.get('start'), allele_record.get('end'), genomic_reference_sequence, genomic_variant_sequence, allele_record.get('assembly'), chromosome_str) if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \ or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000: self.logger.debug( "%s has too long of a sequence potentionally", allele_record.get('alleleId')) # TODO: fix typo in MGI Submission for this variant so # that it doesn't list a 40K bp point mutation. if allele_record.get('alleleId') != 'MGI:6113870': variant_dataset = { "hgvs_nomenclature": hgvs_nomenclature, "genomicReferenceSequence": genomic_reference_sequence, "genomicVariantSequence": genomic_variant_sequence, "paddingLeft": padding_left, "paddingRight": padding_right, "alleleId": allele_record.get('alleleId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "dataProvider": self.data_provider, "variantHGVSSynonym": hgvs_synonym } variant_genomic_location_dataset = { "variantId": hgvs_nomenclature, "assembly": allele_record.get('assembly'), "chromosome": chromosome_str, "start": allele_record.get('start'), "end": allele_record.get('end'), "uuid": str(uuid.uuid4()), "dataProvider": self.data_provider } variant_so_term = { "variantId": hgvs_nomenclature, "soTermId": allele_record.get('type') } variant_so_terms.append(variant_so_term) variant_genomic_locations.append( variant_genomic_location_dataset) variants.append(variant_dataset) if counter == batch_size: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ] variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] if counter > 0: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ]
def get_generators(self, filepath, batch_size): """Get Generators""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # - might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] local_id = None def_links_unprocessed = [] def_links_processed = [] subset = [] definition = "" is_obsolete = "false" ident = key prefix = ident.split(":")[0] if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: do_synonym = { "primary_id": key, "synonym": synonym } do_synonyms_list.append(do_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [s["val"] for s in node["meta"]["basicPropertyValues"]] for alt_id in alt_ids: if "DOID:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } do_alt_ids_list.append(secondary_id) if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') if o_xrefs is not None: for xref_id_dict in o_xrefs: xref_id = xref_id_dict["val"] if ":" in xref_id: local_id = xref_id.split(":")[1].strip() prefix = xref_id.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id) generated_xref = ETLHelper.get_xref_dict(local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", xref_id, complete_url, xref_id + "ontology_provided_cross_reference") generated_xref["oid"] = ident xrefs.append(generated_xref) else: #TODO Need to make sure this else is correct if ":" in o_xrefs: local_id = o_xrefs.split(":")[1].strip() prefix = o_xrefs.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs) generated_xref = ETLHelper.get_xref_dict(local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", o_xrefs, complete_url, o_xrefs) generated_xref["oid"] = ident xrefs.append(generated_xref) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents(key, relations=['subClassOf']) for item in isas_without_names: dictionary = { "primary_id": key, "primary_id2": item } do_isas_list.append(dictionary) def_links_processed = [] def_links = "" if definition is None: definition = "" else: # Remove new lines that cause this to split across two lines in the file # definition = definition.replace('\n', ' ') # Remove any extra double space that might have been introduces in the last replace # definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = re.split(r'(?<!\\)"', definition) if len(split_definition) > 1: if len(split_definition) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links = def_links.rstrip("]").replace("[", "") def_links_unprocessed.append(def_links) for def_link in def_links_unprocessed: def_link = def_link.replace("url:www", "http://www") def_link = def_link.replace("url:", "") def_link = def_link.replace("URL:", "") def_link = def_link.replace("\\:", ":") def_link = def_link.replace('\\', '') if "," in def_link: def_link = def_link.split(",") for link in def_link: if link.strip().startswith("http"): def_links_processed.append(link) else: if def_link.strip().startswith("http"): def_links_processed.append(def_link) # TODO: make this a generic section based on the resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas alt_ids = node.get('alt_id') if alt_ids: if not isinstance(alt_ids, (list, tuple)): alt_ids = [alt_ids] else: alt_ids = [] dict_to_append = { 'oid': node['id'], 'name': node.get('label'), 'name_key': node.get('label'), 'definition': definition, 'defLinksProcessed': def_links_processed, 'is_obsolete': is_obsolete, 'subset': subset, 'oUrl': "http://www.disease-ontology.org/?id=" + node['id'], 'rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=All&x=1&acc_id=' + node['id'] + '#annot', 'rat_only_rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=Rat&x=1&acc_id=' + node['id'] + '#annot', 'human_only_rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=Human&x=1&acc_id=' + node['id'] + '#annot', 'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'], 'zfin_link': 'https://zfin.org/' + node['id'], 'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id=' + node['id'], 'wormbase_link': 'http://www.wormbase.org/resources/disease/' + node['id'], 'sgd_link': 'https://yeastgenome.org/disease/' + node['id'] } do_term_list.append(dict_to_append) if counter == batch_size: yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list] do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 if counter > 0: yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list]
def get_generators(self, htp_dataset_data, batch_size): dataset_tags = [] data_providers = [] htp_datasets = [] publications = [] secondaryIds = [] cross_reference_list = [] counter = 0 date_produced = htp_dataset_data['metaData']['dateProduced'] data_provider_object = htp_dataset_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] for dataset_record in htp_dataset_data['data']: counter = counter + 1 dataset = dataset_record.get('datasetId') datasetId = dataset.get('primaryId') # spoke to RGD and they wish to remove these datasets as they overlap with SGD. if (datasetId == 'GEO:GSE18157' or datasetId == 'GEO:GSE33497') and data_provider == 'RGD': continue if 'secondaryIds' in dataset: for secId in dataset.get('secondaryIds'): secid = {"datasetId": datasetId, "secondaryId": secId} secondaryIds.append(secid) if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( datasetId) if is_it_test_entry is False: counter = counter - 1 continue if 'crossReference' in dataset: crossRefO = dataset.get('crossReference') if crossRefO is not None: crossRefId = crossRefO.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRefId.split(":")[0] pages = crossRefO.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_url, crossRefId + page) xref['dataId'] = datasetId cross_reference_list.append(xref) category_tags = dataset_record.get('categoryTags') if category_tags is not None: for tag in category_tags: dataset_category_tag = {"datasetId": datasetId, "tag": tag} dataset_tags.append(dataset_category_tag) publicationNew = dataset_record.get('publications') if publicationNew is not None: for pub in publicationNew: pid = pub.get('publicationId') publication_mod_id = "" pub_med_id = "" pub_mod_url = "" pub_med_url = "" if pid is not None and pid.startswith('PMID:'): pub_med_id = pid local_pub_med_id = pub_med_id.split(":")[1] pub_med_url = pub_med_url = self.etlh.get_no_page_complete_url( local_pub_med_id, 'PMID', pub_med_id) if 'crossReference' in pub: pub_xref = pub.get('crossReference') publication_mod_id = pub_xref.get('id') pub_mod_url = self.etlh.rdh2.return_url_from_identifier( publication_mod_id) elif pid is not None and not pid.startswith('PMID:'): publication_mod_id = pub.get('publicationId') pub_mod_url = self.etlh.rdh2.return_url_from_identifier( publication_mod_id) publication = { "datasetId": datasetId, "pubPrimaryKey": publication_mod_id + pub_med_id, "pubModId": publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModUrl": pub_mod_url } publications.append(publication) htp_dataset = { "datasetId": datasetId, "dateAssigned": dataset_record.get('dateAssigned'), "title": dataset_record.get('title'), "summary": dataset_record.get('summary'), "numChannels": dataset_record.get('numChannels'), "subSeries": dataset_record.get('subSeries') } htp_datasets.append(htp_dataset) if counter == batch_size: yield [ htp_datasets, dataset_tags, publications, cross_reference_list, secondaryIds ] counter = 0 htp_datasets = [] dataset_tags = [] publications = [] cross_reference_list = [] secondaryIds = [] if counter > 0: yield [ htp_datasets, dataset_tags, publications, cross_reference_list, secondaryIds ]
def get_generators(self, disease_data, batch_size, data_provider): """Creating generators""" counter = 0 disease_association_type = None gene_list_to_yield = [] allele_list_to_yield = [] agm_list_to_yield = [] evidence_code_list_to_yield = [] withs = [] pge_list_to_yield = [] xrefs = [] data_provider_object = disease_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') for disease_record in disease_data['data']: publication_mod_id = "" pub_med_id = "" pub_mod_url = None pub_med_url = None pge_key = '' if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( disease_record.get('objectId')) if is_it_test_entry is False: continue disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \ disease_record['objectRelation'].get("associationType").upper() counter = counter + 1 disease_object_type = disease_record['objectRelation'].get( "objectType") primary_id = disease_record.get('objectId') do_id = disease_record.get('DOid') if 'evidence' in disease_record: pecj_primary_key = str(uuid.uuid4()) evidence = disease_record.get('evidence') if 'publication' in evidence: publication = evidence.get('publication') if publication.get('publicationId').startswith('PMID:'): pub_med_id = publication.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_url = ETLHelper.get_complete_pub_url( local_pub_med_id, pub_med_id) if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') local_pub_mod_id = publication_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( local_pub_mod_id, publication_mod_id) else: publication_mod_id = publication.get('publicationId') local_pub_mod_id = publication_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( local_pub_mod_id, publication_mod_id) if 'evidenceCodes' in disease_record['evidence']: for ecode in disease_record['evidence'].get( 'evidenceCodes'): ecode_map = { "pecjPrimaryKey": pecj_primary_key, "ecode": ecode } evidence_code_list_to_yield.append(ecode_map) negation = '' if 'objectRelation' in disease_record: disease_association_type = disease_record[ 'objectRelation'].get("associationType").upper() if 'negation' in disease_record: # this capitalization is purposeful if disease_association_type == 'IS_IMPLICATED_IN': disease_association_type = 'IS_NOT_IMPLICATED_IN' if disease_association_type == 'IS_MODEL_OF': disease_association_type = 'IS_NOT_MODEL_OF' if disease_association_type == 'IS_MARKER_FOR': disease_association_type = 'IS_NOT_MARKER_FOR' negation = 'NOT' disease_unique_key = disease_unique_key + negation additional_genetic_components = [] if 'additionalGeneticComponents' in disease_record[ 'objectRelation']: for component in disease_record['objectRelation'][ 'additionalGeneticComponents']: component_symbol = component.get('componentSymbol') component_id = component.get('componentId') component_url = component.get( 'componentUrl') + component_id additional_genetic_components.append({ "id": component_id, "componentUrl": component_url, "componentSymbol": component_symbol }) if 'with' in disease_record: with_record = disease_record.get('with') for rec in with_record: disease_unique_key = disease_unique_key + rec for rec in with_record: with_map = { "diseaseUniqueKey": disease_unique_key, "withD": rec } withs.append(with_map) if 'primaryGeneticEntityIDs' in disease_record: pge_ids = disease_record.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) if 'dataProvider' in disease_record: for dp in disease_record['dataProvider']: annotation_type = dp.get('type') xref = dp.get('crossReference') cross_ref_id = xref.get('id') pages = xref.get('pages') if ":" in cross_ref_id: local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref_id.split(":")[0] else: local_crossref_id = "" prefix = cross_ref_id if annotation_type is None: annotation_type = 'curated' if pages is not None and len(pages) > 0: for page in pages: if (data_provider == 'RGD' or data_provider == 'HUMAN') and prefix == 'DOID': display_name = 'RGD' elif (data_provider == 'RGD' or data_provider == 'HUMAN') and prefix == 'OMIM': display_name = 'OMIM' else: display_name = cross_ref_id.split(":")[0] if display_name == 'DOID': display_name = data_provider mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) passing_xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, display_name, mod_global_cross_ref_id, cross_ref_id + page + annotation_type) passing_xref['dataId'] = disease_unique_key if 'loaded' in annotation_type: passing_xref['loadedDB'] = 'true' passing_xref['curatedDB'] = 'false' else: passing_xref['curatedDB'] = 'true' passing_xref['loadedDB'] = 'false' xrefs.append(passing_xref) disease_record = { "diseaseUniqueKey": disease_unique_key, "doId": do_id, "primaryId": primary_id, "pecjPrimaryKey": pecj_primary_key, "relationshipType": disease_association_type.upper(), "dataProvider": data_provider, "dateAssigned": disease_record.get("dateAssigned"), "pubPrimaryKey": publication_mod_id + pub_med_id, "pubModId": publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModUrl": pub_mod_url, "negation": negation } if disease_object_type == 'gene': gene_list_to_yield.append(disease_record) elif disease_object_type == 'allele': allele_list_to_yield.append(disease_record) else: agm_list_to_yield.append(disease_record) if counter == batch_size: yield [ allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs ] agm_list_to_yield = [] allele_list_to_yield = [] gene_list_to_yield = [] evidence_code_list_to_yield = [] pge_list_to_yield = [] xrefs = [] withs = [] counter = 0 if counter > 0: yield [ allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs ]
def get_generators(self, agm_data, data_provider, batch_size): """Get Generators""" data_providers = [] agms = [] agm_synonyms = [] agm_secondary_ids = [] mod_global_cross_ref_url = "" components = [] backgrounds = [] sqtrs = [] counter = 0 date_produced = agm_data['metaData']['dateProduced'] data_provider_object = agm_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_agm" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) for agm_record in agm_data['data']: counter = counter + 1 global_id = agm_record['primaryID'] local_id = global_id.split(":")[1] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue if agm_record.get('secondaryIds') is not None: for sid in agm_record.get('secondaryIds'): agm_secondary_id_dataset = { "primaryId": agm_record.get('primaryID'), "secondaryId": sid } agm_secondary_ids.append(agm_secondary_id_dataset) if agm_record.get('synonyms') is not None: for syn in agm_record.get('synonyms'): syn_dataset = { "primaryId": agm_record.get('primaryID'), "synonym": syn } agm_synonyms.append(syn_dataset) if 'crossReference' in agm_record: cross_ref = agm_record.get('crossReference') cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page in ['Fish', 'genotype', 'strain']: mod_global_cross_ref_url = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) short_species_abbreviation = ETLHelper.get_short_species_abbreviation( agm_record.get('taxonId')) name_text = TextProcessingHelper.cleanhtml(agm_record.get('name')) # TODO: make subtype required in submission file. subtype = agm_record.get('subtype') if subtype is None and data_provider == 'WB': subtype = 'strain' if subtype is None: subtype = 'affected_genomic_model' # TODO: name_text agm_dataset = { "primaryId": agm_record.get('primaryID'), "name": agm_record.get('name'), "globalId": global_id, "localId": local_id, "taxonId": agm_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "subtype": subtype, "modGlobalCrossRefUrl": mod_global_cross_ref_url, "dataProvider": data_provider, "nameText": name_text, "nameWithSpecies": agm_record.get('name') + " (" + short_species_abbreviation + ")", "nameTextWithSpecies": name_text + " (" + short_species_abbreviation + ")", } agms.append(agm_dataset) if agm_record.get('affectedGenomicModelComponents') is not None: for component in agm_record.get( 'affectedGenomicModelComponents'): component_dataset = { "primaryId": agm_record.get('primaryID'), "componentId": component.get('alleleID'), "zygosityId": component.get('zygosity') } components.append(component_dataset) if agm_record.get('sequenceTargetingReagentIDs') is not None: for sqtr in agm_record.get('sequenceTargetingReagentIDs'): sqtr_dataset = { "primaryId": agm_record.get('primaryID'), "sqtrId": sqtr } sqtrs.append(sqtr_dataset) if agm_record.get('parentalPopulationIDs') is not None: for background in agm_record.get('parentalPopulationIDs'): background_dataset = { "primaryId": agm_record.get('primaryID'), "backgroundId": background } backgrounds.append(background_dataset) if counter == batch_size: yield [ agms, agm_secondary_ids, agm_synonyms, components, sqtrs, backgrounds ] agms = [] agm_secondary_ids = [] agm_synonyms = [] components = [] backgrounds = [] counter = 0 if counter > 0: yield [ agms, agm_secondary_ids, agm_synonyms, components, sqtrs, backgrounds ]
def get_generators(self, allele_data, batch_size): data_providers = [] release = "" alleles_no_constrcut_no_gene = [] alleles_construct_gene = [] alleles_no_construct = [] alleles_no_gene = [] allele_synonyms = [] allele_secondary_ids = [] cross_reference_list = [] counter = 0 date_produced = allele_data['metaData']['dateProduced'] data_provider_object = allele_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] loadKey = date_produced + data_provider + "_ALLELE" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) logger.info("data provider: " + data_provider) if 'release' in allele_data['metaData']: release = allele_data['metaData']['release'] for allele_record in allele_data['data']: counter = counter + 1 global_id = allele_record['primaryId'] # fixing parsing error on this end while MGI fixes on their end. if global_id == 'MGI:3826848': description = allele_record.get('description')[:-2] else: description = allele_record.get('description') local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry(global_id) if is_it_test_entry is False: counter = counter - 1 continue gene_id = '' construct_id = '' association_type = '' short_species_abbreviation = ETLHelper.get_short_species_abbreviation(allele_record.get('taxonId')) symbol_text = TextProcessingHelper.cleanhtml(allele_record.get('symbol')) if allele_record.get('alleleObjectRelations') is not None: for relation in allele_record.get('alleleObjectRelations'): association_type = relation.get('objectRelation').get('associationType') if relation.get('objectRelation').get('gene') is not None: gene_id = relation.get('objectRelation').get('gene') if relation.get('objectRelation').get('construct') is not None: construct_id = relation.get('objectRelation').get('construct') if gene_id != '' and construct_id != '': allele_construct_gene_dataset = { "symbol": allele_record.get('symbol'), "geneId": gene_id, "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "constructId": construct_id, "associationType": association_type } alleles_construct_gene.append(allele_construct_gene_dataset) elif construct_id != '' and gene_id == '': allele_construct_no_gene_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "constructId": construct_id, "associationType": association_type } alleles_no_gene.append(allele_construct_no_gene_dataset) elif gene_id != '' and construct_id == '': allele_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "geneId": gene_id, "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_construct.append(allele_gene_no_construct_dataset) elif gene_id == '' and construct_id == '': allele_no_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset) else: allele_no_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset) if 'crossReferences' in allele_record: for crossRef in allele_record['crossReferences']: crossRefId = crossRef.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRef.get('id').split(":")[0] pages = crossRef.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'allele' or page == 'allele/references' or page == 'transgene' or page == 'construct' \ or page == 'transgene/references' or page == 'construct/references': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_id, crossRefId + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'synonyms' in allele_record: for syn in allele_record.get('synonyms'): allele_synonym = { "data_id": allele_record.get('primaryId'), "synonym": syn.strip() } allele_synonyms.append(allele_synonym) if 'secondaryIds' in allele_record: for secondary_id in allele_record.get('secondaryIds'): allele_secondary_id = { "data_id": allele_record.get('primaryId'), "secondary_id": secondary_id } allele_secondary_ids.append(allele_secondary_id) if counter == batch_size: yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene, allele_secondary_ids, allele_synonyms, cross_reference_list] alleles_no_construct = [] alleles_construct_gene = [] alleles_no_gene = [] alleles_no_constrcut_no_gene = [] allele_secondary_ids = [] allele_synonyms = [] cross_reference_list = [] counter = 0 if counter > 0: yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene, allele_secondary_ids, allele_synonyms, cross_reference_list]
def get_generators(self, expression_file, batch_size): """Get Generators""" self.logger.debug("made it to the expression generator") counter = 0 cross_references = [] bio_entities = [] bio_join_entities = [] bio_entity_gene_aos = [] pubs = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] stage_uberon_data = [] uberon_ao_data = [] uberon_ao_other_data = [] uberon_stage_other_data = [] self.logger.debug("streaming json data from %s ...", expression_file) with codecs.open(expression_file, 'r', 'utf-8') as file_handle: for xpat in ijson.items(file_handle, 'data.item'): counter = counter + 1 pub_med_url = None pub_mod_url = None pub_med_id = "" publication_mod_id = "" stage_term_id = "" stage_name = "" stage_uberon_term_id = "" gene_id = xpat.get('geneId') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( gene_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = xpat.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, gene_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \ publication_mod_id) else: publication_mod_id = evidence['publicationId'] if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\ publication_mod_id) if publication_mod_id is None: publication_mod_id = "" assay = xpat.get('assay') if 'whereExpressed' in xpat: where_expressed = xpat.get('whereExpressed') cellular_component_qualifier_term_id = \ where_expressed.get('cellularComponentQualifierTermId') cellular_component_term_id = where_expressed.get( 'cellularComponentTermId') anatomical_structure_term_id = where_expressed.get( 'anatomicalStructureTermId') anatomical_structure_qualifier_term_id = where_expressed.get( 'anatomicalStructureQualifierTermId') anatomical_sub_structure_term_id = \ where_expressed.get('anatomicalSubStructureTermId') anatomical_sub_structure_qualifier_term_id = where_expressed.get( 'anatomicalSubStructureQualifierTermId') where_expressed_statement = where_expressed.get( 'whereExpressedStatement') when_expressed_stage = xpat.get('whenExpressed') if 'stageTermId' in when_expressed_stage: stage_term_id = when_expressed_stage.get('stageTermId') if 'stageName' in when_expressed_stage: stage_name = when_expressed_stage.get('stageName') # TODO: making unique BioEntityGeneExpressionJoin nodes # and ExpressionBioEntity nodes is tedious. # TODO: Lets get the DQMs to fix this. expression_unique_key = gene_id + assay + stage_name expression_entity_unique_key = "" if anatomical_structure_term_id is not None: expression_unique_key += anatomical_structure_term_id expression_entity_unique_key = anatomical_structure_term_id if anatomical_structure_qualifier_term_id is not None: expression_unique_key += anatomical_structure_qualifier_term_id expression_entity_unique_key += anatomical_structure_qualifier_term_id if cellular_component_term_id is not None: expression_unique_key += cellular_component_term_id expression_entity_unique_key += cellular_component_term_id if cellular_component_qualifier_term_id is not None: expression_unique_key += cellular_component_qualifier_term_id expression_entity_unique_key += cellular_component_qualifier_term_id if anatomical_sub_structure_term_id is not None: expression_unique_key += anatomical_sub_structure_term_id if anatomical_sub_structure_qualifier_term_id is not None: expression_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key \ += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += where_expressed_statement expression_unique_key += where_expressed_statement if where_expressed.get( 'anatomicalStructureUberonSlimTermIds' ) is not None: for uberon_structure_term_object in \ where_expressed.get('anatomicalStructureUberonSlimTermIds'): structure_uberon_term_id = \ uberon_structure_term_object.get('uberonTerm') if structure_uberon_term_id is not None \ and structure_uberon_term_id != 'Other': structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": structure_uberon_term_id } uberon_ao_data.append(structure_uberon_term) elif structure_uberon_term_id is not None \ and structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if where_expressed.get( 'anatomicalSubStructureUberonSlimTermIds' ) is not None: for uberon_sub_structure_term_object in \ where_expressed.get('anatomicalSubStructureUberonSlimTermIds'): sub_structure_uberon_term_id = \ uberon_sub_structure_term_object.get('uberonTerm') if sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id != 'Other': sub_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": sub_structure_uberon_term_id } uberon_ao_data.append( sub_structure_uberon_term) elif sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if cellular_component_term_id is None: cellular_component_term_id = "" if when_expressed_stage.get( 'stageUberonSlimTerm') is not None: stage_uberon_term_object = when_expressed_stage.get( 'stageUberonSlimTerm') stage_uberon_term_id = stage_uberon_term_object.get( "uberonTerm") if stage_uberon_term_id is not None \ and stage_uberon_term_id != "post embryonic, pre-adult": stage_uberon = { "uberonStageId": stage_uberon_term_id, "ei_uuid": expression_unique_key } stage_uberon_data.append(stage_uberon) if stage_uberon_term_id == "post embryonic, pre-adult": stage_uberon_other = { "ei_uuid": expression_unique_key } uberon_stage_other_data.append(stage_uberon_other) if stage_term_id is None or stage_name == 'N/A': stage_term_id = "" stage_name = "" stage_uberon_term_id = "" if stage_name is not None: stage = { "stageTermId": stage_term_id, "stageName": stage_name, "ei_uuid": expression_unique_key } stage_list.append(stage) else: stage_uberon_term_id = "" if 'crossReference' in xpat: cross_ref = xpat.get('crossReference') cross_ref_id = cross_ref.get('id') local_cross_ref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'gene/expression/annotation/detail': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\ local_cross_ref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['ei_uuid'] = expression_unique_key cross_references.append(xref) bio_entity = { "ebe_uuid": expression_entity_unique_key, "whereExpressedStatement": where_expressed_statement } bio_entities.append(bio_entity) bio_join_entity = { "ei_uuid": expression_unique_key, "assay": assay } bio_join_entities.append(bio_join_entity) bio_entity_gene_ao = { "geneId": gene_id, "ebe_uuid": expression_entity_unique_key, "anatomicalStructureTermId": anatomical_structure_term_id, "ei_uuid": expression_unique_key } bio_entity_gene_aos.append(bio_entity_gene_ao) pub = { "ei_uuid": expression_unique_key, "pubPrimaryKey": pub_med_id + publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url } pubs.append(pub) ao_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "assay": assay, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_expressions.append(ao_expression) if cellular_component_qualifier_term_id is not None: cc_qualifier = { "ebe_uuid": expression_entity_unique_key, "cellularComponentQualifierTermId": cellular_component_qualifier_term_id } cc_qualifiers.append(cc_qualifier) if anatomical_structure_term_id is None: anatomical_structure_term_id = "" cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "assay": assay, "whereExpressedStatement": where_expressed_statement, "cellularComponentTermId": cellular_component_term_id, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } cc_expressions.append(cc_expression) if anatomical_structure_qualifier_term_id is not None: ao_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureQualifierTermId": anatomical_structure_qualifier_term_id } ao_qualifiers.append(ao_qualifier) if anatomical_sub_structure_term_id is not None: ao_substructure = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureTermId": anatomical_sub_structure_term_id } ao_substructures.append(ao_substructure) if anatomical_sub_structure_qualifier_term_id is not None: ao_ss_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureQualifierTermId": anatomical_sub_structure_qualifier_term_id } ao_ss_qualifiers.append(ao_ss_qualifier) if where_expressed_statement is None: where_expressed_statement = "" if anatomical_structure_term_id is not None \ and anatomical_structure_term_id != "" \ and cellular_component_term_id is not None \ and cellular_component_term_id != "": ao_cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "stageTermId": stage_term_id, "stageName": stage_name, "stageUberonTermId": stage_uberon_term_id, "assay": assay, "cellularComponentTermId": cellular_component_term_id, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_cc_expressions.append(ao_cc_expression) if counter == batch_size: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ] bio_entities = [] bio_join_entities = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] uberon_stage_other_data = [] stage_uberon_data = [] uberon_ao_other_data = [] uberon_ao_data = [] cross_references = [] bio_entity_gene_aos = [] pubs = [] counter = 0 if counter > 0: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ]