class GeoXrefETL(ETL): """GEO XREF ETL""" logger = logging.getLogger(__name__) geo_xref_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene) where o.primaryKey = row.genePrimaryKey """ + ETLHelper.get_cypher_xref_text() gene_crossref_query_template = """ MATCH (g:Gene)-[crr:CROSS_REFERENCE]-(cr:CrossReference) WHERE cr.globalCrossRefId IN {parameter} RETURN g.primaryKey, g.modLocalId, cr.name, cr.globalCrossRefId""" def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): for sub_type in self.data_type_config.get_sub_type_objects(): species_encoded = urllib.parse.quote_plus(\ ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider())) commit_size = self.data_type_config.get_neo4j_commit_size() #batch_size = self.data_type_config.get_generator_batch_size() batch_size = 100000 generators = self.get_generators(sub_type, batch_size, species_encoded) query_template_list = [ [ self.geo_xref_query_template, commit_size, "geo_xref_data_" + sub_type.get_data_provider() + ".csv" ], ] query_and_file_list = self.process_query_params( query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, sub_type, batch_size, species_encoded): """Get Generators""" entrez_ids = [] geo_data_file_contents = Path(sub_type.get_filepath()).read_text() geo_data = json.loads( json.dumps(xmltodict.parse(geo_data_file_contents))) for efetch_value in dict(geo_data.items()).values(): # IdList is a value returned from efetch XML spec, # within IdList, there is another map with "Id" # as the key and the entrez local ids a list value. for sub_map_key, sub_map_value in efetch_value.items(): if sub_map_key == 'IdList': for id_list in dict(sub_map_value.items()).values(): for entrez_id in id_list: self.logger.debug("here is the entrez id: %s", entrez_id) entrez_ids.append("NCBI_Gene:" + entrez_id) geo_data_list = [] return_set = Neo4jHelper.run_single_parameter_query( self.gene_crossref_query_template, entrez_ids) for record in return_set: gene_primary_key = record["g.primaryKey"] mod_local_id = record["g.modLocalId"] global_cross_ref_id = record["cr.globalCrossRefId"] geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1], "NCBI_Gene", "gene/other_expression", "gene/other_expression", "GEO", "https://www.ncbi.nlm.nih.gov/sites/entrez?" \ + "Db=geoprofiles"\ + "&DbFrom=gene"\ + "&Cmd=Link"\ + "&LinkName=gene_geoprofiles"\ + "&LinkReadableName=GEO%20Profiles"\ + "&IdsFromResult="\ + global_cross_ref_id.split(":")[1], global_cross_ref_id+"gene/other_expression") geo_xref["genePrimaryKey"] = gene_primary_key geo_xref["modLocalId"] = mod_local_id geo_data_list.append(geo_xref) yield [geo_data_list]
class ConstructETL(ETL): """Construct ETL""" logger = logging.getLogger(__name__) xref_url_map = ResourceDescriptorHelper().get_data() # Query templates which take params and will be processed later construct_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row //Create the Construct node and set properties. primaryKey is required. MERGE (o:Construct {primaryKey:row.primaryId}) ON CREATE SET o.name = row.name, o.dateProduced = row.dateProduced, o.release = row.release, o.localId = row.localId, o.globalId = row.globalId, o.uuid = row.uuid, o.nameText = row.nameText, o.modCrossRefCompleteUrl = row.modGlobalCrossRefId, o.dataProviders = row.dataProviders, o.dataProvider = row.dataProvider, o.symbol = row.symbol """ construct_secondary_ids_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (f:Construct {primaryKey:row.data_id}) MERGE (second:SecondaryId {primaryKey:row.secondary_id}) SET second.name = row.secondary_id MERGE (f)-[aka1:ALSO_KNOWN_AS]->(second) """ construct_synonyms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (a:Construct {primaryKey:row.data_id}) MERGE(syn:Synonym {primaryKey:row.synonym}) SET syn.name = row.synonym MERGE (a)-[aka2:ALSO_KNOWN_AS]->(syn) """ construct_xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Construct {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text( ) construct_gene_component_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Construct {primaryKey:row.constructID}), (g:Gene {primaryKey:row.componentID}) CALL apoc.create.relationship(g, row.componentRelation, {}, o) yield rel REMOVE rel.noOp""" construct_no_gene_component_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Construct {primaryKey:row.constructID}), (g:NonBGIConstructComponent {primaryKey:row.componentSymbol}) CALL apoc.create.relationship(g, row.componentRelation, {}, o) yield rel REMOVE rel.noOp""" non_bgi_component_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MERGE (o:NonBGIConstructComponent {primaryKey:row.componentSymbol})""" def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, )) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): self.logger.info("Loading Construct Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Construct Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ ConstructETL.construct_query_template, commit_size, "Construct_data_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_secondary_ids_query_template, commit_size, "Construct_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_synonyms_query_template, commit_size, "Construct_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_xrefs_query_template, commit_size, "Construct_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.non_bgi_component_query_template, commit_size, "Construct_non_bgi_component_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_gene_component_query_template, commit_size, "Construct_components_gene" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_no_gene_component_query_template, commit_size, "Construct_components_no_gene" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, construct_data, data_provider, batch_size): """Create Generators""" data_providers = [] release = "" constructs = [] construct_synonyms = [] construct_secondary_ids = [] cross_reference_list = [] component_details = [] component_no_gene_details = [] non_bgi_components = [] counter = 0 date_produced = construct_data['metaData']['dateProduced'] data_provider_object = construct_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') self.logger.info("DataProvider: " + data_provider) data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_construct" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) if 'release' in construct_data['metaData']: release = construct_data['metaData']['release'] for construct_record in construct_data['data']: counter = counter + 1 global_id = construct_record['primaryId'] local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue name_text = TextProcessingHelper.cleanhtml( construct_record.get('name')) construct_dataset = { "symbol": construct_record.get('name'), "primaryId": construct_record.get('primaryId'), "globalId": global_id, "localId": local_id, "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "nameText": name_text, "name": construct_record.get('name') } constructs.append(construct_dataset) if 'crossReferences' in construct_record: for cross_ref in construct_record.get('crossReferences'): cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'construct': mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'constructComponents' in construct_record: for component in construct_record.get('constructComponents'): component_relation = component.get( 'componentRelation').upper() component_symbol = component.get('componentSymbol') component_id = component.get('componentID') if component_id is not None: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "componentID": component_id, "constructID": construct_record.get('primaryId') } component_details.append(component_detail) else: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "constructID": construct_record.get('primaryId') } non_bgi_component = { "componentSymbol": component_symbol } non_bgi_components.append(non_bgi_component) component_no_gene_details.append(component_detail) if 'synonyms' in construct_record: for syn in construct_record.get('synonyms'): construct_synonym = { "data_id": construct_record.get('primaryId'), "synonym": syn.strip() } construct_synonyms.append(construct_synonym) if 'secondaryIds' in construct_record: for secondary_id in construct_record.get('secondaryIds'): construct_secondary_id = { "data_id": construct_record.get('primaryId'), "secondary_id": secondary_id } construct_secondary_ids.append(construct_secondary_id) if counter == batch_size: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ] constructs = [] construct_secondary_ids = [] construct_synonyms = [] cross_reference_list = [] non_bgi_components = [] component_details = [] component_no_gene_details = [] counter = 0 if counter > 0: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ]
class HTPMetaDatasetSampleETL(ETL): htp_dataset_sample_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:OBITerm {primaryKey:row.sampleType}) MATCH (s:Species {primaryKey: row.taxonId}) MATCH (a:MMOTerm {primaryKey: row.assayType}) MERGE (ds:HTPDatasetSample {primaryKey:row.datasetSampleId}) ON CREATE SET ds.dateAssigned = row.dateAssigned, ds.abundance = row.abundance, ds.sex = row.sex, ds.notes = row.notes, ds.dateAssigned = row.dateAssigned, //ds.biosampleText = row.biosampleText, ds.sequencingFormat = row.sequencingFormat, ds.title = row.sampleTitle, ds.sampleAge = row.sampleAge MERGE (ds)-[dssp:FROM_SPECIES]-(s) //MERGE (ds)-[dsat:ASSAY_TYPE]-(a) //MERGE (ds)-[dsst:SAMPLE_TYPE]-(o) """ htp_dataset_sample_agm_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId}) MATCH (agm:AffectedGenomicModel {primaryKey:row.biosampleId}) MERGE (agm)-[agmds:ASSOCIATION]-(ds) """ htp_dataset_sample_agmtext_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId}) MERGE (agm:AffectedGenomicModel {primaryKey:row.biosampleText}) MERGE (agm)-[agmds:ASSOCIATION]-(ds) """ htp_bio_entity_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId}) MERGE (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) ON CREATE SET e.whereExpressedStatement = row.whereExpressedStatement MERGE (dss)-[dsdss:STRUCTURE_SAMPLED]-(e) """ htp_stages_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId}) MATCH (st:Stage {primaryKey:row.stageName}) MERGE (dss)-[eotcctq:SAMPLED_DURING]-(s) """ htp_dataset_join_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDataset {primaryKey:row.datasetId}) MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId}) MERGE (ds)-[dsdss:ASSOCIATION]-(dss) """ htp_secondaryIds_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (dss:HTPDatasetSample {primaryKey: row.datasetSampleId}) MERGE (sec:SecondaryId:Identifier {primaryKey:row.secondaryId}) ON CREATE SET sec.name = row.secondaryId MERGE (dss)<-[aka:ALSO_KNOWN_AS]-(sec) """ ao_substructures_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasst) AND NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ ao_qualifiers_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalStructureQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasst) AND NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ ao_ss_qualifiers_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasst) AND NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ ao_terms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalStructureTermId}) WHERE NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ cc_term_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.cellularComponentTermId}) WHERE NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ eas_substructure_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId}) WHERE NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUB_SUBSTRUCTURE]->(otasst) """ eas_qualified_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otastq:Ontology {primaryKey:row.anatomicalStructureQualifierTermId}) WHERE NOT 'FBCVTerm' in LABELS(otastq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotastq:ANATOMICAL_STRUCTURE_QUALIFIER]-(otastq) """ eass_qualified_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasstq:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasstq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasstq:ANATOMICAL_SUB_STRUCTURE_QUALIFIER]-(otasstq) """ ccq_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otcctq:Ontology {primaryKey:row.cellularComponentQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otcctq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotcctq:CELLULAR_COMPONENT_QUALIFIER]-(otcctq) """ stage_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MERGE (s:Stage {primaryKey:row.stageName}) ON CREATE SET s.name = row.stageName MERGE (ei)-[eotcctq:DURING]-(s) """ uberon_ao_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (o:Ontology:UBERONTerm {primaryKey:row.aoUberonId}) MERGE (ebe)-[ebeo:ANATOMICAL_RIBBON_TERM]-(o) """ uberon_stage_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MATCH (o:Ontology:UBERONTerm {primaryKey:row.uberonStageId}) MERGE (ei)-[eio:STAGE_RIBBON_TERM]-(o) """ uberon_ao_other_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'}) MERGE (ebe)-[ebeu:ANATOMICAL_RIBBON_TERM]-(u) """ uberon_stage_other_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'}) MERGE (ei)-[eiu:STAGE_RIBBON_TERM]-(u) """ htp_dataset_sample_assemblies_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId}) MATCH (u:Assembly {primaryKey:row.assembly}) MERGE (ds)-[dsu:ASSEMBLY]-(u) """ htpdatasetsample_xrefs_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:HTPDatasetSample {primaryKey:row.datasetId}) """ + ETLHelper.get_cypher_xref_text( ) def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): p = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, )) p.start() thread_pool.append(p) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetSampleETL.htp_dataset_sample_query_template, commit_size, "htp_metadataset_sample_samples_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_bio_entity_expression_query_template, commit_size, "htp_metadataset_sample_bioentities_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_sample_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_join_query_template, commit_size, "htp_metadataset_sample_datasets_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size, "htp_metadataset_sample_stages_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size, "htp_metadataset_sample_aoterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_substructures_query_template, commit_size, "htp_metadataset_sample_ao_substructures_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.cc_term_query_template, commit_size, "htp_metadataset_sample_ccterms" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ccq_expression_query_template, commit_size, "htp_metadataset_sample_ccqterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size, "htp_metadataset_sample_uberon_ao_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_other_query_template, commit_size, "htp_metadataset_sample_uberon_ao_other_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template, commit_size, "htp_metadataset_sample_agms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_agmtext_query_template, commit_size, "htp_metadataset_sample_agmstext_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_assemblies_query_template, commit_size, "htp_metadataset_sample_assemblies_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, htp_datasetsample_data, batch_size): htp_datasetsamples = [] secondaryIds = [] datasetIds = [] assemblies = [] uberon_ao_data = [] ao_qualifiers = [] bio_entities = [] ao_ss_qualifiers = [] ao_substructures = [] ao_terms = [] uberon_ao_other_data = [] stages = [] ccq_components = [] cc_components = [] biosamples = [] biosamplesTexts = [] counter = 0 data_provider_object = htp_datasetsample_data['metaData'][ 'dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') for datasample_record in htp_datasetsample_data['data']: counter = counter + 1 sampleIds = '' biosampleId = '' biosampleText = '' sampleId = '' sampleTitle = '' biosamplesTexts = '' if 'sampleId' in datasample_record: sampleIdObj = datasample_record.get('sampleId') sampleId = sampleIdObj.get('primaryId') if 'secondaryIds' in sampleIdObj: for secId in sampleIdObj.get('secondaryIds'): secid = { "datasetSampleId": sampleId, "secondaryId": secId } secondaryIds.append(secid) if 'sampleTitle' in sampleIds: sampleTitle = datasample_record.get('sampleTitle') datasetSampleId = sampleId + sampleTitle if 'datasetIds' in datasample_record: datasetIdSet = datasample_record.get('datasetIds') for datasetID in datasetIdSet: datasetsample = { "datasetSampleId": datasetSampleId, "datasetId": datasetID } datasetIds.append(datasetsample) if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( datasetID) if is_it_test_entry is False: counter = counter - 1 continue if 'genomicInformation' in datasample_record: genomicInformation = datasample_record.get( 'genomicInformation') if 'biosampleId' in genomicInformation: biosampleId = genomicInformation.get('biosampleId') if 'bioSampleText' in genomicInformation: biosampleText = genomicInformation.get('bioSampleText') if biosampleId is not None and biosampleId != '': biosample = { "biosampleId": biosampleId, "datasetSampleId": datasetSampleId } biosamples.append(biosample) if biosampleText is not None and biosampleText != '' and biosampleId == '': biosampleText = { "biosampleText": biosampleText, "datasetSampleId": datasetSampleId } biosamplesTexts.append(biosampleText) if 'assemblyVersions' in datasample_record: for assembly in datasample_record.get('assemblyVersions'): datasetsample = { "datasetSampleId": datasetSampleId, "assembly": assembly } assemblies.append(datasetsample) age = '' if 'sampleAge' in datasample_record: sampleAge = datasample_record.get('sampleAge') stageId = "" if 'age' in sampleAge: age = sampleAge.get('age') stageId = stageId + age if 'stage' in sampleAge: stage = sampleAge.get('stage') stageId = stageId + stage.get('stageName') stage = { "stageId": stageId, "stageTermId": stage.get('stageTermId'), "stageName": stage.get('stageName'), "stageUberonSlimTerm": stage.get('stageUberonSlimTerm'), "sampleAge": age, "datasetSampleId": datasetSampleId } stages.append(stage) else: stage = {"stageId": stageId, "sampleAge": age} stages.append(stage) if 'sampleLocations' in datasample_record: sampleLocations = datasample_record.get('sampleLocations') for location in sampleLocations: cellular_component_qualifier_term_id = location.get( 'cellularComponentQualifierTermId') cellular_component_term_id = location.get( 'cellularComponentTermId') anatomical_structure_term_id = location.get( 'anatomicalStructureTermId') anatomical_structure_qualifier_term_id = location.get( 'anatomicalStructureQualifierTermId') anatomical_sub_structure_term_id = location.get( 'anatomicalSubStructureTermId') anatomical_sub_structure_qualifier_term_id = location.get( 'anatomicalSubStructureQualifierTermId') where_expressed_statement = location.get( 'whereExpressedStatement') expression_unique_key = datasetSampleId expression_entity_unique_key = '' if anatomical_structure_term_id is not None: expression_unique_key += anatomical_structure_term_id expression_entity_unique_key = anatomical_structure_term_id if anatomical_structure_qualifier_term_id is not None: expression_unique_key += anatomical_structure_qualifier_term_id expression_entity_unique_key += anatomical_structure_qualifier_term_id if cellular_component_term_id is not None: expression_unique_key += cellular_component_term_id expression_entity_unique_key += cellular_component_term_id if cellular_component_qualifier_term_id is not None: expression_unique_key += cellular_component_qualifier_term_id expression_entity_unique_key += cellular_component_qualifier_term_id if anatomical_sub_structure_term_id is not None: expression_unique_key += anatomical_sub_structure_term_id if anatomical_sub_structure_qualifier_term_id is not None: expression_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += where_expressed_statement if location.get('anatomicalStructureUberonSlimTermIds' ) is not None: for uberon_structure_term_object in location.get( 'anatomicalStructureUberonSlimTermIds'): structure_uberon_term_id = uberon_structure_term_object.get( 'uberonTerm') if structure_uberon_term_id is not None and structure_uberon_term_id != 'Other': structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": structure_uberon_term_id } uberon_ao_data.append(structure_uberon_term) elif structure_uberon_term_id is not None and structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if location.get('anatomicalSubStructureUberonSlimTermIds' ) is not None: for uberon_sub_structure_term_object in location.get( 'anatomicalSubStructureUberonSlimTermIds'): sub_structure_uberon_term_id = uberon_sub_structure_term_object.get( 'uberonTerm') if sub_structure_uberon_term_id is not None and sub_structure_uberon_term_id != 'Other': sub_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": sub_structure_uberon_term_id } uberon_ao_data.append( sub_structure_uberon_term) elif sub_structure_uberon_term_id is not None and sub_structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if cellular_component_term_id is not None: cc_term = { "ebe_uuid": expression_entity_unique_key, "cellularComponentTermId": cellular_component_term_id } cc_components.append(cc_term) if cellular_component_qualifier_term_id is not None: ccq_term = { "ebe_uuid": expression_entity_unique_key, "cellularComponentQualifierTermId": cellular_component_qualifier_term_id } ccq_components.append(ccq_term) if anatomical_structure_term_id is not None: ao_term = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureTermId": anatomical_structure_term_id } ao_terms.append(ao_term) if anatomical_structure_qualifier_term_id is not None: ao_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureQualifierTermId": anatomical_structure_qualifier_term_id } ao_qualifiers.append(ao_qualifier) if anatomical_sub_structure_term_id is not None: ao_substructure = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureTermId": anatomical_sub_structure_term_id } ao_substructures.append(ao_substructure) if anatomical_sub_structure_qualifier_term_id is not None: ao_ss_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureQualifierTermId": anatomical_sub_structure_qualifier_term_id } ao_ss_qualifiers.append(ao_ss_qualifier) if where_expressed_statement is None: where_expressed_statement = "" bio_entity = { "ebe_uuid": expression_entity_unique_key, "whereExpressedStatement": where_expressed_statement, "datasetSampleId": datasetSampleId } bio_entities.append(bio_entity) htp_dataset_sample = { "datasetSampleId": datasetSampleId, "abundance": datasample_record.get('abundance'), "sampleType": datasample_record.get('sampleType'), "taxonId": datasample_record.get('taxonId'), "sex": datasample_record.get('sex'), "assayType": datasample_record.get('assayType'), "notes": datasample_record.get('notes'), "dateAssigned": datasample_record.get('dateAssigned'), "sequencingFormat": datasample_record.get('sequencingFormat'), "sampleTitle": sampleTitle, "sampleAge": age } htp_datasetsamples.append(htp_dataset_sample) # # if self.test_object.using_test_data() is True: # is_it_test_entry = self.test_object.check_for_test_id_entry(datasetID) # if is_it_test_entry is True: # self.logger.info(htp_dataset_sample) if counter == batch_size: yield [ htp_datasetsamples, bio_entities, secondaryIds, datasetIds, stages, ao_terms, ao_substructures, ao_qualifiers, ao_ss_qualifiers, cc_components, ccq_components, uberon_ao_data, uberon_ao_other_data, biosamples, biosamplesTexts, assemblies, ] counter = 0 htp_datasetsamples = [] datasetIds = [] uberon_ao_data = [] ao_qualifiers = [] bio_entities = [] ao_ss_qualifiers = [] ao_substructures = [] ao_terms = [] uberon_ao_other_data = [] stages = [] ccq_components = [] cc_components = [] biosamples = [] assemblies = [] if counter > 0: yield [ htp_datasetsamples, bio_entities, secondaryIds, datasetIds, stages, ao_terms, ao_substructures, ao_qualifiers, ao_ss_qualifiers, cc_components, ccq_components, uberon_ao_data, uberon_ao_other_data, biosamples, biosamplesTexts, assemblies ]
class HTPMetaDatasetETL(ETL): htp_dataset_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row CREATE (ds:HTPDataset {primaryKey:row.datasetId}) SET ds.dateAssigned = row.dateAssigned, ds.summary = row.summary, ds.numChannels = row.numChannels, ds.subSeries = row.subSeries """ htp_dataset_pub_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDataset {primaryKey: row.datasetId}) MERGE (p:Publication {primaryKey: row.pubPrimaryKey}) ON CREATE SET p.pubModId = row.pubModId, p.pubMedId = row.pubMedId, p.pubModUrl = row.pubModUrl, p.pubMedUrl = row.pubMedUrl MERGE (p)-[:ASSOCIATION]-(ds) """ htp_category_tags_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDataset {primaryKey:row.datasetId}) MERGE (ct:CategoryTag {primaryKey:row.tag}) MERGE (ds)-[:CATEGORY_TAG]-(ct) """ htp_secondaryIds_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDataset {primaryKey: row.datasetId}) MERGE (s:SecondaryId:Identifier {primaryKey:row.secondaryId}) ON CREATE SET s.name = row.secondaryId MERGE (ds)-[aka:ALSO_KNOWN_AS]-(s) """ htpdataset_xrefs_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:HTPDataset {primaryKey:row.datasetId}) """ + ETLHelper.get_cypher_xref_text( ) def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): p = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, )) p.start() thread_pool.append(p) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata Data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetETL.htp_dataset_query_template, commit_size, "htp_metadataset_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_category_tags_query_template, commit_size, "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size, "htp_metadataset_publications_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size, "htp_metadataset_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, htp_dataset_data, batch_size): dataset_tags = [] data_providers = [] htp_datasets = [] publications = [] secondaryIds = [] cross_reference_list = [] counter = 0 date_produced = htp_dataset_data['metaData']['dateProduced'] data_provider_object = htp_dataset_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] for dataset_record in htp_dataset_data['data']: counter = counter + 1 dataset = dataset_record.get('datasetId') datasetId = dataset.get('primaryId') # spoke to RGD and they wish to remove these datasets as they overlap with SGD. if (datasetId == 'GEO:GSE18157' or datasetId == 'GEO:GSE33497') and data_provider == 'RGD': continue if 'secondaryIds' in dataset: for secId in dataset.get('secondaryIds'): secid = {"datasetId": datasetId, "secondaryId": secId} secondaryIds.append(secid) if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( datasetId) if is_it_test_entry is False: counter = counter - 1 continue if 'crossReference' in dataset: crossRefO = dataset.get('crossReference') if crossRefO is not None: crossRefId = crossRefO.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRefId.split(":")[0] pages = crossRefO.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_url, crossRefId + page) xref['dataId'] = datasetId cross_reference_list.append(xref) category_tags = dataset_record.get('categoryTags') if category_tags is not None: for tag in category_tags: dataset_category_tag = {"datasetId": datasetId, "tag": tag} dataset_tags.append(dataset_category_tag) publicationNew = dataset_record.get('publications') if publicationNew is not None: for pub in publicationNew: pid = pub.get('publicationId') publication_mod_id = "" pub_med_id = "" pub_mod_url = "" pub_med_url = "" if pid is not None and pid.startswith('PMID:'): pub_med_id = pid local_pub_med_id = pub_med_id.split(":")[1] pub_med_url = pub_med_url = self.etlh.get_no_page_complete_url( local_pub_med_id, 'PMID', pub_med_id) if 'crossReference' in pub: pub_xref = pub.get('crossReference') publication_mod_id = pub_xref.get('id') pub_mod_url = self.etlh.rdh2.return_url_from_identifier( publication_mod_id) elif pid is not None and not pid.startswith('PMID:'): publication_mod_id = pub.get('publicationId') pub_mod_url = self.etlh.rdh2.return_url_from_identifier( publication_mod_id) publication = { "datasetId": datasetId, "pubPrimaryKey": publication_mod_id + pub_med_id, "pubModId": publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModUrl": pub_mod_url } publications.append(publication) htp_dataset = { "datasetId": datasetId, "dateAssigned": dataset_record.get('dateAssigned'), "title": dataset_record.get('title'), "summary": dataset_record.get('summary'), "numChannels": dataset_record.get('numChannels'), "subSeries": dataset_record.get('subSeries') } htp_datasets.append(htp_dataset) if counter == batch_size: yield [ htp_datasets, dataset_tags, publications, cross_reference_list, secondaryIds ] counter = 0 htp_datasets = [] dataset_tags = [] publications = [] cross_reference_list = [] secondaryIds = [] if counter > 0: yield [ htp_datasets, dataset_tags, publications, cross_reference_list, secondaryIds ]
class VariationETL(ETL): """Variation ETL.""" logger = logging.getLogger(__name__) # Query templates which take params and will be processed later variation_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (a:Allele {primaryKey:row.alleleId}) MATCH (g:Gene)-[:IS_ALLELE_OF]-(a) //Create the variant node and set properties. primaryKey is required. MERGE (o:Variant {primaryKey:row.hgvs_nomenclature}) ON CREATE SET o.name = row.variantHGVSSynonym, o.hgvsNomenclature = row.hgvs_nomenclature, o.genomicReferenceSequence = row.genomicReferenceSequence, o.paddingLeft = row.paddingLeft, o.paddingRight = row.paddingRight, o.genomicVariantSequence = row.genomicVariantSequence, o.dateProduced = row.dateProduced, o.release = row.release, o.dataProviders = row.dataProviders, o.dataProvider = row.dataProvider MERGE (s:Synonym:Identifier {primaryKey:row.hgvs_nomenclature}) SET s.name = row.hgvs_nomenclature MERGE (o)-[aka2:ALSO_KNOWN_AS]->(s) MERGE (o)-[:VARIATION]->(a) MERGE (g)-[:COMPUTED_GENE]->(o) """ so_terms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Variant {primaryKey:row.variantId}) MATCH (s:SOTerm {primaryKey:row.soTermId}) CREATE (o)-[:VARIATION_TYPE]->(s)""" genomic_locations_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Variant {primaryKey:row.variantId}) MATCH (chrm:Chromosome {primaryKey:row.chromosome}) MERGE (a:Assembly {primaryKey:row.assembly}) ON CREATE SET a.dataProvider = row.dataProvider CREATE (o)-[gchrm:LOCATED_ON]->(chrm) CREATE (gchrmn:GenomicLocation {primaryKey:row.uuid}) SET gchrmn.start = apoc.number.parseInt(row.start), gchrmn.end = apoc.number.parseInt(row.end), gchrmn.assembly = row.assembly, gchrmn.strand = row.strand, gchrmn.chromosome = row.chromosome CREATE (o)-[of:ASSOCIATION]->(gchrmn) CREATE (gchrmn)-[ofc:ASSOCIATION]->(chrm) CREATE (gchrmn)-[ao:ASSOCIATION]->(a) """ xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Variant {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text() def __init__(self, config): """Initialise object.""" super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, )) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): self.logger.info("Loading Variation Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Variation Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.variation_query_template, commit_size, "variation_data_" + sub_type.get_data_provider() + ".csv" ], [ self.genomic_locations_query_template, commit_size, "variant_genomiclocations_" + sub_type.get_data_provider() + ".csv" ], [ self.so_terms_query_template, commit_size, "variant_so_terms_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "variant_xrefs_" + sub_type.get_data_provider() + ".csv" ] ] generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Var-{}: ".format(sub_type.get_data_provider())) def get_hgvs_nomenclature(self, refseq_id, variant_type, start_position, end_position, reference_sequence, variant_sequence, assembly, chromosome): """Get HGVS nomenclature.""" if start_position is None: start_position_str = "" else: start_position_str = str(start_position) if end_position is None: end_position_str = "" else: end_position_str = str(end_position) if variant_sequence is None: variant_sequence_str = "" else: variant_sequence_str = variant_sequence if reference_sequence is None: reference_sequence_str = "" else: reference_sequence_str = reference_sequence hgvs_nomenclature = refseq_id.split( ":")[1] + ':g.' + start_position_str hgvs_synonym = '(' + assembly + ')' + chromosome + ':' + start_position_str if variant_type in ['SO:1000002', 'SO:1000008']: # point mutation/substitution hgvs_nomenclature += reference_sequence_str + ">" + variant_sequence_str hgvs_synonym += reference_sequence_str + ">" + variant_sequence_str elif variant_type == "SO:0000667": # insertion hgvs_nomenclature += '_' + end_position_str + 'ins' + variant_sequence_str hgvs_synonym += '_' + end_position_str + 'ins' + variant_sequence_str elif variant_type == "SO:0000159": # deletion hgvs_nomenclature += '_' + end_position_str + 'del' hgvs_synonym += '_' + end_position_str + 'del' elif variant_type == "SO:0002007": # MNV hgvs_nomenclature += '_' + end_position_str + 'delins' + variant_sequence_str hgvs_synonym += '_' + end_position_str + 'delins' + variant_sequence_str elif variant_type == "SO:1000032": # DELIN hgvs_nomenclature += '_' + end_position_str + 'delins' + variant_sequence_str hgvs_synonym += '_' + end_position_str + 'delins' + variant_sequence_str else: hgvs_nomenclature = '' hgvs_synonym = '' return hgvs_nomenclature, hgvs_synonym def get_generators(self, variant_data, batch_size): # noqa """Get Generators.""" data_providers = [] release = "" variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] counter = 0 date_produced = variant_data['metaData']['dateProduced'] self.data_providers_process(variant_data) load_key = date_produced + self.data_provider + "_VARIATION" if 'release' in variant_data['metaData']: release = variant_data['metaData']['release'] assemblies = {} for allele_record in variant_data['data']: chromosome = allele_record["chromosome"] if chromosome.startswith("chr"): chromosome_str = chromosome[3:] else: chromosome_str = chromosome assembly = allele_record["assembly"] if assembly not in assemblies: self.logger.info(assembly) context_info = ContextInfo() data_manager = DataFileManager( context_info.config_file_location) assemblies[assembly] = AssemblySequenceHelper( assembly, data_manager) so_term_id = allele_record.get('type') genomic_reference_sequence = allele_record.get( 'genomicReferenceSequence') genomic_variant_sequence = allele_record.get( 'genomicVariantSequence') if genomic_reference_sequence == 'N/A': genomic_reference_sequence = "" if genomic_variant_sequence == 'N/A': genomic_variant_sequence = "" padding_left = "" padding_right = "" if allele_record.get('start') != "" and allele_record.get( 'end') != "": # not insertion if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567": genomic_reference_sequence = assemblies[ assembly].get_sequence(chromosome_str, allele_record.get('start'), allele_record.get('end')) if allele_record.get('start') < allele_record.get('end'): start = allele_record.get('start') end = allele_record.get('end') else: start = allele_record.get('end') end = allele_record.get('start') padding_width = 500 if so_term_id != "SO:0000667": # not insertion start = start - 1 end = end + 1 left_padding_start = start - padding_width if left_padding_start < 1: left_padding_start = 1 padding_left = assemblies[assembly].get_sequence( chromosome_str, left_padding_start, start) right_padding_end = end + padding_width padding_right = assemblies[assembly].get_sequence( chromosome_str, end, right_padding_end) counter = counter + 1 global_id = allele_record.get('alleleId') mod_global_cross_ref_id = "" cross_references = [] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue cross_ref_primary_id = allele_record.get( 'sequenceOfReferenceAccessionNumber') local_cross_ref_id = cross_ref_primary_id.split(":")[1] prefix = cross_ref_primary_id.split(":")[0] cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, global_id) xref_map = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, "variant_sequence_of_reference", "sequence_of_reference_accession_number", global_id, cross_ref_complete_url, cross_ref_primary_id + "variant_sequence_of_reference") xref_map['dataId'] = global_id if cross_ref_primary_id is not None: cross_references.append(xref_map) if genomic_reference_sequence is not None: if len(genomic_reference_sequence) > 1000 and ( allele_record.get('type') == 'SO:1000002' or allele_record.get('type') == 'SO:1000008'): self.logger.debug("%s genomicReferenceSequence", allele_record.get('alleleId')) if genomic_variant_sequence is not None: if len(genomic_variant_sequence) > 1000 and ( allele_record.get('type') in ['SO:1000002', 'SO:1000008']): self.logger.debug("%s genomicVariantSequence", allele_record.get('alleleId')) hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature( allele_record.get('sequenceOfReferenceAccessionNumber'), allele_record.get('type'), allele_record.get('start'), allele_record.get('end'), genomic_reference_sequence, genomic_variant_sequence, allele_record.get('assembly'), chromosome_str) if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \ or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000: self.logger.debug( "%s has too long of a sequence potentionally", allele_record.get('alleleId')) # TODO: fix typo in MGI Submission for this variant so # that it doesn't list a 40K bp point mutation. if allele_record.get('alleleId') != 'MGI:6113870': variant_dataset = { "hgvs_nomenclature": hgvs_nomenclature, "genomicReferenceSequence": genomic_reference_sequence, "genomicVariantSequence": genomic_variant_sequence, "paddingLeft": padding_left, "paddingRight": padding_right, "alleleId": allele_record.get('alleleId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "dataProvider": self.data_provider, "variantHGVSSynonym": hgvs_synonym } variant_genomic_location_dataset = { "variantId": hgvs_nomenclature, "assembly": allele_record.get('assembly'), "chromosome": chromosome_str, "start": allele_record.get('start'), "end": allele_record.get('end'), "uuid": str(uuid.uuid4()), "dataProvider": self.data_provider } variant_so_term = { "variantId": hgvs_nomenclature, "soTermId": allele_record.get('type') } variant_so_terms.append(variant_so_term) variant_genomic_locations.append( variant_genomic_location_dataset) variants.append(variant_dataset) if counter == batch_size: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ] variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] if counter > 0: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ]
class DOETL(ETL): """DO ETL.""" logger = logging.getLogger(__name__) # Query templates which take params and will be processed later do_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row //Create the DOTerm node and set properties. primaryKey is required. MERGE (doterm:DOTerm:Ontology {primaryKey:row.oid}) SET doterm.name = row.name, doterm.nameKey = row.name_key, doterm.definition = row.definition, doterm.defLinks = apoc.convert.fromJsonList(row.defLinksProcessed), doterm.isObsolete = row.is_obsolete, doterm.subset = row.subset, doterm.doDisplayId = row.oid, doterm.doUrl = row.oUrl, doterm.doPrefix = "DOID", doterm.doId = row.oid, doterm.rgdLink = row.rgd_link, doterm.ratOnlyRgdLink = row.rat_only_rgd_link, doterm.humanOnlyRgdLink = row.human_only_rgd_link, doterm.mgiLink = row.mgi_link, doterm.zfinLink = row.zfin_link, doterm.flybaseLink = row.flybase_link, doterm.wormbaseLink = row.wormbase_link, doterm.sgdLink = row.sgd_link MERGE (doterm)-[ggcg:IS_A_PART_OF_CLOSURE]->(doterm)""" doterm_synonyms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (d:DOTerm {primaryKey:row.primary_id}) MERGE (syn:Synonym:Identifier {primaryKey:row.synonym}) SET syn.name = row.synonym MERGE (d)-[aka2:ALSO_KNOWN_AS]->(syn) """ doterm_isas_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (d1:DOTerm:Ontology {primaryKey:row.primary_id}) MATCH (d2:DOTerm:Ontology {primaryKey:row.primary_id2}) MERGE (d1)-[aka:IS_A]->(d2) """ xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:DOTerm {primaryKey:row.oid}) """ + ETLHelper.get_cypher_xref_text( ) doterm_alt_ids_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (d:DOTerm {primaryKey:row.primary_id}) MERGE (sec:SecondaryId:Identifier {primaryKey:row.secondary_id}) MERGE (d)-[aka2:ALSO_KNOWN_AS]->(sec) """ def __init__(self, config): """Initialise object.""" super().__init__() self.data_type_config = config def _load_and_process_data(self): filepath = self.data_type_config.get_single_filepath() commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.do_query_template, commit_size, "do_term_data.csv"], [self.doterm_isas_query_template, commit_size, "do_isas_data.csv"], [ self.doterm_synonyms_query_template, commit_size, "do_synonyms_data.csv" ], [self.xrefs_query_template, commit_size, "do_xrefs_data.csv"], [ self.doterm_alt_ids_query_template, commit_size, "do_alt_ids_data.csv" ] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("DO-?: ") def get_generators(self, filepath, batch_size): # noqa TODO:Needs splitting up really """Get Generators.""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # - might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] def_links_unprocessed = [] def_links_processed = [] subset = [] definition = "" is_obsolete = "false" ident = key if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: do_synonym = {"primary_id": key, "synonym": synonym} do_synonyms_list.append(do_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [ s["val"] for s in node["meta"]["basicPropertyValues"] ] for alt_id in alt_ids: if "DOID:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } do_alt_ids_list.append(secondary_id) if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') self.ortho_xrefs(o_xrefs, ident, xrefs) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents( key, relations=['subClassOf']) for item in isas_without_names: dictionary = {"primary_id": key, "primary_id2": item} do_isas_list.append(dictionary) def_links_processed = [] def_links = "" if definition is None: definition = "" else: # Remove new lines that cause this to split across two lines in the file # definition = definition.replace('\n', ' ') # Remove any extra double space that might have been introduces in the last replace # definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = re.split(r'(?<!\\)"', definition) if len(split_definition) > 1: if len(split_definition ) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links = def_links.rstrip("]").replace("[", "") def_links_unprocessed.append(def_links) for def_link in def_links_unprocessed: def_link = def_link.replace("url:www", "http://www") def_link = def_link.replace("url:", "") def_link = def_link.replace("URL:", "") def_link = def_link.replace("\\:", ":") def_link = def_link.replace('\\', '') if "," in def_link: def_link = def_link.split(",") for link in def_link: if link.strip().startswith("http"): def_links_processed.append(link) else: if def_link.strip().startswith("http"): def_links_processed.append(def_link) # TODO: make this a generic section based on the resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas # NU: alt_ids = node.get('alt_id') # if alt_ids: # if not isinstance(alt_ids, (list, tuple)): # alt_ids = [alt_ids] # else: # alt_ids = [] # TODO: Need to add urls to resource Descriptis for SGD and MGI. # NOTE: MGI had one but has 'MGI:' at the end of the url not required here. dict_to_append = { 'oid': node['id'], 'name': node.get('label'), 'name_key': node.get('label'), 'definition': definition, 'defLinksProcessed': def_links_processed, 'is_obsolete': is_obsolete, 'subset': subset, 'oUrl': self.etlh.rdh2.return_url_from_key_value('DOID', node['id']), 'rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/all'), 'rat_only_rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/rat'), 'human_only_rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/human'), 'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'], 'zfin_link': self.etlh.rdh2.return_url_from_key_value( 'ZFIN', node['id'], 'disease'), 'flybase_link': self.etlh.rdh2.return_url_from_key_value( 'FB', node['id'], 'disease'), 'wormbase_link': self.etlh.rdh2.return_url_from_key_value( 'WB', node['id'], 'disease'), 'sgd_link': 'https://yeastgenome.org/disease/' + node['id'] } do_term_list.append(dict_to_append) if counter == batch_size: yield [ do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list ] do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 if counter > 0: yield [ do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list ]
class ExpressionAtlasETL(ETL): """Expression Atlas ETL.""" logger = logging.getLogger(__name__) # Querys which do not take params and can be used as is get_all_gene_primary_to_ensmbl_ids_query = """ MATCH (g:Gene)-[:CROSS_REFERENCE]-(c:CrossReference) WHERE c.prefix = 'ENSEMBL' RETURN g.primaryKey, c.localId""" get_mod_gene_symbol_to_primary_ids_query = """ MATCH (g:Gene) WHERE g.dataProvider = {parameter} RETURN g.primaryKey, g.symbol""" get_genes_with_expression_atlas_links_query = """ MATCH (g:Gene) WHERE LOWER(g.primaryKey) IN {parameter} RETURN g.primaryKey, g.modLocalId""" # Query templates which take params and will be processed later add_expression_atlas_crossreferences_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene) WHERE o.primaryKey = row.genePrimaryKey """ + ETLHelper.get_cypher_xref_text() def __init__(self, config): """Initialise object.""" super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] ensg_to_gene_primary_id_map = self._get_primary_gene_ids_to_ensembl_ids( ) for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process( target=self._process_sub_type, args=(sub_type, ensg_to_gene_primary_id_map)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) @staticmethod def _get_primary_gene_ids_to_ensembl_ids(): return_set = Neo4jHelper.run_single_query( ExpressionAtlasETL.get_all_gene_primary_to_ensmbl_ids_query) return { record["c.localId"].lower(): record["g.primaryKey"] for record in return_set } @staticmethod def _get_mod_gene_symbol_to_primary_ids(data_provider): return_set = Neo4jHelper.run_single_parameter_query( ExpressionAtlasETL.get_mod_gene_symbol_to_primary_ids_query, data_provider) return { record["g.symbol"].lower(): record["g.primaryKey"] for record in return_set } # Returns only pages for genes that we have in the Alliance def _get_expression_atlas_gene_pages(self, sub_type, data_provider, ensg_to_gene_primary_id_map): filepath = sub_type.get_filepath() gene_symbol_to_primary_id_map = self._get_mod_gene_symbol_to_primary_ids( data_provider) expression_atlas_gene_pages = {} with open(filepath) as file_handle: doc = xmltodict.parse(file_handle.read())["urlset"] for value in doc.values(): if isinstance(value, (list, )): for element in value: url = element['loc'] expression_atlas_gene = url.split("/")[-1] expression_atlas_gene = expression_atlas_gene.lower() if expression_atlas_gene in ensg_to_gene_primary_id_map: expression_atlas_gene_pages[ ensg_to_gene_primary_id_map[ expression_atlas_gene].lower()] = url elif expression_atlas_gene in gene_symbol_to_primary_id_map: expression_atlas_gene_pages[ gene_symbol_to_primary_id_map[ expression_atlas_gene].lower()] = url else: alliance_gene = data_provider + ":" + expression_atlas_gene expression_atlas_gene_pages[ alliance_gene.lower()] = url return expression_atlas_gene_pages def _process_sub_type(self, sub_type, ensg_to_gene_primary_id_map): data_provider = sub_type.get_data_provider() expression_atlas_gene_pages = self._get_expression_atlas_gene_pages( sub_type, data_provider, ensg_to_gene_primary_id_map) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(expression_atlas_gene_pages, data_provider, batch_size) query_template_list = [ [ self.add_expression_atlas_crossreferences_query_template, commit_size, "expression_atlas_" + data_provider + "_data.csv" ], ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("ExpAtlas-{}: ".format( sub_type.get_data_provider())) def get_generators(self, expression_atlas_gene_pages, data_provider, batch_size): """Get Generators.""" return_set = Neo4jHelper.run_single_parameter_query( ExpressionAtlasETL.get_genes_with_expression_atlas_links_query, list(expression_atlas_gene_pages.keys())) counter = 0 cross_reference_list = [] for record in return_set: counter += 1 cross_reference = ETLHelper.get_xref_dict( record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene", "gene/expression-atlas", "gene/expressionAtlas", record["g.modLocalId"], expression_atlas_gene_pages[record["g.primaryKey"].lower()], data_provider + ":" + record["g.modLocalId"] + "gene/expression-atlas") cross_reference["genePrimaryKey"] = record["g.primaryKey"] cross_reference_list.append(cross_reference) if counter > batch_size: yield [cross_reference_list] counter = 0 cross_reference_list = [] if counter > 0: yield [cross_reference_list]
class ExpressionETL(ETL): """Expression ETL""" logger = logging.getLogger(__name__) # Query templates which take params and will be processed later xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid}) """ + ETLHelper.get_cypher_xref_text() bio_entity_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MERGE (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) ON CREATE SET e.whereExpressedStatement = row.whereExpressedStatement""" bio_entity_gene_expression_join_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (assay:MMOTerm:Ontology {primaryKey:row.assay}) MERGE (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid}) ON CREATE SET gej.joinType = 'expression' MERGE (gej)-[geja:ASSAY]->(assay)""" bio_entity_gene_ao_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (g:Gene {primaryKey:row.geneId}) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (otast:Ontology {primaryKey:row.anatomicalStructureTermId}) WHERE NOT 'UBERONTerm' in LABELS(otast) AND NOT 'FBCVTerm' in LABELS(otast) MERGE (g)-[gex:EXPRESSED_IN]->(e) ON CREATE SET gex.uuid = row.ei_uuid MERGE (e)-[gejotast:ANATOMICAL_STRUCTURE]->(otast)""" add_pubs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid}) MERGE (pubf:Publication {primaryKey:row.pubPrimaryKey}) ON CREATE SET pubf.pubModId = row.pubModId, pubf.pubMedId = row.pubMedId, pubf.pubModUrl = row.pubModUrl, pubf.pubMedUrl = row.pubMedUrl CREATE (gej)-[gejpubf:EVIDENCE]->(pubf) """ ao_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row // GET PRIMARY DATA OBJECTS // LOAD NODES MATCH (g:Gene {primaryKey:row.geneId}) MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid}) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (g)-[ggej:ASSOCIATION]->(gej) MERGE (e)-[egej:ASSOCIATION]->(gej)""" sgd_cc_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row // GET PRIMARY DATA OBJECTS // LOAD NODES MATCH (g:Gene {primaryKey:row.geneId}) MATCH (assay:MMOTerm:Ontology {primaryKey:row.assay}) MATCH (otcct:GOTerm:Ontology {primaryKey:row.cellularComponentTermId}) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid}) MERGE (g)-[gex:EXPRESSED_IN]->(e) ON CREATE SET gex.uuid = row.ei_uuid MERGE (gej)-[geja:ASSAY]->(assay) MERGE (g)-[ggej:ASSOCIATION]->(gej) MERGE (e)-[egej:ASSOCIATION]->(gej) MERGE (e)-[eotcct:CELLULAR_COMPONENT]->(otcct)""" cc_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row // GET PRIMARY DATA OBJECTS // LOAD NODES MATCH (g:Gene {primaryKey:row.geneId}) MATCH (assay:MMOTerm:Ontology {primaryKey:row.assay}) MATCH (otcct:GOTerm:Ontology {primaryKey:row.cellularComponentTermId}) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid}) MERGE (g)-[gex:EXPRESSED_IN]->(e) ON CREATE SET gex.uuid = row.ei_uuid MERGE (gej)-[geja:ASSAY]->(assay) MERGE (g)-[ggej:ASSOCIATION]->(gej) MERGE (e)-[egej:ASSOCIATION]->(gej) MERGE (e)-[eotcct:CELLULAR_COMPONENT]->(otcct)""" ao_cc_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row // GET PRIMARY DATA OBJECTS // LOAD NODES MATCH (g:Gene {primaryKey:row.geneId}) MATCH (assay:MMOTerm:Ontology {primaryKey:row.assay}) MATCH (otcct:GOTerm:Ontology {primaryKey:row.cellularComponentTermId}) MATCH (otast:Ontology {primaryKey:row.anatomicalStructureTermId}) WHERE NOT 'UBERONTerm' in LABELS(otast) AND NOT 'FBCVTerm' in LABELS(otast) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (gej:BioEntityGeneExpressionJoin:Association {primaryKey:row.ei_uuid}) WITH g, e, gej, assay, otcct, otast, row WHERE NOT otast IS NULL AND NOT otcct IS NULL MERGE (g)-[gex:EXPRESSED_IN]->(e) ON CREATE SET gex.uuid = row.ei_uuid MERGE (gej)-[geja:ASSAY]->(assay) MERGE (g)-[ggej:ASSOCIATION]->(gej) MERGE (e)-[egej:ASSOCIATION]->(gej) MERGE (e)-[eotcct:CELLULAR_COMPONENT]->(otcct) MERGE (e)-[gejotast:ANATOMICAL_STRUCTURE]-(otast)""" eas_substructure_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasst) AND NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUB_SUBSTRUCTURE]->(otasst) """ eas_qualified_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otastq:Ontology {primaryKey:row.anatomicalStructureQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otastq) AND NOT 'FBCVTerm' in LABELS(otastq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotastq:ANATOMICAL_STRUCTURE_QUALIFIER]-(otastq) """ eass_qualified_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasstq:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasstq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasstq:ANATOMICAL_SUB_STRUCTURE_QUALIFIER]-(otasstq) """ ccq_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otcctq:Ontology {primaryKey:row.cellularComponentQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otcctq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotcctq:CELLULAR_COMPONENT_QUALIFIER]-(otcctq) """ stage_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MERGE (s:Stage {primaryKey:row.stageName}) ON CREATE SET s.name = row.stageName MERGE (ei)-[eotcctq:DURING]-(s) """ uberon_ao_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (o:Ontology:UBERONTerm {primaryKey:row.aoUberonId}) MERGE (ebe)-[ebeo:ANATOMICAL_RIBBON_TERM]-(o) """ uberon_stage_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MATCH (o:Ontology:UBERONTerm {primaryKey:row.uberonStageId}) MERGE (ei)-[eio:STAGE_RIBBON_TERM]-(o) """ uberon_ao_other_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'}) MERGE (ebe)-[ebeu:ANATOMICAL_RIBBON_TERM]-(u) """ uberon_stage_other_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'}) MERGE (ei)-[eiu:STAGE_RIBBON_TERM]-(u) """ def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): # add the 'other' nodes to support the expression ribbon components. self.add_other() thread_pool = [] query_tracking_list = multiprocessing.Manager().list() for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, query_tracking_list)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) queries = [] for item in query_tracking_list: queries.append(item) Neo4jTransactor.execute_query_batch(queries) def _process_sub_type(self, sub_type, query_tracking_list): self.logger.info("Loading Expression Data: %s", sub_type.get_data_provider()) data_file = sub_type.get_filepath() data_provider = sub_type.get_data_provider() if data_file is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.bio_entity_expression_query_template, commit_size, "expression_entities_" + sub_type.get_data_provider() + ".csv" ], [ self.bio_entity_gene_ao_query_template, commit_size, "expression_gene_ao_" + sub_type.get_data_provider() + ".csv" ], [ self.bio_entity_gene_expression_join_query_template, commit_size, "expression_entity_joins_" + sub_type.get_data_provider() + ".csv" ], [ self.ao_expression_query_template, commit_size, "expression_ao_expression_" + sub_type.get_data_provider() + ".csv" ] ] if data_provider == 'SGD': query_template_list += [[ self.sgd_cc_expression_query_template, commit_size, "expression_SGD_cc_expression_" + sub_type.get_data_provider() + ".csv" ]] else: query_template_list += [[ self.cc_expression_query_template, commit_size, "expression_cc_expression_" + sub_type.get_data_provider() + ".csv" ]] query_template_list += [ [ self.ao_cc_expression_query_template, commit_size, "expression_ao_cc_expression_" + sub_type.get_data_provider() + ".csv" ], [ self.eas_qualified_query_template, commit_size, "expression_eas_qualified_" + sub_type.get_data_provider() + ".csv" ], [ self.eas_substructure_query_template, commit_size, "expression_eas_substructure_" + sub_type.get_data_provider() + ".csv" ], [ self.eass_qualified_query_template, commit_size, "expression_eass_qualified_" + sub_type.get_data_provider() + ".csv" ], [ self.ccq_expression_query_template, commit_size, "expression_ccq_expression_" + sub_type.get_data_provider() + ".csv" ], [ self.stage_expression_query_template, commit_size, "expression_stage_expression_" + sub_type.get_data_provider() + ".csv" ], [ self.uberon_stage_query_template, commit_size, "expression_uberon_stage_" + sub_type.get_data_provider() + ".csv" ], [ self.uberon_ao_query_template, commit_size, "expression_uberon_ao_" + sub_type.get_data_provider() + ".csv" ], [ self.uberon_ao_other_query_template, commit_size, "expression_uberon_ao_other_" + sub_type.get_data_provider() + ".csv" ], [ self.uberon_stage_other_query_template, commit_size, "expression_uberon_stage_other_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "expression_cross_references_" + sub_type.get_data_provider() + ".csv" ], [ self.add_pubs_query_template, commit_size, "expression_add_pubs_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data_file, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) for item in query_and_file_list: query_tracking_list.append(item) self.logger.info("Finished Loading Expression Data: %s", sub_type.get_data_provider()) def add_other(self): """Add Other""" self.logger.debug("made it to the addOther statement") add_other_query = """ MERGE(other:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'}) ON CREATE SET other.name = 'other' MERGE(otherstage:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'}) ON CREATE SET otherstage.name = 'post embryonic, pre-adult' MERGE(othergo:GOTerm:Ontology {primaryKey:'GO:otherLocations'}) ON CREATE SET othergo.name = 'other locations' ON CREATE SET othergo.definition = 'temporary node to group expression entities up to ribbon terms' ON CREATE SET othergo.type = 'other' ON CREATE SET othergo.subset = 'goslim_agr' """ Neo4jHelper.run_single_query(add_other_query) def get_generators(self, expression_file, batch_size): """Get Generators""" self.logger.debug("made it to the expression generator") counter = 0 cross_references = [] bio_entities = [] bio_join_entities = [] bio_entity_gene_aos = [] pubs = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] stage_uberon_data = [] uberon_ao_data = [] uberon_ao_other_data = [] uberon_stage_other_data = [] self.logger.debug("streaming json data from %s ...", expression_file) with codecs.open(expression_file, 'r', 'utf-8') as file_handle: for xpat in ijson.items(file_handle, 'data.item'): counter = counter + 1 pub_med_url = None pub_mod_url = None pub_med_id = "" publication_mod_id = "" stage_term_id = "" stage_name = "" stage_uberon_term_id = "" gene_id = xpat.get('geneId') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( gene_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = xpat.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, gene_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \ publication_mod_id) else: publication_mod_id = evidence['publicationId'] if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\ publication_mod_id) if publication_mod_id is None: publication_mod_id = "" assay = xpat.get('assay') if 'whereExpressed' in xpat: where_expressed = xpat.get('whereExpressed') cellular_component_qualifier_term_id = \ where_expressed.get('cellularComponentQualifierTermId') cellular_component_term_id = where_expressed.get( 'cellularComponentTermId') anatomical_structure_term_id = where_expressed.get( 'anatomicalStructureTermId') anatomical_structure_qualifier_term_id = where_expressed.get( 'anatomicalStructureQualifierTermId') anatomical_sub_structure_term_id = \ where_expressed.get('anatomicalSubStructureTermId') anatomical_sub_structure_qualifier_term_id = where_expressed.get( 'anatomicalSubStructureQualifierTermId') where_expressed_statement = where_expressed.get( 'whereExpressedStatement') when_expressed_stage = xpat.get('whenExpressed') if 'stageTermId' in when_expressed_stage: stage_term_id = when_expressed_stage.get('stageTermId') if 'stageName' in when_expressed_stage: stage_name = when_expressed_stage.get('stageName') # TODO: making unique BioEntityGeneExpressionJoin nodes # and ExpressionBioEntity nodes is tedious. # TODO: Lets get the DQMs to fix this. expression_unique_key = gene_id + assay + stage_name expression_entity_unique_key = "" if anatomical_structure_term_id is not None: expression_unique_key += anatomical_structure_term_id expression_entity_unique_key = anatomical_structure_term_id if anatomical_structure_qualifier_term_id is not None: expression_unique_key += anatomical_structure_qualifier_term_id expression_entity_unique_key += anatomical_structure_qualifier_term_id if cellular_component_term_id is not None: expression_unique_key += cellular_component_term_id expression_entity_unique_key += cellular_component_term_id if cellular_component_qualifier_term_id is not None: expression_unique_key += cellular_component_qualifier_term_id expression_entity_unique_key += cellular_component_qualifier_term_id if anatomical_sub_structure_term_id is not None: expression_unique_key += anatomical_sub_structure_term_id if anatomical_sub_structure_qualifier_term_id is not None: expression_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key \ += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += where_expressed_statement expression_unique_key += where_expressed_statement if where_expressed.get( 'anatomicalStructureUberonSlimTermIds' ) is not None: for uberon_structure_term_object in \ where_expressed.get('anatomicalStructureUberonSlimTermIds'): structure_uberon_term_id = \ uberon_structure_term_object.get('uberonTerm') if structure_uberon_term_id is not None \ and structure_uberon_term_id != 'Other': structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": structure_uberon_term_id } uberon_ao_data.append(structure_uberon_term) elif structure_uberon_term_id is not None \ and structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if where_expressed.get( 'anatomicalSubStructureUberonSlimTermIds' ) is not None: for uberon_sub_structure_term_object in \ where_expressed.get('anatomicalSubStructureUberonSlimTermIds'): sub_structure_uberon_term_id = \ uberon_sub_structure_term_object.get('uberonTerm') if sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id != 'Other': sub_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": sub_structure_uberon_term_id } uberon_ao_data.append( sub_structure_uberon_term) elif sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if cellular_component_term_id is None: cellular_component_term_id = "" if when_expressed_stage.get( 'stageUberonSlimTerm') is not None: stage_uberon_term_object = when_expressed_stage.get( 'stageUberonSlimTerm') stage_uberon_term_id = stage_uberon_term_object.get( "uberonTerm") if stage_uberon_term_id is not None \ and stage_uberon_term_id != "post embryonic, pre-adult": stage_uberon = { "uberonStageId": stage_uberon_term_id, "ei_uuid": expression_unique_key } stage_uberon_data.append(stage_uberon) if stage_uberon_term_id == "post embryonic, pre-adult": stage_uberon_other = { "ei_uuid": expression_unique_key } uberon_stage_other_data.append(stage_uberon_other) if stage_term_id is None or stage_name == 'N/A': stage_term_id = "" stage_name = "" stage_uberon_term_id = "" if stage_name is not None: stage = { "stageTermId": stage_term_id, "stageName": stage_name, "ei_uuid": expression_unique_key } stage_list.append(stage) else: stage_uberon_term_id = "" if 'crossReference' in xpat: cross_ref = xpat.get('crossReference') cross_ref_id = cross_ref.get('id') local_cross_ref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'gene/expression/annotation/detail': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\ local_cross_ref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['ei_uuid'] = expression_unique_key cross_references.append(xref) bio_entity = { "ebe_uuid": expression_entity_unique_key, "whereExpressedStatement": where_expressed_statement } bio_entities.append(bio_entity) bio_join_entity = { "ei_uuid": expression_unique_key, "assay": assay } bio_join_entities.append(bio_join_entity) bio_entity_gene_ao = { "geneId": gene_id, "ebe_uuid": expression_entity_unique_key, "anatomicalStructureTermId": anatomical_structure_term_id, "ei_uuid": expression_unique_key } bio_entity_gene_aos.append(bio_entity_gene_ao) pub = { "ei_uuid": expression_unique_key, "pubPrimaryKey": pub_med_id + publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url } pubs.append(pub) ao_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "assay": assay, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_expressions.append(ao_expression) if cellular_component_qualifier_term_id is not None: cc_qualifier = { "ebe_uuid": expression_entity_unique_key, "cellularComponentQualifierTermId": cellular_component_qualifier_term_id } cc_qualifiers.append(cc_qualifier) if anatomical_structure_term_id is None: anatomical_structure_term_id = "" cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "assay": assay, "whereExpressedStatement": where_expressed_statement, "cellularComponentTermId": cellular_component_term_id, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } cc_expressions.append(cc_expression) if anatomical_structure_qualifier_term_id is not None: ao_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureQualifierTermId": anatomical_structure_qualifier_term_id } ao_qualifiers.append(ao_qualifier) if anatomical_sub_structure_term_id is not None: ao_substructure = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureTermId": anatomical_sub_structure_term_id } ao_substructures.append(ao_substructure) if anatomical_sub_structure_qualifier_term_id is not None: ao_ss_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureQualifierTermId": anatomical_sub_structure_qualifier_term_id } ao_ss_qualifiers.append(ao_ss_qualifier) if where_expressed_statement is None: where_expressed_statement = "" if anatomical_structure_term_id is not None \ and anatomical_structure_term_id != "" \ and cellular_component_term_id is not None \ and cellular_component_term_id != "": ao_cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "stageTermId": stage_term_id, "stageName": stage_name, "stageUberonTermId": stage_uberon_term_id, "assay": assay, "cellularComponentTermId": cellular_component_term_id, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_cc_expressions.append(ao_cc_expression) if counter == batch_size: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ] bio_entities = [] bio_join_entities = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] uberon_stage_other_data = [] stage_uberon_data = [] uberon_ao_other_data = [] uberon_ao_data = [] cross_references = [] bio_entity_gene_aos = [] pubs = [] counter = 0 if counter > 0: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ]
class AlleleETL(ETL): allele_construct_no_gene_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (c:Construct {primaryKey: row.constructId}) MATCH (s:Species {primaryKey: row.taxonId}) //Create the Allele node and set properties. primaryKey is required. MERGE (o:Allele:Feature {primaryKey:row.primaryId}) ON CREATE SET o.symbol = row.symbol, o.taxonId = row.taxonId, o.dateProduced = row.dateProduced, o.release = row.release, o.localId = row.localId, o.globalId = row.globalId, o.uuid = row.uuid, o.symbolText = row.symbolText, o.modCrossRefCompleteUrl = row.modGlobalCrossRefId, o.dataProviders = row.dataProviders, o.dataProvider = row.dataProvider, o.symbolWithSpecies = row.symbolWithSpecies, o.symbolTextWithSpecies = row.symbolTextWithSpecies, o.description = row.alleleDescription MERGE (o)-[:FROM_SPECIES]-(s) MERGE (o)-[:CONTAINS]-(c) """ allele_construct_gene_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (g:Gene {primaryKey: row.geneId}) MATCH (c:Construct {primaryKey: row.constructId}) MATCH (s:Species {primaryKey: row.taxonId}) //Create the Allele node and set properties. primaryKey is required. MERGE (o:Allele:Feature {primaryKey:row.primaryId}) ON CREATE SET o.symbol = row.symbol, o.taxonId = row.taxonId, o.dateProduced = row.dateProduced, o.release = row.release, o.localId = row.localId, o.globalId = row.globalId, o.uuid = row.uuid, o.symbolText = row.symbolText, o.modCrossRefCompleteUrl = row.modGlobalCrossRefId, o.dataProviders = row.dataProviders, o.dataProvider = row.dataProvider, o.symbolWithSpecies = row.symbolWithSpecies, o.symbolTextWithSpecies = row.symbolTextWithSpecies, o.description = row.alleleDescription MERGE (o)-[:FROM_SPECIES]-(s) MERGE (o)-[:IS_ALLELE_OF]-(g) MERGE (o)-[:CONTAINS]-(c) """ allele_gene_no_construct_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (g:Gene {primaryKey: row.geneId}) MATCH (s:Species {primaryKey: row.taxonId}) //Create the Allele node and set properties. primaryKey is required. MERGE (o:Allele:Feature {primaryKey:row.primaryId}) ON CREATE SET o.symbol = row.symbol, o.taxonId = row.taxonId, o.dateProduced = row.dateProduced, o.release = row.release, o.localId = row.localId, o.globalId = row.globalId, o.uuid = row.uuid, o.symbolText = row.symbolText, o.modCrossRefCompleteUrl = row.modGlobalCrossRefId, o.dataProviders = row.dataProviders, o.dataProvider = row.dataProvider, o.symbolWithSpecies = row.symbolWithSpecies, o.symbolTextWithSpecies = row.symbolTextWithSpecies, o.description = row.alleleDescription MERGE (o)-[:FROM_SPECIES]-(s) MERGE (o)-[:IS_ALLELE_OF]->(g) """ allele_no_gene_no_construct_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (s:Species {primaryKey: row.taxonId}) //Create the Allele node and set properties. primaryKey is required. MERGE (o:Allele:Feature {primaryKey:row.primaryId}) ON CREATE SET o.symbol = row.symbol, o.taxonId = row.taxonId, o.dateProduced = row.dateProduced, o.release = row.release, o.localId = row.localId, o.globalId = row.globalId, o.uuid = row.uuid, o.symbolText = row.symbolText, o.modCrossRefCompleteUrl = row.modGlobalCrossRefId, o.dataProviders = row.dataProviders, o.dataProvider = row.dataProvider, o.symbolWithSpecies = row.symbolWithSpecies, o.symbolTextWithSpecies = row.symbolTextWithSpecies, o.description = row.alleleDescription MERGE (o)-[:FROM_SPECIES]-(s) """ allele_secondaryids_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (f:Allele:Feature {primaryKey:row.data_id}) MERGE (second:SecondaryId {primaryKey:row.secondary_id}) SET second.name = row.secondary_id MERGE (f)-[aka1:ALSO_KNOWN_AS]->(second) """ allele_synonyms_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (a:Allele:Feature {primaryKey:row.data_id}) MERGE(syn:Synonym {primaryKey:row.synonym}) SET syn.name = row.synonym MERGE (a)-[aka2:ALSO_KNOWN_AS]->(syn) """ allele_xrefs_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Allele {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text() def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): p = multiprocessing.Process(target=self._process_sub_type, args=(sub_type,)) p.start() thread_pool.append(p) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): logger.info("Loading Allele Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading Allele Data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [AlleleETL.allele_gene_no_construct_query_template, commit_size, "allele_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv"], [AlleleETL.allele_construct_gene_query_template, commit_size, "allele_construct_gene_data_" + sub_type.get_data_provider() + ".csv"], [AlleleETL.allele_construct_no_gene_query_template, commit_size, "allele_construct_no_gene_data_" + sub_type.get_data_provider() + ".csv"], [AlleleETL.allele_no_gene_no_construct_query_template, commit_size, "allele_no_gene_no_construct_data_" + sub_type.get_data_provider() + ".csv"], [AlleleETL.allele_secondaryids_template, commit_size, "allele_secondaryids_" + sub_type.get_data_provider() + ".csv"], [AlleleETL.allele_synonyms_template, commit_size, "allele_synonyms_" + sub_type.get_data_provider() + ".csv"], [AlleleETL.allele_xrefs_template, commit_size, "allele_xrefs_" + sub_type.get_data_provider() + ".csv"], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, allele_data, batch_size): data_providers = [] release = "" alleles_no_constrcut_no_gene = [] alleles_construct_gene = [] alleles_no_construct = [] alleles_no_gene = [] allele_synonyms = [] allele_secondary_ids = [] cross_reference_list = [] counter = 0 date_produced = allele_data['metaData']['dateProduced'] data_provider_object = allele_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] loadKey = date_produced + data_provider + "_ALLELE" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) logger.info("data provider: " + data_provider) if 'release' in allele_data['metaData']: release = allele_data['metaData']['release'] for allele_record in allele_data['data']: counter = counter + 1 global_id = allele_record['primaryId'] # fixing parsing error on this end while MGI fixes on their end. if global_id == 'MGI:3826848': description = allele_record.get('description')[:-2] else: description = allele_record.get('description') local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry(global_id) if is_it_test_entry is False: counter = counter - 1 continue gene_id = '' construct_id = '' association_type = '' short_species_abbreviation = ETLHelper.get_short_species_abbreviation(allele_record.get('taxonId')) symbol_text = TextProcessingHelper.cleanhtml(allele_record.get('symbol')) if allele_record.get('alleleObjectRelations') is not None: for relation in allele_record.get('alleleObjectRelations'): association_type = relation.get('objectRelation').get('associationType') if relation.get('objectRelation').get('gene') is not None: gene_id = relation.get('objectRelation').get('gene') if relation.get('objectRelation').get('construct') is not None: construct_id = relation.get('objectRelation').get('construct') if gene_id != '' and construct_id != '': allele_construct_gene_dataset = { "symbol": allele_record.get('symbol'), "geneId": gene_id, "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "constructId": construct_id, "associationType": association_type } alleles_construct_gene.append(allele_construct_gene_dataset) elif construct_id != '' and gene_id == '': allele_construct_no_gene_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "constructId": construct_id, "associationType": association_type } alleles_no_gene.append(allele_construct_no_gene_dataset) elif gene_id != '' and construct_id == '': allele_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "geneId": gene_id, "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_construct.append(allele_gene_no_construct_dataset) elif gene_id == '' and construct_id == '': allele_no_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset) else: allele_no_gene_no_construct_dataset = { "symbol": allele_record.get('symbol'), "primaryId": allele_record.get('primaryId'), "globalId": global_id, "localId": local_id, "taxonId": allele_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": loadKey, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "symbolWithSpecies": allele_record.get('symbol') + " (" + short_species_abbreviation + ")", "symbolTextWithSpecies": symbol_text + " (" + short_species_abbreviation + ")", "symbolText": symbol_text, "alleleDescription": description, "associationType": association_type } alleles_no_constrcut_no_gene.append(allele_no_gene_no_construct_dataset) if 'crossReferences' in allele_record: for crossRef in allele_record['crossReferences']: crossRefId = crossRef.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRef.get('id').split(":")[0] pages = crossRef.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'allele' or page == 'allele/references' or page == 'transgene' or page == 'construct' \ or page == 'transgene/references' or page == 'construct/references': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_id, crossRefId + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'synonyms' in allele_record: for syn in allele_record.get('synonyms'): allele_synonym = { "data_id": allele_record.get('primaryId'), "synonym": syn.strip() } allele_synonyms.append(allele_synonym) if 'secondaryIds' in allele_record: for secondary_id in allele_record.get('secondaryIds'): allele_secondary_id = { "data_id": allele_record.get('primaryId'), "secondary_id": secondary_id } allele_secondary_ids.append(allele_secondary_id) if counter == batch_size: yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene, allele_secondary_ids, allele_synonyms, cross_reference_list] alleles_no_construct = [] alleles_construct_gene = [] alleles_no_gene = [] alleles_no_constrcut_no_gene = [] allele_secondary_ids = [] allele_synonyms = [] cross_reference_list = [] counter = 0 if counter > 0: yield [alleles_no_construct, alleles_construct_gene, alleles_no_gene, alleles_no_constrcut_no_gene, allele_secondary_ids, allele_synonyms, cross_reference_list]
class MolecularInteractionETL(ETL): """Molecular Interaction ETL.""" logger = logging.getLogger(__name__) # Query templates which take params and will be processed later main_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (g1:Gene {primaryKey:row.interactor_A}) MATCH (g2:Gene {primaryKey:row.interactor_B}) MATCH (mi:MITerm) WHERE mi.primaryKey = row.detection_method MATCH (sdb:MITerm) WHERE sdb.primaryKey = row.source_database MATCH (adb:MITerm) WHERE adb.primaryKey = row.aggregation_database MATCH (ita:MITerm) WHERE ita.primaryKey = row.interactor_A_type MATCH (itb:MITerm) WHERE itb.primaryKey = row.interactor_B_type MATCH (ira:MITerm) WHERE ira.primaryKey = row.interactor_A_role MATCH (irb:MITerm) WHERE irb.primaryKey = row.interactor_B_role MATCH (it:MITerm) WHERE it.primaryKey = row.interaction_type //Create the relationship between the two genes. CREATE (g1)-[iw:INTERACTS_WITH {uuid:row.uuid}]->(g2) //Create the Association node to be used for the object. CREATE (oa:Association {primaryKey:row.uuid}) SET oa :InteractionGeneJoin SET oa.joinType = 'molecular_interaction' CREATE (g1)-[a1:ASSOCIATION]->(oa) CREATE (oa)-[a2:ASSOCIATION]->(g2) //Create the publication nodes and link them to the Association node. MERGE (pn:Publication {primaryKey:row.pub_med_id}) ON CREATE SET pn.pubMedUrl = row.pub_med_url, pn.pubMedId = row.pub_med_id CREATE (oa)-[ev:EVIDENCE]->(pn) //Link detection method to the MI ontology. CREATE (oa)-[dm:DETECTION_METHOD]->(mi) //Link source database to the MI ontology. CREATE (oa)-[sd:SOURCE_DATABASE]->(sdb) //Link aggregation database to the MI ontology. CREATE (oa)-[ad:AGGREGATION_DATABASE]->(adb) //Link interactor roles and types to the MI ontology. CREATE (oa)-[ita1:INTERACTOR_A_TYPE]->(ita) CREATE (oa)-[itb1:INTERACTOR_B_TYPE]->(itb) CREATE (oa)-[ira1:INTERACTOR_A_ROLE]->(ira) CREATE (oa)-[irb1:INTERACTOR_B_ROLE]->(irb) //Link interaction type to the MI ontology. CREATE (oa)-[it1:INTERACTION_TYPE]->(it) """ xref_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row // This needs to be a MERGE below. MATCH (o:InteractionGeneJoin :Association) WHERE o.primaryKey = row.reference_uuid """ + ETLHelper.get_cypher_xref_text() mod_xref_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text() def __init__(self, config): """Initiaslise object.""" super().__init__() self.data_type_config = config # Initialize an instance of ResourceDescriptor for processing external links. # self.resource_descriptor_dict = ResourceDescriptorHelper2() self.missed_database_linkouts = set() self.successful_database_linkouts = set() self.ignored_database_linkouts = set() self.successful_mod_interaction_xrefs = [] def _load_and_process_data(self): # filepath = self.data_type_config.get_single_filepath() # Temporary fix for 3.0 release. filepath = 'tmp/alliance_molecular_interactions.tsv' commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.main_query_template, commit_size, "mol_int_data.csv"], [self.xref_query_template, commit_size, "mol_int_xref.csv"], [self.mod_xref_query_template, commit_size, "mol_int_mod_xref.csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) @staticmethod def populate_genes(): """Populate Genes.""" master_gene_set = set() query = "MATCH (g:Gene) RETURN g.primaryKey" result = Neo4jHelper().run_single_query(query) for record in result: master_gene_set.add(record['g.primaryKey']) return master_gene_set @staticmethod def query_crossreferences(crossref_prefix): """Query Cross References.""" query = """MATCH (g:Gene)-[C:CROSS_REFERENCE]-(cr:CrossReference) WHERE cr.prefix = {parameter} RETURN g.primaryKey, cr.globalCrossRefId""" return Neo4jHelper().run_single_parameter_query(query, crossref_prefix) def populate_crossreference_dictionary(self): """Populate the crossreference dictionary. We're populating a rather large dictionary to use for looking up Alliance genes by their crossreferences. Edit the list below if you'd like to add more crossreferences to the dictionary. The key of the dictionary is the crossreference and the value is the Alliance gene to which it resolves. """ master_crossreference_dictionary = dict() # If additional crossreferences need to be used to find interactors, they can be added here. # Use the crossreference prefix as the dictionary name. # Also add a regex entry to the resolve_identifier function. master_crossreference_dictionary['UniProtKB'] = dict() master_crossreference_dictionary['ENSEMBL'] = dict() master_crossreference_dictionary['NCBI_Gene'] = dict() master_crossreference_dictionary['RefSeq'] = dict() for key in master_crossreference_dictionary: self.logger.info('Querying for %s cross references.', key) result = self.query_crossreferences(key) for record in result: cross_ref_record = None # Modify the cross reference ID to match the PSI MITAB format if necessary. # So far, this is just converting 'NCBI_Gene' to 'entrez gene/locuslink'. if record['cr.globalCrossRefId'].startswith('NCBI_Gene'): cross_ref_record_split = record['cr.globalCrossRefId'].split(':')[1] cross_ref_record = 'entrez gene/locuslink:' + cross_ref_record_split else: cross_ref_record = record['cr.globalCrossRefId'] # The crossreference dictionary is a list of genes # linked to a single crossreference. # Append the gene if the crossref dict entry exists. # Otherwise, create a list and append the entry. if cross_ref_record.lower() in master_crossreference_dictionary[key]: master_crossreference_dictionary[key][cross_ref_record.lower()].append(record['g.primaryKey']) else: master_crossreference_dictionary[key][cross_ref_record.lower()] = [] master_crossreference_dictionary[key][cross_ref_record.lower()].append(record['g.primaryKey']) # The ids in PSI-MITAB files are lower case, hence the .lower() used above. return master_crossreference_dictionary def process_interaction_identifier(self, entry, additional_row): """Create cross references for all the external identifiers.""" xref_main_list = [] entries = None # Identifier types on this list DO NOT receive a # cross_ref_complete_url field for external linking. ignored_identifier_database_list = [ # The following entries are not currently required. 'brenda', 'bmrb', 'cell ontology', 'chebi', 'chembl compound', 'efo', 'flannotator', 'intenz', 'interpro', 'mpidb', 'omim', 'pdbj', 'pmc', 'pride', 'prints', 'proteomexchange', 'psi-mi', 'pubmed', 'go', 'reactome', 'refseq', 'tissue list', 'uniprotkb' ] if '|' in entry: entries = entry.split('|') else: entries = [entry] for individual in entries: """These links are for the individual interaction identifiers and link to the respective database.""" xref_dict = {} page = 'gene/interactions' individual_prefix, individual_body, _ = self.etlh.rdh2.split_identifier(individual) # Capitalize the prefix to match the YAML # and change the prefix if necessary to match the YAML. xref_dict['prefix'] = individual_prefix xref_dict['localId'] = individual_body # Special case for dealing with FlyBase. # The identifier link needs to use row 25 from the psi-mitab file. # TODO Regex to check for FBig in additional_row? if individual.startswith('flybase:FBrf'): if '|' in additional_row: individual = additional_row.split('|')[0] else: individual = additional_row regex_check = re.match('^flybase:FBig\\d{10}$', individual) if regex_check is None: self.logger.critical( """Fatal Error: During special handling of FlyBase molecular interaction links, an FBig ID was not found.""") self.logger.critical('Failed identifier: %s', individual) self.logger.critical('PSI-MITAB row entry: %s', additional_row) sys.exit(-1) # TODO Optimize and re-add this error tracking. if not individual.startswith(tuple(ignored_identifier_database_list)): try: individual_url = self.etlh.rdh2.return_url_from_key_value(individual_prefix, individual_body, page) xref_dict['crossRefCompleteUrl'] = individual_url except KeyError: pass xref_dict['uuid'] = str(uuid.uuid4()) xref_dict['globalCrossRefId'] = individual xref_dict['id'] = individual # Used for name. xref_dict['displayName'] = individual_body xref_dict['primaryKey'] = individual xref_dict['crossRefType'] = 'interaction' xref_dict['page'] = page xref_dict['reference_uuid'] = None # For association interactions (later). # Special case for FlyBase as "individual" is not unique in their case. # Individual_body needs to be used instead. if individual.startswith('flybase'): xref_dict['primaryKey'] = individual_body xref_main_list.append(xref_dict) return xref_main_list def add_mod_interaction_links(self, gene_id): """Create an XREF linking back to interaction pages at each MOD for a particular gene. These links appear at the top of the molecular interactions table once per gene page. """ xref_dict = {} page = 'gene/MODinteractions' individual_prefix, individual_body, _ = self.etlh.rdh2.split_identifier(gene_id) individual_url = self.etlh.rdh2.return_url_from_identifier(gene_id, page) # Exception for MGI if individual_prefix == 'MGI': xref_dict['displayName'] = gene_id xref_dict['id'] = gene_id xref_dict['globalCrossRefId'] = gene_id xref_dict['primaryKey'] = gene_id + page else: xref_dict['displayName'] = individual_body xref_dict['id'] = individual_body xref_dict['globalCrossRefId'] = individual_body xref_dict['primaryKey'] = individual_body + page xref_dict['prefix'] = individual_prefix xref_dict['localId'] = individual_body xref_dict['crossRefCompleteUrl'] = individual_url xref_dict['uuid'] = str(uuid.uuid4()) xref_dict['crossRefType'] = page xref_dict['page'] = page xref_dict['reference_uuid'] = str(uuid.uuid4()) # For matching to the gene when creating the xref relationship in Neo. xref_dict['dataId'] = gene_id # Add the gene_id of the identifier to a global list so we don't create unnecessary xrefs. self.successful_mod_interaction_xrefs.append(gene_id) return xref_dict def resolve_identifiers_by_row(self, row, master_gene_set, master_crossreference_dictionary): """Resolve Identifiers by Row.""" interactor_a_rows = [0, 2, 4, 22] interactor_b_rows = [1, 3, 5, 23] interactor_a_resolved = None interactor_b_resolved = None for row_entry in interactor_a_rows: try: # We need to change uniprot/swiss-prot to uniprotkb for interactor a and b. # This is the only current prefix adjustment. # If we need to do more, we should break this out into a function or small piece of code. interactor_a = row[row_entry].replace("uniprot/swiss-prot:", "uniprotkb:") interactor_a_resolved = self.resolve_identifier(interactor_a, master_gene_set, master_crossreference_dictionary) if interactor_a_resolved is not None: break except IndexError: # Biogrid has less rows than other files, continue on IndexErrors. continue for row_entry in interactor_b_rows: try: interactor_b = row[row_entry].replace("uniprot/swiss-prot:", "uniprotkb:") interactor_b_resolved = self.resolve_identifier(interactor_b, master_gene_set, master_crossreference_dictionary) if interactor_b_resolved is not None: break except IndexError: # Biogrid has less rows than other files, continue on IndexErrors. continue return interactor_a_resolved, interactor_b_resolved def resolve_identifier(self, row_entry, master_gene_set, master_crossreference_dictionary): # noqa """Resolve Identifier.""" list_of_crossref_regex_to_search = [ 'uniprotkb:[\\w\\d_-]*$', 'ensembl:[\\w\\d_-]*$', 'entrez gene/locuslink:.*', 'refseq:[\\w\\d_-]*$' ] # If we're dealing with multiple identifiers separated by a pipe. if '|' in row_entry: row_entries = row_entry.split('|') else: row_entries = [row_entry] for individual_entry in row_entries: # For use in wormbase / flybase lookups. # If we run into an IndexError, there's no identifier to resolve and we return False. # All valid identifiers in the PSI-MI TAB file should be "splittable". try: entry_stripped = individual_entry.split(':')[1] except IndexError: return None if individual_entry.startswith('uniprotkb:'): individual_entry = individual_entry.split('-')[0] prefixed_identifier = None # TODO implement regex for WB / FB gene identifiers. if entry_stripped.startswith('WB'): prefixed_identifier = 'WB:' + entry_stripped if prefixed_identifier in master_gene_set: return [prefixed_identifier] # Always return a list for later processing. return None # TODO implement regex for WB / FB gene identifiers. elif entry_stripped.startswith('FB'): prefixed_identifier = 'FB:' + entry_stripped if prefixed_identifier in master_gene_set: return [prefixed_identifier] # Always return a list for later processing. return None for regex_entry in list_of_crossref_regex_to_search: regex_output = re.findall(regex_entry, individual_entry) if regex_output is not None: # We might have multiple regex matches. # Search them all against our crossreferences. for regex_match in regex_output: identifier = regex_match for crossreference_type in master_crossreference_dictionary.keys(): # Using lowercase in the identifier to be consistent # with Alliance lowercase identifiers. if identifier.lower() in \ master_crossreference_dictionary[crossreference_type]: # Return the corresponding Alliance gene(s). return master_crossreference_dictionary[crossreference_type][identifier.lower()] # If we can't resolve any of the crossReferences, return None # print('Could not resolve identifiers.') # print(row_entries) return None def get_generators(self, filepath, batch_size): # noqa """Get Generators.""" list_to_yield = [] xref_list_to_yield = [] mod_xref_list_to_yield = [] # TODO Taxon species needs to be pulled out into a standalone # module to be used by other scripts. # TODO External configuration script for these types of filters? # Not a fan of hard-coding. # Populate our master dictionary for resolving cross references. master_crossreference_dictionary = self.populate_crossreference_dictionary() self.logger.info('Obtained the following number of cross references from Neo4j:') for entry in master_crossreference_dictionary: self.logger.info('%s: %s', entry, len(master_crossreference_dictionary[entry])) # Populate our master gene set for filtering Alliance genes. master_gene_set = self.populate_genes() self.logger.info('Obtained %s gene primary ids from Neo4j.', len(master_gene_set)) resolved_a_b_count = 0 unresolved_a_b_count = 0 total_interactions_loaded_count = 0 unresolved_publication_count = 0 # Used for debugging. # unresolved_entries = [] # unresolved_crossref_set = set() self.logger.info('Attempting to open %s', filepath) with open(filepath, 'r', encoding='utf-8') as tsvin: tsvin = csv.reader(tsvin, delimiter='\t') counter = 0 total_counter = 0 for row in tsvin: counter += 1 total_counter += 1 if total_counter % 100000 == 0: self.logger.info('Processing row %s.', total_counter) # Skip commented rows. if row[0].startswith('#'): continue taxon_id_1 = row[9] taxon_id_2 = row[10] # After we pass all our filtering / continue opportunities, # we start working with the variables. taxon_id_1_re = re.search(r'\d+', taxon_id_1) taxon_id_1_to_load = 'NCBITaxon:' + taxon_id_1_re.group(0) taxon_id_2_to_load = None if taxon_id_2 != '-': taxon_id_2_re = re.search(r'\d+', taxon_id_2) taxon_id_2_to_load = 'NCBITaxon:' + taxon_id_2_re.group(0) else: taxon_id_2_to_load = taxon_id_1_to_load # self interaction try: # Interactor ID for the UI table identifier_linkout_list = self.process_interaction_identifier(row[13], row[24]) except IndexError: # Interactor ID for the UI table identifier_linkout_list = self.process_interaction_identifier(row[13], None) source_database = None # grab the MI identifier between two quotes "" source_database = re.findall(r'"([^"]*)"', row[12])[0] # database_linkout_set.add(source_database) aggregation_database = 'MI:0670' # IMEx if source_database == 'MI:0478': # FlyBase aggregation_database = 'MI:0478' elif source_database == 'MI:0487': # WormBase aggregation_database = 'MI:0487' elif source_database == 'MI:0463': # BioGRID aggregation_database = 'MI:0463' detection_method = 'MI:0686' # Default to unspecified. try: # grab the MI identifier between two quotes "" detection_method = re.findall(r'"([^"]*)"', row[6])[0] except IndexError: pass # Default to unspecified, see above. # TODO Replace this publication work with a service. # Re-think publication implementation in Neo4j. publication = None publication_url = None if row[8] != '-': # Check for pubmed publication. publication_re = re.search(r'pubmed:\d+', row[8]) if publication_re is not None: publication = publication_re.group(0) # matching bit publication = publication.replace('pubmed', 'PMID') publication_url = self.etlh.rdh2.return_url_from_identifier(publication) elif publication_re is None: # If we can't find a pubmed publication, check for DOI. publication_re = re.search(r'^(DOI\:)?\d{2}\.\d{4}.*$', row[8]) # e.g. DOI:10.1101/2020.03.31.019216 if publication_re is not None: publication = publication_re.group(0) publication = publication.replace('DOI', 'doi') publication_url = self.etlh.rdh2.return_url_from_identifier(publication) else: unresolved_publication_count += 1 continue else: unresolved_publication_count += 1 continue # Other hardcoded values to be used for now. interactor_a_role = 'MI:0499' # Default to unspecified. interactor_b_role = 'MI:0499' # Default to unspecified. interactor_a_type = 'MI:0499' # Default to unspecified. interactor_b_type = 'MI:0499' # Default to unspecified. try: interactor_a_role = re.findall(r'"([^"]*)"', row[18])[0] except IndexError: pass # Default to unspecified, see above. try: interactor_b_role = re.findall(r'"([^"]*)"', row[19])[0] except IndexError: pass # Default to unspecified, see above. try: interactor_a_type = re.findall(r'"([^"]*)"', row[20])[0] except IndexError: pass # Default to unspecified, see above. try: interactor_b_type = re.findall(r'"([^"]*)"', row[21])[0] except IndexError: pass # Default to unspecified, see above. interaction_type = None interaction_type = re.findall(r'"([^"]*)"', row[11])[0] interactor_a_resolved = None interactor_b_resolved = None interactor_a_resolved, interactor_b_resolved = self.resolve_identifiers_by_row( row, master_gene_set, master_crossreference_dictionary) if interactor_a_resolved is None or interactor_b_resolved is None: unresolved_a_b_count += 1 # Tracking unresolved identifiers. # Uncomment the line below for debugging. # unresolved_entries.append([row[0], interactor_a_resolved, row[1], interactor_b_resolved, row[8]]) # if interactor_a_resolved is None: # unresolved_crossref_set.add(row[0]) # if interactor_b_resolved is None: # unresolved_crossref_set.add(row[1]) continue # Skip this entry. mol_int_dataset = { 'interactor_A': None, 'interactor_B': None, 'interactor_A_type': interactor_a_type, 'interactor_B_type': interactor_b_type, 'interactor_A_role': interactor_a_role, 'interactor_B_role': interactor_b_role, 'interaction_type': interaction_type, 'taxon_id_1': taxon_id_1_to_load, 'taxon_id_2': taxon_id_2_to_load, 'detection_method': detection_method, 'pub_med_id': publication, 'pub_med_url': publication_url, 'uuid': None, 'source_database': source_database, 'aggregation_database': aggregation_database } # Remove possible duplicates from interactor lists. interactor_a_resolved_no_dupes = list(set(interactor_a_resolved)) interactor_b_resolved_no_dupes = list(set(interactor_b_resolved)) # Get every possible combination of interactor A x interactor B # (if multiple ids resulted from resolving the identifier.) int_combos = list(itertools.product(interactor_a_resolved_no_dupes, interactor_b_resolved_no_dupes)) # Update the dictionary with every possible combination of # interactor A x interactor B. list_of_mol_int_dataset = [dict(mol_int_dataset, interactor_A=x, interactor_B=y, uuid=str(uuid.uuid4())) for x, y in int_combos] # Tracking successfully loaded identifiers. total_interactions_loaded_count += len(list_of_mol_int_dataset) # Tracking successfully resolved identifiers. resolved_a_b_count += 1 # We need to also create new crossreference dicts for every # new possible interaction combination. new_identifier_linkout_list = [] for dataset_entry in list_of_mol_int_dataset: for identifier_linkout in identifier_linkout_list: new_identifier_linkout_list.append( dict(identifier_linkout, reference_uuid=dataset_entry['uuid'])) # Create dictionaries for xrefs from Alliance genes # to MOD interaction sections of gene reports. for primary_gene_to_link in interactor_a_resolved_no_dupes: # We have the potential for numerous duplicate xrefs. # Check whether we've made this xref previously by looking in a list. # Should cut down loading time for Neo4j significantly. # Hopefully the lookup is not too long -- this should be refined if it's slow. # Ignore ZFIN interaction pages and REFSEQ. if not primary_gene_to_link.startswith('ZFIN') and not primary_gene_to_link.startswith('RefSeq'): if primary_gene_to_link not in self.successful_mod_interaction_xrefs: mod_xref_dataset = self.add_mod_interaction_links(primary_gene_to_link) mod_xref_list_to_yield.append(mod_xref_dataset) # Establishes the number of entries to yield (return) at a time. xref_list_to_yield.extend(new_identifier_linkout_list) list_to_yield.extend(list_of_mol_int_dataset) if counter == batch_size: counter = 0 yield list_to_yield, xref_list_to_yield, mod_xref_list_to_yield list_to_yield = [] xref_list_to_yield = [] mod_xref_list_to_yield = [] if counter > 0: yield list_to_yield, xref_list_to_yield, mod_xref_list_to_yield # TODO Clean up the set output. # for entry in unresolved_entries: # self.logger.info(*entry) # self.logger.info('A set of unique unresolvable cross references:') # for unique_entry in unresolved_crossref_set: # self.logger.info(unique_entry) self.logger.info('Resolved identifiers for %s PSI-MITAB interactions.', resolved_a_b_count) self.logger.info('Prepared to load %s total interactions %s.', total_interactions_loaded_count, '(accounting for multiple possible identifier resolutions)') self.logger.info('Note: Interactions missing valid publications will be skipped, even if their identifiers' ' resolve correctly.') self.logger.info('Could not resolve [and subsequently will not load] ' '{} interactions due to missing publications.'.format(unresolved_publication_count)) self.logger.info('Could not resolve [and subsequently will not load] %s interactions due to unresolved' ' identifiers.', unresolved_a_b_count)