def _process_sub_type(self, sub_type, query_tracking_list): self.logger.info("Loading GOAnnot Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_file_to_download() filepath = os.path.join('tmp/', filepath) self.logger.info("goannot path: %s", filepath) file = open(filepath, "r") self.logger.info("Finished Loading GOAnnot Data: %s", sub_type.get_data_provider()) # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(\ file, ETLHelper.go_annot_prefix_lookup(sub_type.get_data_provider()), batch_size) query_template_list = [ [ self.main_query_template, commit_size, "go_annot_" + sub_type.get_data_provider() + ".csv" ], ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) for item in query_and_file_list: query_tracking_list.append(item)
def get_generators(self, expression_atlas_gene_pages, data_provider, batch_size): """Get Generators.""" return_set = Neo4jHelper.run_single_parameter_query( ExpressionAtlasETL.get_genes_with_expression_atlas_links_query, list(expression_atlas_gene_pages.keys())) counter = 0 cross_reference_list = [] for record in return_set: counter += 1 cross_reference = ETLHelper.get_xref_dict( record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene", "gene/expression-atlas", "gene/expressionAtlas", record["g.modLocalId"], expression_atlas_gene_pages[record["g.primaryKey"].lower()], data_provider + ":" + record["g.modLocalId"] + "gene/expression-atlas") cross_reference["genePrimaryKey"] = record["g.primaryKey"] cross_reference_list.append(cross_reference) if counter > batch_size: yield [cross_reference_list] counter = 0 cross_reference_list = [] if counter > 0: yield [cross_reference_list]
def crossref_process(self, record, global_id, cross_reference_list): """Get xref.""" valid_pages = [ 'allele', 'allele/references', 'transgene', 'construct', 'transgene/references', 'construct/references' ] if 'crossReferences' not in record: return for crossRef in record['crossReferences']: crossRefId = crossRef.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRef.get('id').split(":")[0] pages = crossRef.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page in valid_pages: mod_global_cross_ref_id = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_id, crossRefId + page) xref['dataId'] = global_id cross_reference_list.append(xref)
def data_providers_process(self, data): """Get data providers. Creates 4 attributes. data_provider: provider name/symbol data_providers: list of providers data_provider_pages: pages data_provider_cross_ref_set: list of xref dicts """ data_provider_object = data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') self.data_provider = data_provider_cross_ref.get('id') self.data_provider_pages = data_provider_cross_ref.get('pages') self.data_providers = [] self.data_provider_cross_ref_set = [] if self.data_provider_pages is None: return for data_provider_page in self.data_provider_pages: cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( self.data_provider, self.data_provider, alt_page=data_provider_page) self.data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( self.data_provider, self.data_provider, data_provider_page, data_provider_page, self.data_provider, cross_ref_complete_url, self.data_provider + data_provider_page)) self.data_providers.append(self.data_provider) self.logger.info("data provider: %s", self.data_provider)
def _load_and_process_data(self): for sub_type in self.data_type_config.get_sub_type_objects(): species_encoded = urllib.parse.quote_plus(\ ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider())) commit_size = self.data_type_config.get_neo4j_commit_size() #batch_size = self.data_type_config.get_generator_batch_size() batch_size = 100000 generators = self.get_generators(sub_type, batch_size, species_encoded) query_template_list = [ [ self.geo_xref_query_template, commit_size, "geo_xref_data_" + sub_type.get_data_provider() + ".csv" ], ] query_and_file_list = self.process_query_params( query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list)
def ortho_xrefs(self, o_xrefs, ident, xrefs): """Geenrate xref for orthos.""" if o_xrefs is None: return # turn into a list if type(o_xrefs) != list: self.logger.critical("BOB: o_xrefs is not a list but is a '{}'".format(type(o_xrefs))) for xref_id_dict in o_xrefs: xref_id = xref_id_dict["val"] if ":" in xref_id: local_id = xref_id.split(":")[1].strip() prefix = xref_id.split(":")[0].strip() complete_url = self.etlh.get_complete_url_ont(local_id, xref_id) generated_xref = ETLHelper.get_xref_dict( local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", xref_id, complete_url, xref_id + "ontology_provided_cross_reference") generated_xref["oid"] = ident xrefs.append(generated_xref) if ":" in o_xrefs: # if o_xrefs is a str with ":" in it. local_id = o_xrefs.split(":")[1].strip() prefix = o_xrefs.split(":")[0].strip() complete_url = self.etlh.get_complete_url_ont(local_id, o_xrefs) generated_xref = ETLHelper.get_xref_dict( local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", o_xrefs, complete_url, o_xrefs) generated_xref["oid"] = ident xrefs.append(generated_xref)
def get_generators(self, sub_type, batch_size, species_encoded): """Get Generators""" entrez_ids = [] geo_data_file_contents = Path(sub_type.get_filepath()).read_text() geo_data = json.loads( json.dumps(xmltodict.parse(geo_data_file_contents))) for efetch_value in dict(geo_data.items()).values(): # IdList is a value returned from efetch XML spec, # within IdList, there is another map with "Id" # as the key and the entrez local ids a list value. for sub_map_key, sub_map_value in efetch_value.items(): if sub_map_key == 'IdList': for id_list in dict(sub_map_value.items()).values(): for entrez_id in id_list: self.logger.debug("here is the entrez id: %s", entrez_id) entrez_ids.append("NCBI_Gene:" + entrez_id) geo_data_list = [] return_set = Neo4jHelper.run_single_parameter_query( self.gene_crossref_query_template, entrez_ids) for record in return_set: gene_primary_key = record["g.primaryKey"] mod_local_id = record["g.modLocalId"] global_cross_ref_id = record["cr.globalCrossRefId"] geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1], "NCBI_Gene", "gene/other_expression", "gene/other_expression", "GEO", "https://www.ncbi.nlm.nih.gov/sites/entrez?" \ + "Db=geoprofiles"\ + "&DbFrom=gene"\ + "&Cmd=Link"\ + "&LinkName=gene_geoprofiles"\ + "&LinkReadableName=GEO%20Profiles"\ + "&IdsFromResult="\ + global_cross_ref_id.split(":")[1], global_cross_ref_id+"gene/other_expression") geo_xref["genePrimaryKey"] = gene_primary_key geo_xref["modLocalId"] = mod_local_id geo_data_list.append(geo_xref) yield [geo_data_list]
def save_descriptions_report_files(self, data_provider, json_desc_writer, context_info, gd_data_manager): """Save Descripitons Report Files""" release_version = ".".join(context_info.env["ALLIANCE_RELEASE"].split(".")[0:2]) json_desc_writer.overall_properties.species = data_provider json_desc_writer.overall_properties.release_version = release_version json_desc_writer.overall_properties.date = self.cur_date file_name = self.cur_date + "_" + data_provider file_path = os.path.join("tmp", file_name) json_desc_writer.write_json(file_path=file_path + ".json", pretty=True, include_single_gene_stats=True, data_manager=gd_data_manager) json_desc_writer.write_plain_text(file_path=file_path + ".txt") readme = "This file contains the following fields: gene ID, gene name, and gene description. The gene " \ "descriptions are generated by an algorithm developed by the Alliance that uses highly structured " \ "gene data such as associations to various ontology terms (e.g., Gene Ontology terms) and the " \ "Alliance strict orthology set. The original set of ontology terms that a gene is annotated to may " \ "have been trimmed to an ancestor term in the ontology, in order to balance readability with the " \ "amount of information in the description. The complete set of annotations to any gene in this file " \ "may be found in the relevant data tables on the Alliance gene page." species = ETLHelper.species_lookup_by_data_provider(data_provider) taxon_id = ETLHelper.get_taxon_from_mod(data_provider) header = create_header(file_type='Gene Descriptions', database_version=context_info.env["ALLIANCE_RELEASE"], data_format='txt', readme=readme, species=species, taxon_ids='# TaxonIDs:NCBITaxon:' + taxon_id) header = "\n".join([line.strip() for line in header.splitlines() if len(line.strip()) != 0]) self.add_header_to_file(file_path=file_path + ".txt", header=header) json_desc_writer.write_tsv(file_path=file_path + ".tsv") header = create_header(file_type='Gene Descriptions', database_version=context_info.env["ALLIANCE_RELEASE"], data_format='tsv', readme=readme, species=species, taxon_ids='# TaxonIDs:NCBITaxon:' + taxon_id) header = "\n".join([line.strip() for line in header.splitlines() if len(line.strip()) != 0]) self.add_header_to_file(file_path=file_path + ".tsv", header=header) if context_info.env["GENERATE_REPORTS"]: self.upload_files_to_fms(file_path, context_info, data_provider, self.logger)
def process_pages(self, dp, xrefs, pages): """Process pages to get xrefs.""" annotation_type = dp.get('type') xref = dp.get('crossReference') cross_ref_id = xref.get('id') if ":" in cross_ref_id: local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref_id.split(":")[0] else: local_crossref_id = "" prefix = cross_ref_id if annotation_type is None: annotation_type = 'curated' for page in pages: if (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'DOID': display_name = 'RGD' elif (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'OMIM': display_name = 'OMIM' else: display_name = cross_ref_id.split(":")[0] if display_name == 'DOID': display_name = self.data_provider mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) passing_xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, display_name, mod_global_cross_ref_url, cross_ref_id + page + annotation_type) passing_xref['dataId'] = self.disease_unique_key if 'loaded' in annotation_type: passing_xref['loadedDB'] = 'true' passing_xref['curatedDB'] = 'false' else: passing_xref['curatedDB'] = 'true' passing_xref['loadedDB'] = 'false' xrefs.append(passing_xref)
def get_generators(self, sub_type, batch_size, species_encoded): """Get Generators.""" entrez_ids = [] geo_data_file_contents = Path(sub_type.get_filepath()).read_text() geo_data = json.loads( json.dumps(xmltodict.parse(geo_data_file_contents))) for efetch_value in dict(geo_data.items()).values(): # IdList is a value returned from efetch XML spec, # within IdList, there is another map with "Id" # as the key and the entrez local ids a list value. for sub_map_key, sub_map_value in efetch_value.items(): if sub_map_key == 'IdList': for id_list in dict(sub_map_value.items()).values(): for entrez_id in id_list: self.logger.debug("here is the entrez id: %s", entrez_id) entrez_ids.append("NCBI_Gene:" + entrez_id) geo_data_list = [] return_set = Neo4jHelper.run_single_parameter_query( self.gene_crossref_query_template, entrez_ids) for record in return_set: gene_primary_key = record["g.primaryKey"] mod_local_id = record["g.modLocalId"] global_cross_ref_id = record["cr.globalCrossRefId"] url = self.etlh.rdh2.return_url_from_key_value( 'GEO', global_cross_ref_id.split(":")[1], 'entrezgene') geo_xref = ETLHelper.get_xref_dict( global_cross_ref_id.split(":")[1], "NCBI_Gene", "gene/other_expression", "gene/other_expression", "GEO", url, global_cross_ref_id + "gene/other_expression") geo_xref["genePrimaryKey"] = gene_primary_key geo_xref["modLocalId"] = mod_local_id geo_data_list.append(geo_xref) yield [geo_data_list]
def xref_process(self, construct_record, cross_reference_list): """Process the xrefs.""" global_id = construct_record['primaryId'] if 'crossReferences' not in construct_record: return for cross_ref in construct_record.get('crossReferences'): cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is None or len(pages) == 0: continue for page in pages: if page == 'construct': mod_global_cross_ref_id = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) xref = ETLHelper.get_xref_dict(local_crossref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['dataId'] = global_id cross_reference_list.append(xref)
def xref_process(self, basic_genetic_entity, cross_references, urls): # noqa """Process xrefs.""" primary_id = basic_genetic_entity.get('primaryId') global_id = basic_genetic_entity.get('primaryId') local_id = global_id.split(":")[1] taxon_id = basic_genetic_entity.get("taxonId") if 'crossReferences' not in basic_genetic_entity: return for cross_ref in basic_genetic_entity.get('crossReferences'): if ':' not in cross_ref.get('id'): continue cross_ref_id = cross_ref.get('id') local_cross_ref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') global_xref_id = cross_ref.get('id') display_name = global_xref_id # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: display_name = "" cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/expression_images': cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) elif page == 'gene': urls[ 'mod_cross_reference_complete_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) urls[ 'genetic_entity_external_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/references': urls[ 'gene_literature_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/spell': display_name = 'Serial Patterns of Expression Levels Locator (SPELL)' # TODO: fix generic_cross_reference in SGD, RGD if page == 'generic_cross_reference': cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) # TODO: fix gene/disease xrefs for SGD once # resourceDescriptor change in develop # makes its way to the release branch. if page == 'gene/disease' and taxon_id == 'NCBITaxon:559292': cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( 'SGD', local_id, page) xref_map = ETLHelper.get_xref_dict(local_cross_ref_id, prefix, page, page, display_name, cross_ref_complete_url, global_xref_id + page) xref_map['dataId'] = primary_id cross_references.append(xref_map) else: if prefix == 'PANTHER': cross_ref_primary_id = cross_ref.get( 'id') + '_' + primary_id cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) page = "gene/panther" elif prefix == 'RGD': cross_ref_primary_id = cross_ref.get('id') cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( 'RGD', local_cross_ref_id) page = "generic_cross_reference" else: cross_ref_primary_id = cross_ref.get('id') cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) page = "generic_cross_reference" xref_map = ETLHelper.get_xref_dict(local_cross_ref_id, prefix, page, page, display_name, cross_ref_complete_url, cross_ref_primary_id + page) xref_map['dataId'] = primary_id cross_references.append(xref_map)
class HTPMetaDatasetSampleETL(ETL): htp_dataset_sample_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:OBITerm {primaryKey:row.sampleType}) MATCH (s:Species {primaryKey: row.taxonId}) MATCH (a:MMOTerm {primaryKey: row.assayType}) MERGE (ds:HTPDatasetSample {primaryKey:row.datasetSampleId}) ON CREATE SET ds.dateAssigned = row.dateAssigned, ds.abundance = row.abundance, ds.sex = row.sex, ds.notes = row.notes, ds.dateAssigned = row.dateAssigned, //ds.biosampleText = row.biosampleText, ds.sequencingFormat = row.sequencingFormat, ds.title = row.sampleTitle, ds.sampleAge = row.sampleAge MERGE (ds)-[dssp:FROM_SPECIES]-(s) //MERGE (ds)-[dsat:ASSAY_TYPE]-(a) //MERGE (ds)-[dsst:SAMPLE_TYPE]-(o) """ htp_dataset_sample_agm_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId}) MATCH (agm:AffectedGenomicModel {primaryKey:row.biosampleId}) MERGE (agm)-[agmds:ASSOCIATION]-(ds) """ htp_dataset_sample_agmtext_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId}) MERGE (agm:AffectedGenomicModel {primaryKey:row.biosampleText}) MERGE (agm)-[agmds:ASSOCIATION]-(ds) """ htp_bio_entity_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId}) MERGE (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) ON CREATE SET e.whereExpressedStatement = row.whereExpressedStatement MERGE (dss)-[dsdss:STRUCTURE_SAMPLED]-(e) """ htp_stages_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId}) MATCH (st:Stage {primaryKey:row.stageName}) MERGE (dss)-[eotcctq:SAMPLED_DURING]-(s) """ htp_dataset_join_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDataset {primaryKey:row.datasetId}) MATCH (dss:HTPDatasetSample {primaryKey:row.datasetSampleId}) MERGE (ds)-[dsdss:ASSOCIATION]-(dss) """ htp_secondaryIds_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (dss:HTPDatasetSample {primaryKey: row.datasetSampleId}) MERGE (sec:SecondaryId:Identifier {primaryKey:row.secondaryId}) ON CREATE SET sec.name = row.secondaryId MERGE (dss)<-[aka:ALSO_KNOWN_AS]-(sec) """ ao_substructures_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasst) AND NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ ao_qualifiers_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalStructureQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasst) AND NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ ao_ss_qualifiers_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasst) AND NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ ao_terms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalStructureTermId}) WHERE NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ cc_term_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.cellularComponentTermId}) WHERE NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUBSTRUCTURE]->(otasst) """ eas_substructure_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasst:Ontology {primaryKey:row.anatomicalSubStructureTermId}) WHERE NOT 'FBCVTerm' in LABELS(otasst) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasst:ANATOMICAL_SUB_SUBSTRUCTURE]->(otasst) """ eas_qualified_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otastq:Ontology {primaryKey:row.anatomicalStructureQualifierTermId}) WHERE NOT 'FBCVTerm' in LABELS(otastq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotastq:ANATOMICAL_STRUCTURE_QUALIFIER]-(otastq) """ eass_qualified_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otasstq:Ontology {primaryKey:row.anatomicalSubStructureQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otasstq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotasstq:ANATOMICAL_SUB_STRUCTURE_QUALIFIER]-(otasstq) """ ccq_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (otcctq:Ontology {primaryKey:row.cellularComponentQualifierTermId}) WHERE NOT 'UBERONTerm' in LABELS(otcctq) MATCH (e:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MERGE (e)-[eotcctq:CELLULAR_COMPONENT_QUALIFIER]-(otcctq) """ stage_expression_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MERGE (s:Stage {primaryKey:row.stageName}) ON CREATE SET s.name = row.stageName MERGE (ei)-[eotcctq:DURING]-(s) """ uberon_ao_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (o:Ontology:UBERONTerm {primaryKey:row.aoUberonId}) MERGE (ebe)-[ebeo:ANATOMICAL_RIBBON_TERM]-(o) """ uberon_stage_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MATCH (o:Ontology:UBERONTerm {primaryKey:row.uberonStageId}) MERGE (ei)-[eio:STAGE_RIBBON_TERM]-(o) """ uberon_ao_other_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ebe:ExpressionBioEntity {primaryKey:row.ebe_uuid}) MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'}) MERGE (ebe)-[ebeu:ANATOMICAL_RIBBON_TERM]-(u) """ uberon_stage_other_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ei:BioEntityGeneExpressionJoin {primaryKey:row.ei_uuid}) MATCH (u:Ontology:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'}) MERGE (ei)-[eiu:STAGE_RIBBON_TERM]-(u) """ htp_dataset_sample_assemblies_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDatasetSample {primaryKey:row.datasetSampleId}) MATCH (u:Assembly {primaryKey:row.assembly}) MERGE (ds)-[dsu:ASSEMBLY]-(u) """ htpdatasetsample_xrefs_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:HTPDatasetSample {primaryKey:row.datasetId}) """ + ETLHelper.get_cypher_xref_text( ) def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): p = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, )) p.start() thread_pool.append(p) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata sample data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetSampleETL.htp_dataset_sample_query_template, commit_size, "htp_metadataset_sample_samples_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_bio_entity_expression_query_template, commit_size, "htp_metadataset_sample_bioentities_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_sample_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_join_query_template, commit_size, "htp_metadataset_sample_datasets_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_stages_query_template, commit_size, "htp_metadataset_sample_stages_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_terms_query_template, commit_size, "htp_metadataset_sample_aoterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_substructures_query_template, commit_size, "htp_metadataset_sample_ao_substructures_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ao_ss_qualifiers_query_template, commit_size, "htp_metadataset_sample_ao_ss_qualifiers_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.cc_term_query_template, commit_size, "htp_metadataset_sample_ccterms" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.ccq_expression_query_template, commit_size, "htp_metadataset_sample_ccqterms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_query_template, commit_size, "htp_metadataset_sample_uberon_ao_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.uberon_ao_other_query_template, commit_size, "htp_metadataset_sample_uberon_ao_other_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL.htp_dataset_sample_agm_query_template, commit_size, "htp_metadataset_sample_agms_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_agmtext_query_template, commit_size, "htp_metadataset_sample_agmstext_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetSampleETL. htp_dataset_sample_assemblies_query_template, commit_size, "htp_metadataset_sample_assemblies_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, htp_datasetsample_data, batch_size): htp_datasetsamples = [] secondaryIds = [] datasetIds = [] assemblies = [] uberon_ao_data = [] ao_qualifiers = [] bio_entities = [] ao_ss_qualifiers = [] ao_substructures = [] ao_terms = [] uberon_ao_other_data = [] stages = [] ccq_components = [] cc_components = [] biosamples = [] biosamplesTexts = [] counter = 0 data_provider_object = htp_datasetsample_data['metaData'][ 'dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') for datasample_record in htp_datasetsample_data['data']: counter = counter + 1 sampleIds = '' biosampleId = '' biosampleText = '' sampleId = '' sampleTitle = '' biosamplesTexts = '' if 'sampleId' in datasample_record: sampleIdObj = datasample_record.get('sampleId') sampleId = sampleIdObj.get('primaryId') if 'secondaryIds' in sampleIdObj: for secId in sampleIdObj.get('secondaryIds'): secid = { "datasetSampleId": sampleId, "secondaryId": secId } secondaryIds.append(secid) if 'sampleTitle' in sampleIds: sampleTitle = datasample_record.get('sampleTitle') datasetSampleId = sampleId + sampleTitle if 'datasetIds' in datasample_record: datasetIdSet = datasample_record.get('datasetIds') for datasetID in datasetIdSet: datasetsample = { "datasetSampleId": datasetSampleId, "datasetId": datasetID } datasetIds.append(datasetsample) if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( datasetID) if is_it_test_entry is False: counter = counter - 1 continue if 'genomicInformation' in datasample_record: genomicInformation = datasample_record.get( 'genomicInformation') if 'biosampleId' in genomicInformation: biosampleId = genomicInformation.get('biosampleId') if 'bioSampleText' in genomicInformation: biosampleText = genomicInformation.get('bioSampleText') if biosampleId is not None and biosampleId != '': biosample = { "biosampleId": biosampleId, "datasetSampleId": datasetSampleId } biosamples.append(biosample) if biosampleText is not None and biosampleText != '' and biosampleId == '': biosampleText = { "biosampleText": biosampleText, "datasetSampleId": datasetSampleId } biosamplesTexts.append(biosampleText) if 'assemblyVersions' in datasample_record: for assembly in datasample_record.get('assemblyVersions'): datasetsample = { "datasetSampleId": datasetSampleId, "assembly": assembly } assemblies.append(datasetsample) age = '' if 'sampleAge' in datasample_record: sampleAge = datasample_record.get('sampleAge') stageId = "" if 'age' in sampleAge: age = sampleAge.get('age') stageId = stageId + age if 'stage' in sampleAge: stage = sampleAge.get('stage') stageId = stageId + stage.get('stageName') stage = { "stageId": stageId, "stageTermId": stage.get('stageTermId'), "stageName": stage.get('stageName'), "stageUberonSlimTerm": stage.get('stageUberonSlimTerm'), "sampleAge": age, "datasetSampleId": datasetSampleId } stages.append(stage) else: stage = {"stageId": stageId, "sampleAge": age} stages.append(stage) if 'sampleLocations' in datasample_record: sampleLocations = datasample_record.get('sampleLocations') for location in sampleLocations: cellular_component_qualifier_term_id = location.get( 'cellularComponentQualifierTermId') cellular_component_term_id = location.get( 'cellularComponentTermId') anatomical_structure_term_id = location.get( 'anatomicalStructureTermId') anatomical_structure_qualifier_term_id = location.get( 'anatomicalStructureQualifierTermId') anatomical_sub_structure_term_id = location.get( 'anatomicalSubStructureTermId') anatomical_sub_structure_qualifier_term_id = location.get( 'anatomicalSubStructureQualifierTermId') where_expressed_statement = location.get( 'whereExpressedStatement') expression_unique_key = datasetSampleId expression_entity_unique_key = '' if anatomical_structure_term_id is not None: expression_unique_key += anatomical_structure_term_id expression_entity_unique_key = anatomical_structure_term_id if anatomical_structure_qualifier_term_id is not None: expression_unique_key += anatomical_structure_qualifier_term_id expression_entity_unique_key += anatomical_structure_qualifier_term_id if cellular_component_term_id is not None: expression_unique_key += cellular_component_term_id expression_entity_unique_key += cellular_component_term_id if cellular_component_qualifier_term_id is not None: expression_unique_key += cellular_component_qualifier_term_id expression_entity_unique_key += cellular_component_qualifier_term_id if anatomical_sub_structure_term_id is not None: expression_unique_key += anatomical_sub_structure_term_id if anatomical_sub_structure_qualifier_term_id is not None: expression_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += where_expressed_statement if location.get('anatomicalStructureUberonSlimTermIds' ) is not None: for uberon_structure_term_object in location.get( 'anatomicalStructureUberonSlimTermIds'): structure_uberon_term_id = uberon_structure_term_object.get( 'uberonTerm') if structure_uberon_term_id is not None and structure_uberon_term_id != 'Other': structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": structure_uberon_term_id } uberon_ao_data.append(structure_uberon_term) elif structure_uberon_term_id is not None and structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if location.get('anatomicalSubStructureUberonSlimTermIds' ) is not None: for uberon_sub_structure_term_object in location.get( 'anatomicalSubStructureUberonSlimTermIds'): sub_structure_uberon_term_id = uberon_sub_structure_term_object.get( 'uberonTerm') if sub_structure_uberon_term_id is not None and sub_structure_uberon_term_id != 'Other': sub_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": sub_structure_uberon_term_id } uberon_ao_data.append( sub_structure_uberon_term) elif sub_structure_uberon_term_id is not None and sub_structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if cellular_component_term_id is not None: cc_term = { "ebe_uuid": expression_entity_unique_key, "cellularComponentTermId": cellular_component_term_id } cc_components.append(cc_term) if cellular_component_qualifier_term_id is not None: ccq_term = { "ebe_uuid": expression_entity_unique_key, "cellularComponentQualifierTermId": cellular_component_qualifier_term_id } ccq_components.append(ccq_term) if anatomical_structure_term_id is not None: ao_term = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureTermId": anatomical_structure_term_id } ao_terms.append(ao_term) if anatomical_structure_qualifier_term_id is not None: ao_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureQualifierTermId": anatomical_structure_qualifier_term_id } ao_qualifiers.append(ao_qualifier) if anatomical_sub_structure_term_id is not None: ao_substructure = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureTermId": anatomical_sub_structure_term_id } ao_substructures.append(ao_substructure) if anatomical_sub_structure_qualifier_term_id is not None: ao_ss_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureQualifierTermId": anatomical_sub_structure_qualifier_term_id } ao_ss_qualifiers.append(ao_ss_qualifier) if where_expressed_statement is None: where_expressed_statement = "" bio_entity = { "ebe_uuid": expression_entity_unique_key, "whereExpressedStatement": where_expressed_statement, "datasetSampleId": datasetSampleId } bio_entities.append(bio_entity) htp_dataset_sample = { "datasetSampleId": datasetSampleId, "abundance": datasample_record.get('abundance'), "sampleType": datasample_record.get('sampleType'), "taxonId": datasample_record.get('taxonId'), "sex": datasample_record.get('sex'), "assayType": datasample_record.get('assayType'), "notes": datasample_record.get('notes'), "dateAssigned": datasample_record.get('dateAssigned'), "sequencingFormat": datasample_record.get('sequencingFormat'), "sampleTitle": sampleTitle, "sampleAge": age } htp_datasetsamples.append(htp_dataset_sample) # # if self.test_object.using_test_data() is True: # is_it_test_entry = self.test_object.check_for_test_id_entry(datasetID) # if is_it_test_entry is True: # self.logger.info(htp_dataset_sample) if counter == batch_size: yield [ htp_datasetsamples, bio_entities, secondaryIds, datasetIds, stages, ao_terms, ao_substructures, ao_qualifiers, ao_ss_qualifiers, cc_components, ccq_components, uberon_ao_data, uberon_ao_other_data, biosamples, biosamplesTexts, assemblies, ] counter = 0 htp_datasetsamples = [] datasetIds = [] uberon_ao_data = [] ao_qualifiers = [] bio_entities = [] ao_ss_qualifiers = [] ao_substructures = [] ao_terms = [] uberon_ao_other_data = [] stages = [] ccq_components = [] cc_components = [] biosamples = [] assemblies = [] if counter > 0: yield [ htp_datasetsamples, bio_entities, secondaryIds, datasetIds, stages, ao_terms, ao_substructures, ao_qualifiers, ao_ss_qualifiers, cc_components, ccq_components, uberon_ao_data, uberon_ao_other_data, biosamples, biosamplesTexts, assemblies ]
def get_generators(self, construct_data, data_provider, batch_size): """Create Generators""" data_providers = [] release = "" constructs = [] construct_synonyms = [] construct_secondary_ids = [] cross_reference_list = [] component_details = [] component_no_gene_details = [] non_bgi_components = [] counter = 0 date_produced = construct_data['metaData']['dateProduced'] data_provider_object = construct_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') self.logger.info("DataProvider: " + data_provider) data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_construct" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) if 'release' in construct_data['metaData']: release = construct_data['metaData']['release'] for construct_record in construct_data['data']: counter = counter + 1 global_id = construct_record['primaryId'] local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue name_text = TextProcessingHelper.cleanhtml( construct_record.get('name')) construct_dataset = { "symbol": construct_record.get('name'), "primaryId": construct_record.get('primaryId'), "globalId": global_id, "localId": local_id, "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "nameText": name_text, "name": construct_record.get('name') } constructs.append(construct_dataset) if 'crossReferences' in construct_record: for cross_ref in construct_record.get('crossReferences'): cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'construct': mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'constructComponents' in construct_record: for component in construct_record.get('constructComponents'): component_relation = component.get( 'componentRelation').upper() component_symbol = component.get('componentSymbol') component_id = component.get('componentID') if component_id is not None: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "componentID": component_id, "constructID": construct_record.get('primaryId') } component_details.append(component_detail) else: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "constructID": construct_record.get('primaryId') } non_bgi_component = { "componentSymbol": component_symbol } non_bgi_components.append(non_bgi_component) component_no_gene_details.append(component_detail) if 'synonyms' in construct_record: for syn in construct_record.get('synonyms'): construct_synonym = { "data_id": construct_record.get('primaryId'), "synonym": syn.strip() } construct_synonyms.append(construct_synonym) if 'secondaryIds' in construct_record: for secondary_id in construct_record.get('secondaryIds'): construct_secondary_id = { "data_id": construct_record.get('primaryId'), "secondary_id": secondary_id } construct_secondary_ids.append(construct_secondary_id) if counter == batch_size: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ] constructs = [] construct_secondary_ids = [] construct_synonyms = [] cross_reference_list = [] non_bgi_components = [] component_details = [] component_no_gene_details = [] counter = 0 if counter > 0: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ]
class BGIETL(ETL): """BGI ETL.""" logger = logging.getLogger(__name__) # Query templates which take params and will be processed later so_terms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene {primaryKey:row.primaryKey}) MATCH (s:SOTerm {primaryKey:row.soTermId}) MERGE (o)-[:ANNOTATED_TO]->(s)""" chromosomes_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MERGE (chrm:Chromosome {primaryKey: row.primaryKey}) """ genomic_locations_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene {primaryKey:row.primaryId}) MATCH (chrm:Chromosome {primaryKey:row.chromosome}) MERGE (o)-[ochrm:LOCATED_ON]->(chrm) MERGE (a:Assembly {primaryKey:row.assembly}) ON CREATE SET a.dataProvider = row.dataProvider MERGE (gchrm:GenomicLocation {primaryKey:row.uuid}) ON CREATE SET gchrm.start = apoc.number.parseInt(row.start), gchrm.end = apoc.number.parseInt(row.end), gchrm.assembly = row.assembly, gchrm.strand = row.strand, gchrm.chromosome = row.chromosome, gchrm.assembly = row.assembly MERGE (o)-[of:ASSOCIATION]-(gchrm) MERGE (gchrm)-[ofc:ASSOCIATION]-(chrm) MERGE (gchrmn)-[ao:ASSOCIATION]->(a) """ genomic_locations_bins_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene {primaryKey:row.genePrimaryId}) MATCH (chrm:Chromosome {primaryKey:row.chromosome}) MERGE (bin:GenomicLocationBin {primaryKey:row.binPrimaryKey}) ON CREATE SET bin.number = toInt(row.number), bin.assembly = row.assembly MERGE (o)-[:LOCATED_IN]->(bin) MERGE (bin)-[:LOCATED_ON]->(chrm) """ gene_secondary_ids_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (g:Gene {primaryKey:row.primary_id}) MERGE (second:SecondaryId:Identifier {primaryKey:row.secondary_id}) ON CREATE SET second.name = row.secondary_id MERGE (g)-[aka1:ALSO_KNOWN_AS]->(second) """ gene_synonyms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (g:Gene {primaryKey:row.primary_id}) MERGE(syn:Synonym:Identifier {primaryKey:row.synonym}) SET syn.name = row.synonym MERGE (g)-[aka2:ALSO_KNOWN_AS]->(syn) """ gene_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (l:Load {primaryKey:row.loadKey}) //Create the Gene node and set properties. primaryKey is required. MERGE (o:Gene {primaryKey:row.primaryId}) ON CREATE SET o.symbol = row.symbol, o.taxonId = row.taxonId, o.name = row.name, o.description = row.description, o.geneSynopsisUrl = row.geneSynopsisUrl, o.geneSynopsis = row.geneSynopsis, o.geneLiteratureUrl = row.geneLiteratureUrl, o.geneticEntityExternalUrl = row.geneticEntityExternalUrl, o.dateProduced = row.dateProduced, o.modGlobalCrossRefId = row.modGlobalCrossRefId, o.modCrossRefCompleteUrl = row.modCrossRefCompleteUrl, o.modLocalId = row.localId, o.modGlobalId = row.modGlobalId, o.uuid = row.uuid, o.dataProvider = row.dataProvider, o.symbolWithSpecies = row.symbolWithSpecies """ basic_gene_load_relations_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (l:Load {primaryKey:row.loadKey}) MATCH (g:Gene {primaryKey:row.primaryId}) MERGE (g)-[:LOADED_FROM]->(l) """ basic_gene_species_relations_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (spec:Species {primaryKey: row.taxonId}) MATCH (g:Gene {primaryKey: row.primaryId}) MERGE (g)-[:FROM_SPECIES]->(spec) """ xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_tuned_text( ) xrefs_relationships_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene {primaryKey:row.dataId}) MATCH (c:CrossReference {primaryKey:row.primaryKey}) MERGE (o)-[oc:CROSS_REFERENCE]-(c) """ + ETLHelper.merge_crossref_relationships() gene_metadata_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row //Create the load node(s) CREATE (l:Load:Entity {primaryKey:row.loadKey}) SET l.dateProduced = row.dateProduced, l.loadName = "BGI", l.release = row.release, l.dataProviders = row.dataProviders, l.dataProvider = row.dataProvider """ def __init__(self, config): """Initialise object.""" self.metadata_is_loaded = { } # Dictionary for optimizing metadata loading. super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] query_tracking_list = multiprocessing.Manager().list() for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, query_tracking_list)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) queries = [] for item in query_tracking_list: queries.append(item) Neo4jTransactor.execute_query_batch(queries) def _process_sub_type(self, sub_type, query_tracking_list): self.logger.info("Loading BGI Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() if filepath is None: self.logger.error("Can't find input file for %s", sub_type) sys.exit() data = JSONFile().get_data(filepath) # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # gene_metadata, gene_dataset, secondary_ids, genomic_locations, cross_references, synonyms # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.gene_metadata_query_template, commit_size, "gene_metadata_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_query_template, commit_size, "gene_data_" + sub_type.get_data_provider() + ".csv" ], [ self.basic_gene_load_relations_query_template, commit_size, "gene_data_load_" + sub_type.get_data_provider() + ".csv" ], [ self.basic_gene_species_relations_query_template, commit_size, "gene_data_species_" + sub_type.get_data_provider() + ".csv" ], [ self.so_terms_query_template, commit_size, "gene_so_terms_" + sub_type.get_data_provider() + ".csv" ], [ self.chromosomes_query_template, commit_size, "gene_chromosomes_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_secondary_ids_query_template, commit_size, "gene_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ self.genomic_locations_query_template, commit_size, "gene_genomic_locations_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "gene_cross_references_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_relationships_query_template, commit_size, "gene_cross_references_relationships_" + sub_type.get_data_provider() + ".csv" ], [ self.gene_synonyms_query_template, 600000, "gene_synonyms_" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) for item in query_and_file_list: query_tracking_list.append(item) self.error_messages("BGI-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading BGI Data: %s", sub_type.get_data_provider()) def secondary_process(self, secondarys, data_record): """Get secondary ids. secondarys: list of dataset items. data_record: record to process. """ if data_record.get('secondaryIds') is None: return for sid in data_record.get('secondaryIds'): secondary_id_dataset = { "primary_id": data_record.get('primaryId'), "secondary_id": sid } secondarys.append(secondary_id_dataset) def synonyms_process(self, synonyms, data_record): """Get synonyms.""" if data_record.get('synonyms') is None: return for syn in data_record.get('synonyms'): syn_dataset = { "primary_id": data_record.get('primaryId'), "synonym": syn.strip() } synonyms.append(syn_dataset) def xref_process(self, basic_genetic_entity, cross_references, urls): # noqa """Process xrefs.""" primary_id = basic_genetic_entity.get('primaryId') global_id = basic_genetic_entity.get('primaryId') local_id = global_id.split(":")[1] taxon_id = basic_genetic_entity.get("taxonId") if 'crossReferences' not in basic_genetic_entity: return for cross_ref in basic_genetic_entity.get('crossReferences'): if ':' not in cross_ref.get('id'): continue cross_ref_id = cross_ref.get('id') local_cross_ref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') global_xref_id = cross_ref.get('id') display_name = global_xref_id # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: display_name = "" cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/expression_images': cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) elif page == 'gene': urls[ 'mod_cross_reference_complete_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) urls[ 'genetic_entity_external_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/references': urls[ 'gene_literature_url'] = self.etlh.rdh2.return_url_from_key_value( prefix, local_cross_ref_id, page) if page == 'gene/spell': display_name = 'Serial Patterns of Expression Levels Locator (SPELL)' # TODO: fix generic_cross_reference in SGD, RGD if page == 'generic_cross_reference': cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) # TODO: fix gene/disease xrefs for SGD once # resourceDescriptor change in develop # makes its way to the release branch. if page == 'gene/disease' and taxon_id == 'NCBITaxon:559292': cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( 'SGD', local_id, page) xref_map = ETLHelper.get_xref_dict(local_cross_ref_id, prefix, page, page, display_name, cross_ref_complete_url, global_xref_id + page) xref_map['dataId'] = primary_id cross_references.append(xref_map) else: if prefix == 'PANTHER': cross_ref_primary_id = cross_ref.get( 'id') + '_' + primary_id cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) page = "gene/panther" elif prefix == 'RGD': cross_ref_primary_id = cross_ref.get('id') cross_ref_complete_url = self.etlh.rdh2.return_url_from_key_value( 'RGD', local_cross_ref_id) page = "generic_cross_reference" else: cross_ref_primary_id = cross_ref.get('id') cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, primary_id) page = "generic_cross_reference" xref_map = ETLHelper.get_xref_dict(local_cross_ref_id, prefix, page, page, display_name, cross_ref_complete_url, cross_ref_primary_id + page) xref_map['dataId'] = primary_id cross_references.append(xref_map) def locations_process(self, basic_genetic_entity, chromosomes, genomic_locations): """Get chromosome and genomic location info.""" primary_id = basic_genetic_entity.get('primaryId') if 'genomeLocations' not in basic_genetic_entity: return for genome_location in basic_genetic_entity.get('genomeLocations'): chromosome = genome_location.get('chromosome') if chromosome is not None: if chromosome.startswith("chr"): chromosome = chromosome[3:] if chromosome not in chromosomes: chromosomes[chromosome] = {"primaryKey": chromosome} if 'startPosition' in genome_location: start = genome_location['startPosition'] else: start = None if 'endPosition' in genome_location: end = genome_location['endPosition'] else: end = None if 'strand' in basic_genetic_entity['genomeLocations']: strand = genome_location['strand'] else: strand = None assembly = genome_location.get('assembly') if 'strand' in genome_location: strand = genome_location['strand'] else: strand = None genomic_locations.append({ "primaryId": primary_id, "chromosome": chromosome, "start": start, "end": end, "strand": strand, "assembly": assembly, "uuid": str(uuid.uuid4()), "dataProvider": self.data_provider }) def get_generators(self, gene_data, data_provider, batch_size): """Create Generators.""" date_produced = gene_data['metaData']['dateProduced'] synonyms = [] secondary_ids = [] cross_references = [] genomic_locations = [] gene_dataset = [] gene_metadata = [] gene_to_so_terms = [] chromosomes = {} release = None counter = 0 self.data_providers_process(gene_data) load_key = date_produced + data_provider + "_BGI" # If we're not tracking the metadata, create the entry in our tracker. if load_key not in self.metadata_is_loaded: self.metadata_is_loaded[load_key] = False if 'release' in gene_data['metaData']: release = gene_data['metaData']['release'] if self.metadata_is_loaded[load_key] is False: gene_metadata = [] metadata_dict = { 'loadKey': load_key, 'loadName': 'BGI', 'release': release, 'dataProviders': None, 'dataProvider': data_provider } gene_metadata.append(metadata_dict) for gene_record in gene_data['data']: counter = counter + 1 urls = { 'gene_literature_url': "", 'genetic_entity_external_url': "", 'mod_cross_reference_complete_url': "" } basic_genetic_entity = gene_record['basicGeneticEntity'] primary_id = basic_genetic_entity.get('primaryId') global_id = basic_genetic_entity.get('primaryId') local_id = global_id.split(":")[1] taxon_id = basic_genetic_entity.get("taxonId") short_species_abbreviation = self.etlh.get_short_species_abbreviation( taxon_id) if basic_genetic_entity.get('taxonId') in [ "NCBITaxon:9606", "NCBITaxon:10090" ]: local_id = basic_genetic_entity.get('primaryId') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( primary_id) if is_it_test_entry is False: counter = counter - 1 continue self.xref_process(basic_genetic_entity, cross_references, urls) # TODO Metadata can be safely removed from this dictionary. Needs to be tested. gene_to_so_terms.append({ "primaryKey": primary_id, "soTermId": gene_record['soTermId'] }) gene = { "symbol": gene_record.get('symbol'), # globallyUniqueSymbolWithSpecies requested by search group "symbolWithSpecies": gene_record.get('symbol') + " (" + short_species_abbreviation + ")", "name": gene_record.get('name'), "geneticEntityExternalUrl": urls['genetic_entity_external_url'], "description": gene_record.get('description'), "geneSynopsis": gene_record.get('geneSynopsis'), "geneSynopsisUrl": gene_record.get('geneSynopsisUrl'), "taxonId": basic_genetic_entity.get('taxonId'), "geneLiteratureUrl": urls['gene_literature_url'], "name_key": gene_record.get('symbol'), "primaryId": primary_id, "category": "gene", "href": None, "uuid": str(uuid.uuid4()), "modCrossRefCompleteUrl": urls['mod_cross_reference_complete_url'], "localId": local_id, "modGlobalCrossRefId": global_id, "modGlobalId": global_id, "loadKey": load_key, "dataProvider": data_provider, "dateProduced": date_produced } gene_dataset.append(gene) self.locations_process(basic_genetic_entity, chromosomes, genomic_locations) self.synonyms_process(synonyms, basic_genetic_entity) self.secondary_process(secondary_ids, basic_genetic_entity) # We should have the metadata ready to go after the first loop of the generator. self.metadata_is_loaded[load_key] = True # Establishes the number of genes to yield (return) at a time. if counter == batch_size: # only sending unique chromosomes, hense empty list here. counter = 0 yield [ gene_metadata, gene_dataset, gene_dataset, gene_dataset, gene_to_so_terms, [], secondary_ids, genomic_locations, cross_references, cross_references, synonyms ] gene_metadata = [] gene_dataset = [] synonyms = [] secondary_ids = [] genomic_locations = [] cross_references = [] gene_to_so_terms = [] # xref_relations = [] if counter > 0: yield [ gene_metadata, gene_dataset, gene_dataset, gene_dataset, gene_to_so_terms, chromosomes.values(), secondary_ids, genomic_locations, cross_references, cross_references, synonyms ]
def get_generators(self, sqtr_data, data_provider, batch_size): """Get Generators""" data_providers = [] sqtrs = [] sqtr_synonyms = [] sqtr_secondary_ids = [] mod_global_cross_ref_url = "" tgs = [] counter = 0 date_produced = sqtr_data['metaData']['dateProduced'] data_provider_object = sqtr_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_SqTR" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url(data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict( \ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) for sqtr_record in sqtr_data['data']: counter = counter + 1 global_id = sqtr_record['primaryId'] local_id = global_id.split(":")[1] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry(global_id) if is_it_test_entry is False: counter = counter - 1 continue if sqtr_record.get('secondaryIds') is not None: for sid in sqtr_record.get('secondaryIds'): sqtr_secondary_id_dataset = { "primaryId": sqtr_record.get('primaryId'), "secondaryId": sid } sqtr_secondary_ids.append(sqtr_secondary_id_dataset) if sqtr_record.get('synonyms') is not None: for syn in sqtr_record.get('synonyms'): syn_dataset = { "primaryId": sqtr_record.get('primaryId'), "synonym": syn } sqtr_synonyms.append(syn_dataset) if sqtr_record.get('targetGeneIds') is not None: for target_gene_id in sqtr_record.get('targetGeneIds'): tg_dataset = { "primaryId": sqtr_record.get('primaryId'), "geneId": target_gene_id } tgs.append(tg_dataset) if 'crossReferences' in sqtr_record: for cross_ref in sqtr_record['modCrossReference']: cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is None or len(pages) == 0: continue if 'sequence_targeting_reagent' in pages: page = 'sequence_targeting_reagent' mod_global_cross_ref_url = ETLHelper.get_page_complete_url( \ local_crossref_id, self.xref_url_map, prefix, page) sqtr_dataset = { "primaryId": sqtr_record.get('primaryId'), "name": sqtr_record.get('name'), "globalId": global_id, "localId": local_id, "soTerm": sqtr_record.get('soTermId'), "taxonId": sqtr_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "modGlobalCrossRefUrl": mod_global_cross_ref_url, "dataProvider": data_provider } sqtrs.append(sqtr_dataset) if counter == batch_size: yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs] sqtrs = [] sqtr_secondary_ids = [] sqtr_synonyms = [] tgs = [] counter = 0 if counter > 0: yield [sqtrs, sqtr_secondary_ids, sqtr_synonyms, tgs]
class ConstructETL(ETL): """Construct ETL""" logger = logging.getLogger(__name__) xref_url_map = ResourceDescriptorHelper().get_data() # Query templates which take params and will be processed later construct_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row //Create the Construct node and set properties. primaryKey is required. MERGE (o:Construct {primaryKey:row.primaryId}) ON CREATE SET o.name = row.name, o.dateProduced = row.dateProduced, o.release = row.release, o.localId = row.localId, o.globalId = row.globalId, o.uuid = row.uuid, o.nameText = row.nameText, o.modCrossRefCompleteUrl = row.modGlobalCrossRefId, o.dataProviders = row.dataProviders, o.dataProvider = row.dataProvider, o.symbol = row.symbol """ construct_secondary_ids_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (f:Construct {primaryKey:row.data_id}) MERGE (second:SecondaryId {primaryKey:row.secondary_id}) SET second.name = row.secondary_id MERGE (f)-[aka1:ALSO_KNOWN_AS]->(second) """ construct_synonyms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (a:Construct {primaryKey:row.data_id}) MERGE(syn:Synonym {primaryKey:row.synonym}) SET syn.name = row.synonym MERGE (a)-[aka2:ALSO_KNOWN_AS]->(syn) """ construct_xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Construct {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text( ) construct_gene_component_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Construct {primaryKey:row.constructID}), (g:Gene {primaryKey:row.componentID}) CALL apoc.create.relationship(g, row.componentRelation, {}, o) yield rel REMOVE rel.noOp""" construct_no_gene_component_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Construct {primaryKey:row.constructID}), (g:NonBGIConstructComponent {primaryKey:row.componentSymbol}) CALL apoc.create.relationship(g, row.componentRelation, {}, o) yield rel REMOVE rel.noOp""" non_bgi_component_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MERGE (o:NonBGIConstructComponent {primaryKey:row.componentSymbol})""" def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, )) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): self.logger.info("Loading Construct Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Construct Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ ConstructETL.construct_query_template, commit_size, "Construct_data_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_secondary_ids_query_template, commit_size, "Construct_secondary_ids_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_synonyms_query_template, commit_size, "Construct_synonyms_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_xrefs_query_template, commit_size, "Construct_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.non_bgi_component_query_template, commit_size, "Construct_non_bgi_component_" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_gene_component_query_template, commit_size, "Construct_components_gene" + sub_type.get_data_provider() + ".csv" ], [ ConstructETL.construct_no_gene_component_query_template, commit_size, "Construct_components_no_gene" + sub_type.get_data_provider() + ".csv" ] ] # Obtain the generator generators = self.get_generators(data, sub_type.get_data_provider(), batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, construct_data, data_provider, batch_size): """Create Generators""" data_providers = [] release = "" constructs = [] construct_synonyms = [] construct_secondary_ids = [] cross_reference_list = [] component_details = [] component_no_gene_details = [] non_bgi_components = [] counter = 0 date_produced = construct_data['metaData']['dateProduced'] data_provider_object = construct_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') self.logger.info("DataProvider: " + data_provider) data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_construct" # TODO: get SGD to fix their files. if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) if 'release' in construct_data['metaData']: release = construct_data['metaData']['release'] for construct_record in construct_data['data']: counter = counter + 1 global_id = construct_record['primaryId'] local_id = global_id.split(":")[1] mod_global_cross_ref_id = "" if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue name_text = TextProcessingHelper.cleanhtml( construct_record.get('name')) construct_dataset = { "symbol": construct_record.get('name'), "primaryId": construct_record.get('primaryId'), "globalId": global_id, "localId": local_id, "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "uuid": str(uuid.uuid4()), "dataProvider": data_provider, "nameText": name_text, "name": construct_record.get('name') } constructs.append(construct_dataset) if 'crossReferences' in construct_record: for cross_ref in construct_record.get('crossReferences'): cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'construct': mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['dataId'] = global_id cross_reference_list.append(xref) if 'constructComponents' in construct_record: for component in construct_record.get('constructComponents'): component_relation = component.get( 'componentRelation').upper() component_symbol = component.get('componentSymbol') component_id = component.get('componentID') if component_id is not None: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "componentID": component_id, "constructID": construct_record.get('primaryId') } component_details.append(component_detail) else: component_detail = { "componentRelation": component_relation.upper(), "componentSymbol": component_symbol, "constructID": construct_record.get('primaryId') } non_bgi_component = { "componentSymbol": component_symbol } non_bgi_components.append(non_bgi_component) component_no_gene_details.append(component_detail) if 'synonyms' in construct_record: for syn in construct_record.get('synonyms'): construct_synonym = { "data_id": construct_record.get('primaryId'), "synonym": syn.strip() } construct_synonyms.append(construct_synonym) if 'secondaryIds' in construct_record: for secondary_id in construct_record.get('secondaryIds'): construct_secondary_id = { "data_id": construct_record.get('primaryId'), "secondary_id": secondary_id } construct_secondary_ids.append(construct_secondary_id) if counter == batch_size: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ] constructs = [] construct_secondary_ids = [] construct_synonyms = [] cross_reference_list = [] non_bgi_components = [] component_details = [] component_no_gene_details = [] counter = 0 if counter > 0: yield [ constructs, construct_secondary_ids, construct_synonyms, cross_reference_list, non_bgi_components, component_details, component_no_gene_details ]
class ExpressionAtlasETL(ETL): """Expression Atlas ETL.""" logger = logging.getLogger(__name__) # Querys which do not take params and can be used as is get_all_gene_primary_to_ensmbl_ids_query = """ MATCH (g:Gene)-[:CROSS_REFERENCE]-(c:CrossReference) WHERE c.prefix = 'ENSEMBL' RETURN g.primaryKey, c.localId""" get_mod_gene_symbol_to_primary_ids_query = """ MATCH (g:Gene) WHERE g.dataProvider = {parameter} RETURN g.primaryKey, g.symbol""" get_genes_with_expression_atlas_links_query = """ MATCH (g:Gene) WHERE LOWER(g.primaryKey) IN {parameter} RETURN g.primaryKey, g.modLocalId""" # Query templates which take params and will be processed later add_expression_atlas_crossreferences_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene) WHERE o.primaryKey = row.genePrimaryKey """ + ETLHelper.get_cypher_xref_text() def __init__(self, config): """Initialise object.""" super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] ensg_to_gene_primary_id_map = self._get_primary_gene_ids_to_ensembl_ids( ) for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process( target=self._process_sub_type, args=(sub_type, ensg_to_gene_primary_id_map)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) @staticmethod def _get_primary_gene_ids_to_ensembl_ids(): return_set = Neo4jHelper.run_single_query( ExpressionAtlasETL.get_all_gene_primary_to_ensmbl_ids_query) return { record["c.localId"].lower(): record["g.primaryKey"] for record in return_set } @staticmethod def _get_mod_gene_symbol_to_primary_ids(data_provider): return_set = Neo4jHelper.run_single_parameter_query( ExpressionAtlasETL.get_mod_gene_symbol_to_primary_ids_query, data_provider) return { record["g.symbol"].lower(): record["g.primaryKey"] for record in return_set } # Returns only pages for genes that we have in the Alliance def _get_expression_atlas_gene_pages(self, sub_type, data_provider, ensg_to_gene_primary_id_map): filepath = sub_type.get_filepath() gene_symbol_to_primary_id_map = self._get_mod_gene_symbol_to_primary_ids( data_provider) expression_atlas_gene_pages = {} with open(filepath) as file_handle: doc = xmltodict.parse(file_handle.read())["urlset"] for value in doc.values(): if isinstance(value, (list, )): for element in value: url = element['loc'] expression_atlas_gene = url.split("/")[-1] expression_atlas_gene = expression_atlas_gene.lower() if expression_atlas_gene in ensg_to_gene_primary_id_map: expression_atlas_gene_pages[ ensg_to_gene_primary_id_map[ expression_atlas_gene].lower()] = url elif expression_atlas_gene in gene_symbol_to_primary_id_map: expression_atlas_gene_pages[ gene_symbol_to_primary_id_map[ expression_atlas_gene].lower()] = url else: alliance_gene = data_provider + ":" + expression_atlas_gene expression_atlas_gene_pages[ alliance_gene.lower()] = url return expression_atlas_gene_pages def _process_sub_type(self, sub_type, ensg_to_gene_primary_id_map): data_provider = sub_type.get_data_provider() expression_atlas_gene_pages = self._get_expression_atlas_gene_pages( sub_type, data_provider, ensg_to_gene_primary_id_map) commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(expression_atlas_gene_pages, data_provider, batch_size) query_template_list = [ [ self.add_expression_atlas_crossreferences_query_template, commit_size, "expression_atlas_" + data_provider + "_data.csv" ], ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("ExpAtlas-{}: ".format( sub_type.get_data_provider())) def get_generators(self, expression_atlas_gene_pages, data_provider, batch_size): """Get Generators.""" return_set = Neo4jHelper.run_single_parameter_query( ExpressionAtlasETL.get_genes_with_expression_atlas_links_query, list(expression_atlas_gene_pages.keys())) counter = 0 cross_reference_list = [] for record in return_set: counter += 1 cross_reference = ETLHelper.get_xref_dict( record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene", "gene/expression-atlas", "gene/expressionAtlas", record["g.modLocalId"], expression_atlas_gene_pages[record["g.primaryKey"].lower()], data_provider + ":" + record["g.modLocalId"] + "gene/expression-atlas") cross_reference["genePrimaryKey"] = record["g.primaryKey"] cross_reference_list.append(cross_reference) if counter > batch_size: yield [cross_reference_list] counter = 0 cross_reference_list = [] if counter > 0: yield [cross_reference_list]
def get_generators(self, disease_data, batch_size, data_provider): """Creating generators""" counter = 0 disease_association_type = None gene_list_to_yield = [] allele_list_to_yield = [] agm_list_to_yield = [] evidence_code_list_to_yield = [] withs = [] pge_list_to_yield = [] xrefs = [] data_provider_object = disease_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') for disease_record in disease_data['data']: publication_mod_id = "" pub_med_id = "" pub_mod_url = None pub_med_url = None pge_key = '' if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( disease_record.get('objectId')) if is_it_test_entry is False: continue disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \ disease_record['objectRelation'].get("associationType").upper() counter = counter + 1 disease_object_type = disease_record['objectRelation'].get( "objectType") primary_id = disease_record.get('objectId') do_id = disease_record.get('DOid') if 'evidence' in disease_record: pecj_primary_key = str(uuid.uuid4()) evidence = disease_record.get('evidence') if 'publication' in evidence: publication = evidence.get('publication') if publication.get('publicationId').startswith('PMID:'): pub_med_id = publication.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_url = ETLHelper.get_complete_pub_url( local_pub_med_id, pub_med_id) if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') local_pub_mod_id = publication_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( local_pub_mod_id, publication_mod_id) else: publication_mod_id = publication.get('publicationId') local_pub_mod_id = publication_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( local_pub_mod_id, publication_mod_id) if 'evidenceCodes' in disease_record['evidence']: for ecode in disease_record['evidence'].get( 'evidenceCodes'): ecode_map = { "pecjPrimaryKey": pecj_primary_key, "ecode": ecode } evidence_code_list_to_yield.append(ecode_map) negation = '' if 'objectRelation' in disease_record: disease_association_type = disease_record[ 'objectRelation'].get("associationType").upper() if 'negation' in disease_record: # this capitalization is purposeful if disease_association_type == 'IS_IMPLICATED_IN': disease_association_type = 'IS_NOT_IMPLICATED_IN' if disease_association_type == 'IS_MODEL_OF': disease_association_type = 'IS_NOT_MODEL_OF' if disease_association_type == 'IS_MARKER_FOR': disease_association_type = 'IS_NOT_MARKER_FOR' negation = 'NOT' disease_unique_key = disease_unique_key + negation additional_genetic_components = [] if 'additionalGeneticComponents' in disease_record[ 'objectRelation']: for component in disease_record['objectRelation'][ 'additionalGeneticComponents']: component_symbol = component.get('componentSymbol') component_id = component.get('componentId') component_url = component.get( 'componentUrl') + component_id additional_genetic_components.append({ "id": component_id, "componentUrl": component_url, "componentSymbol": component_symbol }) if 'with' in disease_record: with_record = disease_record.get('with') for rec in with_record: disease_unique_key = disease_unique_key + rec for rec in with_record: with_map = { "diseaseUniqueKey": disease_unique_key, "withD": rec } withs.append(with_map) if 'primaryGeneticEntityIDs' in disease_record: pge_ids = disease_record.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) if 'dataProvider' in disease_record: for dp in disease_record['dataProvider']: annotation_type = dp.get('type') xref = dp.get('crossReference') cross_ref_id = xref.get('id') pages = xref.get('pages') if ":" in cross_ref_id: local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref_id.split(":")[0] else: local_crossref_id = "" prefix = cross_ref_id if annotation_type is None: annotation_type = 'curated' if pages is not None and len(pages) > 0: for page in pages: if (data_provider == 'RGD' or data_provider == 'HUMAN') and prefix == 'DOID': display_name = 'RGD' elif (data_provider == 'RGD' or data_provider == 'HUMAN') and prefix == 'OMIM': display_name = 'OMIM' else: display_name = cross_ref_id.split(":")[0] if display_name == 'DOID': display_name = data_provider mod_global_cross_ref_id = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) passing_xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, display_name, mod_global_cross_ref_id, cross_ref_id + page + annotation_type) passing_xref['dataId'] = disease_unique_key if 'loaded' in annotation_type: passing_xref['loadedDB'] = 'true' passing_xref['curatedDB'] = 'false' else: passing_xref['curatedDB'] = 'true' passing_xref['loadedDB'] = 'false' xrefs.append(passing_xref) disease_record = { "diseaseUniqueKey": disease_unique_key, "doId": do_id, "primaryId": primary_id, "pecjPrimaryKey": pecj_primary_key, "relationshipType": disease_association_type.upper(), "dataProvider": data_provider, "dateAssigned": disease_record.get("dateAssigned"), "pubPrimaryKey": publication_mod_id + pub_med_id, "pubModId": publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModUrl": pub_mod_url, "negation": negation } if disease_object_type == 'gene': gene_list_to_yield.append(disease_record) elif disease_object_type == 'allele': allele_list_to_yield.append(disease_record) else: agm_list_to_yield.append(disease_record) if counter == batch_size: yield [ allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs ] agm_list_to_yield = [] allele_list_to_yield = [] gene_list_to_yield = [] evidence_code_list_to_yield = [] pge_list_to_yield = [] xrefs = [] withs = [] counter = 0 if counter > 0: yield [ allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs ]
def get_generators(self, variant_data, batch_size): # noqa """Get Generators.""" data_providers = [] release = "" variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] counter = 0 date_produced = variant_data['metaData']['dateProduced'] self.data_providers_process(variant_data) load_key = date_produced + self.data_provider + "_VARIATION" if 'release' in variant_data['metaData']: release = variant_data['metaData']['release'] assemblies = {} for allele_record in variant_data['data']: chromosome = allele_record["chromosome"] if chromosome.startswith("chr"): chromosome_str = chromosome[3:] else: chromosome_str = chromosome assembly = allele_record["assembly"] if assembly not in assemblies: self.logger.info(assembly) context_info = ContextInfo() data_manager = DataFileManager( context_info.config_file_location) assemblies[assembly] = AssemblySequenceHelper( assembly, data_manager) so_term_id = allele_record.get('type') genomic_reference_sequence = allele_record.get( 'genomicReferenceSequence') genomic_variant_sequence = allele_record.get( 'genomicVariantSequence') if genomic_reference_sequence == 'N/A': genomic_reference_sequence = "" if genomic_variant_sequence == 'N/A': genomic_variant_sequence = "" padding_left = "" padding_right = "" if allele_record.get('start') != "" and allele_record.get( 'end') != "": # not insertion if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567": genomic_reference_sequence = assemblies[ assembly].get_sequence(chromosome_str, allele_record.get('start'), allele_record.get('end')) if allele_record.get('start') < allele_record.get('end'): start = allele_record.get('start') end = allele_record.get('end') else: start = allele_record.get('end') end = allele_record.get('start') padding_width = 500 if so_term_id != "SO:0000667": # not insertion start = start - 1 end = end + 1 left_padding_start = start - padding_width if left_padding_start < 1: left_padding_start = 1 padding_left = assemblies[assembly].get_sequence( chromosome_str, left_padding_start, start) right_padding_end = end + padding_width padding_right = assemblies[assembly].get_sequence( chromosome_str, end, right_padding_end) counter = counter + 1 global_id = allele_record.get('alleleId') mod_global_cross_ref_id = "" cross_references = [] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue cross_ref_primary_id = allele_record.get( 'sequenceOfReferenceAccessionNumber') local_cross_ref_id = cross_ref_primary_id.split(":")[1] prefix = cross_ref_primary_id.split(":")[0] cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, global_id) xref_map = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, "variant_sequence_of_reference", "sequence_of_reference_accession_number", global_id, cross_ref_complete_url, cross_ref_primary_id + "variant_sequence_of_reference") xref_map['dataId'] = global_id if cross_ref_primary_id is not None: cross_references.append(xref_map) if genomic_reference_sequence is not None: if len(genomic_reference_sequence) > 1000 and ( allele_record.get('type') == 'SO:1000002' or allele_record.get('type') == 'SO:1000008'): self.logger.debug("%s genomicReferenceSequence", allele_record.get('alleleId')) if genomic_variant_sequence is not None: if len(genomic_variant_sequence) > 1000 and ( allele_record.get('type') in ['SO:1000002', 'SO:1000008']): self.logger.debug("%s genomicVariantSequence", allele_record.get('alleleId')) hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature( allele_record.get('sequenceOfReferenceAccessionNumber'), allele_record.get('type'), allele_record.get('start'), allele_record.get('end'), genomic_reference_sequence, genomic_variant_sequence, allele_record.get('assembly'), chromosome_str) if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \ or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000: self.logger.debug( "%s has too long of a sequence potentionally", allele_record.get('alleleId')) # TODO: fix typo in MGI Submission for this variant so # that it doesn't list a 40K bp point mutation. if allele_record.get('alleleId') != 'MGI:6113870': variant_dataset = { "hgvs_nomenclature": hgvs_nomenclature, "genomicReferenceSequence": genomic_reference_sequence, "genomicVariantSequence": genomic_variant_sequence, "paddingLeft": padding_left, "paddingRight": padding_right, "alleleId": allele_record.get('alleleId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "dataProvider": self.data_provider, "variantHGVSSynonym": hgvs_synonym } variant_genomic_location_dataset = { "variantId": hgvs_nomenclature, "assembly": allele_record.get('assembly'), "chromosome": chromosome_str, "start": allele_record.get('start'), "end": allele_record.get('end'), "uuid": str(uuid.uuid4()), "dataProvider": self.data_provider } variant_so_term = { "variantId": hgvs_nomenclature, "soTermId": allele_record.get('type') } variant_so_terms.append(variant_so_term) variant_genomic_locations.append( variant_genomic_location_dataset) variants.append(variant_dataset) if counter == batch_size: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ] variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] if counter > 0: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ]
class DiseaseETL(ETL): """Disease ETL.""" logger = logging.getLogger(__name__) # Query templates which take params and will be processed later execute_annotation_xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:DiseaseEntityJoin:Association {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text_annotation_level() execute_agms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row // GET PRIMARY DATA OBJECTS MATCH (d:DOTerm:Ontology {primaryKey:row.doId}) MATCH (agm:AffectedGenomicModel {primaryKey:row.primaryId}) CALL apoc.create.relationship(d, row.relationshipType, {}, agm) yield rel SET rel.uuid = row.diseaseUniqueKey REMOVE rel.noOp //This is an intentional MERGE, please leave as is MERGE (dfa:Association:DiseaseEntityJoin {primaryKey:row.diseaseUniqueKey}) ON CREATE SET dfa.dataProvider = row.dataProvider, dfa.sortOrder = 1, dfa.joinType = row.relationshipType, dfa.negation = row.negation MERGE (agm)-[fdaf:ASSOCIATION]->(dfa) MERGE (dfa)-[dadf:ASSOCIATION]->(d) // PUBLICATIONS FOR FEATURE MERGE (pubf:Publication {primaryKey:row.pubPrimaryKey}) ON CREATE SET pubf.pubModId = row.pubModId, pubf.pubMedId = row.pubMedId, pubf.pubModUrl = row.pubModUrl, pubf.pubMedUrl = row.pubMedUrl MERGE (pubEJ:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey}) ON CREATE SET pubEJ.joinType = 'pub_evidence_code_join', pubEJ.dateAssigned = row.dateAssigned MERGE (dfa)-[dapug:EVIDENCE {uuid:row.pecjPrimaryKey}]->(pubEJ) MERGE (pubf)-[pubfpubEJ:ASSOCIATION {uuid:row.pecjPrimaryKey}]->(pubEJ) """ execute_allele_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row // GET PRIMARY DATA OBJECTS MATCH (d:DOTerm:Ontology {primaryKey:row.doId}) MATCH (allele:Allele:Feature {primaryKey:row.primaryId}) CALL apoc.create.relationship(d, row.relationshipType, {}, allele) yield rel SET rel.uuid = row.diseaseUniqueKey REMOVE rel.noOp //This is an intentional MERGE, please leave as is MERGE (dfa:Association:DiseaseEntityJoin {primaryKey:row.diseaseUniqueKey}) ON CREATE SET dfa.dataProvider = row.dataProvider, dfa.sortOrder = 1, dfa.joinType = row.relationshipType, dfa.negation = row.negation MERGE (allele)-[fdaf:ASSOCIATION]->(dfa) MERGE (dfa)-[dadf:ASSOCIATION]->(d) // PUBLICATIONS FOR FEATURE MERGE (pubf:Publication {primaryKey:row.pubPrimaryKey}) ON CREATE SET pubf.pubModId = row.pubModId, pubf.pubMedId = row.pubMedId, pubf.pubModUrl = row.pubModUrl, pubf.pubMedUrl = row.pubMedUrl MERGE (pubEJ:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey}) ON CREATE SET pubEJ.joinType = 'pub_evidence_code_join', pubEJ.dateAssigned = row.dateAssigned MERGE (dfa)-[dapug:EVIDENCE {uuid:row.pecjPrimaryKey}]->(pubEJ) MERGE (pubf)-[pubfpubEJ:ASSOCIATION {uuid:row.pecjPrimaryKey}]->(pubEJ)""" execute_gene_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (d:DOTerm:Ontology {primaryKey:row.doId}) MATCH (gene:Gene {primaryKey:row.primaryId}) CALL apoc.create.relationship(d, row.relationshipType, {}, gene) yield rel SET rel.uuid = row.diseaseUniqueKey REMOVE rel.noOp MERGE (dga:Association:DiseaseEntityJoin {primaryKey:row.diseaseUniqueKey}) SET dga.dataProvider = row.dataProvider, dga.sortOrder = 1, dga.joinType = row.relationshipType, dga.negation = row.negation MERGE (gene)-[fdag:ASSOCIATION]->(dga) MERGE (dga)-[dadg:ASSOCIATION]->(d) // PUBLICATIONS FOR GENE MERGE (pubg:Publication {primaryKey:row.pubPrimaryKey}) ON CREATE SET pubg.pubModId = row.pubModId, pubg.pubMedId = row.pubMedId, pubg.pubModUrl = row.pubModUrl, pubg.pubMedUrl = row.pubMedUrl MERGE (pubEJ:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey}) ON CREATE SET pubEJ.joinType = 'pub_evidence_code_join', pubEJ.dateAssigned = row.dateAssigned MERGE (dga)-[dapug:EVIDENCE {uuid:row.pecjPrimaryKey}]->(pubEJ) MERGE (pubg)-[pubgpubEJ:ASSOCIATION {uuid:row.pecjPrimaryKey}]->(pubEJ)""" execute_ecode_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Ontology:ECOTerm {primaryKey:row.ecode}) MATCH (pubjk:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey}) MERGE (pubjk)-[daecode1g:ASSOCIATION {uuid:row.pecjPrimaryKey}]->(o)""" execute_withs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (dga:Association:DiseaseEntityJoin {primaryKey:row.diseaseUniqueKey}) MATCH (diseaseWith:Gene {primaryKey:row.withD}) MERGE (dga)-[dgaw:FROM_ORTHOLOGOUS_GENE]-(diseaseWith) """ execute_pges_gene_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (n:Gene {primaryKey:row.pgeId}) MATCH (d:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey}) MERGE (d)-[dgaw:PRIMARY_GENETIC_ENTITY]-(n)""" execute_pges_allele_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (n:Allele {primaryKey:row.pgeId}) MATCH (d:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey}) MERGE (d)-[dgaw:PRIMARY_GENETIC_ENTITY]-(n)""" execute_pges_agm_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (n:AffectedGenomicModel {primaryKey:row.pgeId}) MATCH (d:PublicationJoin:Association {primaryKey:row.pecjPrimaryKey}) MERGE (d)-[dgaw:PRIMARY_GENETIC_ENTITY]-(n)""" def __init__(self, config): """Initialise object.""" super().__init__() self.data_type_config = config self.disease_unique_key = None self.disease_association_type = None def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type,)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) self.delete_empty_nodes() def delete_empty_nodes(self): """Delete Empty Nodes.""" self.logger.debug("delete empty nodes") delete_empty_do_nodes_query = """ MATCH (dd:DOTerm) WHERE keys(dd)[0] = 'primaryKey' AND size(keys(dd)) = 1 DETACH DELETE (dd)""" Neo4jHelper.run_single_query(delete_empty_do_nodes_query) def _process_sub_type(self, sub_type): self.logger.info("Loading Disease Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [self.execute_allele_query_template, commit_size, "disease_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_gene_query_template, commit_size, "disease_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_agms_query_template, commit_size, "disease_agms_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_gene_query_template, commit_size, "disease_pges_gene_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_allele_query_template, commit_size, "disease_pges_allele_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_pges_agm_query_template, commit_size, "disease_pges_agms_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_withs_query_template, commit_size, "disease_withs_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_ecode_query_template, commit_size, "disease_evidence_code_data_" + sub_type.get_data_provider() + ".csv"], [self.execute_annotation_xrefs_query_template, commit_size, "disease_annotation_xrefs_data_" + sub_type.get_data_provider() + ".csv"] ] # Obtain the generator generators = self.get_generators(data, batch_size, sub_type.get_data_provider()) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Disease-{}: ".format(sub_type.get_data_provider())) self.logger.info("Finished Loading Disease Data: %s", sub_type.get_data_provider()) def process_pages(self, dp, xrefs, pages): """Process pages to get xrefs.""" annotation_type = dp.get('type') xref = dp.get('crossReference') cross_ref_id = xref.get('id') if ":" in cross_ref_id: local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref_id.split(":")[0] else: local_crossref_id = "" prefix = cross_ref_id if annotation_type is None: annotation_type = 'curated' for page in pages: if (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'DOID': display_name = 'RGD' elif (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'OMIM': display_name = 'OMIM' else: display_name = cross_ref_id.split(":")[0] if display_name == 'DOID': display_name = self.data_provider mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) passing_xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, display_name, mod_global_cross_ref_url, cross_ref_id + page + annotation_type) passing_xref['dataId'] = self.disease_unique_key if 'loaded' in annotation_type: passing_xref['loadedDB'] = 'true' passing_xref['curatedDB'] = 'false' else: passing_xref['curatedDB'] = 'true' passing_xref['loadedDB'] = 'false' xrefs.append(passing_xref) def xrefs_process(self, disease_record, xrefs): """Process the xrefs.""" if 'dataProvider' not in disease_record: return for dp in disease_record['dataProvider']: xref = dp.get('crossReference') pages = xref.get('pages') if pages is None or len(pages) == 0: continue self.process_pages(dp, xrefs, pages) def evidence_process(self, disease_record, pubs, evidence_code_list_to_yield): """Process evidence.""" pecj_primary_key = str(uuid.uuid4()) if 'evidence' not in disease_record: self.logger.critical("No evidence but creating new pecj_primary_key anyway") return pecj_primary_key evidence = disease_record.get('evidence') if 'publication' in evidence: publication = evidence.get('publication') if publication.get('publicationId').startswith('PMID:'): pubs['pub_med_id'] = publication.get('publicationId') pubs['pub_med_url'] = self.etlh.return_url_from_identifier(pubs['pub_med_id']) if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') pubs['publication_mod_id'] = pub_xref.get('id') pubs['pub_mod_url'] = self.etlh.return_url_from_identifier(pubs['publication_mod_id']) else: pubs['publication_mod_id'] = publication.get('publicationId') pubs['pub_mod_url'] = self.etlh.return_url_from_identifier(pubs['publication_mod_id']) if 'evidenceCodes' in disease_record['evidence']: for ecode in disease_record['evidence'].get('evidenceCodes'): ecode_map = {"pecjPrimaryKey": pecj_primary_key, "ecode": ecode} evidence_code_list_to_yield.append(ecode_map) return pecj_primary_key def objectrelation_process(self, disease_record): """Object Relation processing.""" negation = '' if 'objectRelation' not in disease_record: self.logger.critical("objectRelation not in record so disease_annotation_type is the last one seen") return negation, None if 'negation' in disease_record: # this capitalization is purposeful if self.disease_association_type == 'IS_IMPLICATED_IN': self.disease_association_type = 'IS_NOT_IMPLICATED_IN' elif self.disease_association_type == 'IS_MODEL_OF': self.disease_association_type = 'IS_NOT_MODEL_OF' elif self.disease_association_type == 'IS_MARKER_FOR': self.disease_association_type = 'IS_NOT_MARKER_FOR' negation = 'NOT' self.disease_unique_key = self.disease_unique_key + negation return negation # Not used anywhere so commented out for now? # additional_genetic_components = [] # if 'additionalGeneticComponents' in disease_record['objectRelation']: # for component in disease_record['objectRelation']['additionalGeneticComponents']: # component_symbol = component.get('componentSymbol') # component_id = component.get('componentId') # component_url = component.get('componentUrl') + component_id # additional_genetic_components.append( # {"id": component_id, # "componentUrl": component_url, # "componentSymbol": component_symbol} # ) def withs_process(self, disease_record, withs): """Process withs.""" if 'with' not in disease_record: return with_record = disease_record.get('with') for rec in with_record: self.disease_unique_key = self.disease_unique_key + rec for rec in with_record: with_map = { "diseaseUniqueKey": self.disease_unique_key, "withD": rec } withs.append(with_map) def primgenent_process(self, disease_record, pge_list_to_yield, pecj_primary_key): """Primary Genetic Entity ID process.""" if 'primaryGeneticEntityIDs' not in disease_record: return pge_ids = disease_record.get('primaryGeneticEntityIDs') for pge in pge_ids: # ? pge_key = pge_key + pge pge_map = {"pecjPrimaryKey": pecj_primary_key, "pgeId": pge} pge_list_to_yield.append(pge_map) def get_generators(self, disease_data, batch_size, data_provider): """Create generators.""" counter = 0 gene_list_to_yield = [] allele_list_to_yield = [] agm_list_to_yield = [] evidence_code_list_to_yield = [] withs = [] pge_list_to_yield = [] xrefs = [] self.data_providers_process(disease_data) for disease_record in disease_data['data']: pubs = {'pub_med_url': None, 'pub_med_id': "", 'pub_mod_url': None, 'publication_mod_id': "" } # pge_key = '' if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry(disease_record.get('objectId')) if is_it_test_entry is False: continue self.disease_unique_key = disease_record.get('objectId') + disease_record.get('DOid') + \ disease_record['objectRelation'].get("associationType").upper() self.disease_association_type = disease_record['objectRelation'].get("associationType").upper() counter = counter + 1 disease_object_type = disease_record['objectRelation'].get("objectType") primary_id = disease_record.get('objectId') do_id = disease_record.get('DOid') self.xrefs_process(disease_record, xrefs) pecj_primary_key = self.evidence_process(disease_record, pubs, evidence_code_list_to_yield) negation = self.objectrelation_process(disease_record) self.withs_process(disease_record, withs) self.primgenent_process(disease_record, pge_list_to_yield, pecj_primary_key) self.xrefs_process(disease_record, xrefs) disease_record = { "diseaseUniqueKey": self.disease_unique_key, "doId": do_id, "primaryId": primary_id, "pecjPrimaryKey": pecj_primary_key, "relationshipType": self.disease_association_type, "dataProvider": data_provider, "dateAssigned": disease_record.get("dateAssigned"), "pubPrimaryKey": pubs['publication_mod_id'] + pubs['pub_med_id'], "pubModId": pubs['publication_mod_id'], "pubMedId": pubs['pub_med_id'], "pubMedUrl": pubs['pub_med_url'], "pubModUrl": pubs['pub_mod_url'], "negation": negation} if disease_object_type == 'gene': gene_list_to_yield.append(disease_record) elif disease_object_type == 'allele': allele_list_to_yield.append(disease_record) else: agm_list_to_yield.append(disease_record) if counter == batch_size: yield [allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs] agm_list_to_yield = [] allele_list_to_yield = [] gene_list_to_yield = [] evidence_code_list_to_yield = [] pge_list_to_yield = [] xrefs = [] withs = [] counter = 0 if counter > 0: yield [allele_list_to_yield, gene_list_to_yield, agm_list_to_yield, pge_list_to_yield, pge_list_to_yield, pge_list_to_yield, withs, evidence_code_list_to_yield, xrefs]
def get_generators(self, phenotype_data, batch_size): """Get Generators""" list_to_yield = [] pge_list_to_yield = [] date_produced = phenotype_data['metaData']['dateProduced'] data_providers = [] data_provider_object = phenotype_data['metaData']['dataProvider'] counter = 0 data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] pge_key = '' load_key = date_produced + data_provider + "_phenotype" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, ETL.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.debug("data provider: %s", data_provider) for pheno in phenotype_data['data']: pecj_primary_key = str(uuid.uuid4()) counter = counter + 1 pub_med_id = None pub_mod_id = None pub_med_url = None pub_mod_url = None primary_id = pheno.get('objectId') phenotype_statement = pheno.get('phenotypeStatement') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( primary_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = pheno.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence['publicationId'] local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, primary_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') pub_mod_id = pub_xref.get('id') pub_mod_local_id = pub_mod_id.split(":")[1] if pub_mod_id is not None: pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) else: pub_mod_id = evidence.get('publicationId') if pub_mod_id is not None: pub_mod_local_id = pub_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) if pub_mod_id is None: pub_mod_id = "" if pub_med_id is None: pub_med_id = "" if pub_mod_id is None: pub_mod_id = "" date_assigned = pheno.get('dateAssigned') if pub_mod_id is None and pub_med_id is None: self.logger.info("%s is missing pubMed and pubMod id", primary_id) if 'primaryGeneticEntityIDs' in pheno: pge_ids = pheno.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) phenotype = { "primaryId": primary_id, "phenotypeUniqueKey": primary_id + phenotype_statement.strip(), "phenotypeStatement": phenotype_statement.strip(), "dateAssigned": date_assigned, "loadKey": load_key, "type": "gene", "dataProviders": data_providers, "dataProvider": data_provider, "dateProduced": date_produced, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": pub_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + pub_mod_id, "pecjPrimaryKey": pecj_primary_key } list_to_yield.append(phenotype) if counter == batch_size: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ] list_to_yield = [] pge_list_to_yield = [] counter = 0 if counter > 0: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ]
def get_generators(self, agm_data, data_provider, batch_size): """Get Generators""" data_providers = [] agms = [] agm_synonyms = [] agm_secondary_ids = [] mod_global_cross_ref_url = "" components = [] backgrounds = [] sqtrs = [] counter = 0 date_produced = agm_data['metaData']['dateProduced'] data_provider_object = agm_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_agm" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.info("data provider: %s", data_provider) for agm_record in agm_data['data']: counter = counter + 1 global_id = agm_record['primaryID'] local_id = global_id.split(":")[1] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue if agm_record.get('secondaryIds') is not None: for sid in agm_record.get('secondaryIds'): agm_secondary_id_dataset = { "primaryId": agm_record.get('primaryID'), "secondaryId": sid } agm_secondary_ids.append(agm_secondary_id_dataset) if agm_record.get('synonyms') is not None: for syn in agm_record.get('synonyms'): syn_dataset = { "primaryId": agm_record.get('primaryID'), "synonym": syn } agm_synonyms.append(syn_dataset) if 'crossReference' in agm_record: cross_ref = agm_record.get('crossReference') cross_ref_id = cross_ref.get('id') local_crossref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page in ['Fish', 'genotype', 'strain']: mod_global_cross_ref_url = ETLHelper.get_page_complete_url( local_crossref_id, self.xref_url_map, prefix, page) short_species_abbreviation = ETLHelper.get_short_species_abbreviation( agm_record.get('taxonId')) name_text = TextProcessingHelper.cleanhtml(agm_record.get('name')) # TODO: make subtype required in submission file. subtype = agm_record.get('subtype') if subtype is None and data_provider == 'WB': subtype = 'strain' if subtype is None: subtype = 'affected_genomic_model' # TODO: name_text agm_dataset = { "primaryId": agm_record.get('primaryID'), "name": agm_record.get('name'), "globalId": global_id, "localId": local_id, "taxonId": agm_record.get('taxonId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "subtype": subtype, "modGlobalCrossRefUrl": mod_global_cross_ref_url, "dataProvider": data_provider, "nameText": name_text, "nameWithSpecies": agm_record.get('name') + " (" + short_species_abbreviation + ")", "nameTextWithSpecies": name_text + " (" + short_species_abbreviation + ")", } agms.append(agm_dataset) if agm_record.get('affectedGenomicModelComponents') is not None: for component in agm_record.get( 'affectedGenomicModelComponents'): component_dataset = { "primaryId": agm_record.get('primaryID'), "componentId": component.get('alleleID'), "zygosityId": component.get('zygosity') } components.append(component_dataset) if agm_record.get('sequenceTargetingReagentIDs') is not None: for sqtr in agm_record.get('sequenceTargetingReagentIDs'): sqtr_dataset = { "primaryId": agm_record.get('primaryID'), "sqtrId": sqtr } sqtrs.append(sqtr_dataset) if agm_record.get('parentalPopulationIDs') is not None: for background in agm_record.get('parentalPopulationIDs'): background_dataset = { "primaryId": agm_record.get('primaryID'), "backgroundId": background } backgrounds.append(background_dataset) if counter == batch_size: yield [ agms, agm_secondary_ids, agm_synonyms, components, sqtrs, backgrounds ] agms = [] agm_secondary_ids = [] agm_synonyms = [] components = [] backgrounds = [] counter = 0 if counter > 0: yield [ agms, agm_secondary_ids, agm_synonyms, components, sqtrs, backgrounds ]
class HTPMetaDatasetETL(ETL): htp_dataset_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row CREATE (ds:HTPDataset {primaryKey:row.datasetId}) SET ds.dateAssigned = row.dateAssigned, ds.summary = row.summary, ds.numChannels = row.numChannels, ds.subSeries = row.subSeries """ htp_dataset_pub_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDataset {primaryKey: row.datasetId}) MERGE (p:Publication {primaryKey: row.pubPrimaryKey}) ON CREATE SET p.pubModId = row.pubModId, p.pubMedId = row.pubMedId, p.pubModUrl = row.pubModUrl, p.pubMedUrl = row.pubMedUrl MERGE (p)-[:ASSOCIATION]-(ds) """ htp_category_tags_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDataset {primaryKey:row.datasetId}) MERGE (ct:CategoryTag {primaryKey:row.tag}) MERGE (ds)-[:CATEGORY_TAG]-(ct) """ htp_secondaryIds_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (ds:HTPDataset {primaryKey: row.datasetId}) MERGE (s:SecondaryId:Identifier {primaryKey:row.secondaryId}) ON CREATE SET s.name = row.secondaryId MERGE (ds)-[aka:ALSO_KNOWN_AS]-(s) """ htpdataset_xrefs_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:HTPDataset {primaryKey:row.datasetId}) """ + ETLHelper.get_cypher_xref_text( ) def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): p = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, )) p.start() thread_pool.append(p) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): logger.info("Loading HTP metadata Data: %s" % sub_type.get_data_provider()) filepath = sub_type.get_filepath() logger.info(filepath) data = JSONFile().get_data(filepath) logger.info("Finished Loading HTP metadata Data: %s" % sub_type.get_data_provider()) if data is None: logger.warn("No Data found for %s skipping" % sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_list = [ [ HTPMetaDatasetETL.htp_dataset_query_template, commit_size, "htp_metadataset_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_category_tags_query_template, commit_size, "htp_metadataset_tags_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_dataset_pub_query_template, commit_size, "htp_metadataset_publications_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htpdataset_xrefs_template, commit_size, "htp_metadataset_xrefs_" + sub_type.get_data_provider() + ".csv" ], [ HTPMetaDatasetETL.htp_secondaryIds_query_template, commit_size, "htp_metadataset_secondaryIds_" + sub_type.get_data_provider() + ".csv" ], ] # Obtain the generator generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, htp_dataset_data, batch_size): dataset_tags = [] data_providers = [] htp_datasets = [] publications = [] secondaryIds = [] cross_reference_list = [] counter = 0 date_produced = htp_dataset_data['metaData']['dateProduced'] data_provider_object = htp_dataset_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] for dataset_record in htp_dataset_data['data']: counter = counter + 1 dataset = dataset_record.get('datasetId') datasetId = dataset.get('primaryId') # spoke to RGD and they wish to remove these datasets as they overlap with SGD. if (datasetId == 'GEO:GSE18157' or datasetId == 'GEO:GSE33497') and data_provider == 'RGD': continue if 'secondaryIds' in dataset: for secId in dataset.get('secondaryIds'): secid = {"datasetId": datasetId, "secondaryId": secId} secondaryIds.append(secid) if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( datasetId) if is_it_test_entry is False: counter = counter - 1 continue if 'crossReference' in dataset: crossRefO = dataset.get('crossReference') if crossRefO is not None: crossRefId = crossRefO.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRefId.split(":")[0] pages = crossRefO.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_url, crossRefId + page) xref['dataId'] = datasetId cross_reference_list.append(xref) category_tags = dataset_record.get('categoryTags') if category_tags is not None: for tag in category_tags: dataset_category_tag = {"datasetId": datasetId, "tag": tag} dataset_tags.append(dataset_category_tag) publicationNew = dataset_record.get('publications') if publicationNew is not None: for pub in publicationNew: pid = pub.get('publicationId') publication_mod_id = "" pub_med_id = "" pub_mod_url = "" pub_med_url = "" if pid is not None and pid.startswith('PMID:'): pub_med_id = pid local_pub_med_id = pub_med_id.split(":")[1] pub_med_url = pub_med_url = self.etlh.get_no_page_complete_url( local_pub_med_id, 'PMID', pub_med_id) if 'crossReference' in pub: pub_xref = pub.get('crossReference') publication_mod_id = pub_xref.get('id') pub_mod_url = self.etlh.rdh2.return_url_from_identifier( publication_mod_id) elif pid is not None and not pid.startswith('PMID:'): publication_mod_id = pub.get('publicationId') pub_mod_url = self.etlh.rdh2.return_url_from_identifier( publication_mod_id) publication = { "datasetId": datasetId, "pubPrimaryKey": publication_mod_id + pub_med_id, "pubModId": publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModUrl": pub_mod_url } publications.append(publication) htp_dataset = { "datasetId": datasetId, "dateAssigned": dataset_record.get('dateAssigned'), "title": dataset_record.get('title'), "summary": dataset_record.get('summary'), "numChannels": dataset_record.get('numChannels'), "subSeries": dataset_record.get('subSeries') } htp_datasets.append(htp_dataset) if counter == batch_size: yield [ htp_datasets, dataset_tags, publications, cross_reference_list, secondaryIds ] counter = 0 htp_datasets = [] dataset_tags = [] publications = [] cross_reference_list = [] secondaryIds = [] if counter > 0: yield [ htp_datasets, dataset_tags, publications, cross_reference_list, secondaryIds ]
class GeoXrefETL(ETL): """GEO XREF ETL""" logger = logging.getLogger(__name__) geo_xref_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Gene) where o.primaryKey = row.genePrimaryKey """ + ETLHelper.get_cypher_xref_text() gene_crossref_query_template = """ MATCH (g:Gene)-[crr:CROSS_REFERENCE]-(cr:CrossReference) WHERE cr.globalCrossRefId IN {parameter} RETURN g.primaryKey, g.modLocalId, cr.name, cr.globalCrossRefId""" def __init__(self, config): super().__init__() self.data_type_config = config def _load_and_process_data(self): for sub_type in self.data_type_config.get_sub_type_objects(): species_encoded = urllib.parse.quote_plus(\ ETLHelper.species_lookup_by_data_provider(sub_type.get_data_provider())) commit_size = self.data_type_config.get_neo4j_commit_size() #batch_size = self.data_type_config.get_generator_batch_size() batch_size = 100000 generators = self.get_generators(sub_type, batch_size, species_encoded) query_template_list = [ [ self.geo_xref_query_template, commit_size, "geo_xref_data_" + sub_type.get_data_provider() + ".csv" ], ] query_and_file_list = self.process_query_params( query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) def get_generators(self, sub_type, batch_size, species_encoded): """Get Generators""" entrez_ids = [] geo_data_file_contents = Path(sub_type.get_filepath()).read_text() geo_data = json.loads( json.dumps(xmltodict.parse(geo_data_file_contents))) for efetch_value in dict(geo_data.items()).values(): # IdList is a value returned from efetch XML spec, # within IdList, there is another map with "Id" # as the key and the entrez local ids a list value. for sub_map_key, sub_map_value in efetch_value.items(): if sub_map_key == 'IdList': for id_list in dict(sub_map_value.items()).values(): for entrez_id in id_list: self.logger.debug("here is the entrez id: %s", entrez_id) entrez_ids.append("NCBI_Gene:" + entrez_id) geo_data_list = [] return_set = Neo4jHelper.run_single_parameter_query( self.gene_crossref_query_template, entrez_ids) for record in return_set: gene_primary_key = record["g.primaryKey"] mod_local_id = record["g.modLocalId"] global_cross_ref_id = record["cr.globalCrossRefId"] geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1], "NCBI_Gene", "gene/other_expression", "gene/other_expression", "GEO", "https://www.ncbi.nlm.nih.gov/sites/entrez?" \ + "Db=geoprofiles"\ + "&DbFrom=gene"\ + "&Cmd=Link"\ + "&LinkName=gene_geoprofiles"\ + "&LinkReadableName=GEO%20Profiles"\ + "&IdsFromResult="\ + global_cross_ref_id.split(":")[1], global_cross_ref_id+"gene/other_expression") geo_xref["genePrimaryKey"] = gene_primary_key geo_xref["modLocalId"] = mod_local_id geo_data_list.append(geo_xref) yield [geo_data_list]
def get_generators(self, expression_file, batch_size): """Get Generators""" self.logger.debug("made it to the expression generator") counter = 0 cross_references = [] bio_entities = [] bio_join_entities = [] bio_entity_gene_aos = [] pubs = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] stage_uberon_data = [] uberon_ao_data = [] uberon_ao_other_data = [] uberon_stage_other_data = [] self.logger.debug("streaming json data from %s ...", expression_file) with codecs.open(expression_file, 'r', 'utf-8') as file_handle: for xpat in ijson.items(file_handle, 'data.item'): counter = counter + 1 pub_med_url = None pub_mod_url = None pub_med_id = "" publication_mod_id = "" stage_term_id = "" stage_name = "" stage_uberon_term_id = "" gene_id = xpat.get('geneId') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( gene_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = xpat.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, gene_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \ publication_mod_id) else: publication_mod_id = evidence['publicationId'] if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\ publication_mod_id) if publication_mod_id is None: publication_mod_id = "" assay = xpat.get('assay') if 'whereExpressed' in xpat: where_expressed = xpat.get('whereExpressed') cellular_component_qualifier_term_id = \ where_expressed.get('cellularComponentQualifierTermId') cellular_component_term_id = where_expressed.get( 'cellularComponentTermId') anatomical_structure_term_id = where_expressed.get( 'anatomicalStructureTermId') anatomical_structure_qualifier_term_id = where_expressed.get( 'anatomicalStructureQualifierTermId') anatomical_sub_structure_term_id = \ where_expressed.get('anatomicalSubStructureTermId') anatomical_sub_structure_qualifier_term_id = where_expressed.get( 'anatomicalSubStructureQualifierTermId') where_expressed_statement = where_expressed.get( 'whereExpressedStatement') when_expressed_stage = xpat.get('whenExpressed') if 'stageTermId' in when_expressed_stage: stage_term_id = when_expressed_stage.get('stageTermId') if 'stageName' in when_expressed_stage: stage_name = when_expressed_stage.get('stageName') # TODO: making unique BioEntityGeneExpressionJoin nodes # and ExpressionBioEntity nodes is tedious. # TODO: Lets get the DQMs to fix this. expression_unique_key = gene_id + assay + stage_name expression_entity_unique_key = "" if anatomical_structure_term_id is not None: expression_unique_key += anatomical_structure_term_id expression_entity_unique_key = anatomical_structure_term_id if anatomical_structure_qualifier_term_id is not None: expression_unique_key += anatomical_structure_qualifier_term_id expression_entity_unique_key += anatomical_structure_qualifier_term_id if cellular_component_term_id is not None: expression_unique_key += cellular_component_term_id expression_entity_unique_key += cellular_component_term_id if cellular_component_qualifier_term_id is not None: expression_unique_key += cellular_component_qualifier_term_id expression_entity_unique_key += cellular_component_qualifier_term_id if anatomical_sub_structure_term_id is not None: expression_unique_key += anatomical_sub_structure_term_id if anatomical_sub_structure_qualifier_term_id is not None: expression_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key \ += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += where_expressed_statement expression_unique_key += where_expressed_statement if where_expressed.get( 'anatomicalStructureUberonSlimTermIds' ) is not None: for uberon_structure_term_object in \ where_expressed.get('anatomicalStructureUberonSlimTermIds'): structure_uberon_term_id = \ uberon_structure_term_object.get('uberonTerm') if structure_uberon_term_id is not None \ and structure_uberon_term_id != 'Other': structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": structure_uberon_term_id } uberon_ao_data.append(structure_uberon_term) elif structure_uberon_term_id is not None \ and structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if where_expressed.get( 'anatomicalSubStructureUberonSlimTermIds' ) is not None: for uberon_sub_structure_term_object in \ where_expressed.get('anatomicalSubStructureUberonSlimTermIds'): sub_structure_uberon_term_id = \ uberon_sub_structure_term_object.get('uberonTerm') if sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id != 'Other': sub_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": sub_structure_uberon_term_id } uberon_ao_data.append( sub_structure_uberon_term) elif sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if cellular_component_term_id is None: cellular_component_term_id = "" if when_expressed_stage.get( 'stageUberonSlimTerm') is not None: stage_uberon_term_object = when_expressed_stage.get( 'stageUberonSlimTerm') stage_uberon_term_id = stage_uberon_term_object.get( "uberonTerm") if stage_uberon_term_id is not None \ and stage_uberon_term_id != "post embryonic, pre-adult": stage_uberon = { "uberonStageId": stage_uberon_term_id, "ei_uuid": expression_unique_key } stage_uberon_data.append(stage_uberon) if stage_uberon_term_id == "post embryonic, pre-adult": stage_uberon_other = { "ei_uuid": expression_unique_key } uberon_stage_other_data.append(stage_uberon_other) if stage_term_id is None or stage_name == 'N/A': stage_term_id = "" stage_name = "" stage_uberon_term_id = "" if stage_name is not None: stage = { "stageTermId": stage_term_id, "stageName": stage_name, "ei_uuid": expression_unique_key } stage_list.append(stage) else: stage_uberon_term_id = "" if 'crossReference' in xpat: cross_ref = xpat.get('crossReference') cross_ref_id = cross_ref.get('id') local_cross_ref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'gene/expression/annotation/detail': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\ local_cross_ref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['ei_uuid'] = expression_unique_key cross_references.append(xref) bio_entity = { "ebe_uuid": expression_entity_unique_key, "whereExpressedStatement": where_expressed_statement } bio_entities.append(bio_entity) bio_join_entity = { "ei_uuid": expression_unique_key, "assay": assay } bio_join_entities.append(bio_join_entity) bio_entity_gene_ao = { "geneId": gene_id, "ebe_uuid": expression_entity_unique_key, "anatomicalStructureTermId": anatomical_structure_term_id, "ei_uuid": expression_unique_key } bio_entity_gene_aos.append(bio_entity_gene_ao) pub = { "ei_uuid": expression_unique_key, "pubPrimaryKey": pub_med_id + publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url } pubs.append(pub) ao_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "assay": assay, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_expressions.append(ao_expression) if cellular_component_qualifier_term_id is not None: cc_qualifier = { "ebe_uuid": expression_entity_unique_key, "cellularComponentQualifierTermId": cellular_component_qualifier_term_id } cc_qualifiers.append(cc_qualifier) if anatomical_structure_term_id is None: anatomical_structure_term_id = "" cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "assay": assay, "whereExpressedStatement": where_expressed_statement, "cellularComponentTermId": cellular_component_term_id, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } cc_expressions.append(cc_expression) if anatomical_structure_qualifier_term_id is not None: ao_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureQualifierTermId": anatomical_structure_qualifier_term_id } ao_qualifiers.append(ao_qualifier) if anatomical_sub_structure_term_id is not None: ao_substructure = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureTermId": anatomical_sub_structure_term_id } ao_substructures.append(ao_substructure) if anatomical_sub_structure_qualifier_term_id is not None: ao_ss_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureQualifierTermId": anatomical_sub_structure_qualifier_term_id } ao_ss_qualifiers.append(ao_ss_qualifier) if where_expressed_statement is None: where_expressed_statement = "" if anatomical_structure_term_id is not None \ and anatomical_structure_term_id != "" \ and cellular_component_term_id is not None \ and cellular_component_term_id != "": ao_cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "stageTermId": stage_term_id, "stageName": stage_name, "stageUberonTermId": stage_uberon_term_id, "assay": assay, "cellularComponentTermId": cellular_component_term_id, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_cc_expressions.append(ao_cc_expression) if counter == batch_size: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ] bio_entities = [] bio_join_entities = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] uberon_stage_other_data = [] stage_uberon_data = [] uberon_ao_other_data = [] uberon_ao_data = [] cross_references = [] bio_entity_gene_aos = [] pubs = [] counter = 0 if counter > 0: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ]
def get_generators(self, htp_dataset_data, batch_size): dataset_tags = [] data_providers = [] htp_datasets = [] publications = [] secondaryIds = [] cross_reference_list = [] counter = 0 date_produced = htp_dataset_data['metaData']['dateProduced'] data_provider_object = htp_dataset_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] for dataset_record in htp_dataset_data['data']: counter = counter + 1 dataset = dataset_record.get('datasetId') datasetId = dataset.get('primaryId') # spoke to RGD and they wish to remove these datasets as they overlap with SGD. if (datasetId == 'GEO:GSE18157' or datasetId == 'GEO:GSE33497') and data_provider == 'RGD': continue if 'secondaryIds' in dataset: for secId in dataset.get('secondaryIds'): secid = {"datasetId": datasetId, "secondaryId": secId} secondaryIds.append(secid) if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( datasetId) if is_it_test_entry is False: counter = counter - 1 continue if 'crossReference' in dataset: crossRefO = dataset.get('crossReference') if crossRefO is not None: crossRefId = crossRefO.get('id') local_crossref_id = crossRefId.split(":")[1] prefix = crossRefId.split(":")[0] pages = crossRefO.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: mod_global_cross_ref_url = self.etlh.rdh2.return_url_from_key_value( prefix, local_crossref_id, page) xref = ETLHelper.get_xref_dict( local_crossref_id, prefix, page, page, crossRefId, mod_global_cross_ref_url, crossRefId + page) xref['dataId'] = datasetId cross_reference_list.append(xref) category_tags = dataset_record.get('categoryTags') if category_tags is not None: for tag in category_tags: dataset_category_tag = {"datasetId": datasetId, "tag": tag} dataset_tags.append(dataset_category_tag) publicationNew = dataset_record.get('publications') if publicationNew is not None: for pub in publicationNew: pid = pub.get('publicationId') publication_mod_id = "" pub_med_id = "" pub_mod_url = "" pub_med_url = "" if pid is not None and pid.startswith('PMID:'): pub_med_id = pid local_pub_med_id = pub_med_id.split(":")[1] pub_med_url = pub_med_url = self.etlh.get_no_page_complete_url( local_pub_med_id, 'PMID', pub_med_id) if 'crossReference' in pub: pub_xref = pub.get('crossReference') publication_mod_id = pub_xref.get('id') pub_mod_url = self.etlh.rdh2.return_url_from_identifier( publication_mod_id) elif pid is not None and not pid.startswith('PMID:'): publication_mod_id = pub.get('publicationId') pub_mod_url = self.etlh.rdh2.return_url_from_identifier( publication_mod_id) publication = { "datasetId": datasetId, "pubPrimaryKey": publication_mod_id + pub_med_id, "pubModId": publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModUrl": pub_mod_url } publications.append(publication) htp_dataset = { "datasetId": datasetId, "dateAssigned": dataset_record.get('dateAssigned'), "title": dataset_record.get('title'), "summary": dataset_record.get('summary'), "numChannels": dataset_record.get('numChannels'), "subSeries": dataset_record.get('subSeries') } htp_datasets.append(htp_dataset) if counter == batch_size: yield [ htp_datasets, dataset_tags, publications, cross_reference_list, secondaryIds ] counter = 0 htp_datasets = [] dataset_tags = [] publications = [] cross_reference_list = [] secondaryIds = [] if counter > 0: yield [ htp_datasets, dataset_tags, publications, cross_reference_list, secondaryIds ]
class DOETL(ETL): """DO ETL.""" logger = logging.getLogger(__name__) # Query templates which take params and will be processed later do_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row //Create the DOTerm node and set properties. primaryKey is required. MERGE (doterm:DOTerm:Ontology {primaryKey:row.oid}) SET doterm.name = row.name, doterm.nameKey = row.name_key, doterm.definition = row.definition, doterm.defLinks = apoc.convert.fromJsonList(row.defLinksProcessed), doterm.isObsolete = row.is_obsolete, doterm.subset = row.subset, doterm.doDisplayId = row.oid, doterm.doUrl = row.oUrl, doterm.doPrefix = "DOID", doterm.doId = row.oid, doterm.rgdLink = row.rgd_link, doterm.ratOnlyRgdLink = row.rat_only_rgd_link, doterm.humanOnlyRgdLink = row.human_only_rgd_link, doterm.mgiLink = row.mgi_link, doterm.zfinLink = row.zfin_link, doterm.flybaseLink = row.flybase_link, doterm.wormbaseLink = row.wormbase_link, doterm.sgdLink = row.sgd_link MERGE (doterm)-[ggcg:IS_A_PART_OF_CLOSURE]->(doterm)""" doterm_synonyms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (d:DOTerm {primaryKey:row.primary_id}) MERGE (syn:Synonym:Identifier {primaryKey:row.synonym}) SET syn.name = row.synonym MERGE (d)-[aka2:ALSO_KNOWN_AS]->(syn) """ doterm_isas_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (d1:DOTerm:Ontology {primaryKey:row.primary_id}) MATCH (d2:DOTerm:Ontology {primaryKey:row.primary_id2}) MERGE (d1)-[aka:IS_A]->(d2) """ xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:DOTerm {primaryKey:row.oid}) """ + ETLHelper.get_cypher_xref_text( ) doterm_alt_ids_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (d:DOTerm {primaryKey:row.primary_id}) MERGE (sec:SecondaryId:Identifier {primaryKey:row.secondary_id}) MERGE (d)-[aka2:ALSO_KNOWN_AS]->(sec) """ def __init__(self, config): """Initialise object.""" super().__init__() self.data_type_config = config def _load_and_process_data(self): filepath = self.data_type_config.get_single_filepath() commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() generators = self.get_generators(filepath, batch_size) query_template_list = [ [self.do_query_template, commit_size, "do_term_data.csv"], [self.doterm_isas_query_template, commit_size, "do_isas_data.csv"], [ self.doterm_synonyms_query_template, commit_size, "do_synonyms_data.csv" ], [self.xrefs_query_template, commit_size, "do_xrefs_data.csv"], [ self.doterm_alt_ids_query_template, commit_size, "do_alt_ids_data.csv" ] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("DO-?: ") def get_generators(self, filepath, batch_size): # noqa TODO:Needs splitting up really """Get Generators.""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # - might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] def_links_unprocessed = [] def_links_processed = [] subset = [] definition = "" is_obsolete = "false" ident = key if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: do_synonym = {"primary_id": key, "synonym": synonym} do_synonyms_list.append(do_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [ s["val"] for s in node["meta"]["basicPropertyValues"] ] for alt_id in alt_ids: if "DOID:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } do_alt_ids_list.append(secondary_id) if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') self.ortho_xrefs(o_xrefs, ident, xrefs) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents( key, relations=['subClassOf']) for item in isas_without_names: dictionary = {"primary_id": key, "primary_id2": item} do_isas_list.append(dictionary) def_links_processed = [] def_links = "" if definition is None: definition = "" else: # Remove new lines that cause this to split across two lines in the file # definition = definition.replace('\n', ' ') # Remove any extra double space that might have been introduces in the last replace # definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = re.split(r'(?<!\\)"', definition) if len(split_definition) > 1: if len(split_definition ) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links = def_links.rstrip("]").replace("[", "") def_links_unprocessed.append(def_links) for def_link in def_links_unprocessed: def_link = def_link.replace("url:www", "http://www") def_link = def_link.replace("url:", "") def_link = def_link.replace("URL:", "") def_link = def_link.replace("\\:", ":") def_link = def_link.replace('\\', '') if "," in def_link: def_link = def_link.split(",") for link in def_link: if link.strip().startswith("http"): def_links_processed.append(link) else: if def_link.strip().startswith("http"): def_links_processed.append(def_link) # TODO: make this a generic section based on the resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas # NU: alt_ids = node.get('alt_id') # if alt_ids: # if not isinstance(alt_ids, (list, tuple)): # alt_ids = [alt_ids] # else: # alt_ids = [] # TODO: Need to add urls to resource Descriptis for SGD and MGI. # NOTE: MGI had one but has 'MGI:' at the end of the url not required here. dict_to_append = { 'oid': node['id'], 'name': node.get('label'), 'name_key': node.get('label'), 'definition': definition, 'defLinksProcessed': def_links_processed, 'is_obsolete': is_obsolete, 'subset': subset, 'oUrl': self.etlh.rdh2.return_url_from_key_value('DOID', node['id']), 'rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/all'), 'rat_only_rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/rat'), 'human_only_rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/human'), 'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'], 'zfin_link': self.etlh.rdh2.return_url_from_key_value( 'ZFIN', node['id'], 'disease'), 'flybase_link': self.etlh.rdh2.return_url_from_key_value( 'FB', node['id'], 'disease'), 'wormbase_link': self.etlh.rdh2.return_url_from_key_value( 'WB', node['id'], 'disease'), 'sgd_link': 'https://yeastgenome.org/disease/' + node['id'] } do_term_list.append(dict_to_append) if counter == batch_size: yield [ do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list ] do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 if counter > 0: yield [ do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list ]
class TestClass(): """Test Class.""" etlh = ETLHelper() def test_get_species_name_from_various_keys(self): """Test getting valid species names from DB.""" lookups = {'RGD': 'Rattus norvegicus', 'NCBITaxon:10116': 'Rattus norvegicus', '10116': 'Rattus norvegicus', 'Cel': 'Caenorhabditis elegans', 'worm': 'Caenorhabditis elegans', 'Dme': 'Drosophila melanogaster', 'bad': None} # Bad lookup returns None for key in lookups.keys(): name = self.etlh.species_name_lookup(key) assert name == lookups[key] def test_get_species_order(self): """Test getting order.""" lookups = {'RGD': 20, 'NCBITaxon:10116': 20, '10116': 20, 'Cel': 60, 'worm': 60, 'Dme': 50, 'bad': None} # Bad lookup returns None for key in lookups.keys(): name = self.etlh.get_species_order(key) assert name == lookups[key] def test_data_provider_lookup(self): """Test provider lookup.""" lookups = {'RGD': 'RGD', 'NCBITaxon:10116': 'RGD', 'worm': 'WB', 'Dme': 'FB', 'Saccharomyces cerevisiae': 'SGD', 'H**o sapiens': 'RGD', # Wierd one 'bad': None} # Bad lookup returns None for key in lookups.keys(): name = self.etlh.data_provider_lookup(key) assert name == lookups[key] def test_url_lookup_key_value(self): """Test url lookups.""" # reset critical error self.etlh.rdh2.missing_keys = {} self.etlh.rdh2.missing_pages = {} self.etlh.rdh2.bad_pages = {} lookups = [{'key': 'RGD', 'value': '123456', 'page': None, 'result': 'https://rgd.mcw.edu/rgdweb/elasticResults.html?term=RGD:123456'}, {'key': 'RGD', 'value': '234567', 'page': 'allele', 'result': 'https://rgd.mcw.edu/rgdweb/report/gene/main.html?id=RGD:234567'}, {'key': 'FB', 'value': 'something', 'page': None, 'result': 'https://flybase.org/reports/something.html'}, {'key': 'FB', 'value': 'FBsomething', 'page': 'badpage', 'result': None}, {'key': 'BADKEY', 'value': 'something', 'page': None, 'result': None}] for item in lookups: url = self.etlh.rdh2.return_url_from_key_value(item['key'], item['value'], alt_page=item['page']) assert url == item['result'] for item_name in self.etlh.rdh2.missing_keys.keys(): assert 1 == self.etlh.rdh2.missing_keys[item_name] assert 'BADKEY-None' in self.etlh.rdh2.missing_keys.keys() assert 'BADKEY' in self.etlh.rdh2.missing_keys.keys() for item_name in self.etlh.rdh2.missing_pages.keys(): assert 1 == self.etlh.rdh2.missing_pages[item_name] assert item_name == 'FB-badpage' def test_url_lookup(self): """Get url tests for ETLHelper.""" self.etlh.rdh2.missing_keys = {} self.etlh.rdh2.missing_pages = {} self.etlh.rdh2.bad_pages = {} self.etlh.rdh2.bad_regex = {} lookups = [{'local_id': 'C5604', 'global_id': 'NCI:C5604', 'result': 'https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=C5604'}, {'local_id': 'badregexdoesnotmatch', 'global_id': 'MESH:badregexdoesnotmatch', 'result': 'https://www.ncbi.nlm.nih.gov/mesh/badregexdoesnotmatch'}, {'local_id': 'Cdiff', 'global_id': 'OMIM:1111', 'result': 'https://www.omim.org/entry/1111'}] for item in lookups: url = self.etlh.get_complete_url_ont(item['local_id'], item['global_id']) assert url == item['result'] if self.etlh.rdh2.missing_keys.keys(): assert 1 == "Should be no missing keys" if self.etlh.rdh2.missing_pages.keys(): assert 1 == "Should be no missing pages" for item_name in self.etlh.rdh2.bad_pages.keys(): # Due to local_id and global not matching we will get a bad pages. assert item_name == "OMIM-None" # mesh fails the regex so make sure we got an error message # we still get a url eror is logged. for item_name in self.etlh.rdh2.bad_regex.keys(): assert 1 == self.etlh.rdh2.bad_regex[item_name] assert item_name == 'MESH'
class VariationETL(ETL): """Variation ETL.""" logger = logging.getLogger(__name__) # Query templates which take params and will be processed later variation_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (a:Allele {primaryKey:row.alleleId}) MATCH (g:Gene)-[:IS_ALLELE_OF]-(a) //Create the variant node and set properties. primaryKey is required. MERGE (o:Variant {primaryKey:row.hgvs_nomenclature}) ON CREATE SET o.name = row.variantHGVSSynonym, o.hgvsNomenclature = row.hgvs_nomenclature, o.genomicReferenceSequence = row.genomicReferenceSequence, o.paddingLeft = row.paddingLeft, o.paddingRight = row.paddingRight, o.genomicVariantSequence = row.genomicVariantSequence, o.dateProduced = row.dateProduced, o.release = row.release, o.dataProviders = row.dataProviders, o.dataProvider = row.dataProvider MERGE (s:Synonym:Identifier {primaryKey:row.hgvs_nomenclature}) SET s.name = row.hgvs_nomenclature MERGE (o)-[aka2:ALSO_KNOWN_AS]->(s) MERGE (o)-[:VARIATION]->(a) MERGE (g)-[:COMPUTED_GENE]->(o) """ so_terms_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Variant {primaryKey:row.variantId}) MATCH (s:SOTerm {primaryKey:row.soTermId}) CREATE (o)-[:VARIATION_TYPE]->(s)""" genomic_locations_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Variant {primaryKey:row.variantId}) MATCH (chrm:Chromosome {primaryKey:row.chromosome}) MERGE (a:Assembly {primaryKey:row.assembly}) ON CREATE SET a.dataProvider = row.dataProvider CREATE (o)-[gchrm:LOCATED_ON]->(chrm) CREATE (gchrmn:GenomicLocation {primaryKey:row.uuid}) SET gchrmn.start = apoc.number.parseInt(row.start), gchrmn.end = apoc.number.parseInt(row.end), gchrmn.assembly = row.assembly, gchrmn.strand = row.strand, gchrmn.chromosome = row.chromosome CREATE (o)-[of:ASSOCIATION]->(gchrmn) CREATE (gchrmn)-[ofc:ASSOCIATION]->(chrm) CREATE (gchrmn)-[ao:ASSOCIATION]->(a) """ xrefs_query_template = """ USING PERIODIC COMMIT %s LOAD CSV WITH HEADERS FROM \'file:///%s\' AS row MATCH (o:Variant {primaryKey:row.dataId}) """ + ETLHelper.get_cypher_xref_text() def __init__(self, config): """Initialise object.""" super().__init__() self.data_type_config = config def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, )) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) def _process_sub_type(self, sub_type): self.logger.info("Loading Variation Data: %s", sub_type.get_data_provider()) filepath = sub_type.get_filepath() data = JSONFile().get_data(filepath) self.logger.info("Finished Loading Variation Data: %s", sub_type.get_data_provider()) if data is None: self.logger.warning("No Data found for %s skipping", sub_type.get_data_provider()) return # This order is the same as the lists yielded from the get_generators function. # A list of tuples. commit_size = self.data_type_config.get_neo4j_commit_size() batch_size = self.data_type_config.get_generator_batch_size() # This needs to be in this format (template, param1, params2) others will be ignored query_template_list = [ [ self.variation_query_template, commit_size, "variation_data_" + sub_type.get_data_provider() + ".csv" ], [ self.genomic_locations_query_template, commit_size, "variant_genomiclocations_" + sub_type.get_data_provider() + ".csv" ], [ self.so_terms_query_template, commit_size, "variant_so_terms_" + sub_type.get_data_provider() + ".csv" ], [ self.xrefs_query_template, commit_size, "variant_xrefs_" + sub_type.get_data_provider() + ".csv" ] ] generators = self.get_generators(data, batch_size) query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.error_messages("Var-{}: ".format(sub_type.get_data_provider())) def get_hgvs_nomenclature(self, refseq_id, variant_type, start_position, end_position, reference_sequence, variant_sequence, assembly, chromosome): """Get HGVS nomenclature.""" if start_position is None: start_position_str = "" else: start_position_str = str(start_position) if end_position is None: end_position_str = "" else: end_position_str = str(end_position) if variant_sequence is None: variant_sequence_str = "" else: variant_sequence_str = variant_sequence if reference_sequence is None: reference_sequence_str = "" else: reference_sequence_str = reference_sequence hgvs_nomenclature = refseq_id.split( ":")[1] + ':g.' + start_position_str hgvs_synonym = '(' + assembly + ')' + chromosome + ':' + start_position_str if variant_type in ['SO:1000002', 'SO:1000008']: # point mutation/substitution hgvs_nomenclature += reference_sequence_str + ">" + variant_sequence_str hgvs_synonym += reference_sequence_str + ">" + variant_sequence_str elif variant_type == "SO:0000667": # insertion hgvs_nomenclature += '_' + end_position_str + 'ins' + variant_sequence_str hgvs_synonym += '_' + end_position_str + 'ins' + variant_sequence_str elif variant_type == "SO:0000159": # deletion hgvs_nomenclature += '_' + end_position_str + 'del' hgvs_synonym += '_' + end_position_str + 'del' elif variant_type == "SO:0002007": # MNV hgvs_nomenclature += '_' + end_position_str + 'delins' + variant_sequence_str hgvs_synonym += '_' + end_position_str + 'delins' + variant_sequence_str elif variant_type == "SO:1000032": # DELIN hgvs_nomenclature += '_' + end_position_str + 'delins' + variant_sequence_str hgvs_synonym += '_' + end_position_str + 'delins' + variant_sequence_str else: hgvs_nomenclature = '' hgvs_synonym = '' return hgvs_nomenclature, hgvs_synonym def get_generators(self, variant_data, batch_size): # noqa """Get Generators.""" data_providers = [] release = "" variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] counter = 0 date_produced = variant_data['metaData']['dateProduced'] self.data_providers_process(variant_data) load_key = date_produced + self.data_provider + "_VARIATION" if 'release' in variant_data['metaData']: release = variant_data['metaData']['release'] assemblies = {} for allele_record in variant_data['data']: chromosome = allele_record["chromosome"] if chromosome.startswith("chr"): chromosome_str = chromosome[3:] else: chromosome_str = chromosome assembly = allele_record["assembly"] if assembly not in assemblies: self.logger.info(assembly) context_info = ContextInfo() data_manager = DataFileManager( context_info.config_file_location) assemblies[assembly] = AssemblySequenceHelper( assembly, data_manager) so_term_id = allele_record.get('type') genomic_reference_sequence = allele_record.get( 'genomicReferenceSequence') genomic_variant_sequence = allele_record.get( 'genomicVariantSequence') if genomic_reference_sequence == 'N/A': genomic_reference_sequence = "" if genomic_variant_sequence == 'N/A': genomic_variant_sequence = "" padding_left = "" padding_right = "" if allele_record.get('start') != "" and allele_record.get( 'end') != "": # not insertion if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567": genomic_reference_sequence = assemblies[ assembly].get_sequence(chromosome_str, allele_record.get('start'), allele_record.get('end')) if allele_record.get('start') < allele_record.get('end'): start = allele_record.get('start') end = allele_record.get('end') else: start = allele_record.get('end') end = allele_record.get('start') padding_width = 500 if so_term_id != "SO:0000667": # not insertion start = start - 1 end = end + 1 left_padding_start = start - padding_width if left_padding_start < 1: left_padding_start = 1 padding_left = assemblies[assembly].get_sequence( chromosome_str, left_padding_start, start) right_padding_end = end + padding_width padding_right = assemblies[assembly].get_sequence( chromosome_str, end, right_padding_end) counter = counter + 1 global_id = allele_record.get('alleleId') mod_global_cross_ref_id = "" cross_references = [] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue cross_ref_primary_id = allele_record.get( 'sequenceOfReferenceAccessionNumber') local_cross_ref_id = cross_ref_primary_id.split(":")[1] prefix = cross_ref_primary_id.split(":")[0] cross_ref_complete_url = self.etlh.get_no_page_complete_url( local_cross_ref_id, prefix, global_id) xref_map = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, "variant_sequence_of_reference", "sequence_of_reference_accession_number", global_id, cross_ref_complete_url, cross_ref_primary_id + "variant_sequence_of_reference") xref_map['dataId'] = global_id if cross_ref_primary_id is not None: cross_references.append(xref_map) if genomic_reference_sequence is not None: if len(genomic_reference_sequence) > 1000 and ( allele_record.get('type') == 'SO:1000002' or allele_record.get('type') == 'SO:1000008'): self.logger.debug("%s genomicReferenceSequence", allele_record.get('alleleId')) if genomic_variant_sequence is not None: if len(genomic_variant_sequence) > 1000 and ( allele_record.get('type') in ['SO:1000002', 'SO:1000008']): self.logger.debug("%s genomicVariantSequence", allele_record.get('alleleId')) hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature( allele_record.get('sequenceOfReferenceAccessionNumber'), allele_record.get('type'), allele_record.get('start'), allele_record.get('end'), genomic_reference_sequence, genomic_variant_sequence, allele_record.get('assembly'), chromosome_str) if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \ or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000: self.logger.debug( "%s has too long of a sequence potentionally", allele_record.get('alleleId')) # TODO: fix typo in MGI Submission for this variant so # that it doesn't list a 40K bp point mutation. if allele_record.get('alleleId') != 'MGI:6113870': variant_dataset = { "hgvs_nomenclature": hgvs_nomenclature, "genomicReferenceSequence": genomic_reference_sequence, "genomicVariantSequence": genomic_variant_sequence, "paddingLeft": padding_left, "paddingRight": padding_right, "alleleId": allele_record.get('alleleId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "dataProvider": self.data_provider, "variantHGVSSynonym": hgvs_synonym } variant_genomic_location_dataset = { "variantId": hgvs_nomenclature, "assembly": allele_record.get('assembly'), "chromosome": chromosome_str, "start": allele_record.get('start'), "end": allele_record.get('end'), "uuid": str(uuid.uuid4()), "dataProvider": self.data_provider } variant_so_term = { "variantId": hgvs_nomenclature, "soTermId": allele_record.get('type') } variant_so_terms.append(variant_so_term) variant_genomic_locations.append( variant_genomic_location_dataset) variants.append(variant_dataset) if counter == batch_size: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ] variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] if counter > 0: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ]