def get_generators(self, expression_file, batch_size): """Get Generators""" self.logger.debug("made it to the expression generator") counter = 0 cross_references = [] bio_entities = [] bio_join_entities = [] bio_entity_gene_aos = [] pubs = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] stage_uberon_data = [] uberon_ao_data = [] uberon_ao_other_data = [] uberon_stage_other_data = [] self.logger.debug("streaming json data from %s ...", expression_file) with codecs.open(expression_file, 'r', 'utf-8') as file_handle: for xpat in ijson.items(file_handle, 'data.item'): counter = counter + 1 pub_med_url = None pub_mod_url = None pub_med_id = "" publication_mod_id = "" stage_term_id = "" stage_name = "" stage_uberon_term_id = "" gene_id = xpat.get('geneId') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( gene_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = xpat.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence.get('publicationId') local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, gene_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') publication_mod_id = pub_xref.get('id') if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref( \ publication_mod_id) else: publication_mod_id = evidence['publicationId'] if publication_mod_id is not None: pub_mod_url = ETLHelper.get_expression_pub_annotation_xref(\ publication_mod_id) if publication_mod_id is None: publication_mod_id = "" assay = xpat.get('assay') if 'whereExpressed' in xpat: where_expressed = xpat.get('whereExpressed') cellular_component_qualifier_term_id = \ where_expressed.get('cellularComponentQualifierTermId') cellular_component_term_id = where_expressed.get( 'cellularComponentTermId') anatomical_structure_term_id = where_expressed.get( 'anatomicalStructureTermId') anatomical_structure_qualifier_term_id = where_expressed.get( 'anatomicalStructureQualifierTermId') anatomical_sub_structure_term_id = \ where_expressed.get('anatomicalSubStructureTermId') anatomical_sub_structure_qualifier_term_id = where_expressed.get( 'anatomicalSubStructureQualifierTermId') where_expressed_statement = where_expressed.get( 'whereExpressedStatement') when_expressed_stage = xpat.get('whenExpressed') if 'stageTermId' in when_expressed_stage: stage_term_id = when_expressed_stage.get('stageTermId') if 'stageName' in when_expressed_stage: stage_name = when_expressed_stage.get('stageName') # TODO: making unique BioEntityGeneExpressionJoin nodes # and ExpressionBioEntity nodes is tedious. # TODO: Lets get the DQMs to fix this. expression_unique_key = gene_id + assay + stage_name expression_entity_unique_key = "" if anatomical_structure_term_id is not None: expression_unique_key += anatomical_structure_term_id expression_entity_unique_key = anatomical_structure_term_id if anatomical_structure_qualifier_term_id is not None: expression_unique_key += anatomical_structure_qualifier_term_id expression_entity_unique_key += anatomical_structure_qualifier_term_id if cellular_component_term_id is not None: expression_unique_key += cellular_component_term_id expression_entity_unique_key += cellular_component_term_id if cellular_component_qualifier_term_id is not None: expression_unique_key += cellular_component_qualifier_term_id expression_entity_unique_key += cellular_component_qualifier_term_id if anatomical_sub_structure_term_id is not None: expression_unique_key += anatomical_sub_structure_term_id if anatomical_sub_structure_qualifier_term_id is not None: expression_unique_key += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key \ += anatomical_sub_structure_qualifier_term_id expression_entity_unique_key += where_expressed_statement expression_unique_key += where_expressed_statement if where_expressed.get( 'anatomicalStructureUberonSlimTermIds' ) is not None: for uberon_structure_term_object in \ where_expressed.get('anatomicalStructureUberonSlimTermIds'): structure_uberon_term_id = \ uberon_structure_term_object.get('uberonTerm') if structure_uberon_term_id is not None \ and structure_uberon_term_id != 'Other': structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": structure_uberon_term_id } uberon_ao_data.append(structure_uberon_term) elif structure_uberon_term_id is not None \ and structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if where_expressed.get( 'anatomicalSubStructureUberonSlimTermIds' ) is not None: for uberon_sub_structure_term_object in \ where_expressed.get('anatomicalSubStructureUberonSlimTermIds'): sub_structure_uberon_term_id = \ uberon_sub_structure_term_object.get('uberonTerm') if sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id != 'Other': sub_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key, "aoUberonId": sub_structure_uberon_term_id } uberon_ao_data.append( sub_structure_uberon_term) elif sub_structure_uberon_term_id is not None \ and sub_structure_uberon_term_id == 'Other': other_structure_uberon_term = { "ebe_uuid": expression_entity_unique_key } uberon_ao_other_data.append( other_structure_uberon_term) if cellular_component_term_id is None: cellular_component_term_id = "" if when_expressed_stage.get( 'stageUberonSlimTerm') is not None: stage_uberon_term_object = when_expressed_stage.get( 'stageUberonSlimTerm') stage_uberon_term_id = stage_uberon_term_object.get( "uberonTerm") if stage_uberon_term_id is not None \ and stage_uberon_term_id != "post embryonic, pre-adult": stage_uberon = { "uberonStageId": stage_uberon_term_id, "ei_uuid": expression_unique_key } stage_uberon_data.append(stage_uberon) if stage_uberon_term_id == "post embryonic, pre-adult": stage_uberon_other = { "ei_uuid": expression_unique_key } uberon_stage_other_data.append(stage_uberon_other) if stage_term_id is None or stage_name == 'N/A': stage_term_id = "" stage_name = "" stage_uberon_term_id = "" if stage_name is not None: stage = { "stageTermId": stage_term_id, "stageName": stage_name, "ei_uuid": expression_unique_key } stage_list.append(stage) else: stage_uberon_term_id = "" if 'crossReference' in xpat: cross_ref = xpat.get('crossReference') cross_ref_id = cross_ref.get('id') local_cross_ref_id = cross_ref_id.split(":")[1] prefix = cross_ref.get('id').split(":")[0] pages = cross_ref.get('pages') # some pages collection have 0 elements if pages is not None and len(pages) > 0: for page in pages: if page == 'gene/expression/annotation/detail': mod_global_cross_ref_id = ETLHelper.get_page_complete_url(\ local_cross_ref_id, self.xref_url_map, prefix, page) xref = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, page, page, cross_ref_id, mod_global_cross_ref_id, cross_ref_id + page) xref['ei_uuid'] = expression_unique_key cross_references.append(xref) bio_entity = { "ebe_uuid": expression_entity_unique_key, "whereExpressedStatement": where_expressed_statement } bio_entities.append(bio_entity) bio_join_entity = { "ei_uuid": expression_unique_key, "assay": assay } bio_join_entities.append(bio_join_entity) bio_entity_gene_ao = { "geneId": gene_id, "ebe_uuid": expression_entity_unique_key, "anatomicalStructureTermId": anatomical_structure_term_id, "ei_uuid": expression_unique_key } bio_entity_gene_aos.append(bio_entity_gene_ao) pub = { "ei_uuid": expression_unique_key, "pubPrimaryKey": pub_med_id + publication_mod_id, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url } pubs.append(pub) ao_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "assay": assay, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_expressions.append(ao_expression) if cellular_component_qualifier_term_id is not None: cc_qualifier = { "ebe_uuid": expression_entity_unique_key, "cellularComponentQualifierTermId": cellular_component_qualifier_term_id } cc_qualifiers.append(cc_qualifier) if anatomical_structure_term_id is None: anatomical_structure_term_id = "" cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "assay": assay, "whereExpressedStatement": where_expressed_statement, "cellularComponentTermId": cellular_component_term_id, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } cc_expressions.append(cc_expression) if anatomical_structure_qualifier_term_id is not None: ao_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalStructureQualifierTermId": anatomical_structure_qualifier_term_id } ao_qualifiers.append(ao_qualifier) if anatomical_sub_structure_term_id is not None: ao_substructure = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureTermId": anatomical_sub_structure_term_id } ao_substructures.append(ao_substructure) if anatomical_sub_structure_qualifier_term_id is not None: ao_ss_qualifier = { "ebe_uuid": expression_entity_unique_key, "anatomicalSubStructureQualifierTermId": anatomical_sub_structure_qualifier_term_id } ao_ss_qualifiers.append(ao_ss_qualifier) if where_expressed_statement is None: where_expressed_statement = "" if anatomical_structure_term_id is not None \ and anatomical_structure_term_id != "" \ and cellular_component_term_id is not None \ and cellular_component_term_id != "": ao_cc_expression = { "geneId": gene_id, "whenExpressedStage": when_expressed_stage, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": publication_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + publication_mod_id, "uuid": str(uuid.uuid4()), "stageTermId": stage_term_id, "stageName": stage_name, "stageUberonTermId": stage_uberon_term_id, "assay": assay, "cellularComponentTermId": cellular_component_term_id, "anatomicalStructureTermId": anatomical_structure_term_id, "whereExpressedStatement": where_expressed_statement, "ei_uuid": expression_unique_key, "ebe_uuid": expression_entity_unique_key } ao_cc_expressions.append(ao_cc_expression) if counter == batch_size: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ] bio_entities = [] bio_join_entities = [] ao_expressions = [] cc_expressions = [] ao_qualifiers = [] ao_substructures = [] ao_ss_qualifiers = [] cc_qualifiers = [] ao_cc_expressions = [] stage_list = [] uberon_stage_other_data = [] stage_uberon_data = [] uberon_ao_other_data = [] uberon_ao_data = [] cross_references = [] bio_entity_gene_aos = [] pubs = [] counter = 0 if counter > 0: yield [ bio_entities, bio_entity_gene_aos, bio_join_entities, ao_expressions, cc_expressions, ao_cc_expressions, ao_qualifiers, ao_substructures, ao_ss_qualifiers, cc_qualifiers, stage_list, stage_uberon_data, uberon_ao_data, uberon_ao_other_data, uberon_stage_other_data, cross_references, pubs ]
def get_generators(self, phenotype_data, batch_size): """Get Generators""" list_to_yield = [] pge_list_to_yield = [] date_produced = phenotype_data['metaData']['dateProduced'] data_providers = [] data_provider_object = phenotype_data['metaData']['dataProvider'] counter = 0 data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] pge_key = '' load_key = date_produced + data_provider + "_phenotype" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, ETL.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append(ETLHelper.get_xref_dict(\ data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.debug("data provider: %s", data_provider) for pheno in phenotype_data['data']: pecj_primary_key = str(uuid.uuid4()) counter = counter + 1 pub_med_id = None pub_mod_id = None pub_med_url = None pub_mod_url = None primary_id = pheno.get('objectId') phenotype_statement = pheno.get('phenotypeStatement') if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( primary_id) if is_it_test_entry is False: counter = counter - 1 continue evidence = pheno.get('evidence') if 'publicationId' in evidence: if evidence.get('publicationId').startswith('PMID:'): pub_med_id = evidence['publicationId'] local_pub_med_id = pub_med_id.split(":")[1] pub_med_prefix = pub_med_id.split(":")[0] pub_med_url = ETLHelper.get_no_page_complete_url( local_pub_med_id, self.xref_url_map, pub_med_prefix, primary_id) if pub_med_id is None: pub_med_id = "" if 'crossReference' in evidence: pub_xref = evidence.get('crossReference') pub_mod_id = pub_xref.get('id') pub_mod_local_id = pub_mod_id.split(":")[1] if pub_mod_id is not None: pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) else: pub_mod_id = evidence.get('publicationId') if pub_mod_id is not None: pub_mod_local_id = pub_mod_id.split(":")[1] pub_mod_url = ETLHelper.get_complete_pub_url( pub_mod_local_id, pub_mod_id) if pub_mod_id is None: pub_mod_id = "" if pub_med_id is None: pub_med_id = "" if pub_mod_id is None: pub_mod_id = "" date_assigned = pheno.get('dateAssigned') if pub_mod_id is None and pub_med_id is None: self.logger.info("%s is missing pubMed and pubMod id", primary_id) if 'primaryGeneticEntityIDs' in pheno: pge_ids = pheno.get('primaryGeneticEntityIDs') for pge in pge_ids: pge_key = pge_key + pge pge_map = { "pecjPrimaryKey": pecj_primary_key, "pgeId": pge } pge_list_to_yield.append(pge_map) phenotype = { "primaryId": primary_id, "phenotypeUniqueKey": primary_id + phenotype_statement.strip(), "phenotypeStatement": phenotype_statement.strip(), "dateAssigned": date_assigned, "loadKey": load_key, "type": "gene", "dataProviders": data_providers, "dataProvider": data_provider, "dateProduced": date_produced, "pubMedId": pub_med_id, "pubMedUrl": pub_med_url, "pubModId": pub_mod_id, "pubModUrl": pub_mod_url, "pubPrimaryKey": pub_med_id + pub_mod_id, "pecjPrimaryKey": pecj_primary_key } list_to_yield.append(phenotype) if counter == batch_size: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ] list_to_yield = [] pge_list_to_yield = [] counter = 0 if counter > 0: yield [ list_to_yield, list_to_yield, list_to_yield, pge_list_to_yield, pge_list_to_yield ]
def get_generators(self, variant_data, batch_size): """Get Generators""" data_providers = [] release = "" variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] counter = 0 date_produced = variant_data['metaData']['dateProduced'] data_provider_object = variant_data['metaData']['dataProvider'] data_provider_cross_ref = data_provider_object.get('crossReference') data_provider = data_provider_cross_ref.get('id') data_provider_pages = data_provider_cross_ref.get('pages') data_provider_cross_ref_set = [] load_key = date_produced + data_provider + "_VARIATION" if data_provider_pages is not None: for data_provider_page in data_provider_pages: cross_ref_complete_url = ETLHelper.get_page_complete_url( data_provider, self.xref_url_map, data_provider, data_provider_page) data_provider_cross_ref_set.append( ETLHelper.get_xref_dict( data_provider, data_provider, data_provider_page, data_provider_page, data_provider, cross_ref_complete_url, data_provider + data_provider_page)) data_providers.append(data_provider) self.logger.debug("data provider: %s", data_provider) if 'release' in variant_data['metaData']: release = variant_data['metaData']['release'] assemblies = {} for allele_record in variant_data['data']: chromosome = allele_record["chromosome"] if chromosome.startswith("chr"): chromosome_str = chromosome[3:] else: chromosome_str = chromosome assembly = allele_record["assembly"] if assembly not in assemblies: self.logger.info(assembly) context_info = ContextInfo() data_manager = DataFileManager( context_info.config_file_location) assemblies[assembly] = AssemblySequenceHelper( assembly, data_manager) so_term_id = allele_record.get('type') genomic_reference_sequence = allele_record.get( 'genomicReferenceSequence') genomic_variant_sequence = allele_record.get( 'genomicVariantSequence') if genomic_reference_sequence == 'N/A': genomic_reference_sequence = "" if genomic_variant_sequence == 'N/A': genomic_variant_sequence = "" padding_left = "" padding_right = "" if allele_record.get('start') != "" and allele_record.get( 'end') != "": # not insertion if so_term_id != "SO:0000667" and chromosome_str != "Unmapped_Scaffold_8_D1580_D1567": genomic_reference_sequence = assemblies[ assembly].get_sequence(chromosome_str, allele_record.get('start'), allele_record.get('end')) if allele_record.get('start') < allele_record.get('end'): start = allele_record.get('start') end = allele_record.get('end') else: start = allele_record.get('end') end = allele_record.get('start') padding_width = 500 if so_term_id != "SO:0000667": #not insertion start = start - 1 end = end + 1 left_padding_start = start - padding_width if left_padding_start < 1: left_padding_start = 1 padding_left = assemblies[assembly].get_sequence( chromosome_str, left_padding_start, start) right_padding_end = end + padding_width padding_right = assemblies[assembly].get_sequence( chromosome_str, end, right_padding_end) counter = counter + 1 global_id = allele_record.get('alleleId') mod_global_cross_ref_id = "" cross_references = [] if self.test_object.using_test_data() is True: is_it_test_entry = self.test_object.check_for_test_id_entry( global_id) if is_it_test_entry is False: counter = counter - 1 continue cross_ref_primary_id = allele_record.get( 'sequenceOfReferenceAccessionNumber') local_cross_ref_id = cross_ref_primary_id.split(":")[1] prefix = cross_ref_primary_id.split(":")[0] cross_ref_complete_url = ETLHelper.get_no_page_complete_url( local_cross_ref_id, ETL.xref_url_map, prefix, global_id) xref_map = ETLHelper.get_xref_dict( local_cross_ref_id, prefix, "variant_sequence_of_reference", "sequence_of_reference_accession_number", global_id, cross_ref_complete_url, cross_ref_primary_id + "variant_sequence_of_reference") xref_map['dataId'] = global_id if cross_ref_primary_id is not None: cross_references.append(xref_map) if genomic_reference_sequence is not None: if len(genomic_reference_sequence) > 1000 and ( allele_record.get('type') == 'SO:1000002' or allele_record.get('type') == 'SO:1000008'): self.logger.debug("%s genomicReferenceSequence", allele_record.get('alleleId')) if genomic_variant_sequence is not None: if len(genomic_variant_sequence) > 1000 and ( allele_record.get('type') in ['SO:1000002', 'SO:1000008']): self.logger.debug("%s genomicVariantSequence", allele_record.get('alleleId')) hgvs_nomenclature, hgvs_synonym = self.get_hgvs_nomenclature( allele_record.get('sequenceOfReferenceAccessionNumber'), allele_record.get('type'), allele_record.get('start'), allele_record.get('end'), genomic_reference_sequence, genomic_variant_sequence, allele_record.get('assembly'), chromosome_str) if (genomic_reference_sequence is not None and len(genomic_reference_sequence) > 30000) \ or (genomic_variant_sequence is not None and len(genomic_variant_sequence)) > 30000: self.logger.debug( "%s has too long of a sequence potentionally", allele_record.get('alleleId')) # TODO: fix typo in MGI Submission for this variant so # that it doesn't list a 40K bp point mutation. if allele_record.get('alleleId') != 'MGI:6113870': variant_dataset = { "hgvs_nomenclature": hgvs_nomenclature, "genomicReferenceSequence": genomic_reference_sequence, "genomicVariantSequence": genomic_variant_sequence, "paddingLeft": padding_left, "paddingRight": padding_right, "alleleId": allele_record.get('alleleId'), "dataProviders": data_providers, "dateProduced": date_produced, "loadKey": load_key, "release": release, "modGlobalCrossRefId": mod_global_cross_ref_id, "dataProvider": data_provider, "variantHGVSSynonym": hgvs_synonym } variant_genomic_location_dataset = { "variantId": hgvs_nomenclature, "assembly": allele_record.get('assembly'), "chromosome": chromosome_str, "start": allele_record.get('start'), "end": allele_record.get('end'), "uuid": str(uuid.uuid4()), "dataProvider": data_provider } variant_so_term = { "variantId": hgvs_nomenclature, "soTermId": allele_record.get('type') } variant_so_terms.append(variant_so_term) variant_genomic_locations.append( variant_genomic_location_dataset) variants.append(variant_dataset) if counter == batch_size: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ] variants = [] variant_genomic_locations = [] variant_so_terms = [] cross_references = [] if counter > 0: yield [ variants, variant_genomic_locations, variant_so_terms, cross_references ]