def scan_data_from_es(self, query=None, include_metadata=False): if query is None and not include_metadata: query = {} if not include_metadata: util.put_js_path_in_dict(query, '_source.excludes', ['_metadata.*']) es_util.scan_index(self.RESOURCE.idx_name, self.on_doc_for_scan, query=query)
class CompoundStructuralAlertDenormalizationHandler(DenormalizationHandler): RESOURCE = DenormalizationHandler.AVAILABLE_RESOURCES.COMPOUND_STRUCTURAL_ALERT ALERTS_MAPPINGS = SummableDict(mappings) put_js_path_in_dict(ALERTS_MAPPINGS, 'properties.cpd_str_alert_id', DefaultMappings.ID_REF) COMPOUND_MAPPING = { 'properties': { '_metadata': { 'properties': { 'compound_structural_alerts': { 'properties': { 'alert_count': DefaultMappings.LONG, 'alerts': ALERTS_MAPPINGS } } } } } } def __init__(self): super().__init__() self.compound_dict = {} def handle_doc(self, es_doc: dict, total_docs: int, index: int, first: bool, last: bool): molecule_c_id = es_doc.get('molecule_chembl_id', None) if molecule_c_id: if molecule_c_id not in self.compound_dict: self.compound_dict[molecule_c_id] = [] self.compound_dict[molecule_c_id].append(es_doc) def save_denormalization(self): def get_update_script_and_size(compound_chembl_id, compound_alerts): compound_alerts_list = list(compound_alerts) update_size = len(compound_alerts) * 2 update_doc = { '_metadata': { 'compound_structural_alerts': { 'alert_count': len(compound_alerts_list), 'alerts': compound_alerts_list } } } return update_doc, update_size self.save_denormalization_dict( DenormalizationHandler.AVAILABLE_RESOURCES.MOLECULE, self.compound_dict, get_update_script_and_size, new_mappings=self.COMPOUND_MAPPING)
def get_compound_dn_mapping(cls): mappings_dict = {} src_mapping = { 'src_id': DefaultMappings.SHORT, 'src_description': DefaultMappings.KEYWORD, 'src_short_name': DefaultMappings.KEYWORD } put_js_path_in_dict( mappings_dict, '._metadata.hierarchy.family_inchi_connectivity_layer', DefaultMappings.KEYWORD, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.is_approved_drug', DefaultMappings.BOOLEAN, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.is_usan', DefaultMappings.BOOLEAN, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.all_family.inchi', DefaultMappings.KEYWORD, es_properties_style=True) put_js_path_in_dict( mappings_dict, '._metadata.hierarchy.all_family.inchi_connectivity_layer', DefaultMappings.KEYWORD, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.all_family.inchi_key', DefaultMappings.KEYWORD, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.all_family.chembl_id', DefaultMappings.CHEMBL_ID_REF, es_properties_style=True) # Parent properties put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.parent.chembl_id', DefaultMappings.CHEMBL_ID_REF, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.parent.sources.', src_mapping, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.parent.synonyms', molecule_n_drug_mapping.molecule_synonyms, es_properties_style=True) # Children properties put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.children.chembl_id', DefaultMappings.CHEMBL_ID_REF, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.children.sources.', src_mapping, es_properties_style=True) put_js_path_in_dict(mappings_dict, '._metadata.hierarchy.children.synonyms', molecule_n_drug_mapping.molecule_synonyms, es_properties_style=True) return mappings_dict
def get_denormalization_dict(self): dn_dict = SummableDict() is_drug_src = self.compound_data['is_drug_src'] is_usan_src = self.compound_data['is_usan_src'] is_db_drug = self.compound_data['is_db_drug'] max_phase = self.compound_data['max_phase'] shared_family_data = [{ 'chembl_id': self.chembl_id, 'inchi': self.compound_data['inchi'], 'inchi_connectivity_layer': get_inchi_connectivity_layer(self.compound_data['inchi_key']), 'inchi_key': self.compound_data['inchi_key'] }] node_data = self.get_node_data() children_data = [] for chembl_id_i, node in self.children.items(): is_drug_src |= node.compound_data['is_drug_src'] is_usan_src |= node.compound_data['is_usan_src'] max_phase = max(max_phase, node.compound_data['max_phase']) dn_data_i, sf_data_i, nd_i = node.get_denormalization_dict() children_data.append(nd_i) put_js_path_in_dict(dn_data_i, node.chembl_id + '._metadata.hierarchy.parent', node_data) dn_dict += dn_data_i shared_family_data += sf_data_i # Warning checks! if is_db_drug and is_db_drug != ( (is_usan_src or is_drug_src) and self.is_family_parent()): print('WARNING! {0} has db_drug {1} and sources_drug {2}'.format( self.chembl_id, is_db_drug, (is_usan_src or is_drug_src)), file=sys.stderr) if max_phase != self.compound_data['max_phase']: print( 'WARNING! {0} has db_max_phase of {1} and children max_phase of {2}' .format(self.chembl_id, self.compound_data['max_phase'], max_phase), file=sys.stderr) dn_dict[self.chembl_id] = {} put_js_path_in_dict( dn_dict[self.chembl_id], '_metadata.hierarchy', { 'is_approved_drug': (is_drug_src and max_phase == 4), 'is_usan': is_usan_src, 'children': children_data }) # If root collect the shared family data if self.is_family_parent(): family_inchi_connectivity_layer = get_inchi_connectivity_layer( self.compound_data['inchi_key']) for dn_data in dn_dict.values(): put_js_path_in_dict(dn_data, '_metadata.hierarchy.all_family', shared_family_data) put_js_path_in_dict( dn_data, '_metadata.hierarchy.family_inchi_connectivity_layer', family_inchi_connectivity_layer) return dn_dict, shared_family_data, node_data
class DocumentDenormalizationHandler(DenormalizationHandler): RESOURCE = DenormalizationHandler.AVAILABLE_RESOURCES.DOCUMENT FIELDS_FOR_ACTIVITY = ['pubmed_id', 'volume', 'year', 'first_page'] FIELDS_FOR_ACTIVITY_MAPPING = {} for field_i in FIELDS_FOR_ACTIVITY: put_js_path_in_dict(FIELDS_FOR_ACTIVITY_MAPPING, '._metadata.document_data.{0}'.format(field_i), DefaultMappings.NO_INDEX_KEYWORD, es_properties_style=True) FIELDS_FOR_ASSAY = [ 'journal', 'year', 'volume', 'first_page', 'last_page', 'title', 'pubmed_id', 'doi' ] FIELDS_FOR_ASSAY_MAPPING = {} FIELDS_FOR_DOC_SIM_MAPPING = {} for field_i in FIELDS_FOR_ASSAY: put_js_path_in_dict(FIELDS_FOR_ASSAY_MAPPING, '._metadata.document_data.{0}'.format(field_i), DefaultMappings.NO_INDEX_KEYWORD, es_properties_style=True) put_js_path_in_dict(FIELDS_FOR_DOC_SIM_MAPPING, '._metadata.similar_documents.{0}'.format(field_i), DefaultMappings.NO_INDEX_KEYWORD, es_properties_style=True) def __init__(self, assay_dh: AssayDenormalizationHandler = None, source_dh: SourceDenormalizationHandler = None): super().__init__(source_dh is not None or assay_dh is not None) self.assay_dh = assay_dh self.source_dh = source_dh self.docs_for_assay_by_chembl_id = {} self.docs_for_activity_by_chembl_id = {} def handle_doc(self, doc: dict, total_docs: int, index: int, first: bool, last: bool): doc_chembl_id = doc['document_chembl_id'] fields_for_assay = {} for field_i in self.FIELDS_FOR_ASSAY: fields_for_assay[field_i] = doc[field_i] self.docs_for_assay_by_chembl_id[doc_chembl_id] = fields_for_assay fields_for_activity = {} for field_i in self.FIELDS_FOR_ACTIVITY: fields_for_activity[field_i] = doc[field_i] self.docs_for_activity_by_chembl_id[ doc_chembl_id] = fields_for_activity def get_custom_mappings_for_complete_data(self): mappings = SummableDict() mappings += SourceDenormalizationHandler.METADATA_MAPPING return mappings def get_doc_for_complete_data(self, doc: dict): update_doc_md = {} src_ids = self.assay_dh.document_2_src_id.get( doc['document_chembl_id'], set()) if 'src_id' in doc: src_ids.add(doc['src_id']) sources = [] for src_id_i in src_ids: if self.source_dh and src_id_i in self.source_dh.sources_by_id: sources.append(self.source_dh.sources_by_id[src_id_i]) if sources is not None: update_doc_md['source'] = sources return {'_metadata': update_doc_md}