Beispiel #1
0
 def scan_data_from_es(self, query=None, include_metadata=False):
     if query is None and not include_metadata:
         query = {}
     if not include_metadata:
         util.put_js_path_in_dict(query, '_source.excludes',
                                  ['_metadata.*'])
     es_util.scan_index(self.RESOURCE.idx_name,
                        self.on_doc_for_scan,
                        query=query)
class CompoundStructuralAlertDenormalizationHandler(DenormalizationHandler):

    RESOURCE = DenormalizationHandler.AVAILABLE_RESOURCES.COMPOUND_STRUCTURAL_ALERT

    ALERTS_MAPPINGS = SummableDict(mappings)
    put_js_path_in_dict(ALERTS_MAPPINGS, 'properties.cpd_str_alert_id',
                        DefaultMappings.ID_REF)

    COMPOUND_MAPPING = {
        'properties': {
            '_metadata': {
                'properties': {
                    'compound_structural_alerts': {
                        'properties': {
                            'alert_count': DefaultMappings.LONG,
                            'alerts': ALERTS_MAPPINGS
                        }
                    }
                }
            }
        }
    }

    def __init__(self):
        super().__init__()
        self.compound_dict = {}

    def handle_doc(self, es_doc: dict, total_docs: int, index: int,
                   first: bool, last: bool):
        molecule_c_id = es_doc.get('molecule_chembl_id', None)

        if molecule_c_id:
            if molecule_c_id not in self.compound_dict:
                self.compound_dict[molecule_c_id] = []
            self.compound_dict[molecule_c_id].append(es_doc)

    def save_denormalization(self):
        def get_update_script_and_size(compound_chembl_id, compound_alerts):
            compound_alerts_list = list(compound_alerts)
            update_size = len(compound_alerts) * 2

            update_doc = {
                '_metadata': {
                    'compound_structural_alerts': {
                        'alert_count': len(compound_alerts_list),
                        'alerts': compound_alerts_list
                    }
                }
            }

            return update_doc, update_size

        self.save_denormalization_dict(
            DenormalizationHandler.AVAILABLE_RESOURCES.MOLECULE,
            self.compound_dict,
            get_update_script_and_size,
            new_mappings=self.COMPOUND_MAPPING)
 def get_compound_dn_mapping(cls):
     mappings_dict = {}
     src_mapping = {
         'src_id': DefaultMappings.SHORT,
         'src_description': DefaultMappings.KEYWORD,
         'src_short_name': DefaultMappings.KEYWORD
     }
     put_js_path_in_dict(
         mappings_dict,
         '._metadata.hierarchy.family_inchi_connectivity_layer',
         DefaultMappings.KEYWORD,
         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.is_approved_drug',
                         DefaultMappings.BOOLEAN,
                         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.is_usan',
                         DefaultMappings.BOOLEAN,
                         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.all_family.inchi',
                         DefaultMappings.KEYWORD,
                         es_properties_style=True)
     put_js_path_in_dict(
         mappings_dict,
         '._metadata.hierarchy.all_family.inchi_connectivity_layer',
         DefaultMappings.KEYWORD,
         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.all_family.inchi_key',
                         DefaultMappings.KEYWORD,
                         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.all_family.chembl_id',
                         DefaultMappings.CHEMBL_ID_REF,
                         es_properties_style=True)
     # Parent properties
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.parent.chembl_id',
                         DefaultMappings.CHEMBL_ID_REF,
                         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.parent.sources.',
                         src_mapping,
                         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.parent.synonyms',
                         molecule_n_drug_mapping.molecule_synonyms,
                         es_properties_style=True)
     # Children properties
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.children.chembl_id',
                         DefaultMappings.CHEMBL_ID_REF,
                         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.children.sources.',
                         src_mapping,
                         es_properties_style=True)
     put_js_path_in_dict(mappings_dict,
                         '._metadata.hierarchy.children.synonyms',
                         molecule_n_drug_mapping.molecule_synonyms,
                         es_properties_style=True)
     return mappings_dict
    def get_denormalization_dict(self):
        dn_dict = SummableDict()
        is_drug_src = self.compound_data['is_drug_src']
        is_usan_src = self.compound_data['is_usan_src']
        is_db_drug = self.compound_data['is_db_drug']
        max_phase = self.compound_data['max_phase']

        shared_family_data = [{
            'chembl_id':
            self.chembl_id,
            'inchi':
            self.compound_data['inchi'],
            'inchi_connectivity_layer':
            get_inchi_connectivity_layer(self.compound_data['inchi_key']),
            'inchi_key':
            self.compound_data['inchi_key']
        }]

        node_data = self.get_node_data()
        children_data = []

        for chembl_id_i, node in self.children.items():
            is_drug_src |= node.compound_data['is_drug_src']
            is_usan_src |= node.compound_data['is_usan_src']
            max_phase = max(max_phase, node.compound_data['max_phase'])
            dn_data_i, sf_data_i, nd_i = node.get_denormalization_dict()
            children_data.append(nd_i)
            put_js_path_in_dict(dn_data_i,
                                node.chembl_id + '._metadata.hierarchy.parent',
                                node_data)
            dn_dict += dn_data_i
            shared_family_data += sf_data_i
        # Warning checks!
        if is_db_drug and is_db_drug != (
            (is_usan_src or is_drug_src) and self.is_family_parent()):
            print('WARNING! {0} has db_drug {1} and sources_drug {2}'.format(
                self.chembl_id, is_db_drug, (is_usan_src or is_drug_src)),
                  file=sys.stderr)
        if max_phase != self.compound_data['max_phase']:
            print(
                'WARNING! {0} has db_max_phase of {1} and children max_phase of {2}'
                .format(self.chembl_id, self.compound_data['max_phase'],
                        max_phase),
                file=sys.stderr)

        dn_dict[self.chembl_id] = {}

        put_js_path_in_dict(
            dn_dict[self.chembl_id], '_metadata.hierarchy', {
                'is_approved_drug': (is_drug_src and max_phase == 4),
                'is_usan': is_usan_src,
                'children': children_data
            })

        # If root collect the shared family data
        if self.is_family_parent():
            family_inchi_connectivity_layer = get_inchi_connectivity_layer(
                self.compound_data['inchi_key'])
            for dn_data in dn_dict.values():
                put_js_path_in_dict(dn_data, '_metadata.hierarchy.all_family',
                                    shared_family_data)
                put_js_path_in_dict(
                    dn_data,
                    '_metadata.hierarchy.family_inchi_connectivity_layer',
                    family_inchi_connectivity_layer)
        return dn_dict, shared_family_data, node_data
Beispiel #5
0
class DocumentDenormalizationHandler(DenormalizationHandler):

    RESOURCE = DenormalizationHandler.AVAILABLE_RESOURCES.DOCUMENT

    FIELDS_FOR_ACTIVITY = ['pubmed_id', 'volume', 'year', 'first_page']

    FIELDS_FOR_ACTIVITY_MAPPING = {}

    for field_i in FIELDS_FOR_ACTIVITY:
        put_js_path_in_dict(FIELDS_FOR_ACTIVITY_MAPPING,
                            '._metadata.document_data.{0}'.format(field_i),
                            DefaultMappings.NO_INDEX_KEYWORD,
                            es_properties_style=True)

    FIELDS_FOR_ASSAY = [
        'journal', 'year', 'volume', 'first_page', 'last_page', 'title',
        'pubmed_id', 'doi'
    ]

    FIELDS_FOR_ASSAY_MAPPING = {}

    FIELDS_FOR_DOC_SIM_MAPPING = {}

    for field_i in FIELDS_FOR_ASSAY:
        put_js_path_in_dict(FIELDS_FOR_ASSAY_MAPPING,
                            '._metadata.document_data.{0}'.format(field_i),
                            DefaultMappings.NO_INDEX_KEYWORD,
                            es_properties_style=True)
        put_js_path_in_dict(FIELDS_FOR_DOC_SIM_MAPPING,
                            '._metadata.similar_documents.{0}'.format(field_i),
                            DefaultMappings.NO_INDEX_KEYWORD,
                            es_properties_style=True)

    def __init__(self,
                 assay_dh: AssayDenormalizationHandler = None,
                 source_dh: SourceDenormalizationHandler = None):
        super().__init__(source_dh is not None or assay_dh is not None)
        self.assay_dh = assay_dh
        self.source_dh = source_dh
        self.docs_for_assay_by_chembl_id = {}
        self.docs_for_activity_by_chembl_id = {}

    def handle_doc(self, doc: dict, total_docs: int, index: int, first: bool,
                   last: bool):
        doc_chembl_id = doc['document_chembl_id']
        fields_for_assay = {}
        for field_i in self.FIELDS_FOR_ASSAY:
            fields_for_assay[field_i] = doc[field_i]
        self.docs_for_assay_by_chembl_id[doc_chembl_id] = fields_for_assay

        fields_for_activity = {}
        for field_i in self.FIELDS_FOR_ACTIVITY:
            fields_for_activity[field_i] = doc[field_i]
        self.docs_for_activity_by_chembl_id[
            doc_chembl_id] = fields_for_activity

    def get_custom_mappings_for_complete_data(self):
        mappings = SummableDict()
        mappings += SourceDenormalizationHandler.METADATA_MAPPING
        return mappings

    def get_doc_for_complete_data(self, doc: dict):
        update_doc_md = {}

        src_ids = self.assay_dh.document_2_src_id.get(
            doc['document_chembl_id'], set())
        if 'src_id' in doc:
            src_ids.add(doc['src_id'])
        sources = []
        for src_id_i in src_ids:
            if self.source_dh and src_id_i in self.source_dh.sources_by_id:
                sources.append(self.source_dh.sources_by_id[src_id_i])
        if sources is not None:
            update_doc_md['source'] = sources

        return {'_metadata': update_doc_md}