Esempio n. 1
0
 def get_custom_mappings_for_complete_data(self):
     mappings = SummableDict()
     mappings += AssayDenormalizationHandler.ACTIVITY_DATA_MAPPING
     mappings += CompoundDenormalizationHandler.ACTIVITY_DATA_MAPPING
     mappings += SourceDenormalizationHandler.METADATA_MAPPING
     mappings += OrganismDenormalizationHandler.METADATA_MAPPING
     mappings += CompoundRecordDenormalizationHandler.ACTIVITY_DATA_MAPPING
     mappings += TargetDenormalizationHandler.ACTIVITY_DATA_MAPPING
     mappings += ProteinClassDenormalizationHandler.METADATA_MAPPING
     mappings += DocumentDenormalizationHandler.FIELDS_FOR_ACTIVITY_MAPPING
     mappings += {
         'properties': {
             '_metadata': {
                 'properties': {
                     'activity_generated': {
                         'properties': {
                             'short_data_validity_comment':
                             DefaultMappings.KEYWORD
                         }
                     }
                 }
             }
         }
     }
     return mappings
Esempio n. 2
0
    def get_custom_mappings_for_complete_data(self):
        mappings = SummableDict()
        mappings += SourceDenormalizationHandler.METADATA_MAPPING
        mappings += OrganismDenormalizationHandler.METADATA_MAPPING
        mappings += {
            'properties':
            {
                '_metadata':
                {
                    'properties':
                    {
                        'assay_generated':
                        {
                            'properties':
                            {
                                'confidence_label': DefaultMappings.KEYWORD,
                                'relationship_label': DefaultMappings.KEYWORD,
                                'type_label': DefaultMappings.KEYWORD
                            }
                        }
                    }

                }
            }
        }
        if self.document_dh_mappings:
            mappings += self.document_dh_mappings
        return mappings
Esempio n. 3
0
 def scan_index(self, es_index, on_doc=None, query=None):
     if self.es_conn is None:
         print(
             "FATAL ERROR: there is not an elastic search connection defined.",
             file=sys.stderr)
         traceback.print_exc(file=sys.stderr)
         sys.exit(1)
     if query is None:
         query = {}
     query['track_total_hits'] = True
     search_res = self.es_conn.search(index=es_index, body=query)
     total_docs = search_res['hits']['total']['value']
     update_every = min(math.ceil(total_docs * 0.001), 1000)
     scan_query = SummableDict()
     if query:
         scan_query += query
     scanner = helpers.scan(self.es_conn,
                            index=es_index,
                            scroll='10m',
                            query=query,
                            size=1000)
     count = 0
     p_bar = progress_bar_handler.get_new_progressbar(
         '{0}_es-index-scan'.format(es_index), total_docs)
     for doc_n in scanner:
         if callable(on_doc):
             should_stop = on_doc(doc_n['_source'], doc_n['_id'],
                                  total_docs, count, count == 0,
                                  count == total_docs - 1)
             if should_stop or self.stop_scan:
                 return
         count += 1
         if count % update_every == 0:
             p_bar.update(count)
     p_bar.finish()
Esempio n. 4
0
 def get_custom_mappings_for_complete_data(self):
     mappings = SummableDict()
     mappings += {
         'properties': {
             '_metadata': {
                 'properties': {
                     'all_molecule_chembl_ids':
                     DefaultMappings.CHEMBL_ID_REF
                 }
             }
         }
     }
     return mappings
 def get_all_dn_dicts(self):
     total_dn_dict = SummableDict()
     pb = get_new_progressbar('built-dn-hierarchy-dict', len(self.children))
     current = 0
     for node in self.children.values():
         dn_dict_i, shared_family_data, node_data = node.get_denormalization_dict(
         )
         for chembl_id, dn_data in dn_dict_i.items():
             total_dn_dict[chembl_id] = dn_data
         current += 1
         pb.update(current)
     pb.finish()
     return total_dn_dict
    def save_similarity_data(self, chembl_id_1, chembl_id_2, mol_tani,
                             tid_tani):
        if chembl_id_1 not in self.document_dict:
            self.document_dict[chembl_id_1] = {}
        if chembl_id_2 not in self.document_dict[chembl_id_1]:
            self.document_dict[chembl_id_1][chembl_id_2] = []

        data_dict = SummableDict()
        data_dict += {
            'document_chembl_id': chembl_id_2,
            'mol_tani': mol_tani,
            'tid_tani': tid_tani
        }
        data_dict += self.document_dh.docs_for_assay_by_chembl_id[chembl_id_2]

        self.document_dict[chembl_id_1][chembl_id_2].append(data_dict)
Esempio n. 7
0
 def get_custom_mappings_for_complete_data(self):
     mappings = SummableDict()
     mappings += {
         'properties': {
             '_metadata': {
                 'properties': {
                     'all_molecule_chembl_ids':
                     DefaultMappings.CHEMBL_ID_REF,
                     'parent_molecule_chembl_id':
                     DefaultMappings.CHEMBL_ID_REF,
                     'should_appear_in_browser': DefaultMappings.BOOLEAN
                 }
             }
         }
     }
     return mappings
    def save_denormalization(self):
        for key_i, value_i in self.document_dict.items():
            for key_j, value_j in value_i.items():
                if len(value_j) != 1:
                    print('WARNING FOUND DUPLICATE DATA FOR:',
                          key_i,
                          key_j,
                          value_j,
                          file=sys.stderr)

        def get_update_script_and_size(es_doc_id, es_doc):
            similar_docs = []
            for chembl_id_other, similarity_list in es_doc.items():
                similar_docs.append(similarity_list[0])
            update_size = len(similar_docs) * 10

            update_doc = {'_metadata': {'similar_documents': similar_docs}}

            return update_doc, update_size

        new_mappings = SummableDict()
        new_mappings += {
            'properties': {
                '_metadata': {
                    'properties': {
                        'similar_documents': {
                            'properties': {
                                'document_chembl_id':
                                DefaultMappings.CHEMBL_ID_REF,
                                'mol_tani': DefaultMappings.FLOAT,
                                'tid_tani': DefaultMappings.FLOAT
                            }
                        }
                    }
                }
            }
        }
        new_mappings += DocumentDenormalizationHandler.FIELDS_FOR_DOC_SIM_MAPPING

        self.save_denormalization_dict(
            DenormalizationHandler.AVAILABLE_RESOURCES.DOCUMENT,
            self.document_dict,
            get_update_script_and_size,
            new_mappings=new_mappings)
Esempio n. 9
0
def _recursive_simplify_es_properties(cur_dict: dict, cur_prefix: str):
    simple_props = SummableDict()
    for key, value in cur_dict.items():
        next_prefix = '{0}.{1}'.format(cur_prefix, key) if cur_prefix else key
        if isinstance(value, dict):
            if 'es_mapping_leaf' in value.keys():
                simple_props[next_prefix] = simplify_single_mapping(value)
            else:
                simple_props[next_prefix] = {
                    'type': 'object',
                    'aggregatable': False,
                    'sortable': False
                }
                simple_props += _recursive_simplify_es_properties(
                    value, next_prefix)
        elif value:
            simple_props[next_prefix] = value

    return simple_props
Esempio n. 10
0
 def get_custom_mappings_for_complete_data(self):
     mappings = SummableDict()
     mappings += unichem_helper.UNICHEM_MAPPING
     mappings += ATCClassDenormalizationHandler.METADATA_MAPPING
     mappings += {
         'properties': {
             '_metadata': {
                 'properties': {
                     'compound_generated': {
                         'properties': {
                             'availability_type_label':
                             DefaultMappings.KEYWORD,
                             'chirality_label': DefaultMappings.KEYWORD,
                             'image_file': DefaultMappings.KEYWORD
                         }
                     }
                 }
             }
         }
     }
     return mappings
    def save_denormalization(self):

        mappings = SummableDict()
        mappings += self.METADATA_MAPPING
        mappings += ProteinClassDenormalizationHandler.METADATA_MAPPING

        def get_update_script_and_size(doc_id, doc):
            update_size = len(doc) * 20
            protein_classifications = self.get_protein_classifications(doc_id)

            update_doc = {
                '_metadata': {
                    'target_component': doc,
                    'protein_classification': protein_classifications
                }
            }

            return update_doc, update_size

        self.save_denormalization_dict(
            DenormalizationHandler.AVAILABLE_RESOURCES.TARGET,
            self.target_dict, get_update_script_and_size, mappings)
Esempio n. 12
0
 def get_new_index_mappings():
     return {
         'properties': {
             'parent_molecule': {
                 'properties': MOLECULE.get_resource_mapping_from_es()
             },
             'drug_indication': {
                 'properties': SummableDict(
                     **DRUG_INDICATION.get_resource_mapping_from_es()) -
                 ['efo_term', 'efo_id'] + {
                     'efo': {
                         'properties': {
                             'term':
                             DefaultMappings.LOWER_CASE_KEYWORD +
                             DefaultMappings.TEXT_STD,
                             'id':
                             DefaultMappings.ID
                         }
                     }
                 }
             }
         }
     }
Esempio n. 13
0
 def get_custom_mappings_for_complete_data(self):
     mappings = SummableDict()
     mappings += {
         'properties': {
             '_metadata': {
                 'properties': {
                     'compound_data': {
                         'properties': {
                             'drug_image_file': DefaultMappings.KEYWORD,
                             'metabolite_image_file':
                             DefaultMappings.KEYWORD,
                             'substrate_image_file':
                             DefaultMappings.KEYWORD,
                             'drug_pref_name': DefaultMappings.KEYWORD,
                             'metabolite_pref_name':
                             DefaultMappings.KEYWORD,
                             'substrate_pref_name': DefaultMappings.KEYWORD
                         }
                     }
                 }
             }
         }
     }
     return mappings
Esempio n. 14
0
def get_config_for_prop(index_name, prop_id):
    cache_key = 'property_config-{index_name}-{prop_id}'.format(
        index_name=index_name, prop_id=prop_id)
    cache_response = cache.get(cache_key)
    if cache_response is not None:
        return cache_response

    index_mapping = resources_description.RESOURCES_BY_IDX_NAME.get(index_name)
    if index_mapping is None:
        raise ESPropsConfigurationManagerError(
            "The index {} does not exist!".format(index_name))

    simplified_mapping = index_mapping.get_simplified_mapping_from_es()
    es_property_description = simplified_mapping.get(prop_id)

    found_in_es = es_property_description is not None
    if not found_in_es:
        es_property_description = {}

    # Search for description in override
    config_override = yaml.load(open(settings.PROPERTIES_CONFIG_OVERRIDE_FILE,
                                     'r'),
                                Loader=yaml.FullLoader)
    found_in_override = False
    if config_override is not None:
        index_override = config_override.get(index_name)
        if index_override is not None:
            property_override_description = index_override.get(prop_id)
            found_in_override = property_override_description is not None

    config = {}
    if not found_in_es and not found_in_override:
        raise ESPropsConfigurationManagerError(
            "The property {} does not exist in elasticsearch or as virtual property"
            .format(prop_id))

    elif found_in_es and not found_in_override:
        # this is a normal property WITHOUT override

        config = SummableDict({
            'index_name': index_name,
            'prop_id': prop_id,
        })
        config += SummableDict(es_property_description)

    elif not found_in_es and found_in_override:
        # this is a virtual property
        config = SummableDict({
            'index_name': index_name,
            'prop_id': prop_id,
            'is_virtual': True
        })

        based_on = property_override_description.get('based_on')
        if based_on is not None:
            config['is_contextual'] = False
            base_description = simplified_mapping.get(based_on)
            if base_description is None:
                raise ESPropsConfigurationManagerError(
                    'The virtual property {prop_id} is based on {based_on} which does not exist in elasticsearch '
                    'index {index_name}'.format(prop_id=prop_id,
                                                based_on=based_on,
                                                index_name=index_name))
            config += SummableDict(base_description)

        else:
            config['is_contextual'] = True
            if property_override_description.get('aggregatable') is None or \
                            property_override_description.get('type') is None or \
                            property_override_description.get('sortable') is None:
                raise ESPropsConfigurationManagerError(
                    'A contextual property must define the type and if it is '
                    'aggregatable and sortable. index => {} : prop => {}'.
                    format(index_name, prop_id))

        config += property_override_description

    elif found_in_es and found_in_override:
        # this is a normal overridden property
        config = SummableDict({
            'index_name': index_name,
            'prop_id': prop_id,
        })
        config += SummableDict(es_property_description)
        config += property_override_description

    cache.set(cache_key, config, CACHE_TIME)
    return config
 def get_custom_mappings_for_complete_data(self):
     mappings = SummableDict()
     return mappings
Esempio n. 16
0
 def get_custom_mappings_for_complete_data(self):
     mappings = SummableDict()
     mappings += self.RELATED_ENTITIES_DYNAMIC_MAPPING
     mappings += OrganismDenormalizationHandler.METADATA_MAPPING
     return mappings
Esempio n. 17
0
class DefaultMappings(object):

    COMMON_CHAR_FILTERS = SummableDict(
        alphanumeric_and_space_char_filter={
            'type': 'pattern_replace',
            'pattern': '[^a-zA-Z0-9\s]',
            'replacement': ''
        },
        vitamin_char_filter={
            'type': 'pattern_replace',
            'pattern': r'(?:\b(?:vitamin|vit))\s+([a-z])(?:\s*(\d+))?',
            'flags': 'CASE_INSENSITIVE',
            'replacement': 'vitamin_$1$2'
        })

    COMMON_FILTERS = SummableDict(greek_synonym_filter={
        'type':
        'synonym',
        'synonyms_path':
        'synonyms/greek_letters_synonyms.txt'
    },
                                  english_stop={
                                      'type': 'stop',
                                      'stopwords': '_english_'
                                  },
                                  english_keywords={
                                      'type': 'keyword_marker',
                                      'keywords': ['example']
                                  },
                                  english_stemmer={
                                      'type': 'stemmer',
                                      'language': 'english'
                                  },
                                  english_possessive_stemmer={
                                      'type': 'stemmer',
                                      'language': 'possessive_english'
                                  },
                                  large_id_ref_filter={
                                      'type': 'limit',
                                      'max_token_count': 10**6
                                  })

    COMMON_ANALYZERS = SummableDict(
        greek_syn_std_analyzer={
            'type': 'custom',
            'tokenizer': 'standard',
            'filter': ['greek_synonym_filter', 'lowercase'],
            'char_filter': 'vitamin_char_filter'
        },
        greek_syn_eng_analyzer={
            'type':
            'custom',
            'tokenizer':
            'standard',
            'filter': [
                'greek_synonym_filter',
                # English analyzer based on:
                # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#english-analyzer
                'english_possessive_stemmer',
                'lowercase',
                'english_stop',
                'english_keywords',
                'english_stemmer'
            ],
            'char_filter':
            'vitamin_char_filter'
        },
        lowercase_keyword={
            'type': 'custom',
            'tokenizer': 'keyword',
            'filter': 'lowercase'
        },
        alphanumeric_lowercase_keyword={
            'type': 'custom',
            'tokenizer': 'keyword',
            'filter': 'lowercase',
            'char_filter': 'alphanumeric_and_space_char_filter'
        },
        whitespace_alphanumeric_lowercase_std_analyzer={
            'type':
            'custom',
            'tokenizer':
            'whitespace',
            'filter': ['lowercase'],
            'char_filter':
            ['vitamin_char_filter', 'alphanumeric_and_space_char_filter']
        },
        whitespace_alphanumeric_std_no_limit_analyzer={
            'type': 'custom',
            'tokenizer': 'whitespace',
            'filter': ['large_id_ref_filter'],
            'char_filter': ['alphanumeric_and_space_char_filter']
        })

    COMMON_ANALYSIS = SummableDict(char_filter=COMMON_CHAR_FILTERS,
                                   analyzer=COMMON_ANALYZERS,
                                   filter=COMMON_FILTERS)

    # Indexing mappings
    __NO_INDEX = SummableDict(index=False)
    __DO_INDEX = SummableDict(index=True)
    __IGNORE_ABOVE = SummableDict(ignore_above=250)

    # Basic Types
    __TEXT_TYPE_NO_OFFSETS = SummableDict(type='text')
    __TEXT_TYPE = SummableDict(type='text',
                               term_vector='with_positions_offsets')
    __KEYWORD_TYPE = SummableDict(type='keyword')
    COMPLETION_TYPE = SummableDict(type='completion',
                                   analyzer='greek_syn_std_analyzer')
    BYTE = SummableDict(type='byte')
    SHORT = SummableDict(type='short')
    INTEGER = SummableDict(type='integer')
    LONG = SummableDict(type='long')
    FLOAT = SummableDict(type='float')
    DOUBLE = SummableDict(type='double')
    BOOLEAN = SummableDict(type='boolean')
    DATE_Y_M_D = SummableDict(type='date', format='yyyy-MM-dd')

    # Text Field Types

    __KEYWORD_FIELD = SummableDict(
        fields={'keyword': __KEYWORD_TYPE + __DO_INDEX})

    __ALPHANUMERIC_LOWERCASE_KEYWORD = SummableDict(
        fields={
            'alphanumeric_lowercase_keyword': __TEXT_TYPE + __DO_INDEX + {
                'analyzer': 'alphanumeric_lowercase_keyword'
            }
        })

    __LOWER_CASE_KEYWORD_FIELD = SummableDict(
        fields={
            'lower_case_keyword': __TEXT_TYPE + __DO_INDEX + {
                'analyzer': 'lowercase_keyword'
            }
        })

    __ID_FIELD = SummableDict(
        fields={'entity_id': __KEYWORD_FIELD['fields']['keyword']})

    __ID_REF_FIELD = SummableDict(
        fields={'id_reference': __KEYWORD_FIELD['fields']['keyword']})

    __CHEMBL_ID_FIELD = SummableDict(
        fields={'chembl_id': __KEYWORD_FIELD['fields']['keyword']})

    __CHEMBL_ID_REF_FIELD = SummableDict(
        fields={
            'chembl_id_reference': __CHEMBL_ID_FIELD['fields']['chembl_id']
        })

    __STD_ANALYZED_FIELD = SummableDict(
        fields={
            'std_analyzed': __TEXT_TYPE + __DO_INDEX + {
                'analyzer': 'greek_syn_std_analyzer'
            }
        })
    __ENG_ANALYZED_FIELD = SummableDict(
        fields={
            'eng_analyzed': __TEXT_TYPE + __DO_INDEX + {
                'analyzer': 'greek_syn_eng_analyzer'
            }
        })
    __WS_ANALYZED_FIELD = SummableDict(
        fields={
            'ws_analyzed': __TEXT_TYPE + __DO_INDEX + {
                'analyzer': 'whitespace_alphanumeric_lowercase_std_analyzer'
            }
        })
    __ALT_NAME_ANALYZED_FIELD = SummableDict(
        fields={
            'alt_name_analyzed': __TEXT_TYPE + __DO_INDEX + {
                'analyzer': 'greek_syn_eng_analyzer'
            }
        })
    __PREF_NAME_ANALYZED_FIELD = SummableDict(
        fields={
            'pref_name_analyzed':
            __ALT_NAME_ANALYZED_FIELD['fields']['alt_name_analyzed']
        })

    __TITLE_ANALYZED_FIELD = SummableDict(fields={
        'title_analyzed':
        __ALT_NAME_ANALYZED_FIELD['fields']['alt_name_analyzed']
    })

    # Chemical structure field types

    __SUBSTRUCTURE_FIELD = SummableDict(
        fields={
            "substructure": {
                "type": "structure_fingerprint",
                "aromaticity_mode": "preserve"
            }
        })

    __SIMILARITY_FIELD = SummableDict(
        fields={
            "similarity": {
                "type": "similarity_fingerprint",
                "aromaticity_mode": "preserve"
            }
        })

    # Properties MAPPINGS

    NO_INDEX_KEYWORD = __NO_INDEX + __KEYWORD_TYPE
    NO_INDEX_TEXT_NO_OFFSETS = __NO_INDEX + __TEXT_TYPE_NO_OFFSETS

    # KEYWORD FIELDS indexation for the field itself (Aggregatable fields)

    KEYWORD = __DO_INDEX + __KEYWORD_TYPE + __KEYWORD_FIELD + __ALPHANUMERIC_LOWERCASE_KEYWORD

    LOWER_CASE_KEYWORD = __DO_INDEX + __KEYWORD_TYPE + __LOWER_CASE_KEYWORD_FIELD + __ALPHANUMERIC_LOWERCASE_KEYWORD

    ID = __DO_INDEX + __KEYWORD_TYPE + __ID_FIELD + __ALPHANUMERIC_LOWERCASE_KEYWORD

    ID_REF = __DO_INDEX + __KEYWORD_TYPE + __ID_REF_FIELD + __ALPHANUMERIC_LOWERCASE_KEYWORD

    CHEMBL_ID = __DO_INDEX + __KEYWORD_TYPE + __CHEMBL_ID_FIELD + __ALPHANUMERIC_LOWERCASE_KEYWORD

    CHEMBL_ID_REF = __DO_INDEX + __KEYWORD_TYPE + __CHEMBL_ID_REF_FIELD + __ALPHANUMERIC_LOWERCASE_KEYWORD

    CHEMBL_ID_REF_AS_WS = __DO_INDEX + __TEXT_TYPE_NO_OFFSETS + \
        {
          'analyzer': 'whitespace_alphanumeric_std_no_limit_analyzer'
        }

    # CHEMICAL FIELDS

    CHEM_STRUCT_FIELD = __DO_INDEX + __KEYWORD_TYPE + __IGNORE_ABOVE + __SUBSTRUCTURE_FIELD + __SIMILARITY_FIELD

    # TEXT FIELDS indexation for the field itself (Aggregatable)

    TEXT_STD = __DO_INDEX + __KEYWORD_TYPE + __IGNORE_ABOVE + __STD_ANALYZED_FIELD + __ENG_ANALYZED_FIELD + \
        __WS_ANALYZED_FIELD

    PREF_NAME = __DO_INDEX + __KEYWORD_TYPE + __IGNORE_ABOVE + __STD_ANALYZED_FIELD + __ENG_ANALYZED_FIELD + \
        __WS_ANALYZED_FIELD + __PREF_NAME_ANALYZED_FIELD

    ALT_NAME = __DO_INDEX + __KEYWORD_TYPE + __IGNORE_ABOVE + __STD_ANALYZED_FIELD + __ENG_ANALYZED_FIELD + \
        __WS_ANALYZED_FIELD + __ALT_NAME_ANALYZED_FIELD

    TITLE = __DO_INDEX + __KEYWORD_TYPE + __IGNORE_ABOVE + __STD_ANALYZED_FIELD + __ENG_ANALYZED_FIELD + \
        __WS_ANALYZED_FIELD + __TITLE_ANALYZED_FIELD

    NUMERIC_MAPPINGS = [BYTE, SHORT, INTEGER, LONG, FLOAT, DOUBLE]
    INTEGER_NUMERIC_MAPPINGS = [BYTE, SHORT, INTEGER, LONG]
    AGGREGATABLE_MAPPINGS = [
        BOOLEAN,
        # Numeric Types
        BYTE,
        SHORT,
        INTEGER,
        LONG,
        FLOAT,
        DOUBLE,
        # Text Types
        KEYWORD,
        LOWER_CASE_KEYWORD,
        ID_REF,
        CHEMBL_ID_REF,
        ID,
        CHEMBL_ID,
        # Chemical types
        CHEM_STRUCT_FIELD
    ]

    NUMERIC_TYPES = {desc_i['type'] for desc_i in NUMERIC_MAPPINGS}
    INTEGER_NUMERIC_TYPES = {
        desc_i['type']
        for desc_i in INTEGER_NUMERIC_MAPPINGS
    }
    AGGREGATABLE_TYPES = {desc_i['type'] for desc_i in AGGREGATABLE_MAPPINGS}

    SIMPLE_MAPPINGS = {
        'string': {__KEYWORD_TYPE['type'], __TEXT_TYPE['type']},
        'string-es-interal': {
            COMPLETION_TYPE['type'],
        },
        'double': {FLOAT['type'], DOUBLE['type']},
        'integer': INTEGER_NUMERIC_TYPES,
        'boolean': {BOOLEAN['type']},
        'date': {
            DATE_Y_M_D['type'],
        }
    }
    SIMPLE_MAPPINGS_REVERSE = {}
    for s_type, types in SIMPLE_MAPPINGS.items():
        for type in types:
            SIMPLE_MAPPINGS_REVERSE[type] = s_type

    # Enable/Disable
    ENABLE = SummableDict(enabled=True)
    DISABLE = SummableDict(enabled=False)
Esempio n. 18
0
 def get_custom_mappings_for_complete_data(self):
     mappings = SummableDict()
     mappings += OrganismDenormalizationHandler.METADATA_MAPPING
     return mappings
Esempio n. 19
0
    def save_denormalization_for_new_index(self):
        es_util.delete_idx(self.generated_resource.idx_name)
        es_util.create_idx(self.generated_resource.idx_name,
                           3,
                           1,
                           analysis=DefaultMappings.COMMON_ANALYSIS,
                           mappings=DrugIndicationDenormalizationHandler.
                           get_new_index_mappings())

        dn_dict = {}

        print('{0} GROUPED RECORDS WERE FOUND'.format(
            len(self.drug_inds_by_grouping_id)),
              file=sys.stderr)
        p_bar = progress_bar_handler.get_new_progressbar(
            'drug_inds_by_parent-dn-generation',
            len(self.drug_inds_by_grouping_id))
        i = 0
        for group_drug_inds in self.drug_inds_by_grouping_id.values():
            base_drug_ind = group_drug_inds[0]
            efo_data = {}
            indication_refs = []
            max_phase_for_ind = 0
            for drug_ind_i in group_drug_inds:

                max_phase_for_ind = max(max_phase_for_ind,
                                        drug_ind_i.get('max_phase_for_ind', 0))

                efo_id_i = drug_ind_i.get('efo_id', None)
                if efo_id_i is not None:
                    efo_data[efo_id_i] = drug_ind_i.get('efo_term', None)

                indication_refs += drug_ind_i.get('indication_refs', [])

            parent_chembl_id, mesh_id = self.get_drug_ind_grouping_id_parts(
                base_drug_ind)

            drug_ind_data = SummableDict(
                **DRUG_INDICATION.get_doc_by_id_from_es(
                    base_drug_ind['drugind_id']))
            drug_ind_data -= ['efo_term', 'efo_id']
            drug_ind_data['efo'] = [{
                'id': efo_id,
                'term': term
            } for efo_id, term in efo_data.items()]
            drug_ind_data['max_phase_for_ind'] = max_phase_for_ind
            drug_ind_data['indication_refs'] = indication_refs

            new_mechanism_doc = {
                'parent_molecule':
                MOLECULE.get_doc_by_id_from_es(parent_chembl_id),
                'drug_indication': drug_ind_data
            }
            doc_id = self.generated_resource.get_doc_id(new_mechanism_doc)

            dn_dict[doc_id] = new_mechanism_doc
            i += 1
            p_bar.update(i)
        p_bar.finish()

        self.save_denormalization_dict(
            self.generated_resource,
            dn_dict,
            DenormalizationHandler.default_update_script_and_size,
            do_index=True)
Esempio n. 20
0
    def get_denormalization_dict(self):
        dn_dict = SummableDict()
        is_drug_src = self.compound_data['is_drug_src']
        is_usan_src = self.compound_data['is_usan_src']
        is_db_drug = self.compound_data['is_db_drug']
        max_phase = self.compound_data['max_phase']

        shared_family_data = [{
            'chembl_id':
            self.chembl_id,
            'inchi':
            self.compound_data['inchi'],
            'inchi_connectivity_layer':
            get_inchi_connectivity_layer(self.compound_data['inchi_key']),
            'inchi_key':
            self.compound_data['inchi_key']
        }]

        node_data = self.get_node_data()
        children_data = []

        for chembl_id_i, node in self.children.items():
            is_drug_src |= node.compound_data['is_drug_src']
            is_usan_src |= node.compound_data['is_usan_src']
            max_phase = max(max_phase, node.compound_data['max_phase'])
            dn_data_i, sf_data_i, nd_i = node.get_denormalization_dict()
            children_data.append(nd_i)
            put_js_path_in_dict(dn_data_i,
                                node.chembl_id + '._metadata.hierarchy.parent',
                                node_data)
            dn_dict += dn_data_i
            shared_family_data += sf_data_i
        # Warning checks!
        if is_db_drug and is_db_drug != (
            (is_usan_src or is_drug_src) and self.is_family_parent()):
            print('WARNING! {0} has db_drug {1} and sources_drug {2}'.format(
                self.chembl_id, is_db_drug, (is_usan_src or is_drug_src)),
                  file=sys.stderr)
        if max_phase != self.compound_data['max_phase']:
            print(
                'WARNING! {0} has db_max_phase of {1} and children max_phase of {2}'
                .format(self.chembl_id, self.compound_data['max_phase'],
                        max_phase),
                file=sys.stderr)

        dn_dict[self.chembl_id] = {}

        put_js_path_in_dict(
            dn_dict[self.chembl_id], '_metadata.hierarchy', {
                'is_approved_drug': (is_drug_src and max_phase == 4),
                'is_usan': is_usan_src,
                'children': children_data
            })

        # If root collect the shared family data
        if self.is_family_parent():
            family_inchi_connectivity_layer = get_inchi_connectivity_layer(
                self.compound_data['inchi_key'])
            for dn_data in dn_dict.values():
                put_js_path_in_dict(dn_data, '_metadata.hierarchy.all_family',
                                    shared_family_data)
                put_js_path_in_dict(
                    dn_data,
                    '_metadata.hierarchy.family_inchi_connectivity_layer',
                    family_inchi_connectivity_layer)
        return dn_dict, shared_family_data, node_data