Exemple #1
0
def term_async_comparison(indexed_term,
                          dict_terms,
                          fuzzy=False,
                          dict_terms_trie=TrieNode(),
                          threshold=0.95):
    """
    term pair-wise comparison for two terms with both exact matching and levenshtein based fuzzy matching
    params:
        indexed_term, potential term to be matched with the given dictionary terms
        dict_terms, dictionary terms
        
    return indexed term, if matched otherwise empty string
    """

    #if indexed_term == dict_term:
    #    return indexed_term
    exact_matched_terms = [
        dict_term for dict_term in dict_terms if indexed_term == dict_term
    ]
    if exact_matched_terms:
        return indexed_term

    if fuzzy:
        similar_terms = search(indexed_term, 3, dict_terms_trie)
        if similar_terms:
            filtered_similar_terms = [
                similar_term[0]
                for similar_term in similar_terms if levenshtein_similarity(
                    indexed_term, similar_term[0]) >= threshold
            ]

            if filtered_similar_terms:
                return indexed_term

    return ""
Exemple #2
0
    def load_dictionary_tagging_setting(self, config):
        try:
            self.dict_tagging = config['DICTIONARY_TAGGER']['dict_tagging']
            if "true" == self.dict_tagging.lower():
                self.dict_tagging = True
            elif "false" == self.dict_tagging.lower():
                self.dict_tagging = False
            else:
                raise Exception(
                    "current setting [%s] for 'dict_tagging' is not supported!"
                    % self.tagging)
        except KeyError:
            self._logger.exception(
                "Oops! 'dict_tagging' is set incorrectly in config file. Default to set false"
            )
            self.dict_tagging = False

        if not self.dict_tagging:
            self._logger.info(
                "dictionary tagging is set to false. Disable dictionary tagging."
            )
            return

        self._logger.info("Dictionary tagging is enabled.")

        try:
            self.dictionary_file = config['DICTIONARY_TAGGER'][
                'dictionary_file']
        except KeyError:
            self._logger.exception(
                "Oops! 'dict_tagging' is set incorrectly in config file. Default to use default csv file in config dir."
            )
            self.dictionary_file = os.path.join(
                os.path.dirname(__file__), '..', 'config',
                'Steel-Terminology-Tata-Steel.csv')

        try:
            self.dict_tagger_fuzzy_matching = config['DICTIONARY_TAGGER'][
                'dict_tagger_fuzzy_matching']
            if "true" == self.dict_tagger_fuzzy_matching.lower():
                self.dict_tagger_fuzzy_matching = True
            elif "false" == self.dict_tagger_fuzzy_matching.lower():
                self.dict_tagger_fuzzy_matching = False
        except KeyError:
            self._logger.exception(
                "Oops! 'dict_tagger_fuzzy_matching' is set incorrectly in config file. Default to False."
            )
            self.dict_tagger_fuzzy_matching = False

        try:
            self.dict_tagger_sim_threshold = float(
                config['DICTIONARY_TAGGER']['dict_tagger_sim_threshold'])
        except KeyError:
            self._logger.exception(
                "Oops! 'dict_tagger_sim_threshold' is set incorrectly in config file. Default to 0.95."
            )
            self.dict_tagger_sim_threshold = float(0.95)

        self.dict_terms = load_terms_from_csv(self.dictionary_file)

        self._logger.info("normalising terms from dictionary...")
        self.dict_terms = [
            self.solrClient.get_industry_term_field_analysis(dict_term)
            for dict_term in self.dict_terms
        ]
        self._logger.info(
            "dictionary terms are normalised and loaded successfully. Total dictionary term size is [%s]",
            str(len(self.dict_terms)))

        if self.dict_tagger_fuzzy_matching:
            self._logger.info("loading into Trie nodes for fuzzy matching...")
            self.dict_terms_trie = TrieNode()
            [
                self.dict_terms_trie.insert(normed_term)
                for normed_term in self.dict_terms
            ]
            self._logger.info("loaded into Trie nodes successfully.")
        else:
            self.dict_terms_trie = TrieNode()