def term_async_comparison(indexed_term, dict_terms, fuzzy=False, dict_terms_trie=TrieNode(), threshold=0.95): """ term pair-wise comparison for two terms with both exact matching and levenshtein based fuzzy matching params: indexed_term, potential term to be matched with the given dictionary terms dict_terms, dictionary terms return indexed term, if matched otherwise empty string """ #if indexed_term == dict_term: # return indexed_term exact_matched_terms = [ dict_term for dict_term in dict_terms if indexed_term == dict_term ] if exact_matched_terms: return indexed_term if fuzzy: similar_terms = search(indexed_term, 3, dict_terms_trie) if similar_terms: filtered_similar_terms = [ similar_term[0] for similar_term in similar_terms if levenshtein_similarity( indexed_term, similar_term[0]) >= threshold ] if filtered_similar_terms: return indexed_term return ""
def load_dictionary_tagging_setting(self, config): try: self.dict_tagging = config['DICTIONARY_TAGGER']['dict_tagging'] if "true" == self.dict_tagging.lower(): self.dict_tagging = True elif "false" == self.dict_tagging.lower(): self.dict_tagging = False else: raise Exception( "current setting [%s] for 'dict_tagging' is not supported!" % self.tagging) except KeyError: self._logger.exception( "Oops! 'dict_tagging' is set incorrectly in config file. Default to set false" ) self.dict_tagging = False if not self.dict_tagging: self._logger.info( "dictionary tagging is set to false. Disable dictionary tagging." ) return self._logger.info("Dictionary tagging is enabled.") try: self.dictionary_file = config['DICTIONARY_TAGGER'][ 'dictionary_file'] except KeyError: self._logger.exception( "Oops! 'dict_tagging' is set incorrectly in config file. Default to use default csv file in config dir." ) self.dictionary_file = os.path.join( os.path.dirname(__file__), '..', 'config', 'Steel-Terminology-Tata-Steel.csv') try: self.dict_tagger_fuzzy_matching = config['DICTIONARY_TAGGER'][ 'dict_tagger_fuzzy_matching'] if "true" == self.dict_tagger_fuzzy_matching.lower(): self.dict_tagger_fuzzy_matching = True elif "false" == self.dict_tagger_fuzzy_matching.lower(): self.dict_tagger_fuzzy_matching = False except KeyError: self._logger.exception( "Oops! 'dict_tagger_fuzzy_matching' is set incorrectly in config file. Default to False." ) self.dict_tagger_fuzzy_matching = False try: self.dict_tagger_sim_threshold = float( config['DICTIONARY_TAGGER']['dict_tagger_sim_threshold']) except KeyError: self._logger.exception( "Oops! 'dict_tagger_sim_threshold' is set incorrectly in config file. Default to 0.95." ) self.dict_tagger_sim_threshold = float(0.95) self.dict_terms = load_terms_from_csv(self.dictionary_file) self._logger.info("normalising terms from dictionary...") self.dict_terms = [ self.solrClient.get_industry_term_field_analysis(dict_term) for dict_term in self.dict_terms ] self._logger.info( "dictionary terms are normalised and loaded successfully. Total dictionary term size is [%s]", str(len(self.dict_terms))) if self.dict_tagger_fuzzy_matching: self._logger.info("loading into Trie nodes for fuzzy matching...") self.dict_terms_trie = TrieNode() [ self.dict_terms_trie.insert(normed_term) for normed_term in self.dict_terms ] self._logger.info("loaded into Trie nodes successfully.") else: self.dict_terms_trie = TrieNode()