def test_simple_string_matcher(): values = ["China", "Beijing", "City"] string_matcher = StringMatcher() string_matcher.init(values) for value in values: match = string_matcher.find(value) if match is not None: assert value == match[0].text
def test_simple_with_ids_string_matcher(): values = ["China", "Beijing", "City"] ids = ["1", "2", "3"] string_matcher = StringMatcher() string_matcher.init(values, ids) for i in range(0, len(values)): value = values[i] match = string_matcher.find(value) if match is not None: assert value == match[0].text assert ids[i] == match[0].canonical_values[0]
def build_matcher_from_lists(*collections: List[str]) -> StringMatcher: matcher = StringMatcher(MatchStrategy.TrieTree, NumberWithUnitTokenizer()) matcher_list = [] for collection in collections: list(map(lambda x: matcher_list.append(x.strip().lower()), collection)) matcher_list = TimeZoneUtility.distinct(matcher_list) matcher.init(matcher_list) return matcher
class EnglishTimeZoneExtractorConfiguration(TimeZoneExtractorConfiguration): @property def timezone_matcher(self): return self._timezone_matcher @property def direct_utc_regex(self) -> Pattern: return self._direct_utc_regex @property def abbreviations_list(self) -> List[str]: return self._abbreviations_list @property def full_name_list(self) -> List[str]: return self._full_name_list @property def location_time_suffix_regex(self) -> Pattern: return self._location_time_suffix_regex @property def location_matcher(self) -> StringMatcher: return self._location_matcher @property def ambiguous_timezone_list(self) -> List[str]: return self._ambiguous_timezone_list def __init__(self): super().__init__() self._direct_utc_regex = RegExpUtility.get_safe_reg_exp( TimeZoneDefinitions.DirectUtcRegex) self._abbreviations_list = list(TimeZoneDefinitions.AbbreviationsList) self._full_name_list = list(TimeZoneDefinitions.FullNameList) self._timezone_matcher = TimeZoneUtility.build_matcher_from_lists( self.full_name_list, self.abbreviations_list) self._location_time_suffix_regex = RegExpUtility.get_safe_reg_exp( TimeZoneDefinitions.LocationTimeSuffixRegex) self._location_matcher = StringMatcher() self._ambiguous_timezone_list = list( TimeZoneDefinitions.AmbiguousTimezoneList) self._location_matcher.init( list( map(lambda o: QueryProcessor.remove_diacritics(o.lower()), TimeZoneDefinitions.MajorLocations)))
def _build_matcher_from_set(self, definitions) -> StringMatcher: matcher = StringMatcher(match_strategy=MatchStrategy.TrieTree, tokenizer=NumberWithUnitTokenizer()) match_term_list = list(map(lambda words: list(filter(lambda word: not str.isspace(word) and word is not None, str(words).strip().split('|'))), definitions)) match_terms = self.distinct(match_term_list) flatten = [item for sublist in match_terms for item in sublist] matcher.init(flatten) return matcher
def test_string_matcher(): utc_8_value = 'UTC+08:00' utc_8_words = ['beijing time', 'chongqing time', 'hong kong time', 'urumqi time'] utc_2_value = 'UTC+02:00' utc_2_words = ['cairo time', 'beirut time', 'gaza time', 'amman time'] value_dictionary = {utc_2_value: utc_2_words, utc_8_value: utc_8_words} string_matcher = StringMatcher() string_matcher.init(value_dictionary) for value in utc_8_words: sentence = 'please change {}, thanks'.format(value) matches: [MatchResult] = string_matcher.find(sentence) assert value == matches[0].text assert utc_8_value == matches[0].canonical_values[0] assert 14 == matches[0].start for value in utc_2_words: sentence = 'please change {}, thanks'.format(value) matches: [MatchResult] = string_matcher.find(sentence) assert value == matches[0].text assert str(utc_2_value) == matches[0].canonical_values[0] assert 14 == matches[0].start