Example #1
0
 def test_simple_string_matcher():
     values = ["China", "Beijing", "City"]
     string_matcher = StringMatcher()
     string_matcher.init(values)
     for value in values:
         match = string_matcher.find(value)
         if match is not None:
             assert value == match[0].text
Example #2
0
 def test_simple_with_ids_string_matcher():
     values = ["China", "Beijing", "City"]
     ids = ["1", "2", "3"]
     string_matcher = StringMatcher()
     string_matcher.init(values, ids)
     for i in range(0, len(values)):
         value = values[i]
         match = string_matcher.find(value)
         if match is not None:
             assert value == match[0].text
             assert ids[i] == match[0].canonical_values[0]
Example #3
0
    def build_matcher_from_lists(*collections: List[str]) -> StringMatcher:
        matcher = StringMatcher(MatchStrategy.TrieTree, NumberWithUnitTokenizer())

        matcher_list = []
        for collection in collections:
            list(map(lambda x: matcher_list.append(x.strip().lower()), collection))

        matcher_list = TimeZoneUtility.distinct(matcher_list)

        matcher.init(matcher_list)

        return matcher
Example #4
0
class EnglishTimeZoneExtractorConfiguration(TimeZoneExtractorConfiguration):
    @property
    def timezone_matcher(self):
        return self._timezone_matcher

    @property
    def direct_utc_regex(self) -> Pattern:
        return self._direct_utc_regex

    @property
    def abbreviations_list(self) -> List[str]:
        return self._abbreviations_list

    @property
    def full_name_list(self) -> List[str]:
        return self._full_name_list

    @property
    def location_time_suffix_regex(self) -> Pattern:
        return self._location_time_suffix_regex

    @property
    def location_matcher(self) -> StringMatcher:
        return self._location_matcher

    @property
    def ambiguous_timezone_list(self) -> List[str]:
        return self._ambiguous_timezone_list

    def __init__(self):
        super().__init__()

        self._direct_utc_regex = RegExpUtility.get_safe_reg_exp(
            TimeZoneDefinitions.DirectUtcRegex)
        self._abbreviations_list = list(TimeZoneDefinitions.AbbreviationsList)
        self._full_name_list = list(TimeZoneDefinitions.FullNameList)
        self._timezone_matcher = TimeZoneUtility.build_matcher_from_lists(
            self.full_name_list, self.abbreviations_list)
        self._location_time_suffix_regex = RegExpUtility.get_safe_reg_exp(
            TimeZoneDefinitions.LocationTimeSuffixRegex)
        self._location_matcher = StringMatcher()
        self._ambiguous_timezone_list = list(
            TimeZoneDefinitions.AmbiguousTimezoneList)

        self._location_matcher.init(
            list(
                map(lambda o: QueryProcessor.remove_diacritics(o.lower()),
                    TimeZoneDefinitions.MajorLocations)))
Example #5
0
    def _build_matcher_from_set(self, definitions) -> StringMatcher:

        matcher = StringMatcher(match_strategy=MatchStrategy.TrieTree, tokenizer=NumberWithUnitTokenizer())

        match_term_list = list(map(lambda words:
                                   list(filter(lambda word: not str.isspace(word) and word is not None,
                                               str(words).strip().split('|'))),
                                   definitions))

        match_terms = self.distinct(match_term_list)

        flatten = [item for sublist in match_terms for item in sublist]

        matcher.init(flatten)

        return matcher
Example #6
0
    def test_string_matcher():
        utc_8_value = 'UTC+08:00'
        utc_8_words = ['beijing time', 'chongqing time', 'hong kong time', 'urumqi time']
        utc_2_value = 'UTC+02:00'
        utc_2_words = ['cairo time', 'beirut time', 'gaza time', 'amman time']
        value_dictionary = {utc_2_value: utc_2_words, utc_8_value: utc_8_words}
        string_matcher = StringMatcher()
        string_matcher.init(value_dictionary)

        for value in utc_8_words:
            sentence = 'please change {}, thanks'.format(value)
            matches: [MatchResult] = string_matcher.find(sentence)
            assert value == matches[0].text
            assert utc_8_value == matches[0].canonical_values[0]
            assert 14 == matches[0].start

        for value in utc_2_words:
            sentence = 'please change {}, thanks'.format(value)
            matches: [MatchResult] = string_matcher.find(sentence)
            assert value == matches[0].text
            assert str(utc_2_value) == matches[0].canonical_values[0]
            assert 14 == matches[0].start