コード例 #1
0
    def __init__(self, config: NumberWithUnitExtractorConfiguration):

        self.config = config
        self.max_prefix_match_len = 0

        if self.config.suffix_list:
            self.__suffix_matcher = self._build_matcher_from_set(
                list(self.config.suffix_list.values()))
        else:
            self.__suffix_matcher = StringMatcher()

        if self.config.prefix_list:
            for pre_match in self.config.prefix_list.values():
                match_list = str(pre_match).split(self.separator[0])
                for match in match_list:
                    if self.max_prefix_match_len >= len(match):
                        self.max_prefix_match_len = self.max_prefix_match_len
                    else:
                        self.max_prefix_match_len = len(match)

            # 2 is the maximum length of spaces.
            self.max_prefix_match_len += 2
            self.__prefix_matcher = self._build_matcher_from_set(self.config.prefix_list.values())
        else:
            self.__prefix_matcher = StringMatcher()

        self.separate_regex = self._build_separate_regex_from_config()
コード例 #2
0
 def test_simple_string_matcher():
     values = ["China", "Beijing", "City"]
     string_matcher = StringMatcher()
     string_matcher.init(values)
     for value in values:
         match = string_matcher.find(value)
         if match is not None:
             assert value == match[0].text
コード例 #3
0
 def test_simple_with_ids_string_matcher():
     values = ["China", "Beijing", "City"]
     ids = ["1", "2", "3"]
     string_matcher = StringMatcher()
     string_matcher.init(values, ids)
     for i in range(0, len(values)):
         value = values[i]
         match = string_matcher.find(value)
         if match is not None:
             assert value == match[0].text
             assert ids[i] == match[0].canonical_values[0]
コード例 #4
0
    def __init__(self):
        super().__init__()

        self._direct_utc_regex = RegExpUtility.get_safe_reg_exp(TimeZoneDefinitions.DirectUtcRegex)
        self._abbreviations_list = list(TimeZoneDefinitions.AbbreviationsList)
        self._full_name_list = list(TimeZoneDefinitions.FullNameList)
        self._timezone_matcher = TimeZoneUtility.build_matcher_from_lists(self.full_name_list, self.abbreviations_list)
        self._location_time_suffix_regex = RegExpUtility.get_safe_reg_exp(TimeZoneDefinitions.LocationTimeSuffixRegex)
        self._location_matcher = StringMatcher()
        self._ambiguous_timezone_list = list(TimeZoneDefinitions.AmbiguousTimezoneList)

        self._location_matcher.init(list(map(lambda o: QueryProcessor.remove_diacritics(o.lower()), TimeZoneDefinitions.MajorLocations)))
コード例 #5
0
    def build_matcher_from_lists(*collections: List[str]) -> StringMatcher:
        matcher = StringMatcher(MatchStrategy.TrieTree, NumberWithUnitTokenizer())

        matcher_list = []
        for collection in collections:
            list(map(lambda x: matcher_list.append(x.strip().lower()), collection))

        matcher_list = TimeZoneUtility.distinct(matcher_list)

        matcher.init(matcher_list)

        return matcher
コード例 #6
0
class EnglishTimeZoneExtractorConfiguration(TimeZoneExtractorConfiguration):
    @property
    def timezone_matcher(self):
        return self._timezone_matcher

    @property
    def direct_utc_regex(self) -> Pattern:
        return self._direct_utc_regex

    @property
    def abbreviations_list(self) -> List[str]:
        return self._abbreviations_list

    @property
    def full_name_list(self) -> List[str]:
        return self._full_name_list

    @property
    def location_time_suffix_regex(self) -> Pattern:
        return self._location_time_suffix_regex

    @property
    def location_matcher(self) -> StringMatcher:
        return self._location_matcher

    @property
    def ambiguous_timezone_list(self) -> List[str]:
        return self._ambiguous_timezone_list

    def __init__(self):
        super().__init__()

        self._direct_utc_regex = RegExpUtility.get_safe_reg_exp(
            TimeZoneDefinitions.DirectUtcRegex)
        self._abbreviations_list = list(TimeZoneDefinitions.AbbreviationsList)
        self._full_name_list = list(TimeZoneDefinitions.FullNameList)
        self._timezone_matcher = TimeZoneUtility.build_matcher_from_lists(
            self.full_name_list, self.abbreviations_list)
        self._location_time_suffix_regex = RegExpUtility.get_safe_reg_exp(
            TimeZoneDefinitions.LocationTimeSuffixRegex)
        self._location_matcher = StringMatcher()
        self._ambiguous_timezone_list = list(
            TimeZoneDefinitions.AmbiguousTimezoneList)

        self._location_matcher.init(
            list(
                map(lambda o: QueryProcessor.remove_diacritics(o.lower()),
                    TimeZoneDefinitions.MajorLocations)))
コード例 #7
0
    def _build_matcher_from_set(self, definitions) -> StringMatcher:

        matcher = StringMatcher(match_strategy=MatchStrategy.TrieTree, tokenizer=NumberWithUnitTokenizer())

        match_term_list = list(map(lambda words:
                                   list(filter(lambda word: not str.isspace(word) and word is not None,
                                               str(words).strip().split('|'))),
                                   definitions))

        match_terms = self.distinct(match_term_list)

        flatten = [item for sublist in match_terms for item in sublist]

        matcher.init(flatten)

        return matcher
コード例 #8
0
    def __init__(self, config):
        self.config = config

        self._tld_matcher = StringMatcher()
        self.tld_matcher().init(BaseURL.TldList)

        self._regexes = [
            ReVal(config.ip_url_regex, Constants.URL_REGEX),
            ReVal(config.url_regex, Constants.URL_REGEX),
            ReVal(RegExpUtility.get_safe_reg_exp(BaseURL.UrlRegex2), Constants.URL_REGEX)
        ]

        self._ambiguous_time_term = ReVal(RegExpUtility.get_safe_reg_exp(BaseURL.AmbiguousTimeTerm),
                                          Constants.URL_REGEX)
コード例 #9
0
    def build_matcher_from_lists(self, collections: []):

        matcher: StringMatcher = StringMatcher(MatchStrategy.TrieTree, NumberWithUnitTokenizer())

        matcher_list = []

        for collection in collections:
            list(map(lambda x: matcher_list.append(x.strip().lower()), collection))

        matcher_list = self.distinct(matcher_list)

        matcher.init(matcher_list)

        return matcher
コード例 #10
0
    def test_string_matcher():
        utc_8_value = 'UTC+08:00'
        utc_8_words = ['beijing time', 'chongqing time', 'hong kong time', 'urumqi time']
        utc_2_value = 'UTC+02:00'
        utc_2_words = ['cairo time', 'beirut time', 'gaza time', 'amman time']
        value_dictionary = {utc_2_value: utc_2_words, utc_8_value: utc_8_words}
        string_matcher = StringMatcher()
        string_matcher.init(value_dictionary)

        for value in utc_8_words:
            sentence = 'please change {}, thanks'.format(value)
            matches: [MatchResult] = string_matcher.find(sentence)
            assert value == matches[0].text
            assert utc_8_value == matches[0].canonical_values[0]
            assert 14 == matches[0].start

        for value in utc_2_words:
            sentence = 'please change {}, thanks'.format(value)
            matches: [MatchResult] = string_matcher.find(sentence)
            assert value == matches[0].text
            assert str(utc_2_value) == matches[0].canonical_values[0]
            assert 14 == matches[0].start