def __init__(self): self._later_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.LaterRegex) self._ago_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.AgoRegex) self._in_connector_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.InConnectorRegex) self._range_unit_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.RangeUnitRegex) self._am_desc_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.AmDescRegex) self._pm_desc__regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.PmDescRegex) self._am_pm_desc_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.AmPmDescRegex) self._time_unit_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.TimeUnitRegex) self._within_next_prefix_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.WithinNextPrefixRegex) self._common_date_prefix_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.CommonDatePrefixRegex)
def __init__(self, mode): self.__regexes = [ ReVal(re=RegExpUtility.get_safe_reg_exp( PortugueseNumeric.FractionNotationWithSpacesRegex), val='FracNum'), ReVal(re=RegExpUtility.get_safe_reg_exp( PortugueseNumeric.FractionNotationRegex), val='FracNum'), ReVal(re=RegExpUtility.get_safe_reg_exp( PortugueseNumeric.FractionNounRegex), val='FracPor'), ReVal(re=RegExpUtility.get_safe_reg_exp( PortugueseNumeric.FractionNounWithArticleRegex), val='FracPor') ] if mode != NumberMode.Unit: self.__regexes.append( ReVal(re=RegExpUtility.get_safe_reg_exp( PortugueseNumeric.FractionPrepositionRegex), val='FracPor'))
def __init__(self, culture_info=None): if culture_info is None: culture_info = CultureInfo(Culture.Chinese) self._culture_info = culture_info self._lang_marker = ChineseNumeric.LangMarker self._decimal_separator_char = ChineseNumeric.DecimalSeparatorChar self._fraction_marker_token = ChineseNumeric.FractionMarkerToken self._non_decimal_separator_char = ChineseNumeric.NonDecimalSeparatorChar self._half_a_dozen_text = ChineseNumeric.HalfADozenText self._word_separator_token = ChineseNumeric.WordSeparatorToken self._round_number_map = ChineseNumeric.RoundNumberMap self._digital_number_regex = RegExpUtility.get_safe_reg_exp(ChineseNumeric.DigitalNumberRegex) self.zero_to_nine_map_chs = ChineseNumeric.ZeroToNineMap self.round_number_map_chs = ChineseNumeric.RoundNumberMapChar self.full_to_half_map_chs = ChineseNumeric.FullToHalfMap self.trato_sim_map_chs = ChineseNumeric.TratoSimMap self.unit_map_chs = ChineseNumeric.UnitMap self.round_direct_list_chs = ChineseNumeric.RoundDirectList self.digit_num_regex = ChineseNumeric.DigitNumRegex self.dozen_regex = ChineseNumeric.DozenRegex self.percentage_regex = ChineseNumeric.PercentageRegex self.double_and_round_chs_regex = RegExpUtility.get_safe_reg_exp(ChineseNumeric.DoubleAndRoundRegex) self.frac_split_regex = RegExpUtility.get_safe_reg_exp(ChineseNumeric.FracSplitRegex) self._negative_number_sign_regex = RegExpUtility.get_safe_reg_exp(ChineseNumeric.NegativeNumberSignRegex) self.point_regex_chs = ChineseNumeric.PointRegex self.spe_get_number_regex = RegExpUtility.get_safe_reg_exp(ChineseNumeric.SpeGetNumberRegex) self.pair_regex = RegExpUtility.get_safe_reg_exp(ChineseNumeric.PairRegex)
def __init__(self, config: BaseDateParserConfiguration): self._duration_extractor = config.duration_extractor self._duration_parser = config.duration_parser self._time_extractor = config.time_extractor self._time_parser = config.time_parser self._date_extractor = config.date_extractor self._date_parser = config.date_parser self._date_time_extractor = config.date_time_extractor self._date_time_parser = config.date_time_parser self._date_period_extractor = config.date_period_extractor self._date_period_parser = config.date_period_parser self._time_period_extractor = config.time_period_extractor self._time_period_parser = config.time_period_parser self._date_time_period_extractor = config.date_time_period_extractor self._date_time_period_parser = config.date_time_period_parser self._unit_map = EnglishDateTime.UnitMap self._each_prefix_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.EachPrefixRegex) self._periodic_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.PeriodicRegex) self._each_unit_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.EachUnitRegex) self._each_day_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.EachDayRegex) self._set_week_day_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.SetWeekDayRegex) self._set_each_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.SetEachRegex)
def __init__(self, mode: NumberMode = NumberMode.DEFAULT): self.__negative_number_terms = RegExpUtility.get_safe_reg_exp( FrenchNumeric.NegativeNumberTermsRegex) self.__regexes: List[ReVal] = list() cardinal_ex: FrenchCardinalExtractor = None if mode is NumberMode.PURE_NUMBER: cardinal_ex = FrenchCardinalExtractor( FrenchNumeric.PlaceHolderPureNumber) elif mode is NumberMode.CURRENCY: self.__regexes.append(ReVal(re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.CurrencyRegex), val='IntegerNum')) if cardinal_ex is None: cardinal_ex = FrenchCardinalExtractor() self.__regexes.extend(cardinal_ex.regexes) fraction_ex = FrenchFractionExtractor(mode) self.__regexes.extend(fraction_ex.regexes) ambiguity_filters_dict: List[ReRe] = list() if mode != NumberMode.Unit: for key, value in BaseNumbers.AmbiguityFiltersDict.items(): ambiguity_filters_dict.append(ReRe(reKey=RegExpUtility.get_safe_reg_exp(key), reVal=RegExpUtility.get_safe_reg_exp(value))) for key, value in FrenchNumeric.AmbiguityFiltersDict.items(): ambiguity_filters_dict.append(ReRe(reKey=RegExpUtility.get_safe_reg_exp(key), reVal=RegExpUtility.get_safe_reg_exp(value))) self.__ambiguity_filters_dict = ambiguity_filters_dict
def __init__(self, config: NumberParserConfiguration): self.config: NumberParserConfiguration = config self.supported_types: List[str] = list() single_int_frac = f'{self.config.word_separator_token}| -|{self._get_key_regex(self.config.cardinal_number_map.keys())}|{self._get_key_regex(self.config.ordinal_number_map.keys())}' self.text_number_regex: Pattern = self._get_text_number_regex( single_int_frac) self.arabic_number_regex: Pattern = RegExpUtility.get_safe_reg_exp( r'\d+', flags=regex.I | regex.S) self.round_number_set: List[str] = list( self.config.round_number_map.keys()) self.is_non_standard_separator_variant = self.config.culture_info.code in \ self.config.non_standard_separator_variants
def __init__(self): self._all_regex: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.AllRegex) self._half_regex: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.HalfRegex) self._followed_unit: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.DurationFollowedUnit) self._number_combined_with_unit: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.NumberCombinedWithDurationUnit) self._an_unit_regex: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.AnUnitRegex) self._inexact_number_unit_regex: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.InexactNumberUnitRegex) self._suffix_and_regex: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.SuffixAndRegex) self._relative_duration_unit_regex: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.RelativeDurationUnitRegex) self._more_than_regex: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.MoreThanRegex) self._less_than_regex: Pattern = RegExpUtility.get_safe_reg_exp( FrenchDateTime.LessThanOneHour) self._cardinal_extractor: BaseNumberExtractor = FrenchCardinalExtractor()
def __skip_non_decimal_separator(self, ch: str, distance: int, culture: CultureInfo) -> bool: decimal_length: int = 3 # Special cases for multi-language countries where decimal separators can be used interchangeably. Mostly informally. # Ex: South Africa, Namibia; Puerto Rico in ES; or in Canada for EN and FR. # "me pidio $5.00 prestados" and "me pidio $5,00 prestados" -> currency $5 culture_regex: Pattern = RegExpUtility.get_safe_reg_exp( r'^(en|es|fr)(-)?\b', flags=regex.I | regex.S) return ch == self.config.non_decimal_separator_char and not ( distance <= decimal_length and culture_regex.match(culture.code))
def __init__(self, mode: NumberMode = NumberMode.DEFAULT): self.__negative_number_terms = RegExpUtility.get_safe_reg_exp( EnglishNumeric.NegativeNumberTermsRegex) self.__regexes: List[ReVal] = list() cardinal_ex: EnglishCardinalExtractor = None if mode is NumberMode.PURE_NUMBER: cardinal_ex = EnglishCardinalExtractor( EnglishNumeric.PlaceHolderPureNumber) elif mode is NumberMode.CURRENCY: self.__regexes.append( ReVal(re=RegExpUtility.get_safe_reg_exp( EnglishNumeric.CurrencyRegex), val='IntegerNum')) if cardinal_ex is None: cardinal_ex = EnglishCardinalExtractor() self.__regexes.extend(cardinal_ex.regexes) fraction_ex = EnglishFractionExtractor() self.__regexes.extend(fraction_ex.regexes)
def __init__(self): super().__init__() self._single_time_extractor = BaseTimeExtractor( PortugueseTimeExtractorConfiguration()) self._integer_extractor = PortugueseIntegerExtractor() self.utility_configuration = PortugueseDateTimeUtilityConfiguration() self._simple_cases_regex: List[Pattern] = [ RegExpUtility.get_safe_reg_exp(PortugueseDateTime.PureNumFromTo), RegExpUtility.get_safe_reg_exp( PortugueseDateTime.PureNumBetweenAnd), RegExpUtility.get_safe_reg_exp( PortugueseDateTime.SpecificTimeFromTo), RegExpUtility.get_safe_reg_exp( PortugueseDateTime.SpecificTimeBetweenAnd) ] self._till_regex: Pattern = RegExpUtility.get_safe_reg_exp( PortugueseDateTime.TillRegex) self._time_of_day_regex: Pattern = RegExpUtility.get_safe_reg_exp( PortugueseDateTime.TimeOfDayRegex) self._general_ending_regex: Pattern = RegExpUtility.get_safe_reg_exp( PortugueseDateTime.GeneralEndingRegex) self.from_regex = RegExpUtility.get_safe_reg_exp( PortugueseDateTime.FromRegex) self.range_connector_regex = RegExpUtility.get_safe_reg_exp( PortugueseDateTime.RangeConnectorRegex) self.between_regex = RegExpUtility.get_safe_reg_exp( PortugueseDateTime.BetweenRegex) self._token_before_date = PortugueseDateTime.TokenBeforeDate self._pure_number_regex = [ PortugueseDateTime.PureNumFromTo, PortugueseDateTime.PureNumFromTo ] self._options = DateTimeOptions.NONE self._time_zone_extractor = BaseTimeZoneExtractor( PortugueseTimeZoneExtractorConfiguration()) self._check_both_before_after = PortugueseDateTime.CheckBothBeforeAfter
def __init__(self): self.__regexes = [ ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.FractionNotationWithSpacesRegex), val='FracNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.FractionNotationRegex), val='FracNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.FractionNounRegex), val='FracFr'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.FractionNounWithArticleRegex), val='FracFr'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.FractionPrepositionRegex), val='FracFr') ]
def __init__(self, culture_info=None): if culture_info is None: culture_info = CultureInfo(Culture.Spanish) self._culture_info = culture_info self._lang_marker = SpanishNumeric.LangMarker self._decimal_separator_char = SpanishNumeric.DecimalSeparatorChar self._fraction_marker_token = SpanishNumeric.FractionMarkerToken self._non_decimal_separator_char = SpanishNumeric.NonDecimalSeparatorChar self._half_a_dozen_text = SpanishNumeric.HalfADozenText self._word_separator_token = SpanishNumeric.WordSeparatorToken self._written_decimal_separator_texts = SpanishNumeric.WrittenDecimalSeparatorTexts self._written_group_separator_texts = SpanishNumeric.WrittenGroupSeparatorTexts self._written_integer_separator_texts = SpanishNumeric.WrittenIntegerSeparatorTexts self._written_fraction_separator_texts = SpanishNumeric.WrittenFractionSeparatorTexts self._non_standard_separator_variants = SpanishNumeric.NonStandardSeparatorVariants self._is_multi_decimal_separator_culture = SpanishNumeric.MultiDecimalSeparatorCulture ordinal_number_map: Dict[str, int] = dict(SpanishNumeric.OrdinalNumberMap) for prefix_key in SpanishNumeric.PrefixCardinalMap: for suffix_key in SpanishNumeric.SuffixOrdinalMap: if not prefix_key + suffix_key in ordinal_number_map: prefix_value = SpanishNumeric.PrefixCardinalMap[prefix_key] suffix_value = SpanishNumeric.SuffixOrdinalMap[suffix_key] ordinal_number_map[ prefix_key + suffix_key] = prefix_value * suffix_value self._cardinal_number_map = SpanishNumeric.CardinalNumberMap self._ordinal_number_map = ordinal_number_map self._round_number_map = SpanishNumeric.RoundNumberMap self._negative_number_sign_regex = RegExpUtility.get_safe_reg_exp( SpanishNumeric.NegativeNumberSignRegex) self._half_a_dozen_regex = RegExpUtility.get_safe_reg_exp( SpanishNumeric.HalfADozenRegex) self._digital_number_regex = RegExpUtility.get_safe_reg_exp( SpanishNumeric.DigitalNumberRegex)
def adjust_by_suffix(self, suffix: str, adjust: AdjustParams): suffix = suffix.strip().lower() delta_hour = 0 match = regex.search(self.time_suffix_full, suffix) if match is not None and match.start() == 0 and match.group( ) == suffix: oclock_str = RegExpUtility.get_group(match, 'oclock') if not oclock_str: am_str = RegExpUtility.get_group(match, 'am') if am_str: if adjust.hour >= 12: delta_hour -= 12 else: adjust.has_am = True pm_str = RegExpUtility.get_group(match, 'pm') if pm_str: if adjust.hour < 12: delta_hour = 12 if regex.search(self.lunch_regex, pm_str): # for hour >= 10 and < 12 if 10 <= adjust.hour <= 12: delta_hour = 0 if adjust.hour == 12: adjust.has_pm = True else: adjust.has_am = True else: adjust.has_pm = True elif regex.search(self.night_regex, pm_str): if adjust.hour <= 3 or adjust.hour == 12: if adjust.hour == 12: adjust.hour = 0 delta_hour = 0 adjust.has_am = True else: adjust.has_pm = True adjust.hour = (adjust.hour + delta_hour) % 24
def match_simple_cases(self, source: str) -> List[Token]: result = [] for regexp in self.config.simple_cases_regex: matches = regex.finditer(regexp, source) if matches: for match in matches: # Cases like "from 10:30 to 11", don't necessarily need "am/pm" if RegExpUtility.get_group(match, Constants.MINUTE_GROUP_NAME) or\ RegExpUtility.get_group(match, Constants.SECOND_GROUP_NAME): # Cases like "from 3:30 to 4" should be supported # Cases like "from 3:30 to 5 on 1/1/2015" should be supported # Cases like "from 3:30 to 4 people" is considered not valid end_with_valid_token = False # "No extra tokens after the time period" if (source.index(match.group()) + (match.end() - match.start())) == len(source): end_with_valid_token = True else: after_str = source[source.index(match.group()) + (match.end() - match.start()):] end_with_general_endings = self.config.general_ending_regex.match(after_str) end_with_am_pm = RegExpUtility.get_group(match, Constants.RIGHT_AM_PM_GROUP_NAME) if end_with_general_endings or end_with_am_pm or\ after_str.lstrip().startswith(self.config.token_before_date): end_with_valid_token = True elif (self.config.options & DateTimeOptions.ENABLE_PREVIEW) != 0: # When TimeZone be migrated enable it end_with_valid_token = False if end_with_valid_token: result.append(Token(source.index(match.group()), source.index(match.group()) + (match.end() - match.start()))) else: # Is there "pm" or "am"? match_pm_str = RegExpUtility.get_group(match, Constants.PM_GROUP_NAME) match_am_str = RegExpUtility.get_group(match, Constants.AM_GROUP_NAME) desc_str = RegExpUtility.get_group(match, Constants.DESC_GROUP_NAME) # Check "pm", "am" if match_pm_str or match_am_str or desc_str: result.append(Token(source.index(match.group()), source.index(match.group()) + (match.end() - match.start()))) else: after_str = source[source.index(match.group()) + (match.end() - match.start()):] # When TimeZone be migrated enable it if (self.config.options & DateTimeOptions.ENABLE_PREVIEW) != 0: result.append(Token(source.index(match.group()), source.index(match.group()) + (match.end() - match.start()))) return result
def __init__(self, config): self._cardinal_extractor: BaseNumberExtractor = EnglishCardinalExtractor() self._number_parser: BaseNumberParser = BaseNumberParser(EnglishNumberParserConfiguration()) self._followed_unit: Pattern = RegExpUtility.get_safe_reg_exp(EnglishDateTime.DurationFollowedUnit) self._suffix_and_regex: Pattern = RegExpUtility.get_safe_reg_exp(EnglishDateTime.SuffixAndRegex) self._number_combined_with_unit: Pattern = RegExpUtility.get_safe_reg_exp(EnglishDateTime.NumberCombinedWithDurationUnit) self._an_unit_regex: Pattern = RegExpUtility.get_safe_reg_exp(EnglishDateTime.AnUnitRegex) self._all_date_unit_regex: Pattern = RegExpUtility.get_safe_reg_exp(EnglishDateTime.AllRegex) self._half_date_unit_regex: Pattern = RegExpUtility.get_safe_reg_exp(EnglishDateTime.HalfRegex) self._inexact_number_unit_regex: Pattern = RegExpUtility.get_safe_reg_exp(EnglishDateTime.InexactNumberUnitRegex) self._unit_map: Dict[str, int] = EnglishDateTime.UnitMap self._unit_value_map: Dict[str, int] = EnglishDateTime.UnitValueMap self._double_numbers: Dict[str, float] = EnglishDateTime.DoubleNumbers
def __init__(self, config: BaseDateParserConfiguration): self._token_before_date = SpanishDateTime.TokenBeforeDate self._token_before_time = SpanishDateTime.TokenBeforeTime self._now_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.NowRegex) self._am_time_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.AmTimeRegex) self._pm_time_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.PmTimeRegex) self._simple_time_of_today_after_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.SimpleTimeOfTodayAfterRegex) self._simple_time_of_today_before_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.SimpleTimeOfTodayBeforeRegex) self._specific_time_of_day_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.SpecificTimeOfDayRegex) self._the_end_of_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.TheEndOfRegex) self._unit_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.UnitRegex) self.next_prefix_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.NextPrefixRegex) self.past_prefix_regex = RegExpUtility.get_safe_reg_exp( SpanishDateTime.PastPrefixRegex) self._date_extractor = config.date_extractor self._time_extractor = config.time_extractor self._date_parser = config.date_parser self._time_parser = config.time_parser self._numbers = config.numbers self._cardinal_extractor = config.cardinal_extractor self._number_parser = config.number_parser self._duration_extractor = config.duration_extractor self._duration_parser = config.duration_parser self._unit_map = config.unit_map self._utility_configuration = config.utility_configuration
def adjust_by_prefix(self, prefix: str, adjust: AdjustParams): delta_min = 0 prefix = prefix.strip().lower() if prefix.startswith('half'): delta_min = 30 elif prefix.startswith('a quarter') or prefix.startswith('quarter'): delta_min = 15 elif prefix.startswith('three quarter'): delta_min = 45 else: match = regex.search(self.less_than_one_hour, prefix) min_str = RegExpUtility.get_group(match, 'deltamin') if min_str: delta_min = int(min_str) else: min_str = RegExpUtility.get_group(match, 'deltaminnum').lower() delta_min = self.numbers[min_str] if prefix.endswith('to'): delta_min = delta_min * -1 adjust.minute += delta_min if adjust.minute < 0: adjust.minute += 60 adjust.hour -= 1 adjust.has_minute = True
def adjust_by_prefix(self, prefix: str, adjust: AdjustParams): delta_min = 0 trimmed_prefix = prefix.strip().lower() # @todo Move hardcoded strings to resource YAML file. if regex.search(self._half_token_regex, prefix): delta_min = -30 elif regex.search(self._quarter_to_token_regex, prefix): delta_min = -15 elif regex.search(self._quarter_past_token_regex, prefix): delta_min = 15 elif regex.search(self._three_quarter_to_token_regex, prefix): delta_min = -45 elif regex.search(self._three_quarter_past_token_regex, prefix): delta_min = 45 else: match = regex.search(self.less_than_one_hour, trimmed_prefix) if match: min_str = RegExpUtility.get_group(match, 'deltamin') if min_str: delta_min = int(min_str) else: min_str = RegExpUtility.get_group(match, 'deltaminnum').lower() delta_min = self.numbers.get(min_str) if trimmed_prefix.startswith('zum'): delta_min = delta_min * -1 adjust.minute += delta_min if adjust.minute < 0: adjust.minute += 60 adjust.hour -= 1 adjust.has_minute = True
def adjust_by_suffix(self, suffix: str, adjust: AdjustParams): suffix = suffix.strip().lower() delta_hour = 0 match = regex.match(self.time_suffix, suffix) if match and match.group() == suffix: oclock_str = RegExpUtility.get_group(match, 'heures') if not oclock_str: am_str = RegExpUtility.get_group(match, 'am') if am_str: if adjust.hour >= 12: delta_hour -= 12 adjust.has_am = True pm_str = RegExpUtility.get_group(match, 'pm') if pm_str: if adjust.hour < 12: delta_hour = 12 adjust.has_pm = True adjust.hour = (adjust.hour + delta_hour) % 24
def _parse_week_of_month(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = self.config.week_of_month_regex.search(source) if not (match and len(match.group()) == len(source)): return result cardinal_str = RegExpUtility.get_group(match, 'cardinal') month_str = RegExpUtility.get_group(match, 'month') month = reference.month year = reference.year no_year = False cardinal = 5 if self.config.is_last_cardinal(cardinal_str) else self.config.cardinal_map.get(cardinal_str) if not month_str: swift = self.config.get_swift_day_or_month(source) temp_data = reference + datedelta(months=swift) month = temp_data.month year = temp_data.year else: month = self.config.month_of_year.get(month_str) no_year = True return self._get_week_of_month(cardinal, month, year, reference, no_year)
def __init__(self, placeholder: str = FrenchNumeric.PlaceHolderDefault): self.__regexes = [ ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.NumbersWithPlaceHolder(placeholder), regex.I), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.NumbersWithSuffix, regex.S), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp(self._generate_format_regex( LongFormatMode.INTEGER_DOT, placeholder), regex.V1), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp(self._generate_format_regex( LongFormatMode.INTEGER_BLANK, placeholder)), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp(self._generate_format_regex( LongFormatMode.INTEGER_NO_BREAK_SPACE, placeholder)), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.RoundNumberIntegerRegexWithLocks), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.NumbersWithDozenSuffix), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.AllIntRegexWithLocks), val=f'Integer{FrenchNumeric.LangMarker}'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.AllIntRegexWithDozenSuffixLocks), val=f'Integer{FrenchNumeric.LangMarker}') ]
def __init__(self): self._integer_extractor = EnglishIntegerExtractor() self._date_extractor = BaseDateExtractor( EnglishDateExtractorConfiguration()) self._time_extractor = BaseTimeExtractor( EnglishTimeExtractorConfiguration()) self._duration_extractor = BaseDurationExtractor( EnglishDurationExtractorConfiguration()) self._date_period_extractor = BaseDatePeriodExtractor( EnglishDatePeriodExtractorConfiguration()) self._time_period_extractor = BaseTimePeriodExtractor( EnglishTimePeriodExtractorConfiguration()) self._date_time_extractor = BaseDateTimeExtractor( EnglishDateTimeExtractorConfiguration()) self._date_time_period_extractor = BaseDateTimePeriodExtractor( EnglishDateTimePeriodExtractorConfiguration()) self._set_extractor = BaseSetExtractor( EnglishSetExtractorConfiguration()) self._holiday_extractor = BaseHolidayExtractor( EnglishHolidayExtractorConfiguration()) self._after_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.AfterRegex) self._before_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.BeforeRegex) self._since_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.SinceRegex) self._from_to_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.FromToRegex) self._single_ambiguous_month_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.SingleAmbiguousMonthRegex) self._preposition_suffix_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.PrepositionSuffixRegex) self._ambiguous_range_modifier_prefix = RegExpUtility.get_safe_reg_exp( EnglishDateTime.AmbiguousRangeModifierPrefix) self._number_ending_pattern = RegExpUtility.get_safe_reg_exp( EnglishDateTime.NumberEndingPattern) self._filter_word_regex_list = [ RegExpUtility.get_safe_reg_exp(EnglishDateTime.OneOnOneRegex) ]
def __init__(self, placeholder): self.__regexes = [ ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.DoubleDecimalPointRegex(placeholder)), val='DoubleNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.DoubleWithoutIntegralRegex(placeholder)), val='DoubleNum'), ReVal( re=RegExpUtility.get_safe_reg_exp(self._generate_format_regex( LongFormatMode.DOUBLE_DOT_COMMA, placeholder)), val='DoubleNum'), ReVal( re=RegExpUtility.get_safe_reg_exp(self._generate_format_regex( LongFormatMode.DOUBLE_NO_BREAK_SPACE_COMMA, placeholder)), val='DoubleNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.DoubleWithMultiplierRegex), val='DoubleNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.DoubleWithRoundNumber), val='DoubleNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.DoubleAllFloatRegex), val='DoubleFr'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.DoubleExponentialNotationRegex), val='DoublePow'), ReVal( re=RegExpUtility.get_safe_reg_exp( FrenchNumeric.DoubleCaretExponentialNotationRegex), val='DoublePow') ]
def _merge_two_times_points(self, source: str, reference: datetime) -> DateTimeResolutionResult: trimmed_source = source.strip() result = DateTimeResolutionResult() ers = self.config.date_extractor.extract(trimmed_source, reference) if not ers or len(ers) < 2: ers = self.config.date_extractor.extract(self.config.token_before_date + trimmed_source, reference) for er in ers: er.start -= len(self.config.token_before_date) if not ers or len(ers) < 2: return result match = self.config.week_with_week_day_range_regex.search(source) if match: week_prefix = RegExpUtility.get_group(match, 'week') if week_prefix: ers[0].text = f'{week_prefix} {ers[0].text}' ers[1].text = f'{week_prefix} {ers[1].text}' prs = [] for er in ers: pr = self.config.date_parser.parse(er, reference) if pr: prs.append(pr) if len(prs) < 2: return result pr_begin = prs[0] pr_end = prs[1] future_begin = pr_begin.value.future_value future_end = pr_end.value.future_value past_begin = pr_begin.value.past_value past_end = pr_end.value.past_value result.sub_date_time_entities = prs result.timex = f'({pr_begin.timex_str},{pr_end.timex_str},P{(future_end - future_begin).days}D)' result.future_value = [future_begin, future_end] result.past_value = [past_begin, past_end] result.success = True return result
def extend_with_week_day_and_year(self, start_index: int, end_index: int, month: int, day: int, text: str, reference: datetime): from .utilities import DateUtils import calendar year = reference.year # Check whether there's a year suffix = text[end_index:] prefix = text[0: start_index] year_index, success = self.get_year_index(suffix, year, False) end_index += year_index # Check also in prefix if not success and self.config.check_both_before_after: year_index, success = self.get_year_index(suffix, year, False) start_index -= year_index # Check also in prefix date = DateUtils.safe_create_from_value(DateUtils.min_value, year, month, day) is_match_in_suffix = False match_week_day = self.config.week_day_end.match(prefix) if not match_week_day: match_week_day = self.config.week_day_start.match(suffix) is_match_in_suffix = True if match_week_day else False if match_week_day: # Get weekday from context directly, compare it with the weekday extraction above # to see whether they reference the same weekday extracted_week_day_str = RegExpUtility.get_group( match_week_day, Constants.WEEKDAY_GROUP_NAME) num_week_day_str = calendar.day_name[date.weekday()].lower() week_day_1 = self.config.day_of_week.get(num_week_day_str) week_day_2 = self.config.day_of_week.get(extracted_week_day_str) if self.config.day_of_week.get(num_week_day_str, week_day_1) and \ self.config.day_of_week.get(extracted_week_day_str, week_day_2): if not date == DateUtils.min_value and week_day_1 == week_day_2: if not is_match_in_suffix: start_index = match_week_day.start() else: end_index += match_week_day.end() return start_index, end_index
def __init__(self): self._later_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.LaterRegex) self._ago_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.AgoRegex) self._in_connector_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.InConnectorRegex) self._range_unit_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.RangeUnitRegex) self._am_desc_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.AmDescRegex) self._pm_desc__regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.PmDescRegex) self._am_pm_desc_regex = RegExpUtility.get_safe_reg_exp( EnglishDateTime.AmPmDescRegex)
def __init__(self, mode: ChineseNumberExtractorMode = ChineseNumberExtractorMode.DEFAULT): self.__regexes = [ ReVal( re=RegExpUtility.get_safe_reg_exp( ChineseNumeric.NumbersSpecialsChars), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( ChineseNumeric.NumbersSpecialsCharsWithSuffix), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( ChineseNumeric.DottedNumbersSpecialsChar), val='IntegerNum'), ReVal( re=RegExpUtility.get_safe_reg_exp( ChineseNumeric.NumbersWithHalfDozen), val=f'Integer{ChineseNumeric.LangMarker}'), ReVal( re=RegExpUtility.get_safe_reg_exp( ChineseNumeric.NumbersWithDozen), val=f'Integer{ChineseNumeric.LangMarker}'), ReVal( re=RegExpUtility.get_safe_reg_exp( ChineseNumeric.HalfUnitRegex), val=f'Integer{ChineseNumeric.LangMarker}') ] if mode == ChineseNumberExtractorMode.DEFAULT: self.__regexes.append( ReVal( re=RegExpUtility.get_safe_reg_exp( ChineseNumeric.NumbersWithAllowListRegex), val=f'Integer{ChineseNumeric.LangMarker}' ) ) elif mode == ChineseNumberExtractorMode.EXTRACT_ALL: self.__regexes.append( ReVal( re=RegExpUtility.get_safe_reg_exp( ChineseNumeric.NumbersAggressiveRegex), val=f'Integer{ChineseNumeric.LangMarker}' ) )
def extend_with_week_day_and_year(self, start_index: int, end_index: int, month: int, day: int, text: str, reference: datetime): from .abstract_year_extractor import AbstractYearExtractor from .utilities import DateUtils import calendar year = reference.year # Check whether there's a year suffix = text[end_index:] match_year = self.config.year_suffix.match(suffix) if match_year and match_year.start() == 0: year = AbstractYearExtractor.get_year_from_text(self, match_year) if Constants.MIN_YEAR_NUM <= year <= Constants.MAX_YEAR_NUM: end_index += len(match_year.group()) date = DateUtils.safe_create_from_value(DateUtils.min_value, year, month, day) # Check whether there's a weekday prefix = text[:start_index] match_week_day = self.config.week_day_end.match(prefix) if match_week_day: # Get weekday from context directly, compare it with the weekday extraction above # to see whether they reference the same weekday extracted_week_day_str = RegExpUtility.get_group( match_week_day, Constants.WEEKDAY_GROUP_NAME) num_week_day_str = calendar.day_name[date.weekday()].lower() if self.config.day_of_week.get(num_week_day_str) and \ self.config.day_of_week.get(extracted_week_day_str): week_day_1 = self.config.day_of_week.get(num_week_day_str) week_day_2 = self.config.day_of_week.get( extracted_week_day_str) if not date == DateUtils.min_value and week_day_1 == week_day_2: start_index = match_week_day.end() return start_index, end_index
def __init__(self, config): super().__init__() self._holiday_regexes = [ RegExpUtility.get_safe_reg_exp(SpanishDateTime.HolidayRegex1), RegExpUtility.get_safe_reg_exp(SpanishDateTime.HolidayRegex2), RegExpUtility.get_safe_reg_exp(SpanishDateTime.HolidayRegex3) ] self._holiday_names = SpanishDateTime.HolidayNames self._variable_holidays_timex_dictionary = SpanishDateTime.VariableHolidaysTimexDictionary self.next_prefix_regex = RegExpUtility.get_safe_reg_exp(SpanishDateTime.NextPrefixRegex) self.past_prefix_regex = RegExpUtility.get_safe_reg_exp(SpanishDateTime.PastPrefixRegex) self.this_prefix_regex = RegExpUtility.get_safe_reg_exp(SpanishDateTime.ThisPrefixRegex)
def parse_each_unit(self, source: str) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = self.config.each_unit_regex.search(source) if not (match and (match.end() - match.start()) == len(source)): return result source_unit = RegExpUtility.get_group(match, 'unit') if not (source_unit and source_unit in self.config.unit_map): return result get_matched_unit_timex = self.config.get_matched_unit_timex(source_unit) if not get_matched_unit_timex.matched: return result result.timex = get_matched_unit_timex.timex result.future_value = result.past_value = 'Set: ' + result.timex result.success = True return result