def __init__(self): super().__init__(ChineseDatePeriodParserConfiguration()) self.integer_extractor = ChineseIntegerExtractor() self.number_parser = CJKNumberParser( ChineseNumberParserConfiguration()) self.year_in_chinese_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodYearInChineseRegex) self.number_combined_with_unit_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.NumberCombinedWithUnit) self.unit_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.UnitRegex) self.year_and_month_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.YearAndMonth) self.pure_number_year_and_month_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.PureNumYearAndMonth) self.year_to_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.YearToYear) self.year_to_year_suffix_required = RegExpUtility.get_safe_reg_exp( ChineseDateTime.YearToYearSuffixRequired) self.chinese_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodYearInChineseRegex) self.season_with_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.SeasonWithYear) self.decade_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DecadeRegex) self.date_this_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodThisRegex) self.date_last_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodLastRegex) self.date_next_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodNextRegex)
def __init__(self): config = ChineseHolidayParserConfiguration() BaseHolidayParser.__init__(self, config) self.__lunar_holiday_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.LunarHolidayRegex) self.__integer_extractor = ChineseIntegerExtractor() self.__number_parser = AgnosticNumberParserFactory.get_parser( AgnosticNumberParserType.INTEGER, ChineseNumberParserConfiguration()) self.__fixed_holiday_dictionary = dict([ ('元旦', ChineseHolidayParser.new_year), ('元旦节', ChineseHolidayParser.new_year), ('教师节', ChineseHolidayParser.teacher_day), ('青年节', ChineseHolidayParser.youth_day), ('儿童节', ChineseHolidayParser.children_day), ('妇女节', ChineseHolidayParser.female_day), ('植树节', ChineseHolidayParser.tree_plant_day), ('情人节', ChineseHolidayParser.lover_day), ('平安夜', ChineseHolidayParser.christmas_eve), ('圣诞节', ChineseHolidayParser.christmas_day), ('新年', ChineseHolidayParser.new_year), ('愚人节', ChineseHolidayParser.fool_day), ('五一', ChineseHolidayParser.labor_day), ('劳动节', ChineseHolidayParser.labor_day), ('万圣节', ChineseHolidayParser.halloween_day), ('中秋节', ChineseHolidayParser.midautumn_day), ('中秋', ChineseHolidayParser.midautumn_day), ('春节', ChineseHolidayParser.spring_day), ('除夕', ChineseHolidayParser.new_year_eve), ('元宵节', ChineseHolidayParser.lantern_day), ('清明节', ChineseHolidayParser.qing_ming_day), ('清明', ChineseHolidayParser.qing_ming_day), ('端午节', ChineseHolidayParser.dragon_boat_day), ('端午', ChineseHolidayParser.dragon_boat_day), ('国庆节', ChineseHolidayParser.chs_national_day), ('建军节', ChineseHolidayParser.chs_mil_build_day), ('女生节', ChineseHolidayParser.girls_day), ('光棍节', ChineseHolidayParser.singles_day), ('双十一', ChineseHolidayParser.singles_day), ('重阳节', ChineseHolidayParser.chong_yang_day) ])
def __init__(self): self._date_regex = [ RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList1), RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList2), RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList3), RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList4), RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList5) ] if ChineseDateTime.DefaultLanguageFallback == Constants.DEFAULT_LANGUAGE_FALLBACK_DMY: self._date_regex.append( RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList7)) self._date_regex.append( RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList6)) else: self._date_regex.append( RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList6)) self._date_regex.append( RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList7)) self._date_regex.append( RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList8)) self._month_of_year = ChineseDateTime.ParserConfigurationMonthOfYear self._day_of_month = ChineseDateTime.ParserConfigurationDayOfMonth self._day_of_week = ChineseDateTime.ParserConfigurationDayOfWeek self._special_day_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.SpecialDayRegex) self._special_day_with_num_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.SpecialDayWithNumRegex) self._this_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DateThisRegex) self._next_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DateNextRegex) self._last_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DateLastRegex) self._unit_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DateUnitRegex) self._unit_map = ChineseDateTime.ParserConfigurationUnitMap self._week_day_of_month_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.WeekDayOfMonthRegex) self._week_day_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.WeekDayRegex) self._dynasty_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DynastyYearRegex) self._dynasty_year_map = ChineseDateTime.DynastyYearMap self._integer_extractor = ChineseIntegerExtractor() self._number_parser = CJKNumberParser( ChineseNumberParserConfiguration()) self._date_extractor = None self._dynasty_start_year = ChineseDateTime.DynastyStartYear
class ChineseDateParser(BaseDateParser): integer_extractor = ChineseIntegerExtractor() def __init__(self): super().__init__(ChineseDateParserConfiguration()) self.lunar_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.LunarRegex) self.special_date_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.SpecialDate) self.token_next_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.NextPrefixRegex) self.token_last_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.LastPrefixRegex) self.month_max_days: List[int] = [ 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ] self.duration_extractor = ChineseDurationExtractor() def parse(self, source: ExtractResult, reference: datetime = None) -> Optional[DateTimeParseResult]: if reference is None: reference = datetime.now() result_value: DateTimeParseResult = None if source.type is self.parser_type_name: source_text = source.text.lower() inner_result = self.parse_basic_regex_match(source_text, reference) if not inner_result.success: inner_result = self.parse_implicit_date(source_text, reference) if not inner_result.success: inner_result = self.parse_weekday_of_month( source_text, reference) if not inner_result.success: inner_result = self.parser_duration_with_ago_and_later( source_text, reference) if inner_result.success: inner_result.future_resolution: Dict[str, str] = dict() inner_result.future_resolution[ TimeTypeConstants.DATE] = DateTimeFormatUtil.format_date( inner_result.future_value) inner_result.past_resolution: Dict[str, str] = dict() inner_result.past_resolution[ TimeTypeConstants.DATE] = DateTimeFormatUtil.format_date( inner_result.past_value) inner_result.is_lunar = self.__parse_lunar_calendar( source_text) result_value = inner_result result = DateTimeParseResult(source) result.value = result_value result.timex_str = result_value.timex if result_value is not None else '' result.resolution_str = '' return result def __parse_lunar_calendar(self, source: str) -> bool: return regex.match(self.lunar_regex, source.strip()) is not None def parse_basic_regex_match(self, source: str, reference: datetime) -> DateTimeParseResult: trimmed_source = source.strip() result = DateTimeResolutionResult() for regexp in self.config.date_regex: match = regex.search(regexp, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): result = self.match_to_date(match, reference) break return result def parse_implicit_date(self, source: str, reference: datetime) -> DateTimeResolutionResult: trimmed_source = source.strip() result = DateTimeResolutionResult() # handle "十二日" "明年这个月三日" "本月十一日" match = regex.match(self.special_date_regex, trimmed_source) if match: year_str = RegExpUtility.get_group(match, 'thisyear') month_str = RegExpUtility.get_group(match, 'thismonth') day_str = RegExpUtility.get_group(match, 'day') month = reference.month day = 0 day = self.config.day_of_month[day_str] year = reference.year has_year = False has_month = False if month_str: has_month = True if regex.search(self.token_next_regex, month_str): month += 1 if month == Constants.MAX_MONTH + 1: month = Constants.MIN_MONTH year += 1 elif regex.search(self.token_last_regex, month_str): month -= 1 if month == Constants.MIN_MONTH - 1: month = Constants.MAX_MONTH year -= 1 if year_str: has_year = True if regex.search(self.token_next_regex, year_str): year += 1 elif regex.search(self.token_last_regex, year_str): year -= 1 result.timex = DateTimeFormatUtil.luis_date( year if has_year else -1, month if has_month else -1, day) if day > self.get_month_max_day(year, month): future_month = month + 1 past_month = month - 1 future_year = year past_year = year if future_month == Constants.MAX_MONTH + 1: future_month = Constants.MIN_MONTH future_year = year + 1 if past_month == Constants.MIN_MONTH - 1: past_month = Constants.MAX_MONTH past_year = year - 1 is_future_valid = DateUtils.is_valid_date( future_year, future_month, day) is_past_valid = DateUtils.is_valid_date( past_year, past_month, day) if is_future_valid and is_past_valid: future_date = DateUtils.safe_create_from_min_value( future_year, future_month, day) past_date = DateUtils.safe_create_from_min_value( past_year, past_month, day) elif is_future_valid and not is_past_valid: future_date = past_date = DateUtils.safe_create_from_min_value( future_year, future_month, day) elif not is_future_valid and not is_past_valid: future_date = past_date = DateUtils.safe_create_from_min_value( past_year, past_month, day) else: future_date = past_date = DateUtils.safe_create_from_min_value( year, month, day) else: future_date = DateUtils.safe_create_from_min_value( year, month, day) past_date = DateUtils.safe_create_from_min_value( year, month, day) if not has_month: if future_date < reference: if self.is_valid_date(year, month + 1, day): future_date += datedelta(months=1) if past_date >= reference: if self.is_valid_date(year, month - 1, day): past_date += datedelta(months=-1) elif DateUtils.is_Feb_29th(year, month - 1, day): past_date += datedelta(months=-2) elif not has_year: if future_date < reference: if self.is_valid_date(year + 1, month, day): future_date += datedelta(years=1) if past_date >= reference: if self.is_valid_date(year - 1, month, day): past_date += datedelta(years=-1) result.future_value = future_date result.past_value = past_date result.success = True return result # handle "today", "the day before yesterday" match = regex.match(self.config.special_day_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): swift = self.config.get_swift_day(match.group()) value = reference + timedelta(days=swift) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = result.past_value = DateUtils.safe_create_from_min_value( value.year, value.month, value.day) result.success = True return result # handle "this Friday" match = regex.match(self.config.this_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = RegExpUtility.get_group(match, 'weekday') value = DateUtils.this(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "next Sunday" match = regex.match(self.config.next_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = RegExpUtility.get_group(match, 'weekday') value = DateUtils.next(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "last Friday", "last mon" match = regex.match(self.config.last_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = RegExpUtility.get_group(match, 'weekday') value = DateUtils.last(reference, self.config.day_of_week.get(weekday_str)) result.timex = DateTimeFormatUtil.luis_date_from_datetime(value) result.future_value = value result.past_value = value result.success = True return result # handle "Friday" match = regex.match(self.config.week_day_regex, trimmed_source) if match and match.start() == 0 and len( match.group()) == len(trimmed_source): weekday_str = RegExpUtility.get_group(match, 'weekday') weekday = self.config.day_of_week.get(weekday_str) value = DateUtils.this(reference, weekday) if weekday == 0: weekday = 7 if weekday < reference.isoweekday(): value = DateUtils.next(reference, weekday) result.timex = 'XXXX-WXX-' + str(weekday) future_date = value past_date = value if future_date < reference: future_date += timedelta(weeks=1) if past_date >= reference: past_date -= timedelta(weeks=1) result.future_value = future_date result.past_value = past_date result.success = True return result return result def match_to_date(self, match, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() year_str = RegExpUtility.get_group(match, 'year') year_chs = RegExpUtility.get_group(match, 'yearchs') month_str = RegExpUtility.get_group(match, 'month') day_str = RegExpUtility.get_group(match, 'day') month = 0 day = 0 year_tmp = self.convert_chinese_year_to_number(year_chs) year = 0 if year_tmp == -1 else year_tmp if month_str in self.config.month_of_year and day_str in self.config.day_of_month: month = self.get_month_of_year(month_str) day = self.get_day_of_month(day_str) if year_str.strip(): year = int(year_str) if year_str.isnumeric() else 0 if year < 100 and year >= Constants.MIN_TWO_DIGIT_YEAR_PAST_NUM: year += 1900 elif year < 100 and year < Constants.MAX_TWO_DIGIT_YEAR_FUTURE_NUM: year += 2000 no_year = False if year == 0: year = reference.year result.timex = DateTimeFormatUtil.luis_date(-1, month, day) no_year = True else: result.timex = DateTimeFormatUtil.luis_date(year, month, day) future_date, past_date = DateUtils.generate_dates( no_year, reference, year, month, day) result.future_value = future_date result.past_value = past_date result.success = True return result # convert Chinese Number to Integer def parse_chinese_written_number_to_value(self, source: str) -> int: num = -1 er: ExtractResult = next(iter(self.integer_extractor.extract(source)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: num = int(self.config.number_parser.parse(er).value) return num def convert_chinese_year_to_number(self, source: str) -> int: year = 0 dynasty_year = parse_chinese_dynasty_year( source, self.config.dynasty_year_regex, self.config.dynasty_start_year, self.config.dynasty_year_map, self.integer_extractor, self.config.number_parser) if dynasty_year is not None: return dynasty_year er: ExtractResult = next( iter(self.config.integer_extractor.extract(source)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: year = int(self.config.number_parser.parse(er).value) if year < 10: year = 0 for char in source: year = year * 10 er = next(iter(self.config.integer_extractor.extract(char)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: year = year + \ int(self.config.number_parser.parse(er).value) return -1 if year < 10 else year def get_month_of_year(self, source: str) -> int: if self.config.month_of_year[source] > 12: return self.config.month_of_year[source] % 12 return self.config.month_of_year[source] def get_day_of_month(self, source: str) -> int: if self.config.day_of_month[source] > 31: return self.config.day_of_month[source] % 31 return self.config.day_of_month[source] def get_month_max_day(self, year, month) -> int: max_day = self.month_max_days[month - 1] if not DateUtils.is_leap_year(year) and month == 2: max_day -= 1 return max_day def is_valid_date(self, year, month, day): if month < Constants.MIN_MONTH: year -= 1 month = Constants.MAX_MONTH if month > Constants.MAX_MONTH: year += 1 month = Constants.MIN_MONTH return DateUtils.is_valid_date(year, month, day) # Handle cases like "三天前" def parser_duration_with_ago_and_later( self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() duration_res = self.duration_extractor.extract( source, reference).pop() if self.duration_extractor.extract( source, reference) else [] if duration_res: match = self.config._unit_regex.search(source) if match: suffix = source[duration_res.start + duration_res.length:] src_unit = RegExpUtility.get_group(match, 'unit') number_str = source[duration_res.start:match.lastindex - duration_res.start + 1] number = self.parse_chinese_written_number_to_value(number_str) if src_unit in self.config.unit_map: unit_str = self.config.unit_map.get(src_unit) before_match = RegExpUtility.get_matches( ChineseDateExtractor.before_regex, suffix) if before_match and suffix.startswith(before_match[0]): if unit_str == Constants.TIMEX_DAY: date = reference + timedelta(days=-number) elif unit_str == Constants.TIMEX_WEEK: date = reference + timedelta(days=-7 * number) elif unit_str == Constants.TIMEX_MONTH_FULL: date = reference.replace(month=reference.month - 1) elif unit_str == Constants.TIMEX_YEAR: date = reference.replace(year=reference.year - 1) else: return result result.timex = DateTimeFormatUtil.luis_date_from_datetime( date) result.future_value = result.past_value = date result.success = True return result after_match = RegExpUtility.get_matches( ChineseDateExtractor.after_regex, suffix) if after_match and suffix.startswith(after_match[0]): if unit_str == Constants.TIMEX_DAY: date = reference + timedelta(days=number) elif unit_str == Constants.TIMEX_WEEK: date = reference + timedelta(days=7 * number) elif unit_str == Constants.TIMEX_MONTH_FULL: date = reference.replace(month=reference.month + 1) elif unit_str == Constants.TIMEX_YEAR: date = reference.replace(year=reference.year + 1) else: return result result.timex = DateTimeFormatUtil.luis_date_from_datetime( date) result.future_value = result.past_value = date result.success = True return result return result
class ChineseDatePeriodParser(BaseDatePeriodParser): def __init__(self): super().__init__(ChineseDatePeriodParserConfiguration()) self.integer_extractor = ChineseIntegerExtractor() self.number_parser = CJKNumberParser( ChineseNumberParserConfiguration()) self.year_in_chinese_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodYearInChineseRegex) self.number_combined_with_unit_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.NumberCombinedWithUnit) self.unit_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.UnitRegex) self.year_and_month_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.YearAndMonth) self.pure_number_year_and_month_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.PureNumYearAndMonth) self.year_to_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.YearToYear) self.year_to_year_suffix_required = RegExpUtility.get_safe_reg_exp( ChineseDateTime.YearToYearSuffixRequired) self.chinese_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodYearInChineseRegex) self.season_with_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.SeasonWithYear) self.decade_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DecadeRegex) self.date_this_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodThisRegex) self.date_last_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodLastRegex) self.date_next_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodNextRegex) def parse(self, source: ExtractResult, reference: datetime = None) -> Optional[DateTimeParseResult]: result_value = None if not reference: reference = datetime.now() if source.type == self.parser_type_name: source_text = source.text.strip().lower() inner_result = self._parse_simple_cases(source_text, reference) if not inner_result.success: inner_result = self._parse_one_word_period( source_text, reference) if not inner_result.success: inner_result = self._merge_two_times_points( source_text, reference) if not inner_result.success: inner_result = self._parse_number_with_unit( source_text, reference) if not inner_result.success: inner_result = self._parse_duration(source_text, reference) if not inner_result.success: inner_result = self._parse_year_and_month( source_text, reference) if not inner_result.success: inner_result = self._parse_year_to_year(source_text, reference) if not inner_result.success: inner_result = self._parse_year(source_text, reference) if not inner_result.success: inner_result = self._parse_week_of_month( source_text, reference) if not inner_result.success: inner_result = self._parse_season(source_text, reference) if not inner_result.success: inner_result = self._parse_quarter(source_text, reference) if not inner_result.success: inner_result = self._parse_decade(source_text, reference) if inner_result.success: if inner_result.future_value and inner_result.past_value: inner_result.future_resolution = { TimeTypeConstants.START_DATE: DateTimeFormatUtil.format_date( inner_result.future_value[0]), TimeTypeConstants.END_DATE: DateTimeFormatUtil.format_date( inner_result.future_value[1]) } inner_result.past_resolution = { TimeTypeConstants.START_DATE: DateTimeFormatUtil.format_date( inner_result.past_value[0]), TimeTypeConstants.END_DATE: DateTimeFormatUtil.format_date( inner_result.past_value[1]) } else: inner_result.future_resolution = {} inner_result.past_resolution = {} result_value = inner_result result = DateTimeParseResult(source) result.value = result_value result.timex_str = result_value.timex if result_value else '' result.resolution_str = '' return result def _parse_simple_cases(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() year = reference.year month = reference.month no_year = False input_year = False match = regex.search(self.config.simple_cases_regex, source) if not match or match.start() != 0 or len( match.group()) != len(source): return result days = RegExpUtility.get_group_list(match, Constants.DAY_GROUP_NAME) begin_day = self.config.day_of_month[days[0]] end_day = self.config.day_of_month[days[1]] month_str = RegExpUtility.get_group(match, Constants.MONTH_GROUP_NAME) if month_str.strip() != '': month = self.config.month_of_year[month_str] else: month_str = RegExpUtility.get_group(match, Constants.REL_MONTH) month += self.config.get_swift_day_or_month(month_str) if month < 0: month = 0 year -= 1 elif month > 11: month = 11 year += 1 year_str = RegExpUtility.get_group(match, Constants.YEAR_GROUP_NAME) if year_str.strip() != '': year = int(year_str) input_year = True else: no_year = True begin_date_luis = DateTimeFormatUtil.luis_date( year if input_year or self.config.is_future(month_str) else -1, month, begin_day) end_date_luis = DateTimeFormatUtil.luis_date( year if input_year or self.config.is_future(month_str) else -1, month, end_day) future_past_begin_date = DateUtils.generate_dates( no_year, reference, year, month, begin_day) future_past_end_date = DateUtils.generate_dates( no_year, reference, year, month, end_day) result.timex = f'({begin_date_luis},{end_date_luis},P{end_day - begin_day}D)' result.future_value = [ future_past_begin_date[0], future_past_end_date[0] ] result.past_value = [ future_past_begin_date[1], future_past_end_date[1] ] result.success = True return result def _parse_number_with_unit( self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() # if there are NO spaces between number and unit match = regex.search(self.number_combined_with_unit_regex, source) if not match: return result source_unit = RegExpUtility.get_group(match, Constants.UNIT).strip().lower() if source_unit not in self.config.unit_map: return result num_str = RegExpUtility.get_group(match, Constants.NUM) before_str = source[:match.start()].strip().lower() return self.__parse_common_duration_with_unit(before_str, source_unit, num_str, reference) def _parse_duration(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() # for case "前两年" "后三年" duration_result = next( iter(self.config.duration_extractor.extract(source, reference)), None) if not duration_result: return result match = regex.search(self.unit_regex, duration_result.text) if not match: return result source_unit = RegExpUtility.get_group(match, Constants.UNIT).strip().lower() if source_unit not in self.config.unit_map: return result before_str = source[:duration_result.start].strip().lower() number_str = duration_result.text[:match.start()].strip().lower() number_val = self.__convert_chinese_to_number(number_str) num_str = str(number_val) return self.__parse_common_duration_with_unit(before_str, source_unit, num_str, reference) def __parse_common_duration_with_unit( self, before: str, unit: str, num: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() unit_str = self.config.unit_map[unit] past_match = regex.search(self.config.past_regex, before) has_past = past_match and len(past_match.group()) == len(before) future_match = regex.search(self.config.future_regex, before) has_future = future_match and len(future_match.group()) == len(before) if not has_future and not has_past: return result begin_date = reference end_date = reference difference = float(num) if unit_str == Constants.UNIT_D: if has_past: begin_date += timedelta(days=-difference) if has_future: end_date += timedelta(days=difference) elif unit_str == Constants.UNIT_W: if has_past: begin_date += timedelta(days=-7 * difference) if has_future: end_date += timedelta(days=7 * difference) elif unit_str == Constants.UNIT_MON: if has_past: begin_date += datedelta(months=int(-difference)) if has_future: end_date += datedelta(months=int(difference)) elif unit_str == Constants.UNIT_Y: if has_past: begin_date += datedelta(years=int(-difference)) if has_future: end_date += datedelta(years=int(difference)) else: return result if has_future: begin_date += timedelta(days=1) end_date += timedelta(days=1) begin_timex = DateTimeFormatUtil.luis_date_from_datetime(begin_date) end_timex = DateTimeFormatUtil.luis_date_from_datetime(end_date) result.timex = f'({begin_timex},{end_timex},P{num}{unit_str[0]})' result.future_value = [begin_date, end_date] result.past_value = [begin_date, end_date] result.success = True return result def __convert_chinese_to_number(self, source: str) -> int: num = -1 er = next(iter(self.integer_extractor.extract(source)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: num = int(self.number_parser.parse(er).value) return num def _parse_year_and_month(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = regex.search(self.year_and_month_regex, source) if not match or len(match.group()) != len(source): match = regex.search(self.pure_number_year_and_month_regex, source) if not match or len(match.group()) != len(source): return result year = reference.year year_num = RegExpUtility.get_group(match, Constants.YEAR_GROUP_NAME) year_chinese = RegExpUtility.get_group(match, Constants.YEAR_CHINESE) year_relative = RegExpUtility.get_group(match, Constants.YEAR_RELATIVE) if year_num.strip() != '': if self.config.is_year_only(year_num): year_num = year_num[:-1] year = self._convert_year(year_num, False) elif year_chinese.strip() != '': if self.config.is_year_only(year_chinese): year_chinese = year_chinese[:-1] year = self._convert_year(year_chinese, True) elif year_relative.strip() != '': year += self.config.get_swift_day_or_month(year_relative) if 100 > year >= 90: year += 1900 elif year < 100 and year < 20: year += 2000 month_str = RegExpUtility.get_group(match, Constants.MONTH_GROUP_NAME) month = self.config.month_of_year.get(month_str, 0) % 12 if month == 0: month = 12 begin_date = DateUtils.safe_create_from_min_value(year, month, 1) end_date = DateUtils.safe_create_from_min_value( year, month, 1) + datedelta(months=1) result.future_value = [begin_date, end_date] result.past_value = [begin_date, end_date] result.timex = f'{year:04d}-{month:02d}' result.success = True return result def _parse_year_to_year(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = regex.search(self.year_to_year_regex, source) if not match: match = regex.search(self.year_to_year_suffix_required, source) if not match: return result year_matches = list(regex.finditer(self.config.year_regex, source)) chinese_year_matches = list( regex.finditer(self.chinese_year_regex, source)) begin_year = 0 end_year = 0 if len(year_matches) == 2: begin_year = self.__convert_chinese_to_number( RegExpUtility.get_group(year_matches[0], Constants.YEAR_GROUP_NAME)) end_year = self.__convert_chinese_to_number( RegExpUtility.get_group(year_matches[1], Constants.YEAR_GROUP_NAME)) elif len(chinese_year_matches) == 2: begin_year = self._convert_year( RegExpUtility.get_group(chinese_year_matches[0], Constants.YEAR_CHINESE), True) end_year = self._convert_year( RegExpUtility.get_group(chinese_year_matches[1], Constants.YEAR_CHINESE), True) elif len(year_matches) == 1 and len(chinese_year_matches) == 1: if year_matches[0].start() < chinese_year_matches[0].start(): begin_year = self.__convert_chinese_to_number( RegExpUtility.get_group(year_matches[0], Constants.YEAR_GROUP_NAME)) end_year = self.__convert_chinese_to_number( RegExpUtility.get_group(chinese_year_matches[0], Constants.YEAR_CHINESE)) else: begin_year = self.__convert_chinese_to_number( RegExpUtility.get_group(chinese_year_matches[0], Constants.YEAR_CHINESE)) end_year = self.__convert_chinese_to_number( RegExpUtility.get_group(year_matches[0], Constants.YEAR_GROUP_NAME)) begin_year = self.__sanitize_year(begin_year) end_year = self.__sanitize_year(end_year) begin_date = DateUtils.safe_create_from_min_value(begin_year, 1, 1) end_date = DateUtils.safe_create_from_min_value(end_year, 1, 1) result.future_value = [begin_date, end_date] result.past_value = [begin_date, end_date] begin_timex = DateTimeFormatUtil.luis_date_from_datetime(begin_date) end_timex = DateTimeFormatUtil.luis_date_from_datetime(end_date) result.timex = f'({begin_timex},{end_timex},P{end_year - begin_year}Y)' result.success = True return result @staticmethod def __sanitize_year(year: int) -> int: result = year if 100 > year >= 90: result += 1900 elif year < 100 and year < 20: result += 2000 return result def _parse_year(self, source: str, reference: datetime) -> DateTimeResolutionResult: source = source.strip().lower() result = DateTimeResolutionResult() is_chinese = False match = regex.search(self.config.year_regex, source) if not match or len(match.group()) != len(source): match = regex.search(self.year_in_chinese_regex, source) is_chinese = match and len(match.group()) == len(source) if not match or len(match.group()) != len(source): return result year_str = match.group() if self.config.is_year_only(year_str): year_str = year_str[:-1].strip() year = self._convert_year(year_str, is_chinese) if len(year_str) == 2: if 100 > year >= 30: year += 1900 elif year < 30: year += 2000 begin_day = DateUtils.safe_create_from_min_value(year, 1, 1) end_day = DateUtils.safe_create_from_min_value(year + 1, 1, 1) result.timex = f'{year:04d}' result.future_value = [begin_day, end_day] result.past_value = [begin_day, end_day] result.success = True return result def _convert_year(self, year_str: str, is_chinese: bool) -> int: year = -1 if is_chinese: dynasty_year = parse_chinese_dynasty_year( year_str, self.config.dynasty_year_regex, self.config.dynasty_start_year, self.config.dynasty_year_map, self.integer_extractor, self.number_parser) if dynasty_year is not None: return dynasty_year year_num = 0 er = next(iter(self.integer_extractor.extract(year_str)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: year_num = int(self.number_parser.parse(er).value) if year_num < 10: year_num = 0 for char in year_str: year_num *= 10 er = next(iter(self.integer_extractor.extract(char)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: year_num += int(self.number_parser.parse(er).value) year = year_num else: year = year_num else: year = int(year_str) return -1 if year == 0 else year def _get_week_of_month(self, cardinal, month, year, reference, no_year) -> DateTimeResolutionResult: result = DateTimeResolutionResult() seed_date = self._compute_date(cardinal, DayOfWeek.MONDAY, month, year) future_date = seed_date past_date = seed_date if no_year and future_date < reference: future_date = self._compute_date(cardinal, DayOfWeek.MONDAY, month, year + 1) if not future_date.month == month: future_date = future_date + timedelta(days=-7) if no_year and past_date >= reference: past_date = self._compute_date(cardinal, DayOfWeek.MONDAY, month, year - 1) if not past_date.month == month: past_date = past_date + timedelta(days=-7) result.timex = ('XXXX' if no_year else f'{year:04d}') + f'-{month:02d}-W{cardinal:02d}' days_to_add = 6 if self._inclusive_end_period else 7 result.future_value = [ future_date, future_date + timedelta(days=days_to_add) ] result.past_value = [ past_date, past_date + timedelta(days=days_to_add) ] result.success = True return result def _compute_date(self, cardinal: int, weekday: DayOfWeek, month: int, year: int): first_day = datetime(year, month, 1) first_week_day = DateUtils.this(first_day, weekday) if weekday == 0: weekday = 7 first_day_of_week = first_day.isoweekday() if first_day_of_week == 7: first_day_of_week = 0 if weekday < first_day_of_week: first_week_day = DateUtils.next(first_day, weekday) first_week_day = first_week_day + timedelta(days=7 * (cardinal - 1)) return first_week_day def _parse_season(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = regex.search(self.season_with_year_regex, source) if not match or len(match.group()) != len(source): return result year = reference.year year_num = RegExpUtility.get_group(match, Constants.YEAR_GROUP_NAME) year_chinese = RegExpUtility.get_group(match, Constants.YEAR_CHINESE) year_relative = RegExpUtility.get_group(match, Constants.YEAR_RELATIVE) has_year = False if year_num.strip() != '': has_year = True if self.config.is_year_only(year_num): year_num = year_num[:-1] year = self._convert_year(year_num, False) elif year_chinese.strip() != '': has_year = True if self.config.is_year_only(year_chinese): year_chinese = year_chinese[:-1] year = self._convert_year(year_chinese, True) elif year_relative.strip() != '': has_year = True year += self.config.get_swift_day_or_month(year_relative) if 100 > year >= 90: year += 1900 elif year < 100 and year < 20: year += 2000 season_str = RegExpUtility.get_group(match, Constants.SEASON) season = self.config.season_map.get(season_str, None) if has_year: result.timex = f'{year:02d}-{season}' result.success = True return result def _parse_quarter(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = regex.search(self.config.quarter_regex, source) if not match or len(match.group()) != len(source): return result year = reference.year year_num = RegExpUtility.get_group(match, Constants.YEAR_GROUP_NAME) year_chinese = RegExpUtility.get_group(match, Constants.YEAR_CHINESE) year_relative = RegExpUtility.get_group(match, Constants.YEAR_RELATIVE) has_year = False if year_num.strip() != '': has_year = True if self.config.is_year_only(year_num): year_num = year_num[:-1] year = self._convert_year(year_num, False) elif year_chinese.strip() != '': has_year = True if self.config.is_year_only(year_chinese): year_chinese = year_chinese[:-1] year = self._convert_year(year_chinese, True) elif year_relative.strip() != '': has_year = True year += self.config.get_swift_day_or_month(year_relative) if 100 > year >= 90: year += 1900 elif year < 100 and year < 20: year += 2000 cardinal_str = RegExpUtility.get_group(match, Constants.CARDINAL) quarter_num = self.config.cardinal_map.get(cardinal_str, None) begin_date = DateUtils.safe_create_from_min_value( year, quarter_num * 3 - 2, 1) end_date = DateUtils.safe_create_from_min_value( year, quarter_num * 3 + 1, 1) result.future_value = [begin_date, end_date] result.past_value = [begin_date, end_date] begin_luis = DateTimeFormatUtil.luis_date_from_datetime(begin_date) end_luis = DateTimeFormatUtil.luis_date_from_datetime(end_date) result.timex = f'({begin_luis},{end_luis},P3M)' result.success = True return result def _parse_decade(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() century = int(reference.year / 100) + 1 decade_last_year = 10 input_century = False match = regex.search(self.decade_regex, source) if not match or len(match.group()) != len(source): return result decade_str = RegExpUtility.get_group(match, Constants.DECADE) decade = self.__convert_chinese_to_number(decade_str) century_str = RegExpUtility.get_group(match, Constants.CENTURY) if century_str != "": century = self.__convert_chinese_to_number(century_str) input_century = True else: century_str = RegExpUtility.get_group(match, Constants.REL_CENTURY) if century_str != "": century_str = century_str.strip().lower() this_match = regex.search(self.date_this_regex, century_str) next_match = regex.search(self.date_next_regex, century_str) last_match = regex.search(self.date_last_regex, century_str) if next_match: century += 1 elif last_match: century -= 1 input_century = True begin_year = ((century - 1) * 100) + decade end_year = begin_year + decade_last_year if input_century: begin_luis_str = DateTimeFormatUtil.luis_date(begin_year, 1, 1) end_luis_str = DateTimeFormatUtil.luis_date(end_year, 1, 1) else: begin_year_str = "XX{:02d}".format(decade) begin_luis_str = DateTimeFormatUtil.luis_date(-1, 1, 1) begin_luis_str = begin_luis_str.replace("XXXX", begin_year_str) end_year_str = "XX{:02d}".format(end_year % 100) end_luis_str = DateTimeFormatUtil.luis_date(-1, 1, 1) end_luis_str = end_luis_str.replace("XXXX", end_year_str) result.timex = f"({begin_luis_str},{end_luis_str},P10Y)" future_year, past_year = begin_year, begin_year start_date = DateUtils.safe_create_from_min_value(begin_year, 1, 1) if not input_century and start_date < reference: future_year += 100 if not input_century and start_date >= reference: past_year -= 100 result.future_value = [ DateUtils.safe_create_from_min_value(future_year, 1, 1), DateUtils.safe_create_from_min_value( future_year + decade_last_year, 1, 1) ] result.past_value = [ DateUtils.safe_create_from_min_value(past_year, 1, 1), DateUtils.safe_create_from_min_value(past_year + decade_last_year, 1, 1) ] result.success = True return result
class ChineseHolidayParser(BaseHolidayParser): def __init__(self): config = ChineseHolidayParserConfiguration() BaseHolidayParser.__init__(self, config) self.__lunar_holiday_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.LunarHolidayRegex) self.__integer_extractor = ChineseIntegerExtractor() self.__number_parser = AgnosticNumberParserFactory.get_parser( AgnosticNumberParserType.INTEGER, ChineseNumberParserConfiguration()) self.__fixed_holiday_dictionary = dict([ ('元旦', ChineseHolidayParser.new_year), ('元旦节', ChineseHolidayParser.new_year), ('教师节', ChineseHolidayParser.teacher_day), ('青年节', ChineseHolidayParser.youth_day), ('儿童节', ChineseHolidayParser.children_day), ('妇女节', ChineseHolidayParser.female_day), ('植树节', ChineseHolidayParser.tree_plant_day), ('情人节', ChineseHolidayParser.lover_day), ('圣诞节', ChineseHolidayParser.christmas_day), ('新年', ChineseHolidayParser.new_year), ('愚人节', ChineseHolidayParser.fool_day), ('五一', ChineseHolidayParser.labor_day), ('劳动节', ChineseHolidayParser.labor_day), ('万圣节', ChineseHolidayParser.halloween_day), ('中秋节', ChineseHolidayParser.midautumn_day), ('中秋', ChineseHolidayParser.midautumn_day), ('春节', ChineseHolidayParser.spring_day), ('除夕', ChineseHolidayParser.new_year_eve), ('元宵节', ChineseHolidayParser.lantern_day), ('清明节', ChineseHolidayParser.qing_ming_day), ('清明', ChineseHolidayParser.qing_ming_day), ('端午节', ChineseHolidayParser.dragon_boat_day), ('端午', ChineseHolidayParser.dragon_boat_day), ('国庆节', ChineseHolidayParser.chs_national_day), ('建军节', ChineseHolidayParser.chs_mil_build_day), ('女生节', ChineseHolidayParser.girls_day), ('光棍节', ChineseHolidayParser.singles_day), ('双十一', ChineseHolidayParser.singles_day), ('重阳节', ChineseHolidayParser.chong_yang_day) ]) @staticmethod def new_year(year: int) -> datetime: return datetime(year, 1, 1) @staticmethod def chs_national_day(year: int) -> datetime: return datetime(year, 10, 1) @staticmethod def labor_day(year: int) -> datetime: return datetime(year, 5, 1) @staticmethod def christmas_day(year: int) -> datetime: return datetime(year, 12, 25) @staticmethod def lover_day(year: int) -> datetime: return datetime(year, 2, 14) @staticmethod def chs_mil_build_day(year: int) -> datetime: return datetime(year, 8, 1) @staticmethod def fool_day(year: int) -> datetime: return datetime(year, 4, 1) @staticmethod def girls_day(year: int) -> datetime: return datetime(year, 3, 7) @staticmethod def tree_plant_day(year: int) -> datetime: return datetime(year, 3, 12) @staticmethod def female_day(year: int) -> datetime: return datetime(year, 3, 8) @staticmethod def children_day(year: int) -> datetime: return datetime(year, 6, 1) @staticmethod def youth_day(year: int) -> datetime: return datetime(year, 5, 4) @staticmethod def teacher_day(year: int) -> datetime: return datetime(year, 9, 10) @staticmethod def singles_day(year: int) -> datetime: return datetime(year, 11, 11) @staticmethod def halloween_day(year: int) -> datetime: return datetime(year, 10, 31) @staticmethod def midautumn_day(year: int) -> datetime: return datetime(year, 8, 15) @staticmethod def spring_day(year: int) -> datetime: return datetime(year, 1, 1) @staticmethod def new_year_eve(year: int) -> datetime: return datetime(year, 1, 1) + timedelta(days=-1) @staticmethod def lantern_day(year: int) -> datetime: return datetime(year, 1, 15) @staticmethod def qing_ming_day(year: int) -> datetime: return datetime(year, 4, 4) @staticmethod def dragon_boat_day(year: int) -> datetime: return datetime(year, 5, 5) @staticmethod def chong_yang_day(year: int) -> datetime: return datetime(year, 9, 9) def parse(self, source: ExtractResult, reference: datetime = None) -> Optional[DateTimeParseResult]: if not reference: reference = datetime.now() value = None if source.type == self.parser_type_name: inner_result = self._parse_holiday_regex_match( source.text, reference) if inner_result.success: inner_result.future_resolution = { TimeTypeConstants.DATE: DateTimeFormatUtil.format_date(inner_result.future_value) } inner_result.past_resolution = { TimeTypeConstants.DATE: DateTimeFormatUtil.format_date(inner_result.past_value) } inner_result.is_lunar = self.__is_lunar(source.text) value = inner_result result = DateTimeParseResult(source) result.value = value result.timex_str = value.timex if value else '' result.resolution_str = '' return result def __is_lunar(self, source: str) -> bool: return self.__lunar_holiday_regex.search(source) is not None def _match2date(self, match: Match, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() holiday_str = self.config.sanitize_holiday_token( match.group('holiday').lower()) if not holiday_str: return result year = reference.year year_num = match.group('year') year_chinese = match.group('yearchs') year_relative = match.group('yearrel') has_year = False if year_num: has_year = True if self.config.get_swift_year(year_num) == 0: year_num = year_num[0:len(year_num) - 1] year = self.__convert_year(year_num, False) elif year_chinese: has_year = True if self.config.get_swift_year(year_chinese) == 0: year_chinese = year_chinese[0:len(year_chinese) - 1] year = self.__convert_year(year_chinese, True) elif year_relative: has_year = True year += self.config.get_swift_year(reference.year) if year < 100 and year >= 90: year += 1900 elif year < 100 and year < 20: year += 2000 timex = '' date = reference if holiday_str in self.__fixed_holiday_dictionary: date = self.__fixed_holiday_dictionary[holiday_str](year) timex = f'-{DateTimeFormatUtil.to_str(date.month, 2)}-{DateTimeFormatUtil.to_str(date.day, 2)}' elif holiday_str in self.config.holiday_func_dictionary: date = self.config.holiday_func_dictionary[holiday_str](year) timex = self.config.variable_holidays_timex_dictionary[holiday_str] else: return result if has_year: result.timex = DateTimeFormatUtil.to_str(year, 4) + timex result.future_value = datetime(year, date.month, date.day) result.past_value = datetime(year, date.month, date.day) else: result.timex = 'XXXX' + timex result.future_value = self.__get_date_value( date, reference, holiday_str, 1, lambda d, r: d < r) result.past_value = self.__get_date_value(date, reference, holiday_str, -1, lambda d, r: d >= r) result.success = True return result def __convert_year(self, year_str: str, is_chinese: bool) -> int: year = -1 if is_chinese: year_num = 0 ers = self.__integer_extractor.extract(year_str) if ers and ers[-1].type == NumberConstants.SYS_NUM_INTEGER: year_num = int(self.__number_parser.parse(ers[-1]).value) if year_num < 10: year_num = 0 for char in year_str: year_num *= 10 ers = self.__integer_extractor.extract(char) if ers and ers[-1].type == NumberConstants.SYS_NUM_INTEGER: year_num += int( self.__number_parser.parse(ers[-1]).value) else: year = year_num else: year = int(year_str) return -1 if year == 0 else year def __get_date_value(self, date: datetime, reference: datetime, holiday: str, swift: int, comparer) -> datetime: result = date if comparer(date, reference): if holiday in self.__fixed_holiday_dictionary: return date + datedelta(years=swift) if holiday in self.config.holiday_func_dictionary: result = self.config.holiday_func_dictionary[holiday]( reference.year + swift) return result
def __init__(self): self._time_parser = ChineseTimeParser() self._integer_extractor = ChineseIntegerExtractor()
def __init__(self): self._date_regex = [ RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList1), RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList2), RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList3), # 2015-12-23 - This regex represents the standard format in Chinese dates (YMD) and has precedence over other orderings RegExpUtility.get_safe_reg_exp(ChineseDateTime.DateRegexList8) ] # Regex precedence where the order between D and M varies is controlled by DefaultLanguageFallback if ChineseDateTime.DefaultLanguageFallback == Constants.DEFAULT_LANGUAGE_FALLBACK_DMY: order_regex_list = [ ChineseDateTime.DateRegexList5, ChineseDateTime.DateRegexList4 ] else: order_regex_list = [ ChineseDateTime.DateRegexList4, ChineseDateTime.DateRegexList5 ] if ChineseDateTime.DefaultLanguageFallback in [ Constants.DEFAULT_LANGUAGE_FALLBACK_DMY, Constants.DEFAULT_LANGUAGE_FALLBACK_YMD ]: order_regex_list.extend([ ChineseDateTime.DateRegexList7, ChineseDateTime.DateRegexList6 ]) else: order_regex_list.extend([ ChineseDateTime.DateRegexList6, ChineseDateTime.DateRegexList7 ]) self._date_regex.extend( [RegExpUtility.get_safe_reg_exp(ii) for ii in order_regex_list]) self._month_of_year = ChineseDateTime.ParserConfigurationMonthOfYear self._day_of_month = ChineseDateTime.ParserConfigurationDayOfMonth self._day_of_week = ChineseDateTime.ParserConfigurationDayOfWeek self._special_day_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.SpecialDayRegex) self._special_day_with_num_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.SpecialDayWithNumRegex) self._this_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DateThisRegex) self._next_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DateNextRegex) self._last_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DateLastRegex) self._unit_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DateUnitRegex) self._unit_map = ChineseDateTime.ParserConfigurationUnitMap self._week_day_of_month_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.WeekDayOfMonthRegex) self._week_day_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.WeekDayRegex) self._dynasty_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DynastyYearRegex) self._dynasty_year_map = ChineseDateTime.DynastyYearMap self._integer_extractor = ChineseIntegerExtractor() self._number_parser = CJKNumberParser( ChineseNumberParserConfiguration()) self._date_extractor = None self._dynasty_start_year = ChineseDateTime.DynastyStartYear
class ChineseDatePeriodParser(BaseDatePeriodParser): def __init__(self): super().__init__(ChineseDatePeriodParserConfiguration()) self.integer_extractor = ChineseIntegerExtractor() self.number_parser = CJKNumberParser( ChineseNumberParserConfiguration()) self.year_in_chinese_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodYearInChineseRegex) self.number_combined_with_unit_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.NumberCombinedWithUnit) self.unit_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.UnitRegex) self.year_and_month_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.YearAndMonth) self.pure_number_year_and_month_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.PureNumYearAndMonth) self.year_to_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.YearToYear) self.chinese_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.DatePeriodYearInChineseRegex) self.season_with_year_regex = RegExpUtility.get_safe_reg_exp( ChineseDateTime.SeasonWithYear) def parse(self, source: ExtractResult, reference: datetime = None) -> Optional[DateTimeParseResult]: if not reference: reference = datetime.now() if source.type == self.parser_type_name: source_text = source.text.strip().lower() inner_result = self._parse_simple_cases(source_text, reference) if not inner_result.success: inner_result = self._parse_one_word_period( source_text, reference) if not inner_result.success: inner_result = self._merge_two_times_points( source_text, reference) if not inner_result.success: inner_result = self._parse_number_with_unit( source_text, reference) if not inner_result.success: inner_result = self._parse_duration(source_text, reference) if not inner_result.success: inner_result = self._parse_year_and_month( source_text, reference) if not inner_result.success: inner_result = self._parse_year_to_year(source_text, reference) if not inner_result.success: inner_result = self._parse_year(source_text, reference) if not inner_result.success: inner_result = self._parse_week_of_month( source_text, reference) if not inner_result.success: inner_result = self._parse_season(source_text, reference) if not inner_result.success: inner_result = self._parse_quarter(source_text, reference) if inner_result.success: if inner_result.future_value and inner_result.past_value: inner_result.future_resolution = { TimeTypeConstants.START_DATE: FormatUtil.format_date(inner_result.future_value[0]), TimeTypeConstants.END_DATE: FormatUtil.format_date(inner_result.future_value[1]) } inner_result.past_resolution = { TimeTypeConstants.START_DATE: FormatUtil.format_date(inner_result.past_value[0]), TimeTypeConstants.END_DATE: FormatUtil.format_date(inner_result.past_value[1]) } else: inner_result.future_resolution = {} inner_result.past_resolution = {} result_value = inner_result result = DateTimeParseResult(source) result.value = result_value result.timex_str = result_value.timex if result_value else '' result.resolution_str = '' return result def _parse_simple_cases(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() year = reference.year month = reference.month no_year = False input_year = False match = regex.search(self.config.simple_cases_regex, source) if not match or match.start() != 0 or len( match.group()) != len(source): return result days = RegExpUtility.get_group_list(match, 'day') begin_day = self.config.day_of_month[days[0]] end_day = self.config.day_of_month[days[1]] month_str = RegExpUtility.get_group(match, 'month') if month_str.strip() != '': month = self.config.month_of_year[month_str] else: month_str = RegExpUtility.get_group(match, 'relmonth') month += self.config.get_swift_day_or_month(month_str) if month < 0: month = 0 year -= 1 elif month > 11: month = 11 year += 1 year_str = RegExpUtility.get_group(match, 'year') if year_str.strip() != '': year = int(year_str) input_year = True else: no_year = True begin_date_luis = FormatUtil.luis_date( year if input_year or self.config.is_future(month_str) else -1, month, begin_day) end_date_luis = FormatUtil.luis_date( year if input_year or self.config.is_future(month_str) else -1, month, end_day) future_year = year past_year = year start_date = DateUtils.safe_create_from_min_value( year, month, begin_day) if no_year and start_date < reference: future_year += 1 if no_year and start_date >= reference: past_year -= 1 result.timex = f'({begin_date_luis},{end_date_luis},P{end_day - begin_day}D)' result.future_value = [ DateUtils.safe_create_from_min_value(future_year, month, begin_day), DateUtils.safe_create_from_min_value(future_year, month, end_day) ] result.past_value = [ DateUtils.safe_create_from_min_value(past_year, month, begin_day), DateUtils.safe_create_from_min_value(past_year, month, end_day) ] result.success = True return result def _parse_number_with_unit( self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() # if there are NO spaces between number and unit match = regex.search(self.number_combined_with_unit_regex, source) if not match: return result source_unit = RegExpUtility.get_group(match, 'unit').strip().lower() if source_unit not in self.config.unit_map: return result num_str = RegExpUtility.get_group(match, 'num') before_str = source[:match.start()].strip().lower() return self.__parse_common_duration_with_unit(before_str, source_unit, num_str, reference) def _parse_duration(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() # for case "前两年" "后三年" duration_result = next( iter(self.config.duration_extractor.extract(source, reference)), None) if not duration_result: return result match = regex.search(self.unit_regex, duration_result.text) if not match: return result source_unit = RegExpUtility.get_group(match, 'unit').strip().lower() if source_unit not in self.config.unit_map: return result before_str = source[:duration_result.start].strip().lower() number_str = duration_result.text[:match.start()].strip().lower() number_val = self.__convert_chinese_to_number(number_str) num_str = str(number_val) return self.__parse_common_duration_with_unit(before_str, source_unit, num_str, reference) def __parse_common_duration_with_unit( self, before: str, unit: str, num: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() unit_str = self.config.unit_map[unit] past_match = regex.search(self.config.past_regex, before) has_past = past_match and len(past_match.group()) == len(before) future_match = regex.search(self.config.future_regex, before) has_future = future_match and len(future_match.group()) == len(before) if not has_future and not has_past: return result begin_date = reference end_date = reference difference = float(num) if unit_str == 'D': if has_past: begin_date += timedelta(days=-difference) if has_future: end_date += timedelta(days=difference) elif unit_str == 'W': if has_past: begin_date += timedelta(days=-7 * difference) if has_future: end_date += timedelta(days=7 * difference) elif unit_str == 'MON': if has_past: begin_date += datedelta(months=int(-difference)) if has_future: end_date += datedelta(months=int(difference)) elif unit_str == 'Y': if has_past: begin_date += datedelta(years=int(-difference)) if has_future: end_date += datedelta(years=int(difference)) else: return result if has_future: begin_date += timedelta(days=1) end_date += timedelta(days=1) begin_timex = FormatUtil.luis_date_from_datetime(begin_date) end_timex = FormatUtil.luis_date_from_datetime(end_date) result.timex = f'({begin_timex},{end_timex},P{num}{unit_str[0]})' result.future_value = [begin_date, end_date] result.past_value = [begin_date, end_date] result.success = True return result def __convert_chinese_to_number(self, source: str) -> int: num = -1 er = next(iter(self.integer_extractor.extract(source)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: num = int(self.number_parser.parse(er).value) return num def _parse_year_and_month(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = regex.search(self.year_and_month_regex, source) if not match or len(match.group()) != len(source): match = regex.search(self.pure_number_year_and_month_regex, source) if not match or len(match.group()) != len(source): return result year = reference.year year_num = RegExpUtility.get_group(match, 'year') year_chinese = RegExpUtility.get_group(match, 'yearchs') year_relative = RegExpUtility.get_group(match, 'yearrel') if year_num.strip() != '': if self.config.is_year_only(year_num): year_num = year_num[:-1] year = self._convert_year(year_num, False) elif year_chinese.strip() != '': if self.config.is_year_only(year_chinese): year_chinese = year_chinese[:-1] year = self._convert_year(year_chinese, True) elif year_relative.strip() != '': year += self.config.get_swift_day_or_month(year_relative) if year < 100 and year >= 90: year += 1900 elif year < 100 and year < 20: year += 2000 month_str = RegExpUtility.get_group(match, 'month') month = self.config.month_of_year.get(month_str, 0) % 12 begin_date = DateUtils.safe_create_from_min_value(year, month, 1) end_date = DateUtils.safe_create_from_min_value( year, month, 1) + datedelta(months=1) result.future_value = [begin_date, end_date] result.past_value = [begin_date, end_date] result.timex = f'{year:04d}-{month:02d}' result.success = True return result def _parse_year_to_year(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = regex.search(self.year_to_year_regex, source) if not match: return result year_matches = list(regex.finditer(self.config.year_regex, source)) chinese_year_matches = list( regex.finditer(self.chinese_year_regex, source)) begin_year = 0 end_year = 0 if len(year_matches) == 2: begin_year = self.__convert_chinese_to_number( RegExpUtility.get_group(year_matches[0], 'year')) end_year = self.__convert_chinese_to_number( RegExpUtility.get_group(year_matches[1], 'year')) elif len(chinese_year_matches) == 2: begin_year = self.__convert_chinese_to_number( RegExpUtility.get_group(chinese_year_matches[0], 'yearchs')) end_year = self.__convert_chinese_to_number( RegExpUtility.get_group(chinese_year_matches[1], 'yearchs')) elif len(year_matches) == 1 and len(chinese_year_matches) == 1: if year_matches[0].start() < chinese_year_matches[0].start(): begin_year = self.__convert_chinese_to_number( RegExpUtility.get_group(year_matches[0], 'year')) end_year = self.__convert_chinese_to_number( RegExpUtility.get_group(chinese_year_matches[0], 'yearchs')) else: begin_year = self.__convert_chinese_to_number( RegExpUtility.get_group(chinese_year_matches[0], 'yearchs')) end_year = self.__convert_chinese_to_number( RegExpUtility.get_group(year_matches[0], 'year')) begin_year = self.__sanitize_year(begin_year) end_year = self.__sanitize_year(end_year) begin_date = DateUtils.safe_create_from_min_value(begin_year, 1, 1) end_date = DateUtils.safe_create_from_min_value(end_year, 1, 1) result.future_value = [begin_date, end_date] result.past_value = [begin_date, end_date] begin_timex = FormatUtil.luis_date_from_datetime(begin_date) end_timex = FormatUtil.luis_date_from_datetime(end_date) result.timex = f'({begin_timex},{end_timex},P{end_year - begin_year}Y)' result.success = True return result def __sanitize_year(self, year: int) -> int: result = year if year < 100 and year >= 90: result += 1900 elif year < 100 and year < 20: result += 2000 return result def _parse_year(self, source: str, reference: datetime) -> DateTimeResolutionResult: source = source.strip().lower() result = DateTimeResolutionResult() is_chinese = False match = regex.search(self.config.year_regex, source) if not match or len(match.group()) != len(source): match = regex.search(self.year_in_chinese_regex, source) is_chinese = match and len(match.group()) == len(source) if not match or len(match.group()) != len(source): return result year_str = match.group() if self.config.is_year_only(year_str): year_str = year_str[:-1].strip() year = self._convert_year(year_str, is_chinese) if len(year_str) == 2: if year < 100 and year >= 30: year += 1900 elif year < 30: year += 2000 begin_day = DateUtils.safe_create_from_min_value(year, 1, 1) end_day = DateUtils.safe_create_from_min_value(year + 1, 1, 1) result.timex = f'{year:04d}' result.future_value = [begin_day, end_day] result.past_value = [begin_day, end_day] result.success = True return result def _convert_year(self, year_str: str, is_chinese: bool) -> int: year = -1 if is_chinese: year_num = 0 er = next(iter(self.integer_extractor.extract(year_str)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: year_num = int(self.number_parser.parse(er).value) if year_num < 10: year_num = 0 for char in year_str: year_num *= 10 er = next(iter(self.integer_extractor.extract(char)), None) if er and er.type == NumberConstants.SYS_NUM_INTEGER: year_num += int(self.number_parser.parse(er).value) else: year = year_num else: year = int(year_str) return -1 if year == 0 else year def _get_week_of_month(self, cardinal, month, year, reference, no_year) -> DateTimeResolutionResult: result = DateTimeResolutionResult() seed_date = self._compute_date(cardinal, 1, month, year) future_date = seed_date past_date = seed_date if no_year and future_date < reference: future_date = self._compute_date(cardinal, 1, month, year + 1) if not future_date.month == month: future_date = future_date + timedelta(days=-7) if no_year and past_date >= reference: past_date = self._compute_date(cardinal, 1, month, year - 1) if not past_date.month == month: past_date = past_date + timedelta(days=-7) result.timex = ('XXXX' if no_year else f'{year:04d}') + f'-{month:02d}-W{cardinal:02d}' days_to_add = 6 if self._inclusive_end_period else 7 result.future_value = [ future_date, future_date + timedelta(days=days_to_add) ] result.past_value = [ past_date, past_date + timedelta(days=days_to_add) ] result.success = True return result def _compute_date(self, cardinal: int, weekday: int, month: int, year: int): first_day = datetime(year, month, 1) first_week_day = DateUtils.this(first_day, weekday) if weekday == 0: weekday = 7 first_day_of_week = first_day.isoweekday() if first_day_of_week == 7: first_day_of_week = 0 if weekday < first_day_of_week: first_week_day = DateUtils.next(first_day, weekday) first_week_day = first_week_day + timedelta(days=7 * (cardinal - 1)) return first_week_day def _parse_season(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = regex.search(self.season_with_year_regex, source) if not match or len(match.group()) != len(source): return result year = reference.year year_num = RegExpUtility.get_group(match, 'year') year_chinese = RegExpUtility.get_group(match, 'yearchs') year_relative = RegExpUtility.get_group(match, 'yearrel') has_year = False if year_num.strip() != '': has_year = True if self.config.is_year_only(year_num): year_num = year_num[:-1] year = self._convert_year(year_num, False) elif year_chinese.strip() != '': has_year = True if self.config.is_year_only(year_chinese): year_chinese = year_chinese[:-1] year = self._convert_year(year_chinese, True) elif year_relative.strip() != '': has_year = True year += self.config.get_swift_day_or_month(year_relative) if year < 100 and year >= 90: year += 1900 elif year < 100 and year < 20: year += 2000 season_str = RegExpUtility.get_group(match, 'season') season = self.config.season_map.get(season_str, None) if has_year: result.timex = f'{year:02d}-{season}' result.success = True return result def _parse_quarter(self, source: str, reference: datetime) -> DateTimeResolutionResult: result = DateTimeResolutionResult() match = regex.search(self.config.quarter_regex, source) if not match or len(match.group()) != len(source): return result year = reference.year year_num = RegExpUtility.get_group(match, 'year') year_chinese = RegExpUtility.get_group(match, 'yearchs') year_relative = RegExpUtility.get_group(match, 'yearrel') has_year = False if year_num.strip() != '': has_year = True if self.config.is_year_only(year_num): year_num = year_num[:-1] year = self._convert_year(year_num, False) elif year_chinese.strip() != '': has_year = True if self.config.is_year_only(year_chinese): year_chinese = year_chinese[:-1] year = self._convert_year(year_chinese, True) elif year_relative.strip() != '': has_year = True year += self.config.get_swift_day_or_month(year_relative) if year < 100 and year >= 90: year += 1900 elif year < 100 and year < 20: year += 2000 cardinal_str = RegExpUtility.get_group(match, 'cardinal') quarter_num = self.config.cardinal_map.get(cardinal_str, None) begin_date = DateUtils.safe_create_from_min_value( year, quarter_num * 3 - 2, 1) end_date = DateUtils.safe_create_from_min_value( year, quarter_num * 3 + 1, 1) result.future_value = [begin_date, end_date] result.past_value = [begin_date, end_date] begin_luis = FormatUtil.luis_date_from_datetime(begin_date) end_luis = FormatUtil.luis_date_from_datetime(end_date) result.timex = f'({begin_luis},{end_luis},P3M)' result.success = True return result