def basic_regex_match(self, source: str) -> []: from .utilities import Token from .utilities import RegExpUtility ret: List[Token] = list() for regexp in self.config.date_regex_list: matches = list(regexp.finditer(source)) if matches is not None: for match in matches: # some match might be part of the date range entity, and might be split in a wrong way if self.validate_match(match, source): # Cases that the relative term is before # the detected date entity, like "this 5/12", "next friday 5/12" pre_text = source[0:source.index(match.group())] relative_regex = RegExpUtility.match_end(self.config.strict_relative_regex, pre_text, True) if relative_regex: if relative_regex.success: ret.append(Token(relative_regex.index, source.index(match.group()) + match.end() - match.start())) else: ret.append(Token(source.index(match.group()), source.index(match.group()) + match.end() - match.start())) else: ret.append(Token(source.index(match.group()), source.index(match.group()) + match.end() - match.start())) return ret
def extract_relative_duration_date_with_in_prefix(self, text: str, duration_er: [ExtractResult], reference: datetime): from .utilities import Token result: [Token] = [] durations: [Token] = [] for duration_extraction in duration_er: match = self.config.date_unit_regex.search(duration_extraction.text) if match: durations.append(Token(duration_extraction.start or 0, (duration_extraction.start or 0) + duration_extraction.length or 0)) for duration in durations: before_str = text[0:duration.start] after_str = text[duration.start + duration.length:] if (str.isspace(before_str) or before_str is None) and (str.isspace(after_str) or after_str is None): continue ers, success = self.extract_in_connector(text, after_str, before_str, duration, True) result.append(ers) if not success and self.config.check_both_before_after: ers, success = self.extract_in_connector(text, after_str, before_str, duration, True) result.append(ers) return flatten(result)
def extract_in_connector(self, text, first_str, second_str, duration, in_prefix): from recognizers_date_time import Token result = [] match = RegExpUtility.match_end(self.config.in_connector_regex, first_str, True) if in_prefix else RegExpUtility.match_begin(self.config.in_connector_regex, first_str, True) success = False if not match else match.success if match and match.success: start_token = match.index range_unit_math = self.config.range_unit_regex.match(text[duration.start: duration.start + duration.length]) if range_unit_math: since_year_match = self.config.since_year_suffix_regex.match(second_str) if since_year_match: result.append(Token(start_token, duration.end + len(since_year_match))) else: result.append(Token(start_token, duration.end)) return result, success
def number_with_month(self, source: str, reference: datetime) -> []: from .utilities import Token from .utilities import DateUtils ret: List[Token] = list() extract_results = self.config.ordinal_extractor.extract(source) extract_results.extend(self.config.integer_extractor.extract(source)) for result in extract_results: num = int(self.config.number_parser.parse(result).value) if num < 1 or num > 31: continue if result.start >= 0: front_string = source[0:result.start or 0] match = regex.search(self.config.month_end, front_string) if match is not None: start_index = match.start() result_length = result.length if result.length else 0 end_index = match.start() + len(match.group()) + result_length start_index, end_index = self.extend_with_week_day_and_year( start_index, end_index, self.config.month_of_year[str(RegExpUtility.get_group( match, Constants.MONTH_GROUP_NAME)).lower()], num, source, reference) ret.append( Token(match.start(), end_index)) continue # handling cases like 'for the 25th' matches = regex.finditer(self.config.for_the_regex, source) is_found = False for match_case in matches: if match_case is not None: ordinal_num = RegExpUtility.get_group( match_case, Constants.DAY_OF_MONTH) if ordinal_num == result.text: length = len( RegExpUtility.get_group(match_case, TimeTypeConstants.END)) ret.append(Token(match_case.start(), match_case.end() - length)) is_found = True if is_found: continue # handling cases like 'Thursday the 21st', which both 'Thursday' and '21st' refer to a same date matches = regex.finditer( self.config.week_day_and_day_of_month_regex, source) for match_case in matches: if match_case is not None: ordinal_num = RegExpUtility.get_group( match_case, Constants.DAY_OF_MONTH) if ordinal_num == result.text: month = reference.month year = reference.year # get week of day for the ordinal number which is regarded as a date of reference month date = DateUtils.safe_create_from_min_value( year, month, num) num_week_day_str: str = calendar.day_name[date.weekday()].lower( ) # get week day from text directly, compare it with the weekday generated above # to see whether they refer to a same week day extracted_week_day_str = RegExpUtility.get_group( match_case, 'weekday').lower() if (date != DateUtils.min_value and self.config.day_of_week[num_week_day_str] == self.config.day_of_week[extracted_week_day_str]): ret.append( Token(match_case.start(), match_case.end())) is_found = True if is_found: continue # Handling cases like 'Monday 21', which both 'Monday' and '21' refer to the same date # The year of expected date can be different to the year of referenceDate. matches = regex.finditer(self.config.week_day_and_day_regex, source) for match_case in matches: if match_case: match_length = result.start + result.length - match_case.start() if match_length == match_case.start(): ret.append(Token(match_case.start(), match_case.end())) is_found = True if is_found: continue # handling cases like '20th of next month' suffix_str: str = source[result.start + result.length:].lower() match = regex.match( self.config.relative_month_regex, suffix_str.strip()) space_len = len(suffix_str) - len(suffix_str.strip()) if match is not None and match.start() == 0: space_len = len(suffix_str) - len(suffix_str.strip()) res_start = result.start res_end = res_start + result.length + space_len + len(match.group()) # Check if prefix contains 'the', include it if any prefix = source[: res_start or 0] prefix_match = self.config.prefix_article_regex.match(prefix) if prefix_match: res_start = prefix_match.start() ret.append( Token(res_start, res_end)) # handling cases like 'second Sunday' suffix_str = source[result.start + result.length:] match = regex.match( self.config.week_day_regex, suffix_str.strip()) if (match is not None and match.start() == 0 and 1 <= num <= 5 and result.type == NumberConstants.SYS_NUM_ORDINAL): week_day_str = RegExpUtility.get_group(match, Constants.WEEKDAY_GROUP_NAME).lower() if week_day_str in self.config.day_of_week: ret.append( Token(result.start, result.start + result.length + space_len + len(match.group()))) # For cases like "I'll go back twenty second of June" if result.start + result.length < len(source): after_string = source[result.start + result.length:] match = regex.match(self.config.of_month, after_string) if match is not None: start_index = result.start if result.start else 0 result_length = result.length if result.length else 0 end_index = (start_index + result_length) + len(match.group()) self.extend_with_week_day_and_year(start_index, end_index, self.config.month_of_year[RegExpUtility.get_group( match, Constants.MONTH_GROUP_NAME).lower() or str( reference.month)], num, source, reference) ret.append(Token(start_index, start_index + result.length + len(match.group()))) return ret