def get_year_from_text(self, match: Match) -> int: year = Constants.INVALID_YEAR year_str = RegExpUtility.get_group(match, 'year') if year_str and not (str.isspace(year_str) or year_str is None): year = int(year_str) if 100 > year >= Constants.MIN_TWO_DIGIT_YEAR_PAST_NUM: year += 1900 elif 0 <= year < Constants.MAX_TWO_DIGIT_YEAR_FUTURE_NUM: year += 2000 else: first_two_year_num_str = RegExpUtility.get_group( match, Constants.FIRST_TWO_YEAR_NUM) if first_two_year_num_str and not ( str.isspace(first_two_year_num_str) or first_two_year_num_str is None): er = ExtractResult() er.text = first_two_year_num_str er.start = match.string.index( RegExpUtility.get_group(match, Constants.FIRST_TWO_YEAR_NUM)) er.length = len( RegExpUtility.get_group(match, Constants.FIRST_TWO_YEAR_NUM)) first_two_year_num = self.config.number_parser.parse(er).value if \ self.config.number_parser.parse(er).value else 0 last_two_year_num = 0 last_two_year_num_str = RegExpUtility.get_group( match, Constants.LAST_TWO_YEAR_NUM) if not (str.isspace(last_two_year_num_str) or last_two_year_num_str is None): er = ExtractResult() er.text = last_two_year_num_str er.start = match.string.index( RegExpUtility.get_group(match, Constants.LAST_TWO_YEAR_NUM)) er.length = len( RegExpUtility.get_group(match, Constants.LAST_TWO_YEAR_NUM)) last_two_year_num = self.config.number_parser.parse(er).value if \ self.config.number_parser.parse(er).value else 0 if (first_two_year_num < 100 and last_two_year_num == 0)\ or (first_two_year_num < 100 and first_two_year_num % 10 == 0 and len(last_two_year_num_str.strip().split(' ')) == 1): year = Constants.INVALID_YEAR return year if first_two_year_num >= 100: year = first_two_year_num + last_two_year_num else: year = (first_two_year_num * 100) + last_two_year_num return year
def try_merge_modifier_token(self, extract_result: ExtractResult, token_regex: Pattern, text: str): start = extract_result.start if extract_result.start else 0 before_str = text[0:start] if self.has_token_index(before_str.rstrip(), token_regex).matched: boolean, token_index = self.has_token_index(before_str.rstrip(), token_regex) mod_length = len(before_str) - token_index extract_result.length += mod_length extract_result.start -= mod_length start = extract_result.start if extract_result.start else 0 length = extract_result.length if extract_result.length else 0 extract_result.text = text[start: start + length] return True return False