Esempio n. 1
0
    def __get_year_from_text(self, match) -> int:
        first_two_year_num_str = match.group('firsttwoyearnum')

        if first_two_year_num_str:
            er = ExtractResult()
            er.text = first_two_year_num_str
            er.start = match.start('firsttwoyearnum')
            er.length = match.end('firsttwoyearnum') - er.start
            first_two_year_num = self.config.number_parser.parse(er).value

            last_two_year_num = 0
            last_two_year_num_str = match.group('lasttwoyearnum')

            if last_two_year_num_str:
                er.text = last_two_year_num_str
                er.start = match.start('lasttwoyearnum')
                er.length = match.end('lasttwoyearnum') - er.start
                last_two_year_num = self.config.number_parser.parse(er).value

            if first_two_year_num < 100 and last_two_year_num == 0 or first_two_year_num < 100 and first_two_year_num % 10 == 0 and len(last_two_year_num_str.strip().split(' ')) == 1:
                return -1

            if first_two_year_num >= 100:
                return first_two_year_num + last_two_year_num

            return first_two_year_num * 100 + last_two_year_num
        else:
            return -1
Esempio n. 2
0
    def get_year_from_text(self, match: Match) -> int:
        year = Constants.INVALID_YEAR

        year_str = RegExpUtility.get_group(match, 'year')
        if year_str and not (str.isspace(year_str) or year_str is None):
            year = int(year_str)
            if 100 > year >= Constants.MIN_TWO_DIGIT_YEAR_PAST_NUM:
                year += 1900
            elif 0 <= year < Constants.MAX_TWO_DIGIT_YEAR_FUTURE_NUM:
                year += 2000
        else:
            first_two_year_num_str = RegExpUtility.get_group(
                match, Constants.FIRST_TWO_YEAR_NUM)

            if first_two_year_num_str and not (
                    str.isspace(first_two_year_num_str)
                    or first_two_year_num_str is None):

                er = ExtractResult()
                er.text = first_two_year_num_str
                er.start = match.string.index(
                    RegExpUtility.get_group(match,
                                            Constants.FIRST_TWO_YEAR_NUM))
                er.length = len(
                    RegExpUtility.get_group(match,
                                            Constants.FIRST_TWO_YEAR_NUM))

                first_two_year_num = self.config.number_parser.parse(er).value if \
                    self.config.number_parser.parse(er).value else 0

                last_two_year_num = 0
                last_two_year_num_str = RegExpUtility.get_group(
                    match, Constants.LAST_TWO_YEAR_NUM)

                if not (str.isspace(last_two_year_num_str)
                        or last_two_year_num_str is None):
                    er = ExtractResult()
                    er.text = last_two_year_num_str
                    er.start = match.string.index(
                        RegExpUtility.get_group(match,
                                                Constants.LAST_TWO_YEAR_NUM))
                    er.length = len(
                        RegExpUtility.get_group(match,
                                                Constants.LAST_TWO_YEAR_NUM))

                    last_two_year_num = self.config.number_parser.parse(er).value if \
                        self.config.number_parser.parse(er).value else 0

                if (first_two_year_num < 100 and last_two_year_num == 0)\
                        or (first_two_year_num < 100 and first_two_year_num % 10 == 0
                            and len(last_two_year_num_str.strip().split(' ')) == 1):
                    year = Constants.INVALID_YEAR
                    return year

                if first_two_year_num >= 100:
                    year = first_two_year_num + last_two_year_num
                else:
                    year = (first_two_year_num * 100) + last_two_year_num

        return year
Esempio n. 3
0
    def add_mod_item(self, er: ExtractResult, source: str) -> ExtractResult:
        before_str = source[0:er.start]

        before = self.has_token_index(before_str.strip(),
                                      self.config.before_regex)
        if before.matched:
            mod_len = len(before_str) - before.index
            er.length += mod_len
            er.start -= mod_len
            er.text = source[er.start:er.start + er.length]

        after = self.has_token_index(before_str.strip(),
                                     self.config.after_regex)
        if after.matched:
            mod_len = len(before_str) - after.index
            er.length += mod_len
            er.start -= mod_len
            er.text = source[er.start:er.start + er.length]

        since = self.has_token_index(before_str.strip(),
                                     self.config.since_regex)
        if since.matched:
            mod_len = len(before_str) - since.index
            er.length += mod_len
            er.start -= mod_len
            er.text = source[er.start:er.start + er.length]

        return er
Esempio n. 4
0
    def try_merge_modifier_token(self,
                                 extract_result: ExtractResult,
                                 pattern: Pattern,
                                 source: str,
                                 potential_ambiguity: bool = False) -> bool:
        before_str = source[0:extract_result.start]
        after_str = source[extract_result.start:extract_result.length]

        # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod
        if potential_ambiguity and self.config.ambiguous_range_modifier_prefix and \
                regex.search(self.config.ambiguous_range_modifier_prefix, before_str):
            matches = list(
                regex.finditer(self.config.potential_ambiguous_range_regex,
                               source))
            if matches and len(matches):
                return any(match.start() < extract_result.start +
                           extract_result.length
                           and match.end() > extract_result.start
                           for match in matches)
                # return self._filter_item(extract_result, matches)

        token = self.has_token_index(before_str.strip(), pattern)
        if token.matched:
            mod_len = len(before_str) - token.index
            extract_result.length += mod_len
            extract_result.start -= mod_len
            extract_result.text = source[extract_result.
                                         start:extract_result.start +
                                         extract_result.length]

            extract_result.meta_data = self.assign_mod_metadata(
                extract_result.meta_data)
            return True
        elif self.config.check_both_before_after:
            # check also after_str
            after_str = source[extract_result.start:extract_result.length]
            token = self.has_token_index(after_str.strip(), pattern)
            if token.matched:
                mod_len = token.index + len(after_str) - len(after_str.strip())
                extract_result.length += mod_len
                extract_result.text = source[extract_result.
                                             start:extract_result.start +
                                             extract_result.length]
                extract_result.data = Constants.HAS_MOD
                extract_result.meta_data = self.assign_mod_metadata(
                    extract_result.meta_data)

                return True

        return False
Esempio n. 5
0
    def _extract_separate_units(self, source: str, num_depend_source: List[ExtractResult], non_unit_matches) -> List[ExtractResult]:
        result = deepcopy(num_depend_source)
        match_result: List[bool] = [False] * len(source)
        for ex_result in num_depend_source:
            start = ex_result.start
            i = 0
            while i < ex_result.length:
                match_result[start + i] = True
                i += 1

        match_collection = list(
            filter(lambda x: x.group(), regex.finditer(self.separate_regex, source)))
        for match in match_collection:
            i = 0
            while i < len(match.group()) and not match_result[match.start() + i]:
                i += 1
            if i == len(match.group()):
                for j in range(i):
                    match_result[j] = True

                is_not_unit = False
                if match.group() == Constants.AMBIGUOUS_TIME_TERM:
                    for time in non_unit_matches:
                        if self._dimension_inside_time(match, time):
                            is_not_unit = True

                if is_not_unit:
                    continue

                to_add = ExtractResult()
                to_add.start = match.start()
                to_add.length = len(match.group())
                to_add.text = match.group()
                to_add.type = self.config.extract_type
                num_depend_source.append(to_add)
Esempio n. 6
0
    def extract(self, source: str) -> List[ExtractResult]:
        result: List[ExtractResult] = list()
        if not self._pre_check_str(source):
            return result

        matched: List[bool] = [False] * len(source)

        match_source: Dict[Match, str] = dict()
        matches_list = list(
            map(
                lambda x: MatchesVal(matches=list(re.finditer(x.re, source)),
                                     val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))

        for ml in matches_list:
            for m in ml.matches:
                for j in range(len(m.group())):
                    matched[m.start() + j] = True
                # Keep Source Data for extra information
                match_source[m] = ml.val
        last = -1

        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substring = source[start:start + length].strip()
                    simple_tokenizer = SimpleTokenizer()
                    if substring.startswith(Constants.IPV6_ELLIPSIS) and (
                            start > 0 and (str.isdigit(source[start - 1]) or
                                           (str.isalpha(source[start - 1])
                                            and not simple_tokenizer.is_cjk(
                                                c=list(source)[start - 1])))):
                        continue

                    elif substring.endswith(Constants.IPV6_ELLIPSIS) and (
                            i + 1 < len(source) and
                        (str.isdigit(source[i + 1]) or
                         (str.isalpha(source[i + 1]) and not simple_tokenizer.
                          is_cjk(c=list(source)[start - 1])))):
                        continue

                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substring
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)
        return result
Esempio n. 7
0
    def try_merge_modifier_token(self,
                                 er: ExtractResult,
                                 pattern: Pattern,
                                 source: str,
                                 potentialAmbiguity: bool = False) -> bool:
        before_str = source[0:er.start]

        # Avoid adding mod for ambiguity cases, such as "from" in "from ... to ..." should not add mod
        if potentialAmbiguity and self.config.ambiguous_range_modifier_prefix and regex.search(
                self.config.ambiguous_range_modifier_prefix, before_str):
            matches = list(
                regex.finditer(self.config.potential_ambiguous_range_regex,
                               source))
            if matches and len(matches):
                return self._filter_item(er, matches)

        token = self.has_token_index(before_str.strip(), pattern)
        if token.matched:
            mod_len = len(before_str) - token.index
            er.length += mod_len
            er.start -= mod_len
            er.text = source[er.start:er.start + er.length]

            er.meta_data = self.assign_mod_metadata(er.meta_data)
            return True

        return False
Esempio n. 8
0
 def _extract_separate_units(
         self, source: str,
         num_depend_source: List[ExtractResult]) -> List[ExtractResult]:
     result = deepcopy(num_depend_source)
     match_result: List[bool] = [False] * len(source)
     for ex_result in num_depend_source:
         for i in range(ex_result.start, ex_result.end + 1):
             match_result[i] = True
     match_collection = list(
         filter(lambda x: x.group(),
                regex.finditer(self.separate_regex, source)))
     for match in match_collection:
         i = 0
         while i < len(
                 match.group()) and not match_result[match.start() + i]:
             i += 1
         if i == len(match.group()):
             for j in range(i):
                 match_result[j] = True
             to_add = ExtractResult()
             to_add.start = match.start()
             to_add.length = len(match.group())
             to_add.text = match.group()
             to_add.type = self.config.extract_type
             result.append(to_add)
     return result
Esempio n. 9
0
    def parse(self, source: ExtractResult) -> Optional[ParseResult]:
        ret = ParseResult(source)
        number_result = None
        if source.data and isinstance(source.data, ExtractResult):
            number_result = source.data
        else:  # if there is no unitResult, means there is just unit
            number_result = ExtractResult()
            number_result.start = -1
            number_result.length = 0
            number_result.text = None
            number_result.type = None
        # key contains units
        key = source.text
        unit_key_build = ''
        unit_keys = []
        i = 0
        while i <= len(key):
            if i == len(key):
                if unit_key_build:
                    self.__add_if_not_contained(
                        unit_keys, unit_key_build.strip())
            # number_result.start is a relative position
            elif i == number_result.start:
                if unit_key_build:
                    self.__add_if_not_contained(
                        unit_keys, unit_key_build.strip())
                    unit_key_build = ''
                if number_result.length:
                    i = number_result.start + number_result.length - 1
            else:
                unit_key_build += key[i]
            i += 1

        # Unit type depends on last unit in suffix.
        last_unit = unit_keys[-1]
        normalized_last_unit = last_unit.lower()
        if self.config.connector_token and normalized_last_unit.startswith(self.config.connector_token):
            normalized_last_unit = normalized_last_unit[len(
                self.config.connector_token):].strip()
            last_unit = last_unit[len(self.config.connector_token):].strip()
        if key and self.config.unit_map:
            unit_value = None
            if last_unit in self.config.unit_map:
                unit_value = self.config.unit_map[last_unit]
            elif normalized_last_unit in self.config.unit_map:
                unit_value = self.config.unit_map[normalized_last_unit]
            if unit_value:
                num_value = self.config.internal_number_parser.parse(
                    number_result) if number_result.text else None
                resolution_str = num_value.resolution_str if num_value else None

                ret.value = UnitValue(
                    number=resolution_str,
                    unit=unit_value)
                ret.resolution_str = f'{resolution_str} {unit_value}'.strip()

        ret.text = ret.text.lower()

        return ret
Esempio n. 10
0
    def strip_inequality(extract_result: ExtractResult, regexp: Pattern, in_prefix: bool):
        if regex.search(regexp, extract_result.text):
            original_length = len(extract_result.text)
            extract_result.text = str(regexp).replace(extract_result.text, '').strip()
            if in_prefix:
                extract_result.start += original_length - len(extract_result.text)

            extract_result.length = len(extract_result.text)
            extract_result.data = ''
Esempio n. 11
0
    def extract(self, source: str):
        results: List[ExtractResult] = list()
        partial_results: List[ExtractResult] = list()
        trimmed_source = source.lower()

        if source is None or source.strip() == '':
            return results
        source_tokens = self.__tokenize(trimmed_source)

        for (regexp, type_extracted) in self.config.regexes_map.items():
            for match in RegExpUtility.get_matches(regexp, trimmed_source):
                match_tokens = self.__tokenize(match)
                top_score = 0.0

                for i in range(len(source_tokens)):
                    score = self.match_value(source_tokens, match_tokens, i)
                    top_score = max(top_score, score)

                if top_score > 0.0:
                    value = ExtractResult()
                    start = trimmed_source.index(match)
                    length = len(match)
                    text = source[start:start + length].strip()
                    value.start = start
                    value.length = length
                    value.text = text
                    value.type = type_extracted
                    value.data = ChoiceExtractDataResult(source, top_score)

                    partial_results.append(value)

        if len(partial_results) == 0:
            return results

        partial_results = sorted(partial_results, key=lambda res: res.start)

        if self.config.only_top_match:
            top_score = 0.0
            top_result_index = 0
            for i in range(len(partial_results)):
                data = ChoiceExtractDataResult(source,
                                               partial_results[i].data.score)
                if data.score > top_score:
                    top_score = data.score
                    top_result_index = i

            top_result = ChoiceExtractDataResult(
                partial_results[top_result_index].data.source,
                partial_results[top_result_index].data.score)
            top_result.other_matches = partial_results
            results.append(partial_results[top_result_index])
        else:
            results = partial_results

        return results
Esempio n. 12
0
    def extract(self, source: str) -> List[ExtractResult]:
        if source is None or len(source.strip()) is 0:
            return list()
        result: List[ExtractResult] = list()
        match_source = dict()
        matched: List[bool] = [False] * len(source)

        matches_list = list(
            map(
                lambda x: MatchesVal(
                    matches=list(regex.finditer(x.re, source)), val=x.val),
                self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))
        for ml in matches_list:
            for m in ml.matches:
                for j in range(len(m.group())):
                    matched[m.start() + j] = True
                # Keep Source Data for extra information
                match_source[m] = ml.val

        last = -1
        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substr = source[start:start + length].strip()
                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    # extract negative numbers
                    if self._negative_number_terms is not None:
                        match = regex.search(self._negative_number_terms,
                                             source[0:start])
                        if match is not None:
                            start = match.start()
                            length = length + match.end() - match.start()
                            substr = source[start:start + length].strip()

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substr
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)

        result = self._filter_ambiguity(result, source)
        return result
Esempio n. 13
0
    def extract(self, source: str) -> List[ExtractResult]:
        result: List[ExtractResult] = list()
        if not self._pre_check_str(source):
            return result

        matched: List[bool] = [False] * len(source)

        match_source: Dict[Match, str] = dict()

        matches_list = list(
            map(
                lambda x: MatchesVal(matches=list(re.finditer(x.re, source)),
                                     val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))

        for ml in matches_list:
            for m in ml.matches:
                if self._is_valid_match(m):
                    for j in range(len(m.group())):
                        matched[m.start() + j] = True
                    # Keep Source Data for extra information
                    match_source[m] = ml.val
        last = -1

        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substring = source[start:start + length].strip()
                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substring
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)

        return result
Esempio n. 14
0
    def try_merge_modifier_token(self, extract_result: ExtractResult, token_regex: Pattern, text: str):
        start = extract_result.start if extract_result.start else 0
        before_str = text[0:start]

        if self.has_token_index(before_str.rstrip(), token_regex).matched:
            boolean, token_index = self.has_token_index(before_str.rstrip(), token_regex)

            mod_length = len(before_str) - token_index

            extract_result.length += mod_length
            extract_result.start -= mod_length
            start = extract_result.start if extract_result.start else 0
            length = extract_result.length if extract_result.length else 0
            extract_result.text = text[start: start + length]
            return True

        return False
Esempio n. 15
0
    def parse(self, source: ExtractResult) -> Optional[ParseResult]:
        original = source.text

        # do replace text & data from extended info
        if isinstance(source.data, list):
            source.text = source.data[0]
            source.data = source.data[1].data

        result: ParseResult = super().parse(source)

        if not result.resolution_str is None and result.resolution_str:
            if not result.resolution_str.strip().endswith('%'):
                result.resolution_str = result.resolution_str.strip() + '%'

        result.data = source.text
        result.text = original

        return result
Esempio n. 16
0
    def parse(self, source: ExtractResult) -> Optional[ParseResult]:
        # Check if the parser is configured to support specific types
        if self.supported_types and source.type not in self.supported_types:
            return None
        ret: Optional[ParseResult] = None
        extra = source.data if isinstance(source.data, str) else None
        if not extra:
            if self.arabic_number_regex.search(source.text):
                extra = 'Num'
            else:
                extra = self.config.lang_marker

        # Resolve symbol prefix
        is_negative = False
        match_negative = regex.search(self.config.negative_number_sign_regex,
                                      source.text)

        if match_negative:
            is_negative = True
            source.text = source.text[len(match_negative[1]):]

        if 'Num' in extra:
            ret = self._digit_number_parse(source)
        elif regex.search(
                fr'Frac{self.config.lang_marker}',
                extra):  # Frac is a special number, parse via another method
            ret = self._frac_like_number_parse(source)
        elif self.config.lang_marker in extra:
            ret = self._text_number_parse(source)
        elif 'Pow' in extra:
            ret = self._power_number_parse(source)

        if ret and ret.value is not None:
            if is_negative:
                # Recover to the original extracted Text
                ret.text = match_negative[1] + source.text
                ret.value = ret.value * -1
            # Use culture_info to format values
            ret.resolution_str = self.config.culture_info.format(
                ret.value) if self.config.culture_info is not None else repr(
                    ret.value)
            ret.text = ret.text.lower()

        return ret
Esempio n. 17
0
def merge_all_tokens(tokens: List[Token], source: str,
                     extractor_name: str) -> List[ExtractResult]:
    result = []

    merged_tokens: List[Token] = list()
    tokens_ = sorted(filter(None, tokens), key=lambda x: x.start)

    for token in tokens_:
        add = True

        for index, m_token in enumerate(merged_tokens):
            if not add:
                break

            if token.start >= m_token.start and token.end <= m_token.end:
                add = False

            if m_token.start < token.start < m_token.end:
                add = False

            if token.start <= m_token.start and token.end >= m_token.end:
                add = False
                merged_tokens[index] = token

        if add:
            merged_tokens.append(token)

    for token in merged_tokens:
        start = token.start
        length = token.length
        sub_str = source[start:start + length]

        extracted_result = ExtractResult()
        extracted_result.start = start
        extracted_result.length = length
        extracted_result.text = sub_str
        extracted_result.type = extractor_name
        extracted_result.data = None
        extracted_result.meta_data = token.metadata

        result.append(extracted_result)

    return result
Esempio n. 18
0
    def extract(self, source: str) -> List[ExtractResult]:
        origin = source

        # preprocess the source sentence via extracting and replacing the numbers in it
        preprocess = self.__preprocess_with_number_extracted(origin)
        source = preprocess.source
        positionmap = preprocess.position
        extractresults = preprocess.results

        allmatches = list(
            map(lambda p: list(regex.finditer(p, source)), self.regexes))
        matched: List[bool] = [False] * len(source)

        for matches in allmatches:
            for match in matches:
                for j in range(len(match.group())):
                    matched[match.start() + j] = True

        results = list()

        # get index of each matched results
        last = -1
        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if (i + 1) == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substr = source[start:start + length].strip()
                    value = ExtractResult()
                    value.start = start
                    value.length = length
                    value.text = substr
                    value.type = self._extract_type
                    results.append(value)

        # post-processing, restoring the extracted numbers
        results = self.__post_processing(results, origin, positionmap,
                                         extractresults)

        return results
Esempio n. 19
0
    def merge_date_and_time(self, source: str,
                            reference: datetime) -> List[Token]:
        tokens: List[Token] = list()
        date_ers: List[
            ExtractResult] = self.config.date_point_extractor.extract(
                source, reference)

        if not date_ers:
            return tokens

        time_ers = self.config.time_point_extractor.extract(source, reference)
        time_num_matches = self.config.number_as_time_regex.match(source)

        if len(time_ers) == 0 and time_num_matches == 0:
            return tokens

        extract_results = date_ers
        extract_results.extend(time_ers)

        # handle cases which use numbers as time points
        # only enabled in CalendarMode
        if (self.config.options & DateTimeOptions.CALENDAR) != 0:
            num_ers = []

            idx = 0

            for idx in range(idx, len(time_num_matches), 1):
                match = time_num_matches[idx]
                node = ExtractResult()
                node.start = source.index(match.group())
                node.length = len(match.group())
                node.text = match.text
                node.type = NumConstants.SYS_NUM_INTEGER
                num_ers.append(node)

            extract_results.extend(num_ers)

        extract_results = sorted(extract_results, key=lambda x: x.start)

        i = 0

        while i < len(extract_results) - 1:

            j = i + 1

            while j < len(extract_results) and extract_results[i].overlap(
                    extract_results[j]):
                j += 1

            if j >= len(extract_results):
                break

            if ((extract_results[i].type is Constants.SYS_DATETIME_DATE
                 and extract_results[j].type is Constants.SYS_DATETIME_TIME) or
                (extract_results[i].type is Constants.SYS_DATETIME_TIME
                 and extract_results[j].type is Constants.SYS_DATETIME_DATE) or
                (extract_results[i].type is Constants.SYS_DATETIME_DATE
                 and extract_results[j] is NumConstants.SYS_NUM_INTEGER)):
                middle_begin = extract_results[i].start + (
                    extract_results[i].length or 0)
                middle_end = extract_results[j].start or 0

                if middle_begin > middle_end:
                    i = j + 1
                    continue

                middle_str = source[middle_begin:middle_end].strip()
                valid = False

                # for cases like "tomorrow 3", "tomorrow at 3"
                if extract_results[j].type is NumConstants.SYS_NUM_INTEGER:
                    match = self.config.date_number_connector_regex.search(
                        middle_str)
                    if not middle_str or match:
                        valid = True
                else:
                    # for case like "3 pm or later on monday"
                    match = self.config.suffix_after_regex.search(middle_str)
                    if match:
                        middle_str = middle_str[middle_str.index(match.group(
                        )) + len(match.group()):len(middle_end)].strip()

                    if not (match and len(middle_str) == 0):
                        if self.config.is_connector_token(middle_str):
                            valid = True

                if valid:
                    begin = extract_results[i].start or 0
                    end = (extract_results[j].start
                           or 0) + (extract_results[j].length or 0)

                    end_index, start_index = self.extend_with_date_time_and_year(
                        begin, end, source, reference)

                    tokens.append(Token(start_index, end_index))
                    i = j + 1
                    continue
            i = j

        # handle "in the afternoon" at the end of entity
        idx = 0
        for idx in range(idx, len(tokens), 1):
            after_str = source[tokens[idx].end:]
            match = self.config.suffix_regex.search(after_str)
            if match:
                tokens[idx] = Token(tokens[idx].start,
                                    tokens[idx].end + len(match.group()))

        # handle "day" prefixes
        idx = 0
        for idx in range(idx, len(tokens), 1):
            before_str = source[0:tokens[idx].start]
            match = self.config.utility_configuration.common_date_prefix_regex.search(
                before_str)
            if match:
                tokens[idx] = Token(tokens[idx].start - len(match.group()),
                                    tokens[idx].end)

        return tokens
    def parse_specific_time(self, source: str,
                            reference: datetime) -> DateTimeResolutionResult:
        result = DateTimeResolutionResult()
        year = reference.year
        month = reference.month
        day = reference.day

        source = source.strip().lower()

        match = regex.search(self.config.specific_time_from_to_regex, source)
        if not match:
            match = regex.search(self.config.specific_time_between_and_regex,
                                 source)

        if not match or match.start() != 0:
            return result

        # this "from .. to .." pattern is valid if followed by a Date OR "pm"
        valid = False

        time1 = RegExpUtility.get_group(match, "time1")
        time2 = RegExpUtility.get_group(match, "time2")

        # get hours
        hour_group_list = RegExpUtility.get_group_list(
            match, Constants.HOUR_GROUP_NAME)

        hour_str = hour_group_list[0]
        begin_hour = self.config.numbers.get(hour_str, None)
        if not begin_hour:
            begin_hour = int(hour_str)

        hour_str = hour_group_list[1]
        end_hour = self.config.numbers.get(hour_str, None)
        if not end_hour:
            end_hour = int(hour_str)

        # get minutes
        minute_group_list = RegExpUtility.get_group_list(
            match, Constants.MINUTE_GROUP_NAME)

        begin_minute = end_minute = -1
        if len(minute_group_list) > 1:
            minute_str = minute_group_list[0]
            begin_minute = self.config.numbers.get(minute_str, None)
            if not begin_minute:
                begin_minute = int(minute_str)
            minute_str = minute_group_list[1]
            end_minute = self.config.numbers.get(minute_str, None)
            if not end_minute:
                end_minute = int(minute_str)
        elif len(minute_group_list) == 1:
            minute_str = minute_group_list[0]
            if minute_str in time1:
                begin_minute = self.config.numbers.get(minute_str, None)
                if not begin_minute:
                    begin_minute = int(minute_str)
            elif minute_str in time2:
                end_minute = self.config.numbers.get(minute_str, None)
                if not end_minute:
                    end_minute = int(minute_str)

        # parse AM/PM
        left_desc: str = RegExpUtility.get_group(
            match, Constants.LEFT_DESC_GROUP_NAME)
        right_desc: str = RegExpUtility.get_group(
            match, Constants.RIGHT_DESC_GROUP_NAME)

        desc_capture_list = RegExpUtility.get_group_list(
            match, Constants.DESC_GROUP_NAME)
        for desc_capture in desc_capture_list:
            if desc_capture in time1 and not left_desc:
                left_desc: str = desc_capture
            elif desc_capture in time2 and not right_desc:
                right_desc: str = desc_capture

        begin_date_time = datetime(
            year,
            month,
            day,
            hour=begin_hour,
            minute=begin_minute if begin_minute > 0 else 0)
        end_date_time = datetime(year,
                                 month,
                                 day,
                                 hour=end_hour,
                                 minute=end_minute if end_minute > 0 else 0)

        has_left_am = left_desc != '' and left_desc.startswith('a')
        has_left_pm = left_desc != '' and left_desc.startswith('p')
        has_right_am = right_desc != '' and right_desc.startswith('a')
        has_right_pm = right_desc != '' and right_desc.startswith('p')
        has_left = has_left_am or has_left_pm
        has_right = has_right_am or has_right_pm

        # both time point has description like 'am' or 'pm'
        if has_left and has_right:
            if has_left_am:
                if begin_hour >= 12:
                    begin_date_time -= timedelta(hours=12)
            else:
                if begin_hour < 12:
                    begin_date_time += timedelta(hours=12)
            if has_right_am:
                if end_hour > 12:
                    end_date_time -= timedelta(hours=12)
            else:
                if end_hour < 12:
                    end_date_time += timedelta(hours=12)
        # one of the time point has description like 'am' or 'pm'
        elif has_left or has_right:
            if has_left_am:
                if begin_hour >= 12:
                    begin_date_time -= timedelta(hours=12)
                if end_hour < 12:
                    if end_date_time < begin_date_time:
                        end_date_time += timedelta(hours=12)
            elif has_left_pm:
                if begin_hour < 12:
                    begin_date_time += timedelta(hours=12)
                if end_hour < 12:
                    if end_date_time < begin_date_time:
                        span: datetime = begin_date_time - end_date_time
                        end_date_time += timedelta(
                            hours=24) if span >= timedelta(
                                hours=12) else timedelta(hours=12)
            if has_right_am:
                if end_hour >= 12:
                    end_date_time -= timedelta(hours=12)
                if begin_hour < 12:
                    if end_date_time < begin_date_time:
                        begin_date_time -= timedelta(hours=12)
            elif has_right_pm:
                if end_hour < 12:
                    end_date_time += timedelta(hours=12)
                if begin_hour < 12:
                    if end_date_time < begin_date_time:
                        begin_date_time -= timedelta(hours=12)
                    else:
                        span = end_date_time - begin_date_time
                        if span >= timedelta(hours=12):
                            begin_date_time += timedelta(hours=12)
        # no 'am' or 'pm' indicator
        elif begin_hour <= 12 and end_hour <= 12:
            if begin_date_time > end_date_time:
                if begin_hour == 12:
                    begin_date_time -= timedelta(hours=12)
                else:
                    end_date_time += timedelta(hours=12)
            result.comment = Constants.AM_PM_GROUP_NAME

        if end_date_time < begin_date_time:
            end_date_time += timedelta(hours=24)

        if begin_minute >= 0:
            begin = f'T{begin_date_time.hour:02d}:{begin_date_time.minute:02d}'
        else:
            begin = f'T{begin_date_time.hour:02d}'
        if end_minute >= 0:
            end = f'T{end_date_time.hour:02d}:{end_date_time.minute:02d}'
        else:
            end = f'T{end_date_time.hour:02d}'

        difference = datetime(year, month,
                              day) + (end_date_time - begin_date_time)
        if difference.minute != 0 and difference.hour != 0:
            result.timex = f'({begin},{end},PT{difference.hour}H{difference.minute}M)'
        elif difference.minute != 0 and difference.hour == 0:
            result.timex = f'({begin},{end},PT{difference.minute}M)'
        else:
            result.timex = f'({begin},{end},PT{difference.hour}H)'

        result.future_value = ResolutionStartEnd()
        result.past_value = ResolutionStartEnd()
        result.future_value.start = begin_date_time
        result.future_value.end = end_date_time
        result.past_value.start = result.future_value.start
        result.past_value.end = result.future_value.end
        result.success = True

        result.sub_date_time_entities = []

        # in SplitDateAndTime mode, time points will be get from these sub_date_time_entities
        # cases like "from 4 to 5pm", "4" should not be trated as sub_date_time_entities
        if has_left or begin_minute >= 0:
            er = ExtractResult()
            er.start = match.start("time1")
            er.length = match.end("time1") - match.start("time1")
            er.text = time1
            er.type = Constants.SYS_DATETIME_TIME
            pr = self.config.time_parser.parse(er, reference)
            result.sub_date_time_entities.append(pr)

        # cases like "from 4am to 5" "5" should not treated as sub_date_time_entities
        if has_right or end_minute >= 0:
            er = ExtractResult()
            er.start = match.start("time2")
            er.length = match.end("time2") - match.start("time2")
            er.text = time2
            er.type = Constants.SYS_DATETIME_TIME
            pr = self.config.time_parser.parse(er, reference)
            result.sub_date_time_entities.append(pr)

        return result
Esempio n. 21
0
    def parse(self, source: ExtractResult, reference: datetime = None) -> Optional[DateTimeParseResult]:
        if not reference:
            reference = datetime.now()

        # Push, save the MOD string
        has_before = False
        has_after = False
        has_since = False
        has_around = False
        has_equal = False
        has_date_after = False
        match_is_after = False

        # "inclusive_mod" means MOD should include the start/end time
        # For example, cases like "on or later than", "earlier than or in" have inclusive modifier
        has_inclusive_mod = False
        mod_str = ''
        if source.meta_data and source.meta_data.has_mod:
            before_match = RegExpUtility.match_begin(self.config.before_regex, source.text, True)
            after_match = RegExpUtility.match_begin(self.config.after_regex, source.text, True)
            since_match = RegExpUtility.match_begin(self.config.since_regex, source.text, True)

            preLength = 0
            if before_match and before_match.success:
                preLength = before_match.index + before_match.length
            elif after_match and after_match.success:
                preLength = after_match.index + after_match.length
            elif since_match and since_match.success:
                preLength = since_match.index + since_match.length
            aroundText = source.text[preLength:]
            around_match = RegExpUtility.match_begin(self.config.around_regex, aroundText, True)
            equal_match = RegExpUtility.match_begin(self.config.equal_regex, source.text, True)

            if before_match and not before_match.success:
                before_match = RegExpUtility.match_end(self.config.before_regex, source.text, True)
                match_is_after = match_is_after or before_match.success

            if after_match and not after_match.success:
                after_match = RegExpUtility.match_end(self.config.after_regex, source.text, True)
                match_is_after = match_is_after or after_match.success

            if since_match and not since_match.success:
                since_match = RegExpUtility.match_end(self.config.since_regex, source.text, True)
                match_is_after = match_is_after or since_match.success

            if around_match and not around_match.success:
                around_match = RegExpUtility.match_end(self.config.around_regex, source.text, True)
                match_is_after = match_is_after or around_match.success

            if equal_match and not equal_match.success:
                equal_match = RegExpUtility.match_end(self.config.equal_regex, source.text, True)
                match_is_after = match_is_after or equal_match.success

            if around_match and around_match.success:
                has_around = True
                source.start += 0 if match_is_after else preLength + around_match.index + around_match.length
                source.length -= around_match.length if match_is_after else preLength + around_match.index + around_match.length
                source.text = source.text[0:source.length] if match_is_after else source.text[preLength + around_match.index + around_match.length:]
                mod_str = around_match.group() if match_is_after else aroundText[0:around_match.index + around_match.length]
            if before_match and before_match.success:
                has_before = True
                if not (around_match and around_match.success):
                    source.start += 0 if match_is_after else before_match.length
                    source.length -= before_match.length
                    source.text = source.text[0:source.length] if match_is_after else source.text[before_match.length:]
                mod_str = before_match.group() + mod_str
                if RegExpUtility.get_group(before_match.match[0], "include"):
                    has_inclusive_mod = True
            elif after_match and after_match.success:
                has_after = True
                if not (around_match and around_match.success):
                    source.start += 0 if match_is_after else after_match.length
                    source.length -= after_match.length
                    source.text = source.text[0:source.length] if match_is_after else source.text[after_match.length:]
                mod_str = after_match.group() + mod_str
                if RegExpUtility.get_group(after_match.match[0], "include"):
                    has_inclusive_mod = True
            elif since_match and since_match.success:
                has_since = True
                if not (around_match and around_match.success):
                    source.start += 0 if match_is_after else since_match.length
                    source.length -= since_match.length
                    source.text = source.text[0:source.length] if match_is_after else source.text[since_match.length:]
                mod_str = since_match.group() + mod_str
            elif equal_match and equal_match.success:
                has_equal = True
                source.start += 0 if match_is_after else equal_match.length
                source.length -= equal_match.length
                source.text = source.text[0:source.length] if match_is_after else source.text[equal_match.length:]
                mod_str = equal_match.group()
            elif source.type == Constants.SYS_DATETIME_DATEPERIOD and \
                    regex.search(self.config.year_regex, source.text) or source.type == Constants.SYS_DATETIME_DATE or \
                    source.type == Constants.SYS_DATETIME_TIME:
                # This has to be put at the end of the if, or cases like "before 2012" and "after 2012"
                # would fall into this
                # 2012 or after/above
                # 3 pm or later
                match = RegExpUtility.match_end(self.config.suffix_after, source.text, True)
                if match and match.success:
                    has_date_after = True
                    source.length -= match.length
                    source.text = source.text[0:source.length]
                    mod_str = match.group()

        result = self.parse_result(source, reference)
        if not result:
            return None

        # Pop, restore the MOD string
        if has_before and result.value:
            result.length += len(mod_str)
            result.start -= 0 if match_is_after else len(mod_str)
            result.text = result.text + mod_str if match_is_after else mod_str + result.text
            val = result.value

            val.mod = self.combine_mod(val.mod, TimeTypeConstants.BEFORE_MOD if not has_inclusive_mod else
                                       TimeTypeConstants.UNTIL_MOD)
            if has_around:
                val.mod = self.combine_mod(TimeTypeConstants.APPROX_MOD, val.mod)
                has_around = False
            result.value = val

        if has_after and result.value:
            result.length += len(mod_str)
            result.start -= len(mod_str)
            result.text = mod_str + result.text
            val = result.value

            val.mod = self.combine_mod(val.mod, TimeTypeConstants.AFTER_MOD if not has_inclusive_mod else
                                       TimeTypeConstants.SINCE_MOD)
            if has_around:
                val.mod = self.combine_mod(TimeTypeConstants.APPROX_MOD, val.mod)
                has_around = False
            result.value = val

        if has_since and result.value:
            result.length += len(mod_str)
            result.start -= len(mod_str)
            result.text = mod_str + result.text
            val = result.value
            val.mod = TimeTypeConstants.SINCE_MOD
            if has_around:
                val.mod = self.combine_mod(TimeTypeConstants.APPROX_MOD, val.mod)
                has_around = False
            result.value = val

        if has_around and result.value:
            result.length += len(mod_str)
            result.start -= len(mod_str)
            result.text = mod_str + result.text
            val = result.value
            val.mod = TimeTypeConstants.APPROX_MOD
            result.value = val

        if has_equal and result.value:
            result.length += len(mod_str)
            result.start -= len(mod_str)
            result.text = mod_str + result.text

        if has_date_after and result.value:
            result.length += len(mod_str)
            result.text = result.text + mod_str
            val = result.value
            val.mod = self.combine_mod(val.mod, TimeTypeConstants.SINCE_MOD)
            result.value = val
            has_since = True

        # For cases like "3 pm or later on monday"
        match = self.config.suffix_after.match(result.text)
        if result.value and (match.start() != 0 if match else match) and \
                result.type == Constants.SYS_DATETIME_DATETIME:
            val = result.value
            val.mod = self.combine_mod(val.mod, TimeTypeConstants.SINCE_MOD)
            result.value = val
            has_since = True

        if self.options & DateTimeOptions.SPLIT_DATE_AND_TIME and result.value and result.value.sub_date_time_entities:
            result.value = self._date_time_resolution_for_split(result)
        else:
            result = self.set_parse_result(
                result, has_before, has_after, has_since)

        return result
Esempio n. 22
0
    def extract(self, source: str) -> List[ExtractResult]:
        if not self._pre_check_str(source):
            return list()

        mapping_prefix: Dict[float, PrefixUnitResult] = dict()
        matched: List[bool] = [False] * len(source)
        numbers: List[ExtractResult] = self.config.unit_num_extractor.extract(
            source)
        result: List[ExtractResult] = list()
        source_len = len(source)

        if self.max_prefix_match_len != 0:
            for num in numbers:
                if num.start is None or num.length is None:
                    continue
                max_find_prefix = min(self.max_prefix_match_len, num.start)
                if max_find_prefix == 0:
                    continue

                left: str = source[num.start - max_find_prefix:num.start]
                last_index = len(left)
                best_match: Match = None
                for pattern in self.prefix_regex:
                    collection = list(
                        filter(lambda x: len(x.group()),
                               regex.finditer(pattern, left)))
                    for match in collection:
                        if left[match.start():last_index].strip(
                        ) == match.group():
                            if best_match is None or best_match.start(
                            ) >= match.start():
                                best_match = match
                if best_match:
                    mapping_prefix[num.start] = PrefixUnitResult(
                        offset=last_index - best_match.start(),
                        unit=left[best_match.start():last_index])
        for num in numbers:
            if num.start is None or num.length is None:
                continue
            start = num.start
            length = num.length
            max_find_len = source_len - start - length

            prefix_unit: PrefixUnitResult = mapping_prefix.get(start, None)

            if max_find_len > 0:
                right = source[start + length:start + length + max_find_len]
                unit_match_list = map(lambda x: list(regex.finditer(x, right)),
                                      self.suffix_regex)
                unit_match = chain.from_iterable(unit_match_list)
                unit_match = list(filter(lambda x: x.group(), unit_match))

                max_len = 0
                for match in unit_match:
                    if match.group():
                        end_pos = match.start() + len(match.group())
                        if match.start() >= 0:
                            middle: str = right[:min(match.start(), len(right)
                                                     )]
                            if max_len < end_pos and (
                                    not middle.strip() or middle.strip()
                                    == self.config.connector_token):
                                max_len = end_pos
                if max_len != 0:
                    for i in range(length + max_len):
                        matched[i + start] = True
                    ex_result = ExtractResult()
                    ex_result.start = start
                    ex_result.length = length + max_len
                    ex_result.text = source[start:start + length + max_len]
                    ex_result.type = self.config.extract_type

                    if prefix_unit:
                        ex_result.start -= prefix_unit.offset
                        ex_result.length += prefix_unit.offset
                        ex_result.text = prefix_unit.unit + ex_result.text

                    num.start = start - ex_result.start
                    ex_result.data = num

                    is_not_unit = False
                    if ex_result.type == Constants.SYS_UNIT_DIMENSION:
                        non_unit_match = self.config.pm_non_unit_regex.finditer(
                            source)
                        for match in non_unit_match:
                            if ex_result.start >= match.start(
                            ) and ex_result.end <= match.end():
                                is_not_unit = True

                    if is_not_unit:
                        continue

                    result.append(ex_result)
                    continue
            if prefix_unit:
                ex_result = ExtractResult()
                ex_result.start = num.start - prefix_unit.offset
                ex_result.length = num.length + prefix_unit.offset
                ex_result.text = prefix_unit.unit + num.text
                ex_result.type = self.config.extract_type

                num.start = start - ex_result.start
                ex_result.data = num
                result.append(ex_result)

        if self.separate_regex:
            result = self._extract_separate_units(source, result)

        return result
Esempio n. 23
0
    def extract(self, source: str) -> List[ExtractResult]:
        if not self._pre_check_str(source):
            return []

        non_unit_match = None
        numbers = None

        mapping_prefix: Dict[float, PrefixUnitResult] = dict()
        matched = [False] * len(source)
        result = []
        prefix_matched = False
        prefix_match: List[MatchResult] = sorted(self.prefix_matcher.find(source), key=lambda o: o.start)
        suffix_match: List[MatchResult] = sorted(self.suffix_matcher.find(source), key=lambda o: o.start)

        if len(prefix_match) > 0 or len(suffix_match) > 0:

            numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)

            if len(numbers) > 0 and self.config.extract_type is Constants.SYS_UNIT_CURRENCY and len(prefix_match) > 0 and len(suffix_match) > 0:

                for number in numbers:
                    start = number.start
                    length = number.length
                    number_prefix = [(mr.start + mr.length) == start for mr in prefix_match]
                    number_suffix = [mr.start == (start + length) for mr in suffix_match]
                    if True in number_prefix and True in number_suffix and "," in number.text:
                        comma_index = number.start + number.text.index(",")
                        source = source[:comma_index] + " " + source[comma_index + 1:]

                numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)

            # Special case for cases where number multipliers clash with unit
            ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex
            if ambiguous_multiplier_regex is not None:

                for num in numbers:
                    match = list(filter(lambda x: x.group(), regex.finditer(
                        ambiguous_multiplier_regex, num.text)))
                    if match and len(match) == 1:
                        new_length = num.length - \
                            (match[0].span()[1] - match[0].span()[0])
                        num.text = num.text[0:new_length]
                        num.length = new_length

            for number in numbers:
                if number.start is None or number.length is None:
                    continue
                start = int(number.start)
                length = int(number.length)
                max_find_pref = min(self.max_prefix_match_len, number.start)
                max_find_suff = len(source) - start - length

                if max_find_pref != 0:
                    last_index = start
                    best_match = None

                    for m in prefix_match:
                        if m.length > 0 and m.end > start:
                            break

                        if m.length > 0 and source[m.start: m.start + (last_index - m.start)].strip() == m.text:
                            best_match = m
                            break

                    if best_match is not None:
                        off_set = last_index - best_match.start
                        unit_str = source[best_match.start:best_match.start + off_set]
                        self.add_element(mapping_prefix, number.start, (PrefixUnitResult(off_set, unit_str)))
                prefix_unit = mapping_prefix.get(start, None)
                if max_find_suff > 0:

                    max_len = 0
                    first_index = start + length

                    for m in suffix_match:

                        if m.length > 0 and m.start >= first_index:

                            end_pos = m.start + m.length - first_index
                            if max_len < end_pos:
                                mid_str = source[first_index: first_index + (m.start - first_index)]
                                if mid_str is None or not mid_str or str.isspace(mid_str) \
                                        or mid_str.strip() == self.config.connector_token:
                                    max_len = end_pos

                    if max_len != 0:
                        substr = source[start: start + length + max_len]
                        er = ExtractResult()

                        er.start = start
                        er.length = length + max_len
                        er.text = substr
                        er.type = self.config.extract_type

                        if prefix_unit is not None:
                            prefix_matched = True
                            er.start -= prefix_unit[0].offset
                            er.length += prefix_unit[0].offset
                            er.text = prefix_unit[0].unit + er.text

                        # Relative position will be used in Parser
                        number.start = start - er.start
                        er.data = number

                        # Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension
                        is_not_unit = False

                        if er.type is Constants.SYS_UNIT_DIMENSION:
                            if non_unit_match is None:
                                non_unit_match = list(self.config.non_unit_regex.finditer(source))
                            for time in non_unit_match:
                                trimmed_source = source.lower()
                                index = trimmed_source.index(time.group())
                                if er.start >= time.start() and er.start + er.length <= \
                                        time.start() + len(time.group()):
                                    is_not_unit = True
                                    break

                        if is_not_unit:
                            continue

                        result.append(er)

                if prefix_unit and prefix_unit is not None and not prefix_matched:
                    er = ExtractResult()
                    er.start = number.start - prefix_unit[0].offset
                    er.length = number.length + prefix_unit[0].offset
                    er.text = prefix_unit[0].unit + number.text
                    er.type = self.config.extract_type

                    # Relative position will be used in Parser
                    number.start = start - er.start
                    er.data = number
                    result.append(er)

        # Extract Separate unit
        if self.separate_regex:
            if non_unit_match is None:
                try:
                    non_unit_match = list(self.config.non_unit_regex.match(source))
                except:
                    non_unit_match = []

            self._extract_separate_units(source, result, non_unit_match)

            # Remove common ambiguous cases
            result = self._filter_ambiguity(result, source)

        # Expand Chinese phrase to the `half` patterns when it follows closely origin phrase.
        self.config.expand_half_suffix(source, result, numbers)

        return result
Esempio n. 24
0
    def __merged_compound_units(self, source: str):
        ers = NumberWithUnitExtractor(self.config).extract(source)
        ers = self.__merge_pure_number(source, ers)

        result = []
        groups = [0] * len(ers)

        idx = 0
        while idx < len(ers) - 1:
            if ers[idx].type != ers[idx + 1].type and not ers[idx].type == Constants.SYS_NUM and not ers[idx + 1].type == Constants.SYS_NUM:
                idx = idx + 1
                continue

            if isinstance(ers[idx].data, ExtractResult) and not str(ers[idx].data.data).startswith("Integer"):
                groups[idx + 1] = groups[idx] + 1
                idx = idx + 1
                continue

            middle_begin = ers[idx].start + ers[idx].length
            middle_end = ers[idx + 1].start

            middle_str = source[middle_begin: middle_begin + (middle_end -
                                                              middle_begin)].strip().lower()

            # Separated by whitespace
            if not middle_str:
                groups[idx + 1] = groups[idx]
                idx = idx + 1
                continue

            # Separated by connector
            match = self.config.compound_unit_connector_regex.match(middle_str)
            if match is not None:
                splitted_match = match.string.split(" ")
            if match and match.pos == 0 and len(splitted_match[0]) == len(middle_str):
                groups[idx + 1] = groups[idx]
            else:
                groups[idx + 1] = groups[idx] + 1

            idx = idx + 1

        idx = 0
        while idx < len(ers):
            if idx == 0 or groups[idx] != groups[idx - 1]:
                tmp_extract_result = ers[idx]
                tmp = ExtractResult()
                tmp.data = ers[idx].data
                tmp.length = ers[idx].length
                tmp.start = ers[idx].start
                tmp.text = ers[idx].text
                tmp.type = ers[idx].type
                tmp_extract_result.data = [tmp]

                result.append(tmp_extract_result)

            # reduce extract results in same group
            if idx + 1 < len(ers) and groups[idx + 1] == groups[idx]:
                group = groups[idx]

                period_begin = result[group].start
                period_end = ers[idx + 1].start + ers[idx + 1].length

                result[group].length = period_end - period_begin
                result[group].text = source[period_begin:period_begin + (period_end - period_begin)]
                result[group].type = Constants.SYS_UNIT_CURRENCY
                if isinstance(result[group].data, list):
                    result[group].data.append(ers[idx + 1])

            idx = idx + 1

        idx = 0
        while idx < len(result):
            inner_data = result[idx].data
            if len(inner_data) == 1:
                result[idx] = inner_data[0]
            idx = idx + 1

        result = [x for x in result if not x.type == Constants.SYS_NUM]

        return result
Esempio n. 25
0
    def parse(self,
              source: ExtractResult,
              reference: datetime = None) -> Optional[DateTimeParseResult]:
        if not reference:
            reference = datetime.now()

        result = None
        has_before = False
        has_after = False
        has_since = False
        mod_str = ''
        before_match = self.config.before_regex.match(source.text)
        after_match = self.config.after_regex.match(source.text)
        since_match = self.config.since_regex.match(source.text)

        if before_match:
            has_before = True
            source.start += before_match.end()
            source.length -= before_match.end()
            source.text = source.text[before_match.end():]
            mod_str = before_match.group()
        elif after_match:
            has_after = True
            source.start += after_match.end()
            source.length -= after_match.end()
            source.text = source.text[after_match.end():]
            mod_str = after_match.group()
        elif since_match:
            has_since = True
            source.start += since_match.end()
            source.length -= since_match.end()
            source.text = source.text[since_match.end():]
            mod_str = since_match.group()

        if source.type == Constants.SYS_DATETIME_DATE:
            result = self.config.date_parser.parse(source, reference)
            if not result.value:
                result = self.config.holiday_parser.parse(source, reference)
        elif source.type == Constants.SYS_DATETIME_TIME:
            result = self.config.time_parser.parse(source, reference)
        elif source.type == Constants.SYS_DATETIME_DATETIME:
            result = self.config.date_time_parser.parse(source, reference)
        elif source.type == Constants.SYS_DATETIME_DATEPERIOD:
            result = self.config.date_period_parser.parse(source, reference)
        elif source.type == Constants.SYS_DATETIME_TIMEPERIOD:
            result = self.config.time_period_parser.parse(source, reference)
        elif source.type == Constants.SYS_DATETIME_DATETIMEPERIOD:
            result = self.config.date_time_period_parser.parse(
                source, reference)
        elif source.type == Constants.SYS_DATETIME_DURATION:
            result = self.config.duration_parser.parse(source, reference)
        elif source.type == Constants.SYS_DATETIME_SET:
            result = self.config.set_parser.parse(source, reference)
        else:
            return None

        if has_before and result.value:
            result.length += len(mod_str)
            result.start -= len(mod_str)
            result.text = mod_str + result.text
            val = result.value
            val.mod = TimeTypeConstants.BEFORE_MOD
            result.value = val

        if has_after and result.value:
            result.length += len(mod_str)
            result.start -= len(mod_str)
            result.text = mod_str + result.text
            val = result.value
            val.mod = TimeTypeConstants.AFTER_MOD
            result.value = val

        if has_since and result.value:
            result.length += len(mod_str)
            result.start -= len(mod_str)
            result.text = mod_str + result.text
            val = result.value
            val.mod = TimeTypeConstants.SINCE_MOD
            result.value = val

        if self.options & DateTimeOptions.SPLIT_DATE_AND_TIME and result.value and result.value.sub_date_time_entities:
            result.value = self._date_time_resolution_for_split(result)
        else:
            result = self.set_parse_result(result, has_before, has_after,
                                           has_since)

        return result