Example #1
0
    def __get_year_from_text(self, match) -> int:
        first_two_year_num_str = match.group('firsttwoyearnum')

        if first_two_year_num_str:
            er = ExtractResult()
            er.text = first_two_year_num_str
            er.start = match.start('firsttwoyearnum')
            er.length = match.end('firsttwoyearnum') - er.start
            first_two_year_num = self.config.number_parser.parse(er).value

            last_two_year_num = 0
            last_two_year_num_str = match.group('lasttwoyearnum')

            if last_two_year_num_str:
                er.text = last_two_year_num_str
                er.start = match.start('lasttwoyearnum')
                er.length = match.end('lasttwoyearnum') - er.start
                last_two_year_num = self.config.number_parser.parse(er).value

            if first_two_year_num < 100 and last_two_year_num == 0 or first_two_year_num < 100 and first_two_year_num % 10 == 0 and len(last_two_year_num_str.strip().split(' ')) == 1:
                return -1

            if first_two_year_num >= 100:
                return first_two_year_num + last_two_year_num

            return first_two_year_num * 100 + last_two_year_num
        else:
            return -1
Example #2
0
    def get_year_from_text(self, match: Match) -> int:
        year = Constants.INVALID_YEAR

        year_str = RegExpUtility.get_group(match, 'year')
        if year_str and not (str.isspace(year_str) or year_str is None):
            year = int(year_str)
            if 100 > year >= Constants.MIN_TWO_DIGIT_YEAR_PAST_NUM:
                year += 1900
            elif 0 <= year < Constants.MAX_TWO_DIGIT_YEAR_FUTURE_NUM:
                year += 2000
        else:
            first_two_year_num_str = RegExpUtility.get_group(
                match, Constants.FIRST_TWO_YEAR_NUM)

            if first_two_year_num_str and not (
                    str.isspace(first_two_year_num_str)
                    or first_two_year_num_str is None):

                er = ExtractResult()
                er.text = first_two_year_num_str
                er.start = match.string.index(
                    RegExpUtility.get_group(match,
                                            Constants.FIRST_TWO_YEAR_NUM))
                er.length = len(
                    RegExpUtility.get_group(match,
                                            Constants.FIRST_TWO_YEAR_NUM))

                first_two_year_num = self.config.number_parser.parse(er).value if \
                    self.config.number_parser.parse(er).value else 0

                last_two_year_num = 0
                last_two_year_num_str = RegExpUtility.get_group(
                    match, Constants.LAST_TWO_YEAR_NUM)

                if not (str.isspace(last_two_year_num_str)
                        or last_two_year_num_str is None):
                    er = ExtractResult()
                    er.text = last_two_year_num_str
                    er.start = match.string.index(
                        RegExpUtility.get_group(match,
                                                Constants.LAST_TWO_YEAR_NUM))
                    er.length = len(
                        RegExpUtility.get_group(match,
                                                Constants.LAST_TWO_YEAR_NUM))

                    last_two_year_num = self.config.number_parser.parse(er).value if \
                        self.config.number_parser.parse(er).value else 0

                if (first_two_year_num < 100 and last_two_year_num == 0)\
                        or (first_two_year_num < 100 and first_two_year_num % 10 == 0
                            and len(last_two_year_num_str.strip().split(' ')) == 1):
                    year = Constants.INVALID_YEAR
                    return year

                if first_two_year_num >= 100:
                    year = first_two_year_num + last_two_year_num
                else:
                    year = (first_two_year_num * 100) + last_two_year_num

        return year
Example #3
0
    def extract(self, source: str) -> List[ExtractResult]:
        result: List[ExtractResult] = list()
        if not self._pre_check_str(source):
            return result

        matched: List[bool] = [False] * len(source)

        match_source: Dict[Match, str] = dict()
        matches_list = list(
            map(
                lambda x: MatchesVal(matches=list(re.finditer(x.re, source)),
                                     val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))

        for ml in matches_list:
            for m in ml.matches:
                for j in range(len(m.group())):
                    matched[m.start() + j] = True
                # Keep Source Data for extra information
                match_source[m] = ml.val
        last = -1

        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substring = source[start:start + length].strip()
                    simple_tokenizer = SimpleTokenizer()
                    if substring.startswith(Constants.IPV6_ELLIPSIS) and (
                            start > 0 and (str.isdigit(source[start - 1]) or
                                           (str.isalpha(source[start - 1])
                                            and not simple_tokenizer.is_cjk(
                                                c=list(source)[start - 1])))):
                        continue

                    elif substring.endswith(Constants.IPV6_ELLIPSIS) and (
                            i + 1 < len(source) and
                        (str.isdigit(source[i + 1]) or
                         (str.isalpha(source[i + 1]) and not simple_tokenizer.
                          is_cjk(c=list(source)[start - 1])))):
                        continue

                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substring
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)
        return result
Example #4
0
    def _extract_separate_units(self, source: str, num_depend_source: List[ExtractResult], non_unit_matches) -> List[ExtractResult]:
        result = deepcopy(num_depend_source)
        match_result: List[bool] = [False] * len(source)
        for ex_result in num_depend_source:
            start = ex_result.start
            i = 0
            while i < ex_result.length:
                match_result[start + i] = True
                i += 1

        match_collection = list(
            filter(lambda x: x.group(), regex.finditer(self.separate_regex, source)))
        for match in match_collection:
            i = 0
            while i < len(match.group()) and not match_result[match.start() + i]:
                i += 1
            if i == len(match.group()):
                for j in range(i):
                    match_result[j] = True

                is_not_unit = False
                if match.group() == Constants.AMBIGUOUS_TIME_TERM:
                    for time in non_unit_matches:
                        if self._dimension_inside_time(match, time):
                            is_not_unit = True

                if is_not_unit:
                    continue

                to_add = ExtractResult()
                to_add.start = match.start()
                to_add.length = len(match.group())
                to_add.text = match.group()
                to_add.type = self.config.extract_type
                num_depend_source.append(to_add)
Example #5
0
 def _extract_separate_units(
         self, source: str,
         num_depend_source: List[ExtractResult]) -> List[ExtractResult]:
     result = deepcopy(num_depend_source)
     match_result: List[bool] = [False] * len(source)
     for ex_result in num_depend_source:
         for i in range(ex_result.start, ex_result.end + 1):
             match_result[i] = True
     match_collection = list(
         filter(lambda x: x.group(),
                regex.finditer(self.separate_regex, source)))
     for match in match_collection:
         i = 0
         while i < len(
                 match.group()) and not match_result[match.start() + i]:
             i += 1
         if i == len(match.group()):
             for j in range(i):
                 match_result[j] = True
             to_add = ExtractResult()
             to_add.start = match.start()
             to_add.length = len(match.group())
             to_add.text = match.group()
             to_add.type = self.config.extract_type
             result.append(to_add)
     return result
Example #6
0
    def parse(self, source: ExtractResult) -> Optional[ParseResult]:
        ret = ParseResult(source)
        number_result = None
        if source.data and isinstance(source.data, ExtractResult):
            number_result = source.data
        else:  # if there is no unitResult, means there is just unit
            number_result = ExtractResult()
            number_result.start = -1
            number_result.length = 0
            number_result.text = None
            number_result.type = None
        # key contains units
        key = source.text
        unit_key_build = ''
        unit_keys = []
        i = 0
        while i <= len(key):
            if i == len(key):
                if unit_key_build:
                    self.__add_if_not_contained(
                        unit_keys, unit_key_build.strip())
            # number_result.start is a relative position
            elif i == number_result.start:
                if unit_key_build:
                    self.__add_if_not_contained(
                        unit_keys, unit_key_build.strip())
                    unit_key_build = ''
                if number_result.length:
                    i = number_result.start + number_result.length - 1
            else:
                unit_key_build += key[i]
            i += 1

        # Unit type depends on last unit in suffix.
        last_unit = unit_keys[-1]
        normalized_last_unit = last_unit.lower()
        if self.config.connector_token and normalized_last_unit.startswith(self.config.connector_token):
            normalized_last_unit = normalized_last_unit[len(
                self.config.connector_token):].strip()
            last_unit = last_unit[len(self.config.connector_token):].strip()
        if key and self.config.unit_map:
            unit_value = None
            if last_unit in self.config.unit_map:
                unit_value = self.config.unit_map[last_unit]
            elif normalized_last_unit in self.config.unit_map:
                unit_value = self.config.unit_map[normalized_last_unit]
            if unit_value:
                num_value = self.config.internal_number_parser.parse(
                    number_result) if number_result.text else None
                resolution_str = num_value.resolution_str if num_value else None

                ret.value = UnitValue(
                    number=resolution_str,
                    unit=unit_value)
                ret.resolution_str = f'{resolution_str} {unit_value}'.strip()

        ret.text = ret.text.lower()

        return ret
Example #7
0
    def extract(self, source: str):
        results: List[ExtractResult] = list()
        partial_results: List[ExtractResult] = list()
        trimmed_source = source.lower()

        if source is None or source.strip() == '':
            return results
        source_tokens = self.__tokenize(trimmed_source)

        for (regexp, type_extracted) in self.config.regexes_map.items():
            for match in RegExpUtility.get_matches(regexp, trimmed_source):
                match_tokens = self.__tokenize(match)
                top_score = 0.0

                for i in range(len(source_tokens)):
                    score = self.match_value(source_tokens, match_tokens, i)
                    top_score = max(top_score, score)

                if top_score > 0.0:
                    value = ExtractResult()
                    start = trimmed_source.index(match)
                    length = len(match)
                    text = source[start:start + length].strip()
                    value.start = start
                    value.length = length
                    value.text = text
                    value.type = type_extracted
                    value.data = ChoiceExtractDataResult(source, top_score)

                    partial_results.append(value)

        if len(partial_results) == 0:
            return results

        partial_results = sorted(partial_results, key=lambda res: res.start)

        if self.config.only_top_match:
            top_score = 0.0
            top_result_index = 0
            for i in range(len(partial_results)):
                data = ChoiceExtractDataResult(source,
                                               partial_results[i].data.score)
                if data.score > top_score:
                    top_score = data.score
                    top_result_index = i

            top_result = ChoiceExtractDataResult(
                partial_results[top_result_index].data.source,
                partial_results[top_result_index].data.score)
            top_result.other_matches = partial_results
            results.append(partial_results[top_result_index])
        else:
            results = partial_results

        return results
Example #8
0
    def extract(self, source: str) -> List[ExtractResult]:
        if source is None or len(source.strip()) is 0:
            return list()
        result: List[ExtractResult] = list()
        match_source = dict()
        matched: List[bool] = [False] * len(source)

        matches_list = list(
            map(
                lambda x: MatchesVal(
                    matches=list(regex.finditer(x.re, source)), val=x.val),
                self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))
        for ml in matches_list:
            for m in ml.matches:
                for j in range(len(m.group())):
                    matched[m.start() + j] = True
                # Keep Source Data for extra information
                match_source[m] = ml.val

        last = -1
        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substr = source[start:start + length].strip()
                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    # extract negative numbers
                    if self._negative_number_terms is not None:
                        match = regex.search(self._negative_number_terms,
                                             source[0:start])
                        if match is not None:
                            start = match.start()
                            length = length + match.end() - match.start()
                            substr = source[start:start + length].strip()

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substr
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)

        result = self._filter_ambiguity(result, source)
        return result
Example #9
0
    def extract(self, source: str) -> List[ExtractResult]:
        result: List[ExtractResult] = list()
        if not self._pre_check_str(source):
            return result

        matched: List[bool] = [False] * len(source)

        match_source: Dict[Match, str] = dict()

        matches_list = list(
            map(
                lambda x: MatchesVal(matches=list(re.finditer(x.re, source)),
                                     val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))

        for ml in matches_list:
            for m in ml.matches:
                if self._is_valid_match(m):
                    for j in range(len(m.group())):
                        matched[m.start() + j] = True
                    # Keep Source Data for extra information
                    match_source[m] = ml.val
        last = -1

        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substring = source[start:start + length].strip()
                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substring
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)

        return result
def merge_all_tokens(tokens: List[Token], source: str,
                     extractor_name: str) -> List[ExtractResult]:
    result = []

    merged_tokens: List[Token] = list()
    tokens_ = sorted(filter(None, tokens), key=lambda x: x.start)

    for token in tokens_:
        add = True

        for index, m_token in enumerate(merged_tokens):
            if not add:
                break

            if token.start >= m_token.start and token.end <= m_token.end:
                add = False

            if m_token.start < token.start < m_token.end:
                add = False

            if token.start <= m_token.start and token.end >= m_token.end:
                add = False
                merged_tokens[index] = token

        if add:
            merged_tokens.append(token)

    for token in merged_tokens:
        start = token.start
        length = token.length
        sub_str = source[start:start + length]

        extracted_result = ExtractResult()
        extracted_result.start = start
        extracted_result.length = length
        extracted_result.text = sub_str
        extracted_result.type = extractor_name
        extracted_result.data = None
        extracted_result.meta_data = token.metadata

        result.append(extracted_result)

    return result
Example #11
0
    def extract(self, source: str) -> List[ExtractResult]:
        origin = source

        # preprocess the source sentence via extracting and replacing the numbers in it
        preprocess = self.__preprocess_with_number_extracted(origin)
        source = preprocess.source
        positionmap = preprocess.position
        extractresults = preprocess.results

        allmatches = list(
            map(lambda p: list(regex.finditer(p, source)), self.regexes))
        matched: List[bool] = [False] * len(source)

        for matches in allmatches:
            for match in matches:
                for j in range(len(match.group())):
                    matched[match.start() + j] = True

        results = list()

        # get index of each matched results
        last = -1
        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if (i + 1) == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substr = source[start:start + length].strip()
                    value = ExtractResult()
                    value.start = start
                    value.length = length
                    value.text = substr
                    value.type = self._extract_type
                    results.append(value)

        # post-processing, restoring the extracted numbers
        results = self.__post_processing(results, origin, positionmap,
                                         extractresults)

        return results
Example #12
0
    def merge_date_and_time(self, source: str,
                            reference: datetime) -> List[Token]:
        tokens: List[Token] = list()
        date_ers: List[
            ExtractResult] = self.config.date_point_extractor.extract(
                source, reference)

        if not date_ers:
            return tokens

        time_ers = self.config.time_point_extractor.extract(source, reference)
        time_num_matches = self.config.number_as_time_regex.match(source)

        if len(time_ers) == 0 and time_num_matches == 0:
            return tokens

        extract_results = date_ers
        extract_results.extend(time_ers)

        # handle cases which use numbers as time points
        # only enabled in CalendarMode
        if (self.config.options & DateTimeOptions.CALENDAR) != 0:
            num_ers = []

            idx = 0

            for idx in range(idx, len(time_num_matches), 1):
                match = time_num_matches[idx]
                node = ExtractResult()
                node.start = source.index(match.group())
                node.length = len(match.group())
                node.text = match.text
                node.type = NumConstants.SYS_NUM_INTEGER
                num_ers.append(node)

            extract_results.extend(num_ers)

        extract_results = sorted(extract_results, key=lambda x: x.start)

        i = 0

        while i < len(extract_results) - 1:

            j = i + 1

            while j < len(extract_results) and extract_results[i].overlap(
                    extract_results[j]):
                j += 1

            if j >= len(extract_results):
                break

            if ((extract_results[i].type is Constants.SYS_DATETIME_DATE
                 and extract_results[j].type is Constants.SYS_DATETIME_TIME) or
                (extract_results[i].type is Constants.SYS_DATETIME_TIME
                 and extract_results[j].type is Constants.SYS_DATETIME_DATE) or
                (extract_results[i].type is Constants.SYS_DATETIME_DATE
                 and extract_results[j] is NumConstants.SYS_NUM_INTEGER)):
                middle_begin = extract_results[i].start + (
                    extract_results[i].length or 0)
                middle_end = extract_results[j].start or 0

                if middle_begin > middle_end:
                    i = j + 1
                    continue

                middle_str = source[middle_begin:middle_end].strip()
                valid = False

                # for cases like "tomorrow 3", "tomorrow at 3"
                if extract_results[j].type is NumConstants.SYS_NUM_INTEGER:
                    match = self.config.date_number_connector_regex.search(
                        middle_str)
                    if not middle_str or match:
                        valid = True
                else:
                    # for case like "3 pm or later on monday"
                    match = self.config.suffix_after_regex.search(middle_str)
                    if match:
                        middle_str = middle_str[middle_str.index(match.group(
                        )) + len(match.group()):len(middle_end)].strip()

                    if not (match and len(middle_str) == 0):
                        if self.config.is_connector_token(middle_str):
                            valid = True

                if valid:
                    begin = extract_results[i].start or 0
                    end = (extract_results[j].start
                           or 0) + (extract_results[j].length or 0)

                    end_index, start_index = self.extend_with_date_time_and_year(
                        begin, end, source, reference)

                    tokens.append(Token(start_index, end_index))
                    i = j + 1
                    continue
            i = j

        # handle "in the afternoon" at the end of entity
        idx = 0
        for idx in range(idx, len(tokens), 1):
            after_str = source[tokens[idx].end:]
            match = self.config.suffix_regex.search(after_str)
            if match:
                tokens[idx] = Token(tokens[idx].start,
                                    tokens[idx].end + len(match.group()))

        # handle "day" prefixes
        idx = 0
        for idx in range(idx, len(tokens), 1):
            before_str = source[0:tokens[idx].start]
            match = self.config.utility_configuration.common_date_prefix_regex.search(
                before_str)
            if match:
                tokens[idx] = Token(tokens[idx].start - len(match.group()),
                                    tokens[idx].end)

        return tokens
Example #13
0
    def __merged_compound_units(self, source: str):
        ers = NumberWithUnitExtractor(self.config).extract(source)
        ers = self.__merge_pure_number(source, ers)

        result = []
        groups = [0] * len(ers)

        idx = 0
        while idx < len(ers) - 1:
            if ers[idx].type != ers[idx + 1].type and not ers[idx].type == Constants.SYS_NUM and not ers[idx + 1].type == Constants.SYS_NUM:
                idx = idx + 1
                continue

            if isinstance(ers[idx].data, ExtractResult) and not str(ers[idx].data.data).startswith("Integer"):
                groups[idx + 1] = groups[idx] + 1
                idx = idx + 1
                continue

            middle_begin = ers[idx].start + ers[idx].length
            middle_end = ers[idx + 1].start

            middle_str = source[middle_begin: middle_begin + (middle_end -
                                                              middle_begin)].strip().lower()

            # Separated by whitespace
            if not middle_str:
                groups[idx + 1] = groups[idx]
                idx = idx + 1
                continue

            # Separated by connector
            match = self.config.compound_unit_connector_regex.match(middle_str)
            if match is not None:
                splitted_match = match.string.split(" ")
            if match and match.pos == 0 and len(splitted_match[0]) == len(middle_str):
                groups[idx + 1] = groups[idx]
            else:
                groups[idx + 1] = groups[idx] + 1

            idx = idx + 1

        idx = 0
        while idx < len(ers):
            if idx == 0 or groups[idx] != groups[idx - 1]:
                tmp_extract_result = ers[idx]
                tmp = ExtractResult()
                tmp.data = ers[idx].data
                tmp.length = ers[idx].length
                tmp.start = ers[idx].start
                tmp.text = ers[idx].text
                tmp.type = ers[idx].type
                tmp_extract_result.data = [tmp]

                result.append(tmp_extract_result)

            # reduce extract results in same group
            if idx + 1 < len(ers) and groups[idx + 1] == groups[idx]:
                group = groups[idx]

                period_begin = result[group].start
                period_end = ers[idx + 1].start + ers[idx + 1].length

                result[group].length = period_end - period_begin
                result[group].text = source[period_begin:period_begin + (period_end - period_begin)]
                result[group].type = Constants.SYS_UNIT_CURRENCY
                if isinstance(result[group].data, list):
                    result[group].data.append(ers[idx + 1])

            idx = idx + 1

        idx = 0
        while idx < len(result):
            inner_data = result[idx].data
            if len(inner_data) == 1:
                result[idx] = inner_data[0]
            idx = idx + 1

        result = [x for x in result if not x.type == Constants.SYS_NUM]

        return result
Example #14
0
    def extract(self, source: str) -> List[ExtractResult]:
        if not self._pre_check_str(source):
            return []

        non_unit_match = None
        numbers = None

        mapping_prefix: Dict[float, PrefixUnitResult] = dict()
        matched = [False] * len(source)
        result = []
        prefix_matched = False
        prefix_match: List[MatchResult] = sorted(self.prefix_matcher.find(source), key=lambda o: o.start)
        suffix_match: List[MatchResult] = sorted(self.suffix_matcher.find(source), key=lambda o: o.start)

        if len(prefix_match) > 0 or len(suffix_match) > 0:

            numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)

            if len(numbers) > 0 and self.config.extract_type is Constants.SYS_UNIT_CURRENCY and len(prefix_match) > 0 and len(suffix_match) > 0:

                for number in numbers:
                    start = number.start
                    length = number.length
                    number_prefix = [(mr.start + mr.length) == start for mr in prefix_match]
                    number_suffix = [mr.start == (start + length) for mr in suffix_match]
                    if True in number_prefix and True in number_suffix and "," in number.text:
                        comma_index = number.start + number.text.index(",")
                        source = source[:comma_index] + " " + source[comma_index + 1:]

                numbers: List[ExtractResult] = sorted(self.config.unit_num_extractor.extract(source), key=lambda o: o.start)

            # Special case for cases where number multipliers clash with unit
            ambiguous_multiplier_regex = self.config.ambiguous_unit_number_multiplier_regex
            if ambiguous_multiplier_regex is not None:

                for num in numbers:
                    match = list(filter(lambda x: x.group(), regex.finditer(
                        ambiguous_multiplier_regex, num.text)))
                    if match and len(match) == 1:
                        new_length = num.length - \
                            (match[0].span()[1] - match[0].span()[0])
                        num.text = num.text[0:new_length]
                        num.length = new_length

            for number in numbers:
                if number.start is None or number.length is None:
                    continue
                start = int(number.start)
                length = int(number.length)
                max_find_pref = min(self.max_prefix_match_len, number.start)
                max_find_suff = len(source) - start - length

                if max_find_pref != 0:
                    last_index = start
                    best_match = None

                    for m in prefix_match:
                        if m.length > 0 and m.end > start:
                            break

                        if m.length > 0 and source[m.start: m.start + (last_index - m.start)].strip() == m.text:
                            best_match = m
                            break

                    if best_match is not None:
                        off_set = last_index - best_match.start
                        unit_str = source[best_match.start:best_match.start + off_set]
                        self.add_element(mapping_prefix, number.start, (PrefixUnitResult(off_set, unit_str)))
                prefix_unit = mapping_prefix.get(start, None)
                if max_find_suff > 0:

                    max_len = 0
                    first_index = start + length

                    for m in suffix_match:

                        if m.length > 0 and m.start >= first_index:

                            end_pos = m.start + m.length - first_index
                            if max_len < end_pos:
                                mid_str = source[first_index: first_index + (m.start - first_index)]
                                if mid_str is None or not mid_str or str.isspace(mid_str) \
                                        or mid_str.strip() == self.config.connector_token:
                                    max_len = end_pos

                    if max_len != 0:
                        substr = source[start: start + length + max_len]
                        er = ExtractResult()

                        er.start = start
                        er.length = length + max_len
                        er.text = substr
                        er.type = self.config.extract_type

                        if prefix_unit is not None:
                            prefix_matched = True
                            er.start -= prefix_unit[0].offset
                            er.length += prefix_unit[0].offset
                            er.text = prefix_unit[0].unit + er.text

                        # Relative position will be used in Parser
                        number.start = start - er.start
                        er.data = number

                        # Special treatment, handle cases like '2:00 pm', '00 pm' is not dimension
                        is_not_unit = False

                        if er.type is Constants.SYS_UNIT_DIMENSION:
                            if non_unit_match is None:
                                non_unit_match = list(self.config.non_unit_regex.finditer(source))
                            for time in non_unit_match:
                                trimmed_source = source.lower()
                                index = trimmed_source.index(time.group())
                                if er.start >= time.start() and er.start + er.length <= \
                                        time.start() + len(time.group()):
                                    is_not_unit = True
                                    break

                        if is_not_unit:
                            continue

                        result.append(er)

                if prefix_unit and prefix_unit is not None and not prefix_matched:
                    er = ExtractResult()
                    er.start = number.start - prefix_unit[0].offset
                    er.length = number.length + prefix_unit[0].offset
                    er.text = prefix_unit[0].unit + number.text
                    er.type = self.config.extract_type

                    # Relative position will be used in Parser
                    number.start = start - er.start
                    er.data = number
                    result.append(er)

        # Extract Separate unit
        if self.separate_regex:
            if non_unit_match is None:
                try:
                    non_unit_match = list(self.config.non_unit_regex.match(source))
                except:
                    non_unit_match = []

            self._extract_separate_units(source, result, non_unit_match)

            # Remove common ambiguous cases
            result = self._filter_ambiguity(result, source)

        # Expand Chinese phrase to the `half` patterns when it follows closely origin phrase.
        self.config.expand_half_suffix(source, result, numbers)

        return result
Example #15
0
    def extract(self, source: str) -> List[ExtractResult]:
        if not self._pre_check_str(source):
            return list()

        mapping_prefix: Dict[float, PrefixUnitResult] = dict()
        matched: List[bool] = [False] * len(source)
        numbers: List[ExtractResult] = self.config.unit_num_extractor.extract(
            source)
        result: List[ExtractResult] = list()
        source_len = len(source)

        if self.max_prefix_match_len != 0:
            for num in numbers:
                if num.start is None or num.length is None:
                    continue
                max_find_prefix = min(self.max_prefix_match_len, num.start)
                if max_find_prefix == 0:
                    continue

                left: str = source[num.start - max_find_prefix:num.start]
                last_index = len(left)
                best_match: Match = None
                for pattern in self.prefix_regex:
                    collection = list(
                        filter(lambda x: len(x.group()),
                               regex.finditer(pattern, left)))
                    for match in collection:
                        if left[match.start():last_index].strip(
                        ) == match.group():
                            if best_match is None or best_match.start(
                            ) >= match.start():
                                best_match = match
                if best_match:
                    mapping_prefix[num.start] = PrefixUnitResult(
                        offset=last_index - best_match.start(),
                        unit=left[best_match.start():last_index])
        for num in numbers:
            if num.start is None or num.length is None:
                continue
            start = num.start
            length = num.length
            max_find_len = source_len - start - length

            prefix_unit: PrefixUnitResult = mapping_prefix.get(start, None)

            if max_find_len > 0:
                right = source[start + length:start + length + max_find_len]
                unit_match_list = map(lambda x: list(regex.finditer(x, right)),
                                      self.suffix_regex)
                unit_match = chain.from_iterable(unit_match_list)
                unit_match = list(filter(lambda x: x.group(), unit_match))

                max_len = 0
                for match in unit_match:
                    if match.group():
                        end_pos = match.start() + len(match.group())
                        if match.start() >= 0:
                            middle: str = right[:min(match.start(), len(right)
                                                     )]
                            if max_len < end_pos and (
                                    not middle.strip() or middle.strip()
                                    == self.config.connector_token):
                                max_len = end_pos
                if max_len != 0:
                    for i in range(length + max_len):
                        matched[i + start] = True
                    ex_result = ExtractResult()
                    ex_result.start = start
                    ex_result.length = length + max_len
                    ex_result.text = source[start:start + length + max_len]
                    ex_result.type = self.config.extract_type

                    if prefix_unit:
                        ex_result.start -= prefix_unit.offset
                        ex_result.length += prefix_unit.offset
                        ex_result.text = prefix_unit.unit + ex_result.text

                    num.start = start - ex_result.start
                    ex_result.data = num

                    is_not_unit = False
                    if ex_result.type == Constants.SYS_UNIT_DIMENSION:
                        non_unit_match = self.config.pm_non_unit_regex.finditer(
                            source)
                        for match in non_unit_match:
                            if ex_result.start >= match.start(
                            ) and ex_result.end <= match.end():
                                is_not_unit = True

                    if is_not_unit:
                        continue

                    result.append(ex_result)
                    continue
            if prefix_unit:
                ex_result = ExtractResult()
                ex_result.start = num.start - prefix_unit.offset
                ex_result.length = num.length + prefix_unit.offset
                ex_result.text = prefix_unit.unit + num.text
                ex_result.type = self.config.extract_type

                num.start = start - ex_result.start
                ex_result.data = num
                result.append(ex_result)

        if self.separate_regex:
            result = self._extract_separate_units(source, result)

        return result
    def parse_specific_time(self, source: str,
                            reference: datetime) -> DateTimeResolutionResult:
        result = DateTimeResolutionResult()
        year = reference.year
        month = reference.month
        day = reference.day

        source = source.strip().lower()

        match = regex.search(self.config.specific_time_from_to_regex, source)
        if not match:
            match = regex.search(self.config.specific_time_between_and_regex,
                                 source)

        if not match or match.start() != 0:
            return result

        # this "from .. to .." pattern is valid if followed by a Date OR "pm"
        valid = False

        time1 = RegExpUtility.get_group(match, "time1")
        time2 = RegExpUtility.get_group(match, "time2")

        # get hours
        hour_group_list = RegExpUtility.get_group_list(
            match, Constants.HOUR_GROUP_NAME)

        hour_str = hour_group_list[0]
        begin_hour = self.config.numbers.get(hour_str, None)
        if not begin_hour:
            begin_hour = int(hour_str)

        hour_str = hour_group_list[1]
        end_hour = self.config.numbers.get(hour_str, None)
        if not end_hour:
            end_hour = int(hour_str)

        # get minutes
        minute_group_list = RegExpUtility.get_group_list(
            match, Constants.MINUTE_GROUP_NAME)

        begin_minute = end_minute = -1
        if len(minute_group_list) > 1:
            minute_str = minute_group_list[0]
            begin_minute = self.config.numbers.get(minute_str, None)
            if not begin_minute:
                begin_minute = int(minute_str)
            minute_str = minute_group_list[1]
            end_minute = self.config.numbers.get(minute_str, None)
            if not end_minute:
                end_minute = int(minute_str)
        elif len(minute_group_list) == 1:
            minute_str = minute_group_list[0]
            if minute_str in time1:
                begin_minute = self.config.numbers.get(minute_str, None)
                if not begin_minute:
                    begin_minute = int(minute_str)
            elif minute_str in time2:
                end_minute = self.config.numbers.get(minute_str, None)
                if not end_minute:
                    end_minute = int(minute_str)

        # parse AM/PM
        left_desc: str = RegExpUtility.get_group(
            match, Constants.LEFT_DESC_GROUP_NAME)
        right_desc: str = RegExpUtility.get_group(
            match, Constants.RIGHT_DESC_GROUP_NAME)

        desc_capture_list = RegExpUtility.get_group_list(
            match, Constants.DESC_GROUP_NAME)
        for desc_capture in desc_capture_list:
            if desc_capture in time1 and not left_desc:
                left_desc: str = desc_capture
            elif desc_capture in time2 and not right_desc:
                right_desc: str = desc_capture

        begin_date_time = datetime(
            year,
            month,
            day,
            hour=begin_hour,
            minute=begin_minute if begin_minute > 0 else 0)
        end_date_time = datetime(year,
                                 month,
                                 day,
                                 hour=end_hour,
                                 minute=end_minute if end_minute > 0 else 0)

        has_left_am = left_desc != '' and left_desc.startswith('a')
        has_left_pm = left_desc != '' and left_desc.startswith('p')
        has_right_am = right_desc != '' and right_desc.startswith('a')
        has_right_pm = right_desc != '' and right_desc.startswith('p')
        has_left = has_left_am or has_left_pm
        has_right = has_right_am or has_right_pm

        # both time point has description like 'am' or 'pm'
        if has_left and has_right:
            if has_left_am:
                if begin_hour >= 12:
                    begin_date_time -= timedelta(hours=12)
            else:
                if begin_hour < 12:
                    begin_date_time += timedelta(hours=12)
            if has_right_am:
                if end_hour > 12:
                    end_date_time -= timedelta(hours=12)
            else:
                if end_hour < 12:
                    end_date_time += timedelta(hours=12)
        # one of the time point has description like 'am' or 'pm'
        elif has_left or has_right:
            if has_left_am:
                if begin_hour >= 12:
                    begin_date_time -= timedelta(hours=12)
                if end_hour < 12:
                    if end_date_time < begin_date_time:
                        end_date_time += timedelta(hours=12)
            elif has_left_pm:
                if begin_hour < 12:
                    begin_date_time += timedelta(hours=12)
                if end_hour < 12:
                    if end_date_time < begin_date_time:
                        span: datetime = begin_date_time - end_date_time
                        end_date_time += timedelta(
                            hours=24) if span >= timedelta(
                                hours=12) else timedelta(hours=12)
            if has_right_am:
                if end_hour >= 12:
                    end_date_time -= timedelta(hours=12)
                if begin_hour < 12:
                    if end_date_time < begin_date_time:
                        begin_date_time -= timedelta(hours=12)
            elif has_right_pm:
                if end_hour < 12:
                    end_date_time += timedelta(hours=12)
                if begin_hour < 12:
                    if end_date_time < begin_date_time:
                        begin_date_time -= timedelta(hours=12)
                    else:
                        span = end_date_time - begin_date_time
                        if span >= timedelta(hours=12):
                            begin_date_time += timedelta(hours=12)
        # no 'am' or 'pm' indicator
        elif begin_hour <= 12 and end_hour <= 12:
            if begin_date_time > end_date_time:
                if begin_hour == 12:
                    begin_date_time -= timedelta(hours=12)
                else:
                    end_date_time += timedelta(hours=12)
            result.comment = Constants.AM_PM_GROUP_NAME

        if end_date_time < begin_date_time:
            end_date_time += timedelta(hours=24)

        if begin_minute >= 0:
            begin = f'T{begin_date_time.hour:02d}:{begin_date_time.minute:02d}'
        else:
            begin = f'T{begin_date_time.hour:02d}'
        if end_minute >= 0:
            end = f'T{end_date_time.hour:02d}:{end_date_time.minute:02d}'
        else:
            end = f'T{end_date_time.hour:02d}'

        difference = datetime(year, month,
                              day) + (end_date_time - begin_date_time)
        if difference.minute != 0 and difference.hour != 0:
            result.timex = f'({begin},{end},PT{difference.hour}H{difference.minute}M)'
        elif difference.minute != 0 and difference.hour == 0:
            result.timex = f'({begin},{end},PT{difference.minute}M)'
        else:
            result.timex = f'({begin},{end},PT{difference.hour}H)'

        result.future_value = ResolutionStartEnd()
        result.past_value = ResolutionStartEnd()
        result.future_value.start = begin_date_time
        result.future_value.end = end_date_time
        result.past_value.start = result.future_value.start
        result.past_value.end = result.future_value.end
        result.success = True

        result.sub_date_time_entities = []

        # in SplitDateAndTime mode, time points will be get from these sub_date_time_entities
        # cases like "from 4 to 5pm", "4" should not be trated as sub_date_time_entities
        if has_left or begin_minute >= 0:
            er = ExtractResult()
            er.start = match.start("time1")
            er.length = match.end("time1") - match.start("time1")
            er.text = time1
            er.type = Constants.SYS_DATETIME_TIME
            pr = self.config.time_parser.parse(er, reference)
            result.sub_date_time_entities.append(pr)

        # cases like "from 4am to 5" "5" should not treated as sub_date_time_entities
        if has_right or end_minute >= 0:
            er = ExtractResult()
            er.start = match.start("time2")
            er.length = match.end("time2") - match.start("time2")
            er.text = time2
            er.type = Constants.SYS_DATETIME_TIME
            pr = self.config.time_parser.parse(er, reference)
            result.sub_date_time_entities.append(pr)

        return result