Ejemplo n.º 1
0
def _regex_match(sentence, regex_list):
    """
    """

    candidates = []
    for i, regex in enumerate(regex_list):
        iterator = regex.finditer(sentence)
        for match in iterator:
            # strip any trailing whitespace (invalidates match.end())
            match_text = match.group().rstrip()
            start = match.start()
            end = start + len(match_text)

            #DISPLAY('\t{0}'.format(match_text))
            housing = None
            if _GROUP_HOUSING in match.groupdict():
                housing = match.group(_GROUP_HOUSING)

            candidates.append(
                overlap.Candidate(start, end, match_text, regex,
                                  other=housing))

    # sort the candidates in DECREASING order of length
    candidates = sorted(candidates, key=lambda x: x.end - x.start)

    if _TRACE:
        DISPLAY('\tCandidate matches: ')
        index = 0
        for c in candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    # keep the longest of any overlapping matches
    pruned_candidates = overlap.remove_overlap(candidates,
                                               False,
                                               keep_longest=True)

    if _TRACE:
        DISPLAY('\tCandidate matches after overlap resolution: ')
        index = 0
        for c in pruned_candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    return pruned_candidates
Ejemplo n.º 2
0
def run(sentence):
    """

    Find dates in the sentence by attempting to match all regexes. Avoid
    matching sub-expressions of already-matched strings. Returns a JSON
    array containing info on each date found.

    """

    results = []  # DateValue namedtuple results
    candidates = []  # potential matches, need overlap resolution to confirm

    original_sentence = sentence
    sentence = _clean_sentence(sentence)

    if _TRACE:
        log('(DF) original: {0}'.format(original_sentence))
        log('(DF)  cleaned: {0}'.format(sentence))

    for regex_index, regex in enumerate(_regexes):
        iterator = regex.finditer(sentence)
        for match in iterator:
            match_text = match.group().strip()
            if _ISO_DATETIME_REGEX_INDEX == regex_index:
                # extract only the date portion
                t_pos = match_text.find('T')
                assert -1 != t_pos
                match_text = match_text[:t_pos]
            start = match.start()
            end = start + len(match_text)
            candidates.append(overlap.Candidate(start, end, match_text, regex))

            if _TRACE:
                log('\t[{0:2}]: MATCH TEXT: ->{1}<-'.format(
                    regex_index, match_text))

    # sort the candidates in descending order of length, which is needed for
    # one-pass overlap resolution later on
    candidates = sorted(candidates,
                        key=lambda x: x.end - x.start,
                        reverse=True)

    if _TRACE:
        log('\tCandidate matches: ')
        index = 0
        for c in candidates:
            log('\t[{0:2}]\t[{1},{2}): {3}'.format(index, c.start, c.end,
                                                   c.match_text, c.regex))
            index += 1
        log()

    pruned_candidates = overlap.remove_overlap(candidates, _TRACE)

    if _TRACE:
        log('\tcandidates count after overlap removal: {0}'.format(
            len(pruned_candidates)))
        log('\tPruned candidates: ')
        for c in pruned_candidates:
            log('\t\t[{0},{1}): {2}'.format(c.start, c.end, c.match_text))
        log()

    if _TRACE:
        log('Extracting data from pruned candidates...')

    for pc in pruned_candidates:

        # use the saved regex to match the saved text again
        if _regex_iso_datetime == pc.regex:
            # match only the date portion
            match = _regex_iso_2.match(pc.match_text)
        else:
            match = pc.regex.match(pc.match_text)
        assert match

        int_year = EMPTY_FIELD
        int_month = EMPTY_FIELD
        int_day = EMPTY_FIELD

        if _TRACE:
            log('\t     matched: "{0}"'.format(match.group()))
            log('\t\t groupdict: {0}'.format(match.groupdict()))
            log('\t\tmatch_text: "{0}"'.format(pc.match_text))

        for k, v in match.groupdict().items():
            if v is None:
                continue
            if 'year' == k:
                int_year = int(v)
            elif 'month' == k:
                # convert textual months to int
                if re.search(r'\D', v):
                    int_month = month_dict[v.strip().lower()]
                else:
                    int_month = int(v)
            elif 'day' == k:
                # strip text from 1st, 3rd, etc.
                if re.search(r'\D', v):
                    int_day = int(re.search(r'\d+', v).group())
                else:
                    int_day = int(v)

        meas = DateValue(text=pc.match_text,
                         start=pc.start,
                         end=pc.end,
                         year=int_year,
                         month=int_month,
                         day=int_day)

        results.append(meas)

    # sort results to match order in sentence
    results = sorted(results, key=lambda x: x.start)

    # convert to list of dicts to preserve field names in JSON output
    return json.dumps([r._asdict() for r in results], indent=4)
Ejemplo n.º 3
0
def run(text_in):
    """
    Find lab values in the input text and return a list of
    finder_overlap.Candidate results.
    """

    results = []

    text = _cleanup_text(text_in)

    if _TRACE:
        log('\n*****\n TEXT: "{0}"\n*****\n'.format(text))

    for regex_list_index, regex_list in enumerate(_all_regex_lists):
        candidates = []
        for regex_index, regex in enumerate(regex_list):
            # print('\n*** REGEX ***\n')
            # print(regex)
            # print()
            iterator = regex.finditer(text)
            for match in iterator:
                start = match.start()
                end = match.end()
                match_text = match.group()
                if 0 == len(match_text):
                    continue

                # valid matches must begin with an alphanumeric char or a dash
                first_char = match_text[0]
                if '-' != first_char and not first_char.isalnum():
                    if _TRACE:
                        log('\tDiscarding (not isalnum): "{0}"'.format(
                            match_text))
                    continue

                # discard if match_text begins with a stopword
                pos = match_text.find(' ')
                if -1 != pos and match_text[:pos] in _stopwords:
                    if _TRACE:
                        log('\tDiscarding (stopword) "{0}"'.format(match_text))
                    continue

                if _TRACE:
                    log('\tMATCH: "{0}"'.format(match_text))
                    if 'header' in match.groupdict().keys():
                        header = match.group('header')
                        if header is not None:
                            log('\t\tHEADER: "{0}"'.format(header))

                c = overlap.Candidate(start, end, match_text, None)
                candidates.append(c)

        candidates = sorted(candidates,
                            key=lambda x: x.end - x.start,
                            reverse=True)

        if len(candidates) > 0:

            if _TRACE:
                log('\tCandidate matches: ')
                for index, c in enumerate(candidates):
                    log('\t[{0:2}]\t[{1},{2}): {3}'.format(
                        index, c.start, c.end, c.match_text, c.regex))
                log()

            pruned_candidates = overlap.remove_overlap(candidates, _TRACE)

            if _TRACE:
                log('\tCandidate count after overlap removal: {0}'.format(
                    len(pruned_candidates)))
                log('\tPruned candidates: ')
                for c in pruned_candidates:
                    log('\t\t[{0},{1}): {2}'.format(c.start, c.end,
                                                    c.match_text))
                log()

            results.extend(pruned_candidates)

    # sort results by order of occurrence in text
    results = sorted(results, key=lambda x: x.start)

    # resolve any overlap in these final results
    results = _resolve_overlap(results)

    return results
Ejemplo n.º 4
0
def _resolve_overlap(result_list):
    """
    Remove any remaining overlap among the items in the list. The items are
    of type finder_overlap.Candidate. Assumes the list items are sorted in
    order of occurrence in the sentence.
    """

    if 0 == len(result_list):
        return []

    if _TRACE:
        log('Called _resolve_overlap...')
        log('Candidates: ')
        for r in result_list:
            log('[{0:3}, {1:3}): {2}'.format(r.start, r.end, r.match_text))
        log()

    final_results = [result_list[0]]

    for i in range(1, len(result_list)):
        r = result_list[i]
        f = final_results[-1]
        # check for overlap with previous final result
        if not overlap.has_overlap(r.start, r.end, f.start, f.end):

            # if r begins with 'mean', 'avg', etc., append to f
            match2 = _regex_avg.match(r.match_text)
            if match2:
                match_text = f.match_text + r.match_text
                new_f = overlap.Candidate(start=f.start,
                                          end=f.start + len(match_text),
                                          match_text=match_text,
                                          regex=f.regex,
                                          other=f.other)
                final_results[-1] = new_f
                if _TRACE:
                    log('\tAppending r to f: ')
                    log('\t\tf:[{0:3},{1:3}): {2}'.format(
                        f.start, f.end, f.match_text))
                    log('\t\tr:[{0:3},{1:3}): {2}'.format(
                        r.start, r.end, r.match_text))
                continue

            match2 = re.match(r'\A[\d.%]+\s?\Z', r.match_text)
            if match2:
                # discard, value only
                if _TRACE:
                    log('\t\tvalue only, discarding "{0}"'.format(
                        r.match_text))
                continue

            if _TRACE: log('\tkeeping result {0}'.format(r.match_text))
            final_results.append(r)
            continue

        else:
            # has overlap with prevous result

            if _TRACE:
                log('\tOverlap: ')
                log('\t\tf:[{0:3},{1:3}): {2}'.format(f.start, f.end,
                                                      f.match_text))
                log('\t\tr:[{0:3},{1:3}): {2}'.format(r.start, r.end,
                                                      r.match_text))

            # check if duplicate
            if r.start == f.start and r.end == f.end:
                # ignore duplicate
                if _TRACE: log('\t\tduplicate')
                continue

            # if r is a subset of f, ignore r
            if -1 != f.match_text.find(r.match_text):
                if _TRACE: log('\t\tr is a subset of f, ignoring r...')
                continue

            # partial overlap:
            #     |f.start---f.end|
            #              |r.start---r.end|
            #
            #     |f.start---f.end|
            #     |r.start--------------r.end|
            assert r.start <= f.end
            diff = f.end - r.start
            if _TRACE:
                log('\t\tdiff: {0}'.format(diff))
            assert diff >= 0
            if 0 == diff:
                continue

            # subtract 'diff' chars from the end of f
            # keep r intact
            match_text = f.match_text[:-diff]
            match = _regex_num.search(match_text)
            if not match:
                # remove final_result[-1] (only text, no numeric value remains)
                # replace with r
                if _TRACE:
                    log('\t\tignoring f, text only: "{0}"'.format(match_text))

                final_results[-1] = r
                continue

            if match_text.endswith('/'):
                # discard r, f would be left as a fragment
                if _TRACE:
                    log('\t\tavoiding fragment, discarding "{0}"'.format(
                        r.match_text))
                continue

            match = re.match(r'[\d.%]+\s?', match_text)
            if match:
                # discard, value only
                if _TRACE:
                    log('\t\tvalue only, discarding "{0}"'.format(match_text))
                continue

            new_f = overlap.Candidate(start=f.start,
                                      end=f.end - diff,
                                      match_text=f.match_text[:-diff],
                                      regex=f.regex,
                                      other=f.other)

            if new_f.start == new_f.end:
                if _TRACE:
                    log('\t\tzero span')
                continue

            # if identical to prior elt, ignore
            if len(final_results) >= 2:
                f2 = final_results[-2]
                if new_f.start == f2.start and \
                   new_f.end   == f2.end   and \
                   new_f.match_text == f2.match_text:
                    if _TRACE:
                        log('\t\tidentical to prior elt, ignoring...')
                    continue

            if _TRACE:
                log('\t\tOverwriting f with new_f: ')
                log('\t\tnew_f:[{0:3},{1:3}): {2}'.format(
                    new_f.start, new_f.end, new_f.match_text))

            final_results[-1] = new_f
            final_results.append(r)

    return final_results
Ejemplo n.º 5
0
def _regex_match(sentence, regex_list):
    """
    """

    candidates = []
    for i, regex in enumerate(regex_list):
        iterator = regex.finditer(sentence)
        for match in iterator:
            match_text = match.group().strip()

            # Special case for regex index 6. Prevent a match on the bracketed
            # portion of something like this:
            #       "<RA. Pt was initially satting 95%> on NRB.".
            #
            # In other words, the sentence segmentation should have started
            # a new sentence at "Pt", in which case the match would be correct.
            special_match = re.search(r'\.\s[A-Z][a-z]+', match_text)
            if special_match:
                continue

            start = match.start()
            end = start + len(match_text)
            candidates.append(
                overlap.Candidate(start, end, match_text, regex, other=match))
            if _TRACE:
                print('[{0:2}]: [{1:3}, {2:3})\tMATCH TEXT: ->{3}<-'.format(
                    i, start, end, match_text))
                print('\tmatch.groupdict entries: ')
                for k, v in match.groupdict().items():
                    print('\t\t{0} => {1}'.format(k, v))

    if 0 == len(candidates):
        return []

    # sort the candidates in descending order of length, which is needed for
    # one-pass overlap resolution later on
    candidates = sorted(candidates,
                        key=lambda x: x.end - x.start,
                        reverse=True)

    if _TRACE:
        print('\tCandidate matches: ')
        index = 0
        for c in candidates:
            print('\t[{0:2}]\t[{1},{2}): {3}'.format(index, c.start, c.end,
                                                     c.match_text, c.regex))
            index += 1
        print()

    # if two overlap exactly, keep candidate with longer device string
    prev_start = candidates[0].start
    prev_end = candidates[0].end
    delete_index = None
    for i in range(1, len(candidates)):
        c = candidates[i]
        if c.start == prev_start and c.end == prev_end:
            if _TRACE:
                print('\tCandidates at indices {0} and {1} have ' \
                      'identical overlap'.format(i-1, i))
            # the regex match object is stored in the 'other' field
            matchobj = c.other
            matchobj_prev = candidates[i - 1].other
            if 'device' in matchobj.groupdict(
            ) and 'device' in matchobj_prev.groupdict():
                device = matchobj.group('device')
                device_prev = matchobj_prev.group('device')
                if device is not None and device_prev is not None:
                    len_device = len(device)
                    len_device_prev = len(device_prev)
                    if _TRACE:
                        print('\t\tdevice string for index {0}: {1}'.format(
                            i - 1, device_prev))
                        print('\t\tdevice string for index {0}: {1}'.format(
                            i, device))
                    if len_device > len_device_prev:
                        delete_index = i - 1
                    else:
                        delete_index = i
                    if _TRACE:
                        print('\t\t\tdelete_index: {0}'.format(delete_index))
                    break
        prev_start = c.start
        prev_end = c.end

    if delete_index is not None:
        del candidates[delete_index]
        if _TRACE:
            print(
                '\tRemoved candidate at index {0} with shorter device string'.
                format(delete_index))

    # remove any that are proper substrings of another, exploiting the fact
    # that the candidate list is sorted in decreasing order of length
    discard_set = set()
    for i in range(1, len(candidates)):
        start = candidates[i].start
        end = candidates[i].end
        for j in range(0, i):
            prev_start = candidates[j].start
            prev_end = candidates[j].end
            if start >= prev_start and end <= prev_end:
                discard_set.add(i)
                if _TRACE:
                    print('\t[{0:2}] is a substring of [{1}], discarding...'.
                          format(i, j))
                break

    survivors = []
    for i in range(len(candidates)):
        if i not in discard_set:
            survivors.append(candidates[i])

    candidates = survivors

    # Check for any 'complete' candidates. A complete candidate has an O2
    # saturation value, a device, and a flow rate. If one or more of these,
    # restrict consideration to these only.

    complete_candidates = []
    for c in candidates:
        # match object stored in the 'other' field
        gd = c.other.groupdict()
        count = 0
        if 'val' in gd and gd['val'] is not None:
            count += 1
        if 'flow_rate' in gd and gd['flow_rate'] is not None:
            count += 1
        if 'flow_rate2' in gd and gd['flow_rate2'] is not None:
            count += 1
        if 'flow_rate3' in gd and gd['flow_rate3'] is not None:
            count += 1
        if 'device' in gd and gd['device'] is not None:
            count += 1
            # room air will not have a flow rate, but it is nontheless complete
            device_str = gd['device']
            if -1 != device_str.find(' air'):
                count += 1
        if count >= 3:
            complete_candidates.append(c)
            if _TRACE:
                print('\tFound complete candidate "{0}"'.format(c.match_text))

    if len(complete_candidates) > 0:
        candidates = complete_candidates

        # Now find the maximum number of non-overlapping candidates. This is an
        # instance of the equal-weight interval scheduling problem, which has an
        # optimal greedy solution. See the book "Algorithm Design" by Kleinberg and
        # Tardos, ch. 4.

        # sort candidates in increasing order of their END points
        candidates = sorted(candidates, key=lambda x: x.end)

        pruned_candidates = [candidates[0]]
        prev_end = pruned_candidates[0].end
        for i in range(1, len(candidates)):
            c = candidates[i]
            if c.start >= prev_end:
                pruned_candidates.append(c)
                prev_end = c.end

    else:
        # run the usual overlap resolution
        pruned_candidates = overlap.remove_overlap(candidates, _TRACE)

    if _TRACE:
        print('\tcandidate count after overlap removal: {0}'.format(
            len(pruned_candidates)))
        print('\tPruned candidates: ')
        for c in pruned_candidates:
            print('\t\t[{0},{1}): {2}'.format(c.start, c.end, c.match_text))
        print()

    return pruned_candidates
Ejemplo n.º 6
0
def _regex_match(sentence, regex_list):
    """
    """

    sentence_save = sentence

    candidates = []
    for i, regex in enumerate(regex_list):
        iterator = regex.finditer(sentence)
        for match in iterator:
            # strip any trailing whitespace (invalidates match.end())
            match_text = match.group().rstrip()
            start = match.start()
            end = start + len(match_text)

            # isolate the school, if any
            school_text = None
            if _SCHOOL_COLLEGE in match.groupdict() and match.group(
                    _SCHOOL_COLLEGE) is not None:
                school_text = _SCHOOL_COLLEGE
            elif _SCHOOL_HS in match.groupdict() and match.group(
                    _SCHOOL_HS) is not None:
                school_text = _SCHOOL_HS
            elif _SCHOOL_ELEM in match.groupdict() and match.group(
                    _SCHOOL_ELEM) is not None:
                school_text = _SCHOOL_ELEM

            degree_text = None
            #if 'degree' in match.groupdict():
            #    degree_text = match.group('degree').strip()
            if _DEG_DOCTORAL in match.groupdict() and match.group(
                    _DEG_DOCTORAL) is not None:
                degree_text = _DEG_DOCTORAL
            elif _DEG_MASTERS in match.groupdict() and match.group(
                    _DEG_MASTERS) is not None:
                degree_text = _DEG_MASTERS
            elif _DEG_BATCHELORS in match.groupdict() and match.group(
                    _DEG_BATCHELORS) is not None:
                degree_text = _DEG_BATCHELORS
            elif _DEG_GED in match.groupdict() and match.group(
                    _DEG_GED) is not None:
                degree_text = _DEG_GED

            info_dict = {
                _KEY_SCHOOL: school_text,
                _KEY_DEGREE: degree_text,
            }

            candidates.append(
                overlap.Candidate(start,
                                  end,
                                  match_text,
                                  regex,
                                  other=info_dict))

    # sort the candidates in DECREASING order of length
    candidates = sorted(candidates, key=lambda x: x.end - x.start)

    if _TRACE:
        DISPLAY('\tCandidate matches: ')
        index = 0
        for c in candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    # keep the longest of any overlapping matches
    pruned_candidates = overlap.remove_overlap(candidates,
                                               False,
                                               keep_longest=True)

    if _TRACE:
        DISPLAY('\tCandidate matches after overlap resolution: ')
        index = 0
        for c in pruned_candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    return pruned_candidates
Ejemplo n.º 7
0
def _regex_match(sentence, regex_list):
    """
    """

    candidates = []
    for i, regex in enumerate(regex_list):
        # expect only a single match
        iterator = regex.finditer(sentence)
        for match in iterator:
            # strip any trailing whitespace (invalidates match.end())
            match_text = match.group().rstrip()
            start = match.start()
            end = start + len(match_text)

            if _TRACE:
                DISPLAY('\t' + match_text)

            # get group name to determine employment status
            status = None
            if 'unemployed' in match.groupdict() and match.group(
                    'unemployed') is not None:
                status = EMPLOYMENT_STATUS_UNEMPLOYED
            elif 'disabled' in match.groupdict() and match.group(
                    'disabled') is not None:
                status = EMPLOYMENT_STATUS_DISABLED
            elif 'retired' in match.groupdict() and match.group(
                    'retired') is not None:
                status = EMPLOYMENT_STATUS_RETIRED
            elif 'employed' in match.groupdict() and match.group(
                    'employed') is not None:
                status = EMPLOYMENT_STATUS_EMPLOYED

            # append the new match if no other candidates of the same type
            ok_to_append = True
            for c in candidates:
                if c.other == status:
                    ok_to_append = False
                    break

            if ok_to_append:
                candidates.append(
                    overlap.Candidate(start,
                                      end,
                                      match_text,
                                      regex,
                                      other=status))

    # sort candidates in DECREASING order of length
    candidates = sorted(candidates, key=lambda x: x.end - x.start)

    if _TRACE:
        DISPLAY('\tCandidate matches: ')
        index = 0
        for c in candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    # keep the longest of any overlapping matches
    pruned_candidates = overlap.remove_overlap(candidates,
                                               False,
                                               keep_longest=True)

    if _TRACE:
        DISPLAY('\tCandidate matches after overlap resolution: ')
        index = 0
        for c in pruned_candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    return pruned_candidates
Ejemplo n.º 8
0
def run(sentence):
    """

    Find time expressions in the sentence by attempting to match all regexes.
    Avoid matching sub-expressions of already-matched strings. Returns a JSON
    array containing info on each date found.
    
    """

    results = []  # TimeValue namedtuple results
    candidates = []  # potential matches, need overlap resolution to confirm

    original_sentence = sentence
    sentence = _clean_sentence(sentence)

    if _TRACE:
        log('original: {0}'.format(original_sentence))
        log(' cleaned: {0}'.format(sentence))

    for regex_index, regex in enumerate(_regexes):
        iterator = regex.finditer(sentence)
        for match in iterator:
            match_text = match.group().strip()
            t_adjustment = 0
            if _ISO_DATETIME_REGEX_INDEX == regex_index:
                # extract only the time portion
                t_pos = match_text.find('T')
                assert -1 != t_pos
                match_text = match_text[t_pos + 1:]
                t_adjustment = t_pos + 1
            start = match.start() + t_adjustment
            end = start + len(match_text)
            candidates.append(overlap.Candidate(start, end, match_text, regex))
            if _TRACE:
                log('[{0:2}]: [{1:3}, {2:3})\tMATCH TEXT: ->{3}<-'.format(
                    regex_index, start, end, match_text))

    # sort the candidates in descending order of length, which is needed for
    # one-pass overlap resolution later on
    candidates = sorted(candidates,
                        key=lambda x: x.end - x.start,
                        reverse=True)

    if _TRACE:
        log('\tCandidate matches: ')
        index = 0
        for c in candidates:
            log('\t[{0:2}]\t[{1},{2}): {3}'.format(index, c.start, c.end,
                                                   c.match_text, c.regex))
            index += 1
        log()

    pruned_candidates = overlap.remove_overlap(candidates, _TRACE)

    if _TRACE:
        log('\tcandidates count after overlap removal: {0}'.format(
            len(pruned_candidates)))
        log('\tPruned candidates: ')
        for c in pruned_candidates:
            log('\t\t[{0},{1}): {2}'.format(c.start, c.end, c.match_text))
        log()

    if _TRACE:
        log('Extracting data from pruned candidates...')

    for pc in pruned_candidates:

        # used the saved regex to match the saved text again
        if _regex_iso_datetime == pc.regex:
            # match only time portion
            match = _regex_iso_time.match(pc.match_text)
        else:
            match = pc.regex.match(pc.match_text)
        assert match

        int_hours = EMPTY_FIELD
        int_minutes = EMPTY_FIELD
        int_seconds = EMPTY_FIELD
        frac_seconds = EMPTY_FIELD
        am_pm = EMPTY_FIELD
        timezone = EMPTY_FIELD
        gmt_delta = EMPTY_FIELD
        gmt_delta_sign = EMPTY_FIELD
        gmt_delta_hours = EMPTY_FIELD
        gmt_delta_minutes = EMPTY_FIELD

        for k, v in match.groupdict().items():
            if v is None:
                continue
            if 'hours' == k:
                int_hours = int(v)
            elif 'minutes' == k:
                int_minutes = int(v)
            elif 'seconds' == k:
                int_seconds = int(v)
            elif 'frac' == k:
                # leave as a string; conversion needs to handle leading zeros
                frac_seconds = v[1:]
            elif 'am_pm' == k:
                if -1 != v.find('a') or -1 != v.find('A'):
                    am_pm = STR_AM
                else:
                    am_pm = STR_PM
            elif 'timezone' == k:
                timezone = v
                if 'Z' == timezone:
                    timezone = 'UTC'
            elif 'gmt_delta' == k:
                gmt_delta = v
                match_gmt = _regex_gmt.search(v)
                if match_gmt:
                    for k2, v2 in match_gmt.groupdict().items():
                        if v2 is None:
                            continue
                        if 'gmt_sign' == k2:
                            gmt_delta_sign = v2
                        elif 'gmt_hours' == k2:
                            gmt_delta_hours = int(v2)
                        elif 'gmt_minutes' == k2:
                            gmt_delta_minutes = int(v2)

        meas = TimeValue(text=pc.match_text,
                         start=pc.start,
                         end=pc.end,
                         hours=int_hours,
                         minutes=int_minutes,
                         seconds=int_seconds,
                         fractional_seconds=frac_seconds,
                         am_pm=am_pm,
                         timezone=timezone,
                         gmt_delta_sign=gmt_delta_sign,
                         gmt_delta_hours=gmt_delta_hours,
                         gmt_delta_minutes=gmt_delta_minutes)
        results.append(meas)

    # sort results to match order of occurrence in sentence
    results = sorted(results, key=lambda x: x.start)

    # convert to list of dicts to preserve field names in JSON output
    return json.dumps([r._asdict() for r in results], indent=4)
Ejemplo n.º 9
0
def _regex_match(sentence, regex_list):
    """
    """

    sentence_save = sentence

    # erase any negated languages from the sentence, then attempt regexes
    neg_match = _regex_neg_language.search(sentence)
    if neg_match:
        if _TRACE:
            DISPLAY('NEG LANGUAGE MATCH: "{0}"'.format(neg_match.group()))
        sentence = sentence[:neg_match.start()] + sentence[neg_match.end():]

    candidates = []
    for i, regex in enumerate(regex_list):
        iterator = regex.finditer(sentence)
        for match in iterator:
            # strip any trailing whitespace (invalidates match.end())
            match_text = match.group().rstrip()
            start = match.start()
            end = start + len(match_text)

            # isolate the matching language(s)
            language_text = match.group('languages').strip()

            candidates.append(
                overlap.Candidate(start,
                                  end,
                                  match_text,
                                  regex,
                                  other=language_text))

    # sort the candidates in DECREASING order of length
    candidates = sorted(candidates, key=lambda x: x.end - x.start)

    if _TRACE:
        DISPLAY('\tCandidate matches: ')
        index = 0
        for c in candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    # keep the longest of any overlapping matches
    pruned_candidates = overlap.remove_overlap(candidates,
                                               False,
                                               keep_longest=True)

    if _TRACE:
        DISPLAY('\tCandidate matches after overlap resolution: ')
        index = 0
        for c in pruned_candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    return pruned_candidates
Ejemplo n.º 10
0
def _regex_match(sentence, regex_list):
    """
    """

    candidates = []
    for i, regex in enumerate(regex_list):
        iterator = regex.finditer(sentence)
        for match in iterator:
            # strip any trailing whitespace (invalidates match.end())
            match_text = match.group().rstrip()
            start = match.start()
            end = start + len(match_text)

            religion_text = match.group('religion').strip()
            candidates.append(
                overlap.Candidate(start,
                                  end,
                                  match_text,
                                  regex,
                                  other=religion_text))

    # if no matches, try to infer religion from presence of religious official(s)
    if 0 == len(candidates):
        match = _regex_official.search(sentence)
        if match:
            match_text = match.group().strip()
            start = match.start()
            end = start + len(match_text)

            religion_text = None
            if 'imam' in match_text:
                religion_text = 'islam'
            elif 'rabbi' in match_text or 'rabi' in match_text:
                religion_text = 'judaism'

            if religion_text is not None:
                candidates.append(
                    overlap.Candidate(start,
                                      end,
                                      match_text,
                                      regex,
                                      other=religion_text))

    # sort candidates in DECREASING order of length
    candidates = sorted(candidates, key=lambda x: x.end - x.start)

    if _TRACE:
        DISPLAY('\tCandidate matches: ')
        index = 0
        for c in candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    # keep the longest of any overlapping matches
    pruned_candidates = overlap.remove_overlap(candidates,
                                               False,
                                               keep_longest=True)

    if _TRACE:
        DISPLAY('\tCandidate matches after overlap resolution: ')
        index = 0
        for c in pruned_candidates:
            regex_index = regex_list.index(c.regex)
            DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format(
                index, regex_index, c.start, c.end, c.match_text))
            index += 1
        DISPLAY()

    return pruned_candidates