def _regex_match(sentence, regex_list): """ """ candidates = [] for i, regex in enumerate(regex_list): iterator = regex.finditer(sentence) for match in iterator: # strip any trailing whitespace (invalidates match.end()) match_text = match.group().rstrip() start = match.start() end = start + len(match_text) #DISPLAY('\t{0}'.format(match_text)) housing = None if _GROUP_HOUSING in match.groupdict(): housing = match.group(_GROUP_HOUSING) candidates.append( overlap.Candidate(start, end, match_text, regex, other=housing)) # sort the candidates in DECREASING order of length candidates = sorted(candidates, key=lambda x: x.end - x.start) if _TRACE: DISPLAY('\tCandidate matches: ') index = 0 for c in candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() # keep the longest of any overlapping matches pruned_candidates = overlap.remove_overlap(candidates, False, keep_longest=True) if _TRACE: DISPLAY('\tCandidate matches after overlap resolution: ') index = 0 for c in pruned_candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() return pruned_candidates
def run(sentence): """ Find dates in the sentence by attempting to match all regexes. Avoid matching sub-expressions of already-matched strings. Returns a JSON array containing info on each date found. """ results = [] # DateValue namedtuple results candidates = [] # potential matches, need overlap resolution to confirm original_sentence = sentence sentence = _clean_sentence(sentence) if _TRACE: log('(DF) original: {0}'.format(original_sentence)) log('(DF) cleaned: {0}'.format(sentence)) for regex_index, regex in enumerate(_regexes): iterator = regex.finditer(sentence) for match in iterator: match_text = match.group().strip() if _ISO_DATETIME_REGEX_INDEX == regex_index: # extract only the date portion t_pos = match_text.find('T') assert -1 != t_pos match_text = match_text[:t_pos] start = match.start() end = start + len(match_text) candidates.append(overlap.Candidate(start, end, match_text, regex)) if _TRACE: log('\t[{0:2}]: MATCH TEXT: ->{1}<-'.format( regex_index, match_text)) # sort the candidates in descending order of length, which is needed for # one-pass overlap resolution later on candidates = sorted(candidates, key=lambda x: x.end - x.start, reverse=True) if _TRACE: log('\tCandidate matches: ') index = 0 for c in candidates: log('\t[{0:2}]\t[{1},{2}): {3}'.format(index, c.start, c.end, c.match_text, c.regex)) index += 1 log() pruned_candidates = overlap.remove_overlap(candidates, _TRACE) if _TRACE: log('\tcandidates count after overlap removal: {0}'.format( len(pruned_candidates))) log('\tPruned candidates: ') for c in pruned_candidates: log('\t\t[{0},{1}): {2}'.format(c.start, c.end, c.match_text)) log() if _TRACE: log('Extracting data from pruned candidates...') for pc in pruned_candidates: # use the saved regex to match the saved text again if _regex_iso_datetime == pc.regex: # match only the date portion match = _regex_iso_2.match(pc.match_text) else: match = pc.regex.match(pc.match_text) assert match int_year = EMPTY_FIELD int_month = EMPTY_FIELD int_day = EMPTY_FIELD if _TRACE: log('\t matched: "{0}"'.format(match.group())) log('\t\t groupdict: {0}'.format(match.groupdict())) log('\t\tmatch_text: "{0}"'.format(pc.match_text)) for k, v in match.groupdict().items(): if v is None: continue if 'year' == k: int_year = int(v) elif 'month' == k: # convert textual months to int if re.search(r'\D', v): int_month = month_dict[v.strip().lower()] else: int_month = int(v) elif 'day' == k: # strip text from 1st, 3rd, etc. if re.search(r'\D', v): int_day = int(re.search(r'\d+', v).group()) else: int_day = int(v) meas = DateValue(text=pc.match_text, start=pc.start, end=pc.end, year=int_year, month=int_month, day=int_day) results.append(meas) # sort results to match order in sentence results = sorted(results, key=lambda x: x.start) # convert to list of dicts to preserve field names in JSON output return json.dumps([r._asdict() for r in results], indent=4)
def run(text_in): """ Find lab values in the input text and return a list of finder_overlap.Candidate results. """ results = [] text = _cleanup_text(text_in) if _TRACE: log('\n*****\n TEXT: "{0}"\n*****\n'.format(text)) for regex_list_index, regex_list in enumerate(_all_regex_lists): candidates = [] for regex_index, regex in enumerate(regex_list): # print('\n*** REGEX ***\n') # print(regex) # print() iterator = regex.finditer(text) for match in iterator: start = match.start() end = match.end() match_text = match.group() if 0 == len(match_text): continue # valid matches must begin with an alphanumeric char or a dash first_char = match_text[0] if '-' != first_char and not first_char.isalnum(): if _TRACE: log('\tDiscarding (not isalnum): "{0}"'.format( match_text)) continue # discard if match_text begins with a stopword pos = match_text.find(' ') if -1 != pos and match_text[:pos] in _stopwords: if _TRACE: log('\tDiscarding (stopword) "{0}"'.format(match_text)) continue if _TRACE: log('\tMATCH: "{0}"'.format(match_text)) if 'header' in match.groupdict().keys(): header = match.group('header') if header is not None: log('\t\tHEADER: "{0}"'.format(header)) c = overlap.Candidate(start, end, match_text, None) candidates.append(c) candidates = sorted(candidates, key=lambda x: x.end - x.start, reverse=True) if len(candidates) > 0: if _TRACE: log('\tCandidate matches: ') for index, c in enumerate(candidates): log('\t[{0:2}]\t[{1},{2}): {3}'.format( index, c.start, c.end, c.match_text, c.regex)) log() pruned_candidates = overlap.remove_overlap(candidates, _TRACE) if _TRACE: log('\tCandidate count after overlap removal: {0}'.format( len(pruned_candidates))) log('\tPruned candidates: ') for c in pruned_candidates: log('\t\t[{0},{1}): {2}'.format(c.start, c.end, c.match_text)) log() results.extend(pruned_candidates) # sort results by order of occurrence in text results = sorted(results, key=lambda x: x.start) # resolve any overlap in these final results results = _resolve_overlap(results) return results
def _resolve_overlap(result_list): """ Remove any remaining overlap among the items in the list. The items are of type finder_overlap.Candidate. Assumes the list items are sorted in order of occurrence in the sentence. """ if 0 == len(result_list): return [] if _TRACE: log('Called _resolve_overlap...') log('Candidates: ') for r in result_list: log('[{0:3}, {1:3}): {2}'.format(r.start, r.end, r.match_text)) log() final_results = [result_list[0]] for i in range(1, len(result_list)): r = result_list[i] f = final_results[-1] # check for overlap with previous final result if not overlap.has_overlap(r.start, r.end, f.start, f.end): # if r begins with 'mean', 'avg', etc., append to f match2 = _regex_avg.match(r.match_text) if match2: match_text = f.match_text + r.match_text new_f = overlap.Candidate(start=f.start, end=f.start + len(match_text), match_text=match_text, regex=f.regex, other=f.other) final_results[-1] = new_f if _TRACE: log('\tAppending r to f: ') log('\t\tf:[{0:3},{1:3}): {2}'.format( f.start, f.end, f.match_text)) log('\t\tr:[{0:3},{1:3}): {2}'.format( r.start, r.end, r.match_text)) continue match2 = re.match(r'\A[\d.%]+\s?\Z', r.match_text) if match2: # discard, value only if _TRACE: log('\t\tvalue only, discarding "{0}"'.format( r.match_text)) continue if _TRACE: log('\tkeeping result {0}'.format(r.match_text)) final_results.append(r) continue else: # has overlap with prevous result if _TRACE: log('\tOverlap: ') log('\t\tf:[{0:3},{1:3}): {2}'.format(f.start, f.end, f.match_text)) log('\t\tr:[{0:3},{1:3}): {2}'.format(r.start, r.end, r.match_text)) # check if duplicate if r.start == f.start and r.end == f.end: # ignore duplicate if _TRACE: log('\t\tduplicate') continue # if r is a subset of f, ignore r if -1 != f.match_text.find(r.match_text): if _TRACE: log('\t\tr is a subset of f, ignoring r...') continue # partial overlap: # |f.start---f.end| # |r.start---r.end| # # |f.start---f.end| # |r.start--------------r.end| assert r.start <= f.end diff = f.end - r.start if _TRACE: log('\t\tdiff: {0}'.format(diff)) assert diff >= 0 if 0 == diff: continue # subtract 'diff' chars from the end of f # keep r intact match_text = f.match_text[:-diff] match = _regex_num.search(match_text) if not match: # remove final_result[-1] (only text, no numeric value remains) # replace with r if _TRACE: log('\t\tignoring f, text only: "{0}"'.format(match_text)) final_results[-1] = r continue if match_text.endswith('/'): # discard r, f would be left as a fragment if _TRACE: log('\t\tavoiding fragment, discarding "{0}"'.format( r.match_text)) continue match = re.match(r'[\d.%]+\s?', match_text) if match: # discard, value only if _TRACE: log('\t\tvalue only, discarding "{0}"'.format(match_text)) continue new_f = overlap.Candidate(start=f.start, end=f.end - diff, match_text=f.match_text[:-diff], regex=f.regex, other=f.other) if new_f.start == new_f.end: if _TRACE: log('\t\tzero span') continue # if identical to prior elt, ignore if len(final_results) >= 2: f2 = final_results[-2] if new_f.start == f2.start and \ new_f.end == f2.end and \ new_f.match_text == f2.match_text: if _TRACE: log('\t\tidentical to prior elt, ignoring...') continue if _TRACE: log('\t\tOverwriting f with new_f: ') log('\t\tnew_f:[{0:3},{1:3}): {2}'.format( new_f.start, new_f.end, new_f.match_text)) final_results[-1] = new_f final_results.append(r) return final_results
def _regex_match(sentence, regex_list): """ """ candidates = [] for i, regex in enumerate(regex_list): iterator = regex.finditer(sentence) for match in iterator: match_text = match.group().strip() # Special case for regex index 6. Prevent a match on the bracketed # portion of something like this: # "<RA. Pt was initially satting 95%> on NRB.". # # In other words, the sentence segmentation should have started # a new sentence at "Pt", in which case the match would be correct. special_match = re.search(r'\.\s[A-Z][a-z]+', match_text) if special_match: continue start = match.start() end = start + len(match_text) candidates.append( overlap.Candidate(start, end, match_text, regex, other=match)) if _TRACE: print('[{0:2}]: [{1:3}, {2:3})\tMATCH TEXT: ->{3}<-'.format( i, start, end, match_text)) print('\tmatch.groupdict entries: ') for k, v in match.groupdict().items(): print('\t\t{0} => {1}'.format(k, v)) if 0 == len(candidates): return [] # sort the candidates in descending order of length, which is needed for # one-pass overlap resolution later on candidates = sorted(candidates, key=lambda x: x.end - x.start, reverse=True) if _TRACE: print('\tCandidate matches: ') index = 0 for c in candidates: print('\t[{0:2}]\t[{1},{2}): {3}'.format(index, c.start, c.end, c.match_text, c.regex)) index += 1 print() # if two overlap exactly, keep candidate with longer device string prev_start = candidates[0].start prev_end = candidates[0].end delete_index = None for i in range(1, len(candidates)): c = candidates[i] if c.start == prev_start and c.end == prev_end: if _TRACE: print('\tCandidates at indices {0} and {1} have ' \ 'identical overlap'.format(i-1, i)) # the regex match object is stored in the 'other' field matchobj = c.other matchobj_prev = candidates[i - 1].other if 'device' in matchobj.groupdict( ) and 'device' in matchobj_prev.groupdict(): device = matchobj.group('device') device_prev = matchobj_prev.group('device') if device is not None and device_prev is not None: len_device = len(device) len_device_prev = len(device_prev) if _TRACE: print('\t\tdevice string for index {0}: {1}'.format( i - 1, device_prev)) print('\t\tdevice string for index {0}: {1}'.format( i, device)) if len_device > len_device_prev: delete_index = i - 1 else: delete_index = i if _TRACE: print('\t\t\tdelete_index: {0}'.format(delete_index)) break prev_start = c.start prev_end = c.end if delete_index is not None: del candidates[delete_index] if _TRACE: print( '\tRemoved candidate at index {0} with shorter device string'. format(delete_index)) # remove any that are proper substrings of another, exploiting the fact # that the candidate list is sorted in decreasing order of length discard_set = set() for i in range(1, len(candidates)): start = candidates[i].start end = candidates[i].end for j in range(0, i): prev_start = candidates[j].start prev_end = candidates[j].end if start >= prev_start and end <= prev_end: discard_set.add(i) if _TRACE: print('\t[{0:2}] is a substring of [{1}], discarding...'. format(i, j)) break survivors = [] for i in range(len(candidates)): if i not in discard_set: survivors.append(candidates[i]) candidates = survivors # Check for any 'complete' candidates. A complete candidate has an O2 # saturation value, a device, and a flow rate. If one or more of these, # restrict consideration to these only. complete_candidates = [] for c in candidates: # match object stored in the 'other' field gd = c.other.groupdict() count = 0 if 'val' in gd and gd['val'] is not None: count += 1 if 'flow_rate' in gd and gd['flow_rate'] is not None: count += 1 if 'flow_rate2' in gd and gd['flow_rate2'] is not None: count += 1 if 'flow_rate3' in gd and gd['flow_rate3'] is not None: count += 1 if 'device' in gd and gd['device'] is not None: count += 1 # room air will not have a flow rate, but it is nontheless complete device_str = gd['device'] if -1 != device_str.find(' air'): count += 1 if count >= 3: complete_candidates.append(c) if _TRACE: print('\tFound complete candidate "{0}"'.format(c.match_text)) if len(complete_candidates) > 0: candidates = complete_candidates # Now find the maximum number of non-overlapping candidates. This is an # instance of the equal-weight interval scheduling problem, which has an # optimal greedy solution. See the book "Algorithm Design" by Kleinberg and # Tardos, ch. 4. # sort candidates in increasing order of their END points candidates = sorted(candidates, key=lambda x: x.end) pruned_candidates = [candidates[0]] prev_end = pruned_candidates[0].end for i in range(1, len(candidates)): c = candidates[i] if c.start >= prev_end: pruned_candidates.append(c) prev_end = c.end else: # run the usual overlap resolution pruned_candidates = overlap.remove_overlap(candidates, _TRACE) if _TRACE: print('\tcandidate count after overlap removal: {0}'.format( len(pruned_candidates))) print('\tPruned candidates: ') for c in pruned_candidates: print('\t\t[{0},{1}): {2}'.format(c.start, c.end, c.match_text)) print() return pruned_candidates
def _regex_match(sentence, regex_list): """ """ sentence_save = sentence candidates = [] for i, regex in enumerate(regex_list): iterator = regex.finditer(sentence) for match in iterator: # strip any trailing whitespace (invalidates match.end()) match_text = match.group().rstrip() start = match.start() end = start + len(match_text) # isolate the school, if any school_text = None if _SCHOOL_COLLEGE in match.groupdict() and match.group( _SCHOOL_COLLEGE) is not None: school_text = _SCHOOL_COLLEGE elif _SCHOOL_HS in match.groupdict() and match.group( _SCHOOL_HS) is not None: school_text = _SCHOOL_HS elif _SCHOOL_ELEM in match.groupdict() and match.group( _SCHOOL_ELEM) is not None: school_text = _SCHOOL_ELEM degree_text = None #if 'degree' in match.groupdict(): # degree_text = match.group('degree').strip() if _DEG_DOCTORAL in match.groupdict() and match.group( _DEG_DOCTORAL) is not None: degree_text = _DEG_DOCTORAL elif _DEG_MASTERS in match.groupdict() and match.group( _DEG_MASTERS) is not None: degree_text = _DEG_MASTERS elif _DEG_BATCHELORS in match.groupdict() and match.group( _DEG_BATCHELORS) is not None: degree_text = _DEG_BATCHELORS elif _DEG_GED in match.groupdict() and match.group( _DEG_GED) is not None: degree_text = _DEG_GED info_dict = { _KEY_SCHOOL: school_text, _KEY_DEGREE: degree_text, } candidates.append( overlap.Candidate(start, end, match_text, regex, other=info_dict)) # sort the candidates in DECREASING order of length candidates = sorted(candidates, key=lambda x: x.end - x.start) if _TRACE: DISPLAY('\tCandidate matches: ') index = 0 for c in candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() # keep the longest of any overlapping matches pruned_candidates = overlap.remove_overlap(candidates, False, keep_longest=True) if _TRACE: DISPLAY('\tCandidate matches after overlap resolution: ') index = 0 for c in pruned_candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() return pruned_candidates
def _regex_match(sentence, regex_list): """ """ candidates = [] for i, regex in enumerate(regex_list): # expect only a single match iterator = regex.finditer(sentence) for match in iterator: # strip any trailing whitespace (invalidates match.end()) match_text = match.group().rstrip() start = match.start() end = start + len(match_text) if _TRACE: DISPLAY('\t' + match_text) # get group name to determine employment status status = None if 'unemployed' in match.groupdict() and match.group( 'unemployed') is not None: status = EMPLOYMENT_STATUS_UNEMPLOYED elif 'disabled' in match.groupdict() and match.group( 'disabled') is not None: status = EMPLOYMENT_STATUS_DISABLED elif 'retired' in match.groupdict() and match.group( 'retired') is not None: status = EMPLOYMENT_STATUS_RETIRED elif 'employed' in match.groupdict() and match.group( 'employed') is not None: status = EMPLOYMENT_STATUS_EMPLOYED # append the new match if no other candidates of the same type ok_to_append = True for c in candidates: if c.other == status: ok_to_append = False break if ok_to_append: candidates.append( overlap.Candidate(start, end, match_text, regex, other=status)) # sort candidates in DECREASING order of length candidates = sorted(candidates, key=lambda x: x.end - x.start) if _TRACE: DISPLAY('\tCandidate matches: ') index = 0 for c in candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() # keep the longest of any overlapping matches pruned_candidates = overlap.remove_overlap(candidates, False, keep_longest=True) if _TRACE: DISPLAY('\tCandidate matches after overlap resolution: ') index = 0 for c in pruned_candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() return pruned_candidates
def run(sentence): """ Find time expressions in the sentence by attempting to match all regexes. Avoid matching sub-expressions of already-matched strings. Returns a JSON array containing info on each date found. """ results = [] # TimeValue namedtuple results candidates = [] # potential matches, need overlap resolution to confirm original_sentence = sentence sentence = _clean_sentence(sentence) if _TRACE: log('original: {0}'.format(original_sentence)) log(' cleaned: {0}'.format(sentence)) for regex_index, regex in enumerate(_regexes): iterator = regex.finditer(sentence) for match in iterator: match_text = match.group().strip() t_adjustment = 0 if _ISO_DATETIME_REGEX_INDEX == regex_index: # extract only the time portion t_pos = match_text.find('T') assert -1 != t_pos match_text = match_text[t_pos + 1:] t_adjustment = t_pos + 1 start = match.start() + t_adjustment end = start + len(match_text) candidates.append(overlap.Candidate(start, end, match_text, regex)) if _TRACE: log('[{0:2}]: [{1:3}, {2:3})\tMATCH TEXT: ->{3}<-'.format( regex_index, start, end, match_text)) # sort the candidates in descending order of length, which is needed for # one-pass overlap resolution later on candidates = sorted(candidates, key=lambda x: x.end - x.start, reverse=True) if _TRACE: log('\tCandidate matches: ') index = 0 for c in candidates: log('\t[{0:2}]\t[{1},{2}): {3}'.format(index, c.start, c.end, c.match_text, c.regex)) index += 1 log() pruned_candidates = overlap.remove_overlap(candidates, _TRACE) if _TRACE: log('\tcandidates count after overlap removal: {0}'.format( len(pruned_candidates))) log('\tPruned candidates: ') for c in pruned_candidates: log('\t\t[{0},{1}): {2}'.format(c.start, c.end, c.match_text)) log() if _TRACE: log('Extracting data from pruned candidates...') for pc in pruned_candidates: # used the saved regex to match the saved text again if _regex_iso_datetime == pc.regex: # match only time portion match = _regex_iso_time.match(pc.match_text) else: match = pc.regex.match(pc.match_text) assert match int_hours = EMPTY_FIELD int_minutes = EMPTY_FIELD int_seconds = EMPTY_FIELD frac_seconds = EMPTY_FIELD am_pm = EMPTY_FIELD timezone = EMPTY_FIELD gmt_delta = EMPTY_FIELD gmt_delta_sign = EMPTY_FIELD gmt_delta_hours = EMPTY_FIELD gmt_delta_minutes = EMPTY_FIELD for k, v in match.groupdict().items(): if v is None: continue if 'hours' == k: int_hours = int(v) elif 'minutes' == k: int_minutes = int(v) elif 'seconds' == k: int_seconds = int(v) elif 'frac' == k: # leave as a string; conversion needs to handle leading zeros frac_seconds = v[1:] elif 'am_pm' == k: if -1 != v.find('a') or -1 != v.find('A'): am_pm = STR_AM else: am_pm = STR_PM elif 'timezone' == k: timezone = v if 'Z' == timezone: timezone = 'UTC' elif 'gmt_delta' == k: gmt_delta = v match_gmt = _regex_gmt.search(v) if match_gmt: for k2, v2 in match_gmt.groupdict().items(): if v2 is None: continue if 'gmt_sign' == k2: gmt_delta_sign = v2 elif 'gmt_hours' == k2: gmt_delta_hours = int(v2) elif 'gmt_minutes' == k2: gmt_delta_minutes = int(v2) meas = TimeValue(text=pc.match_text, start=pc.start, end=pc.end, hours=int_hours, minutes=int_minutes, seconds=int_seconds, fractional_seconds=frac_seconds, am_pm=am_pm, timezone=timezone, gmt_delta_sign=gmt_delta_sign, gmt_delta_hours=gmt_delta_hours, gmt_delta_minutes=gmt_delta_minutes) results.append(meas) # sort results to match order of occurrence in sentence results = sorted(results, key=lambda x: x.start) # convert to list of dicts to preserve field names in JSON output return json.dumps([r._asdict() for r in results], indent=4)
def _regex_match(sentence, regex_list): """ """ sentence_save = sentence # erase any negated languages from the sentence, then attempt regexes neg_match = _regex_neg_language.search(sentence) if neg_match: if _TRACE: DISPLAY('NEG LANGUAGE MATCH: "{0}"'.format(neg_match.group())) sentence = sentence[:neg_match.start()] + sentence[neg_match.end():] candidates = [] for i, regex in enumerate(regex_list): iterator = regex.finditer(sentence) for match in iterator: # strip any trailing whitespace (invalidates match.end()) match_text = match.group().rstrip() start = match.start() end = start + len(match_text) # isolate the matching language(s) language_text = match.group('languages').strip() candidates.append( overlap.Candidate(start, end, match_text, regex, other=language_text)) # sort the candidates in DECREASING order of length candidates = sorted(candidates, key=lambda x: x.end - x.start) if _TRACE: DISPLAY('\tCandidate matches: ') index = 0 for c in candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() # keep the longest of any overlapping matches pruned_candidates = overlap.remove_overlap(candidates, False, keep_longest=True) if _TRACE: DISPLAY('\tCandidate matches after overlap resolution: ') index = 0 for c in pruned_candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() return pruned_candidates
def _regex_match(sentence, regex_list): """ """ candidates = [] for i, regex in enumerate(regex_list): iterator = regex.finditer(sentence) for match in iterator: # strip any trailing whitespace (invalidates match.end()) match_text = match.group().rstrip() start = match.start() end = start + len(match_text) religion_text = match.group('religion').strip() candidates.append( overlap.Candidate(start, end, match_text, regex, other=religion_text)) # if no matches, try to infer religion from presence of religious official(s) if 0 == len(candidates): match = _regex_official.search(sentence) if match: match_text = match.group().strip() start = match.start() end = start + len(match_text) religion_text = None if 'imam' in match_text: religion_text = 'islam' elif 'rabbi' in match_text or 'rabi' in match_text: religion_text = 'judaism' if religion_text is not None: candidates.append( overlap.Candidate(start, end, match_text, regex, other=religion_text)) # sort candidates in DECREASING order of length candidates = sorted(candidates, key=lambda x: x.end - x.start) if _TRACE: DISPLAY('\tCandidate matches: ') index = 0 for c in candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() # keep the longest of any overlapping matches pruned_candidates = overlap.remove_overlap(candidates, False, keep_longest=True) if _TRACE: DISPLAY('\tCandidate matches after overlap resolution: ') index = 0 for c in pruned_candidates: regex_index = regex_list.index(c.regex) DISPLAY('\t[{0:2}] R{1:2}\t[{2},{3}): ->{4}<-'.format( index, regex_index, c.start, c.end, c.match_text)) index += 1 DISPLAY() return pruned_candidates