def test_regex_from_group_spans(self): text = r'Error on comp21' regex = r'Error on (comp(\d\d))' span_comp = Span(9, 15, pattern=r'comp(\d\d)') span_number = Span(13, 15, pattern=r'\d\d') group_spans = SpanList([span_comp, span_number]) regex_from_groups = regex_from_group_spans(group_spans, text) assert regex == regex_from_groups
def test_find_date_spans_by_force(self): text = r'2015-12-03 or [10/Oct/1999:21:15:05 +0500] "GET /index.html HTTP/1.0" 200 1043' spans = _find_date_spans_by_force(text) assert len(spans) == 3 spans = SpanList(spans).sort_by_start_and_end() dates = [text[s.start:s.end] for s in spans] assert dates[0] == '2015-12-03' assert dates[1] == '10/Oct/1999' assert dates[2] == '21:15:05 +0500'
def test_find_spans_by_regex(self): regexes = dict((re.compile(regex), regex) for regex in [r"\d+-\d+-\d\d", r"comp\d\d"]) text = r"2015-12-03 Data migration from comp36 to comp21 failed" spans = find_spans_by_regex(regexes, text) assert len(spans) == 3 spans = SpanList(spans).sort_by_start_and_end() groups = [text[s.start:s.end] for s in spans] assert groups[0] == '2015-12-03' assert groups[1] == 'comp36' assert groups[2] == 'comp21'
def find_date_spans(text, regexes=None): regexes = regexes or {} forced_date_spans = _find_date_spans_by_force(text) regex_date_spans = _find_date_spans_by_regex(regexes, text) unique_spans = forced_date_spans.union(regex_date_spans) ordered_date_spans = SpanList(unique_spans).sort_reversed_by_length() best_spans = SpanList.not_overlapping_spans(ordered_date_spans) return best_spans
def regex_from_group_spans(group_spans, line_text): sorted_group_spans = group_spans.sort_by_start_and_end() # TODO: # [(1,5), (2, 3)] -> [(1,5)] # [(1,5), (3, 7)] -> Error somewhere greedy_group_spans = SpanList.not_overlapping_spans(sorted_group_spans) complement_spans = greedy_group_spans.complementary_spans( 0, len(line_text), create_obvious_regex ) line_spans = (complement_spans + greedy_group_spans).sort_by_start_and_end() regex = r"" for span in line_spans: span.update_pattern(line_text) span_pattern = span.pattern if span.is_param: span_pattern = "(" + span_pattern + ")" regex += span_pattern return regex