Exemple #1
0
 def test_regex_from_group_spans(self):
     text = r'Error on comp21'
     regex = r'Error on (comp(\d\d))'
     span_comp = Span(9, 15, pattern=r'comp(\d\d)')
     span_number = Span(13, 15, pattern=r'\d\d')
     group_spans = SpanList([span_comp, span_number])
     regex_from_groups = regex_from_group_spans(group_spans, text)
     assert regex == regex_from_groups
Exemple #2
0
 def test_find_date_spans_by_force(self):
     text = r'2015-12-03 or [10/Oct/1999:21:15:05 +0500] "GET /index.html HTTP/1.0" 200 1043'
     spans = _find_date_spans_by_force(text)
     assert len(spans) == 3
     spans = SpanList(spans).sort_by_start_and_end()
     dates = [text[s.start:s.end] for s in spans]
     assert dates[0] == '2015-12-03'
     assert dates[1] == '10/Oct/1999'
     assert dates[2] == '21:15:05 +0500'
Exemple #3
0
 def test_find_spans_by_regex(self):
     regexes = dict((re.compile(regex), regex)
                    for regex in [r"\d+-\d+-\d\d", r"comp\d\d"])
     text = r"2015-12-03 Data migration from comp36 to comp21 failed"
     spans = find_spans_by_regex(regexes, text)
     assert len(spans) == 3
     spans = SpanList(spans).sort_by_start_and_end()
     groups = [text[s.start:s.end] for s in spans]
     assert groups[0] == '2015-12-03'
     assert groups[1] == 'comp36'
     assert groups[2] == 'comp21'
Exemple #4
0
def find_date_spans(text, regexes=None):
    regexes = regexes or {}

    forced_date_spans = _find_date_spans_by_force(text)
    regex_date_spans = _find_date_spans_by_regex(regexes, text)

    unique_spans = forced_date_spans.union(regex_date_spans)

    ordered_date_spans = SpanList(unique_spans).sort_reversed_by_length()

    best_spans = SpanList.not_overlapping_spans(ordered_date_spans)

    return best_spans
Exemple #5
0
def regex_from_group_spans(group_spans, line_text):
    sorted_group_spans = group_spans.sort_by_start_and_end()
    # TODO:
    # [(1,5), (2, 3)] -> [(1,5)]
    # [(1,5), (3, 7)] -> Error somewhere
    greedy_group_spans = SpanList.not_overlapping_spans(sorted_group_spans)
    complement_spans = greedy_group_spans.complementary_spans(
        0, len(line_text), create_obvious_regex
    )
    line_spans = (complement_spans + greedy_group_spans).sort_by_start_and_end()

    regex = r""
    for span in line_spans:
        span.update_pattern(line_text)
        span_pattern = span.pattern
        if span.is_param:
            span_pattern = "(" + span_pattern + ")"
        regex += span_pattern
    return regex