Ejemplo n.º 1
0
def adapt(markup):
    spans = list(markup.spans)
    spans = list(split_overlapping_spans(spans))
    spans = list(
        strip_spans(spans, markup.text, QUOTES + BRACKETS + DASHES + SPACES))
    spans = list(filter_empty_spans(spans))
    spans = list(adapt_spans(spans, markup.text, TYPES))
    return Markup(markup.text, spans)
Ejemplo n.º 2
0
def adapt(markup):
    # Чувашской Республики".
    # ----------------------
    # год Чарльза Дарвина»
    #     ----------------
    spans = list(strip_spans(markup.spans, markup.text, QUOTES + DOT + SPACES))
    spans = list(adapt_spans(spans, markup.text, TYPES))
    return Markup(markup.text, spans)
Ejemplo n.º 3
0
def adapt(markup):
    # extra spaces + dots in spans

    # News Corp .
    # -----------

    # « Русал »
    # ---------

    spans = strip_spans(markup.spans, markup.text, DOT + SPACES)
    spans = strip_spans_bounds(spans, markup.text, QUOTES + SPACES)
    spans = adapt_spans(spans, markup.text, TYPES)
    return Markup(markup.text, list(spans))
Ejemplo n.º 4
0
def adapt_spans(spans, text, types):
    spans = select_type_spans(spans, types)
    spans = convert_span_types(spans, types)

    # in mitie and sometimes in deeppavlov
    spans = list(strip_spans(spans, text, QUOTES))

    # ne5 typos is span.stop
    #   Magna Internationa -> Magna International
    #   Горсове -> Горсовет
    # tokenizer errors
    #   поезд Москва-Баку
    #   Yahoo!.
    tokens = list(tokenize(text))
    spans = list(filter_misaligned_spans(spans, tokens))

    # ne5 bug
    #   Бражский район Подмосковья
    #   --------------
    #            -----------------
    spans = list(filter_overlapping(spans))

    return spans
Ejemplo n.º 5
0
def adapt(markup):
    spans = strip_spans(markup.spans, markup.text, QUOTES + SPACES + DOT)
    spans = adapt_spans(spans, markup.text, TYPES)
    return Markup(markup.text, list(spans))
Ejemplo n.º 6
0
def adapt(markup):
    spans = list(strip_spans(markup.spans, markup.text, QUOTES + SPACES))
    spans = list(adapt_spans(spans, markup.text, TYPES))
    return Markup(markup.text, spans)
Ejemplo n.º 7
0
def adapt_spans(spans, text):
    spans = list(adapt_overlapping_spans(spans, text))
    spans = list(strip_spans(spans, text, QUOTES + BRACKETS + DASHES))
    spans = list(filter_empty_spans(spans))
    return adapt_spans_(list(spans), text, TYPES)
Ejemplo n.º 8
0
def adapt_overlapping_spans(spans, text):
    spans = split_overlapping_spans(spans)
    spans = strip_spans(spans, text, SPACES)
    return filter_empty_spans(spans)