Beispiel #1
0
def parse_text(text):
  """Extracts longest number and date spans.

  Args:
    text: text to annotate.

  Returns:
    List of longest numeric value spans.
  """
  span_dict = collections.defaultdict(list)
  for match in _NUMBER_PATTERN.finditer(text):
    span_text = text[match.start():match.end()]
    number = _parse_number(span_text)
    if number is not None:
      span_dict[match.span()].append(_get_numeric_value_from_float(number))

  for begin_index, end_index in text_utils.get_all_spans(
      text, max_ngram_length=1):
    if (begin_index, end_index) in span_dict:
      continue
    span_text = text[begin_index:end_index]

    number = _parse_number(span_text)
    if number is not None:
      span_dict[begin_index, end_index].append(
          _get_numeric_value_from_float(number))
    for number, word in enumerate(_NUMBER_WORDS):
      if span_text == word:
        span_dict[begin_index, end_index].append(
            _get_numeric_value_from_float(float(number)))
        break
    for number, word in enumerate(_ORDINAL_WORDS):
      if span_text == word:
        span_dict[begin_index, end_index].append(
            _get_numeric_value_from_float(float(number)))
        break

  for begin_index, end_index in text_utils.get_all_spans(
      text, max_ngram_length=_MAX_DATE_NGRAM_SIZE):
    span_text = text[begin_index:end_index]
    date = _parse_date(span_text)
    if date is not None:
      span_dict[begin_index, end_index].append(date)

  spans = sorted(
      span_dict.items(),
      key=lambda span_value: _get_span_length_key(span_value[0]),
      reverse=True)
  selected_spans = []
  for span, value in spans:
    for selected_span, _ in selected_spans:
      if selected_span[0] <= span[0] and span[1] <= selected_span[1]:
        break
    else:
      selected_spans.append((span, value))

  selected_spans.sort(key=lambda span_value: span_value[0][0])

  numeric_value_spans = []
  for span, values in selected_spans:
    numeric_value_spans.append(
        interaction_pb2.NumericValueSpan(
            begin_index=span[0], end_index=span[1], values=values))
  return numeric_value_spans
Beispiel #2
0
def _get_all_spans_as_text(text, max_ngram_length):
    spans = text_utils.get_all_spans(text, max_ngram_length)
    return [text[span[0]:span[1]] for span in spans]