Example #1
0
def tokens(doc):
    """Given a Document, produces character or word tokens.

  Tokens can be either characters, or word-level tokens (unigrams and/or
  bigrams).

  Args:
    doc: Document to produce tokens from.

  Yields:
    token

  Raises:
    ValueError: if all FLAGS.{output_unigrams, output_bigrams, output_char}
      are False.
  """
    if not (FLAGS.output_unigrams or FLAGS.output_bigrams
            or FLAGS.output_char):
        raise ValueError(
            'At least one of {FLAGS.output_unigrams, FLAGS.output_bigrams, '
            'FLAGS.output_char} must be true')

    content = doc.content.strip()
    if FLAGS.lowercase:
        content = content.lower()

    if FLAGS.output_char:
        for char in content:
            yield char

    else:
        tokens_ = data_utils.split_by_punct(content)
        for i, token in enumerate(tokens_):
            if FLAGS.output_unigrams:
                yield token

            if FLAGS.output_bigrams:
                previous_token = (tokens_[i - 1]
                                  if i > 0 else data_utils.EOS_TOKEN)
                bigram = '_'.join([previous_token, token])
                yield bigram
                if (i + 1) == len(tokens_):
                    bigram = '_'.join([token, data_utils.EOS_TOKEN])
                    yield bigram
Example #2
0
def tokens(doc):
  """Given a Document, produces character or word tokens.

  Tokens can be either characters, or word-level tokens (unigrams and/or
  bigrams).

  Args:
    doc: Document to produce tokens from.

  Yields:
    token

  Raises:
    ValueError: if all FLAGS.{output_unigrams, output_bigrams, output_char}
      are False.
  """
  if not (FLAGS.output_unigrams or FLAGS.output_bigrams or FLAGS.output_char):
    raise ValueError(
        'At least one of {FLAGS.output_unigrams, FLAGS.output_bigrams, '
        'FLAGS.output_char} must be true')

  content = doc.content.strip()
  if FLAGS.lowercase:
    content = content.lower()

  if FLAGS.output_char:
    for char in content:
      yield char

  else:
    tokens_ = data_utils.split_by_punct(content)
    for i, token in enumerate(tokens_):
      if FLAGS.output_unigrams:
        yield token

      if FLAGS.output_bigrams:
        previous_token = (tokens_[i - 1] if i > 0 else data_utils.EOS_TOKEN)
        bigram = '_'.join([previous_token, token])
        yield bigram
        if (i + 1) == len(tokens_):
          bigram = '_'.join([token, data_utils.EOS_TOKEN])
          yield bigram