Ejemplo n.º 1
0
def preproc_doc(document_tuple):
  """Convert document to list of TF Examples for binary order classification.

  Args:
      document_tuple: a chapter from one book as a list of lines

  Returns:
      A list of tfexamples of binary orderings of pairs of sentences in the
      document. The tfexamples are serialized to string to be written directly
      to TFRecord.
  """
  document, other_docs = document_tuple

  # Each document is a list of lines
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  # set a random seed for reproducability
  # since this function is run in parallel, if we hardcode a seed, all
  # documents will have the same permutations. Instead we use the hash of the
  # first sentence as the seed so it is different for each document and it
  # is still reproducible.
  hash_object = hashlib.md5(document[0])
  rng = random.Random(int(hash_object.hexdigest(), 16) % (10**8))

  # Each document is composed of a list of sentences. We create paragraphs
  # by keeping together sentences on the same line and adding adjacent sentences
  # if there are fewer than 5 to form the paragraph.
  # The utility functions below expect the document to be split by paragraphs.
  list_of_paragraphs = []
  paragraph = []
  for line in document:
    line = tokenization.convert_to_unicode(line)
    line = line.replace(u"\u2018", "'").replace(u"\u2019", "'")
    sents = split_line_by_sentences(line)
    for sent in sents:
      tokens = tokenizer.tokenize(sent)
      if tokens:
        paragraph.append(tokens)
    if len(paragraph) > 10:
      list_of_paragraphs.append(paragraph)
      paragraph = []

  # In case of any empty paragraphs, remove them.
  list_of_paragraphs = [x for x in list_of_paragraphs if x]

  # Redo above with otherdocs
  list_of_para_other_docs = []
  paragraph = []
  for doc in other_docs:
    for line in doc:
      line = tokenization.convert_to_unicode(line)
      line = line.replace(u"\u2018", "'").replace(u"\u2019", "'")
      sents = split_line_by_sentences(line)
      for sent in sents:
        tokens = tokenizer.tokenize(sent)
        if tokens:
          paragraph.append(tokens)
      if len(paragraph) > 10:
        list_of_para_other_docs.append(paragraph)
        paragraph = []

  # In case of any empty paragraphs, remove them.
  list_of_para_other_docs = [x for x in list_of_para_other_docs if x]

  # we need to be able to sample from multiple paragraphs
  if len(list_of_para_other_docs) == 1 or \
      sum([len(x) for x in list_of_para_other_docs]) < 35:
    return []

  # Convert the list of paragraphs into TrainingInstance object
  # See preprocessing_utils.py for definition
  instances = create_instances_from_document(list_of_paragraphs,
                                             FLAGS.max_seq_length, rng,
                                             list_of_para_other_docs,
                                             FLAGS.format)

  # Convert token lists into ids and add any needed tokens and padding for BERT
  tf_examples = [
      convert_instance_to_tf_example(tokenizer, instance,
                                     FLAGS.max_seq_length)[0]
      for instance in instances
  ]

  # Serialize TFExample for writing to file.
  tf_examples = [example.SerializeToString() for example in tf_examples]

  return tf_examples
Ejemplo n.º 2
0
def preproc_doc(document_tuple):
  """Convert document to list of TF Examples for binary order classification.

  Args:
      document_tuple: a wikipedia article as a list of lines and a tuple of 3
        random docs

  Returns:
      A list of tfexamples of binary orderings of pairs of sentences in the
      document. The tfexamples are serialized to string to be written directly
      to TFRecord.
  """
  document, other_docs = document_tuple
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  # set a random seed for reproducability
  hash_object = hashlib.md5(document[0])
  rng = random.Random(int(hash_object.hexdigest(), 16) % (10**8))

  # Each document is composed of a list of text lines. Each text line is a
  # paragraph. We split the line into sentences but keep the paragraph grouping.
  # The utility functions below expect the document to be split by paragraphs.
  list_of_paragraphs = []
  special_chars = {"@,@": ",", "@.@": ".", "@-@": "-"}
  for line in document:
    line = tokenization.convert_to_unicode(line)
    line = line.replace(u"\u2018", "'").replace(u"\u2019", "'")
    sents = split_line_by_sentences(line)

    # These special chars in the wiki dataset should be replaced with the
    # raw characters
    for sp_char in special_chars.items():
      sents = [sent.replace(sp_char[0], sp_char[1]) for sent in sents]

    if len(sents) < (5 if FLAGS.format == ONE_SENT_CTX else 8):
      continue
    sent_tokens = [tokenizer.tokenize(sent) for sent in sents if sent]
    sent_tokens = [sent for sent in sent_tokens if len(sent) > 1]
    if len(sent_tokens) < (5 if FLAGS.format == ONE_SENT_CTX else 8):
      continue
    list_of_paragraphs.append(sent_tokens)

  # In case of any empty paragraphs, remove them.
  list_of_paragraphs = [x for x in list_of_paragraphs if x]

  # Redo above with otherdocs
  list_of_para_other_docs = []
  for doc in other_docs:
    for line in doc:
      line = tokenization.convert_to_unicode(line)
      line = line.replace(u"\u2018", "'").replace(u"\u2019", "'")
      sents = split_line_by_sentences(line)
      sent_tokens = [tokenizer.tokenize(sent) for sent in sents if sent]
      list_of_para_other_docs.append(sent_tokens)

  # In case of any empty paragraphs, remove them.
  list_of_para_other_docs = [x for x in list_of_para_other_docs if x]

  # we need to be able to sample from multiple paragraphs
  if len(list_of_para_other_docs) == 1 or \
      sum([len(x) for x in list_of_para_other_docs]) < 35:
    return []

  # Convert the list of paragraphs into TrainingInstance object
  # See preprocessing_utils.py for definition
  instances = create_instances_from_document(list_of_paragraphs,
                                             FLAGS.max_seq_length, rng,
                                             list_of_para_other_docs,
                                             FLAGS.format)

  # Convert token lists into ids and add any needed tokens and padding for BERT
  tf_examples = [
      convert_instance_to_tf_example(tokenizer, instance,
                                     FLAGS.max_seq_length)[0]
      for instance in instances
  ]

  # Serialize TFExample for writing to file.
  tf_examples = [example.SerializeToString() for example in tf_examples]

  return tf_examples