Ejemplo n.º 1
0
 def get_dev_examples(self, data_dir):
     """Gets a collection of `InputExample`s for the dev set."""
     lines = self._read_tsv(data_dir)
     examples = []
     for (i, line) in enumerate(lines):
         guid = line[0]
         text_a = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[2])
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      label=label))
     return examples
Ejemplo n.º 2
0
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     text_a = tokenization.convert_to_unicode(line[3])
     text_b = tokenization.convert_to_unicode(line[4])
     if set_type == "test":
       label = "0"
     else:
       label = tokenization.convert_to_unicode(line[0])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Ejemplo n.º 3
0
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     # Only the test set has a header
     if set_type == "test" and i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     if set_type == "test":
       text_a = tokenization.convert_to_unicode(line[1])
       label = "0"
     else:
       text_a = tokenization.convert_to_unicode(line[3])
       label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
   return examples
Ejemplo n.º 4
0
 def get_dev_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "dev-%d" % (i)
     language = tokenization.convert_to_unicode(line[0])
     if language != tokenization.convert_to_unicode(self.language):
       continue
     text_a = tokenization.convert_to_unicode(line[6])
     text_b = tokenization.convert_to_unicode(line[7])
     label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Ejemplo n.º 5
0
 def get_train_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(
       os.path.join(data_dir, "multinli",
                    "multinli.train.%s.tsv" % self.language))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "train-%d" % (i)
     text_a = tokenization.convert_to_unicode(line[0])
     text_b = tokenization.convert_to_unicode(line[1])
     label = tokenization.convert_to_unicode(line[2])
     if label == tokenization.convert_to_unicode("contradictory"):
       label = tokenization.convert_to_unicode("contradiction")
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Ejemplo n.º 6
0
 def get_test_examples(self, data_dir):
     """Gets a collection of `InputExample`s for prediction."""
     lines = self._read_tsv(data_dir)
     examples = []
     for (i, line) in enumerate(lines):
         text_a = tokenization.convert_to_unicode(line[-1])
         label = self.change_label_to_id(line[:-1])
         examples.append(InputExample(guid=i, text_a=text_a, label=label))
     return examples
Ejemplo n.º 7
0
        vocab_file=settings.bert_model_vocab_path, do_lower_case=True)
    if not os.path.exists(settings.train_tfrecord_path):
        train_examples = processor.get_train_examples(settings.train_data_path)
        file_based_convert_examples_to_features(train_examples, label_list,
                                                model_params.max_seq_length,
                                                tokenizer,
                                                settings.train_tfrecord_path)

    if not os.path.exists(settings.dev_tfrecord_path):
        dev_examples = processor.get_dev_examples(settings.dev_data_path)
        file_based_convert_examples_to_features(dev_examples, label_list,
                                                model_params.max_seq_length,
                                                tokenizer,
                                                settings.dev_tfrecord_path)

    if not os.path.exists(settings.test_tfrecord_path):
        test_examples = processor.get_test_examples(settings.test_data_path)
        file_based_convert_examples_to_features(test_examples, label_list,
                                                model_params.max_seq_length,
                                                tokenizer,
                                                settings.test_tfrecord_path)


if __name__ == '__main__':
    create_tfrecorf_file()
    test_string = "##武汉加油##"
    print(tokenization.convert_to_unicode(test_string))
    tokenizer = tokenization.FullTokenizer(
        vocab_file=settings.bert_model_vocab_path, do_lower_case=True)
    print(tokenizer.tokenize(test_string))