Esempio n. 1
0
    def trans_to_index(self, text_as, text_bs):
        """
        将输入转化为索引表示
        :param text_as: 输入
        :param text_bs:
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.__vocab_path,
                                               do_lower_case=True)
        input_ids = []
        input_masks = []
        segment_ids = []
        for text_a, text_b in zip(text_as, text_bs):
            text_a = tokenization.convert_to_unicode(text_a)
            text_b = tokenization.convert_to_unicode(text_b)
            tokens_a = tokenizer.tokenize(text_a)
            tokens_b = tokenizer.tokenize(text_b)

            # 判断两条序列组合在一起长度是否超过最大长度
            self._truncate_seq_pair(tokens_a, tokens_b,
                                    self._sequence_length - 3)

            tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
            input_id = tokenizer.convert_tokens_to_ids(tokens)
            input_ids.append(input_id)
            input_masks.append([1] * len(input_id))
            segment_ids.append([0] * (len(tokens_a) + 2) + [1] *
                               (len(tokens_b) + 1))

        return input_ids, input_masks, segment_ids
def create_single_example(index, line, set_type):
    """Creates examples for the training and dev sets."""
    guid = "%s-%s" % (set_type, index)
    text_a = tokenization.convert_to_unicode(line[1])
    label = tokenization.convert_to_unicode(line[0])
    example = InputExample(guid=guid, text_a=text_a, text_b=None, label=label)
    return example
def input_from_line(line, max_seq_length, tag_to_id):
    """
    Take sentence data and return an input for
    the training or the evaluation function.
    """
    string = [w[0].strip() for w in line]
    # chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>']
    #         for w in string]
    char_line = ' '.join(string)  # 使用空格把汉字拼起来
    text = tokenization.convert_to_unicode(char_line)

    tags = ['O' for _ in string]

    labels = ' '.join(tags)  # 使用空格把标签拼起来
    labels = tokenization.convert_to_unicode(labels)

    ids, mask, segment_ids, label_ids = convert_single_example(
        char_line=text,
        tag_to_id=tag_to_id,
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        label_line=labels)
    import numpy as np
    segment_ids = np.reshape(segment_ids, (1, max_seq_length))
    ids = np.reshape(ids, (1, max_seq_length))
    mask = np.reshape(mask, (1, max_seq_length))
    label_ids = np.reshape(label_ids, (1, max_seq_length))
    return [string, segment_ids, ids, mask, label_ids]
Esempio n. 4
0
    def sentence_to_idx(self, text_a, text_b):
        """
        将分词后的句子转换成idx表示
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path,
                                               do_lower_case=True)

        text_a = tokenization.convert_to_unicode(text_a)
        text_b = tokenization.convert_to_unicode(text_b)
        tokens_a = tokenizer.tokenize(text_a)
        tokens_b = tokenizer.tokenize(text_b)

        # 判断两条序列组合在一起长度是否超过最大长度
        self._truncate_seq_pair(tokens_a, tokens_b, self.sequence_length - 3)

        tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        segment_id = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)

        input_id, input_mask, segment_id = self.padding(
            input_id, input_mask, segment_id)

        return [input_id], [input_mask], [segment_id]
Esempio n. 5
0
 def _create_example(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[0])
         examples.append(InputExample(guid=guid, text=text, label=label))
     return examples
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[1])
         text_b = None
         label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Esempio n. 7
0
    def get_train_examples(self, data_dir):
        """See base class."""
        trian_dir = os.path.join(data_dir, "train_data.csv")
        train_data = pd.read_csv(trian_dir,
                                 header=None,
                                 names=['location', 'result', 'fw', 'label'])
        print(train_data.shape)
        set_type = "train"
        examples = []
        for (i, line) in train_data.iterrows():
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(
                str(i)))

            text_a = self.process_text(str(line["result"]))
            text_b = self.process_text(line["fw"])
            if set_type == "test":
                label = "contradiction"
            else:
                label = self.process_text(str(int(float(line["label"]))))
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))

        return examples
Esempio n. 8
0
 def get_dev_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "dev-%d" % (i)
     language = tokenization.convert_to_unicode(line[0])
     if language != tokenization.convert_to_unicode(self.language):
       continue
     text_a = tokenization.convert_to_unicode(line[6])
     text_b = tokenization.convert_to_unicode(line[7])
     label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Esempio n. 9
0
 def get_train_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(
       os.path.join(data_dir, "multinli",
                    "multinli.train.%s.tsv" % self.language))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "train-%d" % (i)
     text_a = tokenization.convert_to_unicode(line[0])
     text_b = tokenization.convert_to_unicode(line[1])
     label = tokenization.convert_to_unicode(line[2])
     if label == tokenization.convert_to_unicode("contradictory"):
       label = tokenization.convert_to_unicode("contradiction")
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Esempio n. 10
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(
             line[0]))
         text_a = tokenization.convert_to_unicode(line[1])
         text_b = tokenization.convert_to_unicode(line[2])
         if set_type == "test":
             label = "not_entailment"
         else:
             label = tokenization.convert_to_unicode(line[-1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Esempio n. 11
0
def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, rng):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        with tf.gfile.GFile(input_file, "r") as reader:
            while True:
                line = reader.readline()
                if not FLAGS.spm_model_file:
                    line = tokenization.convert_to_unicode(line)
                if not line:
                    break
                if FLAGS.spm_model_file:
                    line = tokenization.preprocess_text(
                        line, lower=FLAGS.do_lower_case)
                else:
                    line = line.strip()

                # Empty lines are used as document delimiters
                if not line:
                    all_documents.append([])

                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(all_documents, document_index,
                                               max_seq_length, short_seq_prob,
                                               masked_lm_prob,
                                               max_predictions_per_seq,
                                               vocab_words, rng))

    rng.shuffle(instances)
    return instances
Esempio n. 12
0
 def _create_example(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         #skip header
         if i == 0 and set_type == 'test':
             continue
         guid = line[0]
         text_a = tokenization.convert_to_unicode(line[1])
         if set_type == "test":
             label = self.get_labels()[-1]
         else:
             try:
                 label = tokenization.convert_to_unicode(line[2])
             except IndexError:
                 logging.exception(line)
                 exit(1)
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
def prepare_dataset(sentences,
                    max_seq_length,
                    tag_to_id,
                    lower=False,
                    train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    def f(x):
        return x.lower() if lower else x

    data = []
    for s in sentences:
        string = [w[0].strip() for w in s]
        char_line = ' '.join(string)  # 使用空格把汉字拼起来
        text = tokenization.convert_to_unicode(char_line)

        if train:
            tags = [w[-1] for w in s]
        else:
            tags = ['O' for _ in string]

        labels = ' '.join(tags)  # 使用空格把标签拼起来
        labels = tokenization.convert_to_unicode(labels)

        ids, mask, segment_ids, label_ids = convert_single_example(
            char_line=text,
            tag_to_id=tag_to_id,
            max_seq_length=max_seq_length,
            tokenizer=tokenizer,
            label_line=labels)
        data.append([string, segment_ids, ids, mask, label_ids])

    return data
Esempio n. 14
0
  def _create_examples(self, lines, set_type):
    examples = []
    for line in lines:
      qid = line['id']
      question = tokenization.convert_to_unicode(line['question']['stem'])
      answers = np.array([
        tokenization.convert_to_unicode(choice['text'])
        for choice in sorted(
            line['question']['choices'],
            key=lambda c: c['label'])
      ])

      #TODO process_text
      # the test set has no answer key so use 'A' as a dummy label
      label = self.LABELS.index(line.get('answerKey', 'A'))

      examples.append(
        InputExample(
          qid=qid,
          question=question,
          answers=answers,
          label=label))

    return examples
Esempio n. 15
0
    def sentence_to_idx(self, text):
        """
        将分词后的句子转换成idx表示
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path,
                                               do_lower_case=True)

        text = tokenization.convert_to_unicode(text)
        tokens = tokenizer.tokenize(text)
        tokens = ["[CLS]"] + tokens + ["[SEP]"]
        input_id = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_id)
        segment_id = [0] * len(input_id)

        input_id, input_mask, segment_id = self.padding(
            input_id, input_mask, segment_id)

        return [input_id], [input_mask], [segment_id]
Esempio n. 16
0
    def trans_to_index(self, inputs):
        """
        将输入转化为索引表示
        :param inputs: 输入
        :return:
        """
        tokenizer = tokenization.FullTokenizer(vocab_file=self.__vocab_path,
                                               do_lower_case=True)
        input_ids = []
        input_masks = []
        segment_ids = []
        for text in inputs:
            text = tokenization.convert_to_unicode(text)
            tokens = tokenizer.tokenize(text)
            tokens = ["[CLS]"] + tokens + ["[SEP]"]
            input_id = tokenizer.convert_tokens_to_ids(tokens)
            input_ids.append(input_id)
            input_masks.append([1] * len(input_id))
            segment_ids.append([0] * len(input_id))

        return input_ids, input_masks, segment_ids
Esempio n. 17
0
 def _to_example(sentences):
     import re
     """
     sentences to InputExample
     :param sentences: list of strings
     :return: list of InputExample
     """
     guid = 0
     for ss in sentences:
         line = tokenization.convert_to_unicode(ss)
         if not line:
             continue
         line = line.strip()
         text_a = None
         text_b = None
         m = re.match(r"^(.*) \|\|\| (.*)$", line)
         if m is None:
             text_a = line
         else:
             text_a = m.group(1)
             text_b = m.group(2)
         yield InputExample(guid=guid, text_a=text_a, text_b=text_b)
         guid += 1
Esempio n. 18
0
 def process_text(self, text):
   if self.use_spm:
     return tokenization.preprocess_text(text, lower=self.do_lower_case)
   else:
     return tokenization.convert_to_unicode(text)