def _create_example(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[0])
         if i == 0:
             print(label)
         examples.append(InputExample(guid=guid, text=text, label=label))
     return examples
Exemple #2
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(
             line[0]))
         text_a = tokenization.convert_to_unicode(line[8])
         text_b = tokenization.convert_to_unicode(line[9])
         label = tokenization.convert_to_unicode(line[-1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemple #3
0
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   for (i, line) in enumerate(lines):
     # Only the test set has a header
     if i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     if set_type == "test":
       text_a = tokenization.convert_to_unicode(line[0])
       label = "0"
     else:
       text_a = tokenization.convert_to_unicode(line[0])
       label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
   return examples
Exemple #4
0
 def get_dev_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "dev-%d" % (i)
     language = tokenization.convert_to_unicode(line[0])
     if language != tokenization.convert_to_unicode(self.language):
       continue
     text_a = tokenization.convert_to_unicode(line[6])
     text_b = tokenization.convert_to_unicode(line[7])
     label = tokenization.convert_to_unicode(line[1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Exemple #5
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if len(line) != 4:
             print(line)
         guid = "%s-%s-%s" % (
             set_type, tokenization.convert_to_unicode(line[0]), str(i))
         target = tokenization.convert_to_unicode(line[1])
         text = tokenization.convert_to_unicode(line[2])
         label = tokenization.convert_to_unicode(line[3])
         examples.append(
             InputExample(guid=guid,
                          text_a=target,
                          text_b=text,
                          label=label))
     return examples
Exemple #6
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # skip header
         if i == 0:
             continue
         guid = int(line[0])
         group = line[1]
         text_left = tokenization.convert_to_unicode(line[2])
         text_right = tokenization.convert_to_unicode(line[3])
         label = tokenization.convert_to_unicode(line[4])
         negs = [tokenization.convert_to_unicode(neg) for neg in line[5:] if neg.strip()]
         examples.append(
             InputExample(guid=guid, group=group, text_left=text_left, text_right=text_right, label=label,
                          negs=negs))
     return examples
Exemple #7
0
def create_examples(data, label_list, set_type, labels_available=True):
    """Creates examples for the training, dev and test sets.
       Data:list
    """
    examples = []
    for (i, line) in enumerate(data):        
        guid = "%s" % (i)
        text_a = tokenization.convert_to_unicode(line[0].strip("\n"))
        if labels_available:
            labels = tokenization.convert_to_unicode(line[0])
            labels = labels.split(", ")   #format: list  ['a'] or ['a','b']
            labels = Multi_hot_label(labels,label_list)
        else:
            labels = [0]*len(label_list)
        examples.append(
            InputExample(guid=guid, text_a=text_a, labels=labels))
    return examples
 def get_dev_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(os.path.join(curr_path, data_dir))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "dev-%d" % (i)
         text_a = tokenization.convert_to_unicode(line[2])
         text_b = tokenization.convert_to_unicode(line[3])
         label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
 def _create_examples(self, lines, set_type):
   """Creates examples for the training and dev sets."""
   examples = []
   print("length of lines:", len(lines))
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "%s-%s" % (set_type, i)
     try:
       label = tokenization.convert_to_unicode(line[2])
       text_a = tokenization.convert_to_unicode(line[0])
       text_b = tokenization.convert_to_unicode(line[1])
       examples.append(
           InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     except Exception:  # pylint: disable=broad-except
       print("###error.i:", i, line)
   return examples
Exemple #10
0
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        sentence1_index = 0
        sentence2_index = 0
        label0_index = None
        label1_index = None
        label2_index = None
        for (i, line) in enumerate(lines):

            if i == 0:
                # Identify the sentence index
                for j, token in enumerate(line):
                    if token.strip() == "sentence1":
                        sentence1_index = j
                    elif token.strip() == "sentence2":
                        sentence2_index = j
                    elif token.strip() == "label0_prob":
                        label0_index = j
                    elif token.strip() == "label1_prob":
                        label1_index = j
                    elif token.strip() == "label2_prob":
                        label2_index = j
                continue

            guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(
                line[0]))
            text_a = tokenization.convert_to_unicode(line[sentence1_index])
            text_b = tokenization.convert_to_unicode(line[sentence2_index])
            if label0_index and label1_index and label2_index:
                # The three indices correspond to probabilities of contradiction,
                # entailment and neutral.
                label = [
                    float(line[label0_index]),
                    float(line[label1_index]),
                    float(line[label2_index])
                ]
            else:
                label = [1.0, 0, 0]

            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples
 def get_train_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'train.txt')
     train_df = pd.read_csv(file_path,
                            encoding='utf-8',
                            sep='\t',
                            header=None)
     train_data = []
     for index, train in enumerate(train_df.values):
         guid = 'train-%d' % index
         text_a = tokenization.convert_to_unicode(str(train[0]))
         text_b = tokenization.convert_to_unicode(str(train[1]))
         label = str(train[2])
         train_data.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return train_data
Exemple #12
0
 def load_test_data(self, path):
     x1, x2, y = [], [], []
     for data in self.read_csv(path):
         x1.append(convert_to_unicode(data[1]))
         x2.append(None)
         y.append(
             None
         )  # if test data does not includes ground true label, this is a placeholder which can be anything
     return x1, x2, y
    def _create_examples(self, lines):
        """Create examples data set"""
        examples = []
        for index, line in enumerate(lines):
            guid = 'train-%d' % index
            split_line = line.strip().split(',')
            if len(split_line) == 3 and (split_line[0] == '1'
                                         or split_line[0] == '0'):
                text_a = tokenization.convert_to_unicode(split_line[1])
                text_b = tokenization.convert_to_unicode(split_line[2])
                label = split_line[0]
                examples.append(
                    InputExample(guid=guid,
                                 text_a=text_a,
                                 text_b=text_b,
                                 label=label))

        return examples
Exemple #14
0
 def _create_example(self, lines, labels, set_type):
     examples = []
     idx = 0
     for line, label in zip(lines, labels):
         guid = "{}-{}".format(set_type, idx)
         text_a = tokenization.convert_to_unicode(line)
         examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
         idx += 1
     return examples
Exemple #15
0
def convert_to_examples(texts):
    examples = []
    for _id, text in enumerate(texts):
        line = tokenization.convert_to_unicode(text)
        if not line:
            break
        line = line.strip()
        examples.append(InputExample(unique_id=_id, text_a=line, text_b=None))
    return examples
    def get_example(self, index):
        data = self.get_data(index)
        # left, pron, right, candidates, selected
        src_left = self.tokenizer.tokenize(convert_to_unicode(data.left))
        src_right = self.tokenizer.tokenize(convert_to_unicode(data.right))
        pronoun_tokens = self.tokenizer.tokenize(convert_to_unicode(data.pron))

        candidates = [
            self.tokenizer.tokenize(convert_to_unicode(c))
            for c in data.candidates.split(',')
        ]
        selected_idx = int(data.selected)
        assert len(candidates) <= self.max_candidates, data
        candidates.extend([None] * (self.max_candidates - len(candidates)))
        selected = [0] * len(candidates)
        selected[selected_idx] = 1
        return self._make_inputs(src_left, src_right, pronoun_tokens,
                                 candidates, selected)
Exemple #17
0
 def get_dev_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'dev.txt')
     dev_df = pd.read_csv(file_path,
                          encoding='utf-8',
                          sep='\t',
                          header=None)
     dev_data = []
     for index, dev in enumerate(dev_df.values):
         guid = 'test-%d' % index
         text_a = tokenization.convert_to_unicode(str(dev[1]))
         text_b = tokenization.convert_to_unicode(str(dev[2]))
         label = str(dev[3])
         dev_data.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return dev_data
Exemple #18
0
 def get_train_examples(self, data_dir):
   """See base class."""
   lines = self._read_tsv(
       os.path.join(data_dir, "multinli",
                    "multinli.train.%s.tsv" % self.language))
   examples = []
   for (i, line) in enumerate(lines):
     if i == 0:
       continue
     guid = "train-%d" % (i)
     text_a = tokenization.convert_to_unicode(line[0])
     text_b = tokenization.convert_to_unicode(line[1])
     label = tokenization.convert_to_unicode(line[2])
     if label == tokenization.convert_to_unicode("contradictory"):
       label = tokenization.convert_to_unicode("contradiction")
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
Exemple #19
0
def bert_tokenize(vocab_fname, corpus_fname, output_fname):
    tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False)
    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            tokens = tokenizer.tokenize(convert_to_unicode(sentence))
            tokenized_sent = ' '.join(tokens)
            f2.writelines(tokenized_sent + '\n')
Exemple #20
0
 def get_train_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'train.txt')
     f = open(file_path, 'r', encoding='utf-8')
     train_data = []
     index = 0
     for line in f.readlines():
         guid = 'train-%d' % index  #参数guid是用来区分每个example的
         line = line.replace("\n", "").split("\t")
         text_a = tokenization.convert_to_unicode(line[1])  #句子a
         text_b = tokenization.convert_to_unicode(line[2])  #句子b
         label = line[0]  #文本对应的类别
         train_data.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
         index += 1
     return train_data
Exemple #21
0
 def get_test_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'test.txt')
     f = open(file_path, 'r', encoding='utf-8')
     test_data = []
     index = 0
     for line in f.readlines():
         guid = 'test-%d' % index
         line = line.replace("\n", "").split("\t")
         text_a = tokenization.convert_to_unicode(line[1])
         text_b = tokenization.convert_to_unicode(line[2])
         label = line[0]
         test_data.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
         index += 1
     return test_data
    def get_dev_examples(self, data_dir):
        """
        获取验证数据
        :param data_dir: 文件文件夹路径
        :return:
        """
        # 1. 加载训练数据
        lines = self._read_txt(data_dir, file_name=FLAGS.dev_file_names)

        # 2. 数据遍历处理并转换
        examples = []
        for idx, line in enumerate(lines):
            guid = "dev-%d" % idx
            text_a = tokenization.convert_to_unicode(line[0])
            label = tokenization.convert_to_unicode(line[1])
            examples.append(InputExample(guid=guid, text_a=text_a,
                                         label=label))
        return examples
Exemple #23
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = line[1]
         text_a = tokenization.convert_to_unicode(line[2])
         text_b = tokenization.convert_to_unicode(line[3])
         if set_type == "test":
             label = self.get_labels()[-1]
         else:
             label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemple #24
0
    def convert_line(line, label_list, max_seq_length, tokenizer):
        """Function to convert a line that should be predicted into BERT
    input features.
    """
        label = tokenization.convert_to_unicode("0")  # Mock label
        text_a = tokenization.convert_to_unicode(line)
        example = rc.InputExample(guid=0,
                                  text_a=text_a,
                                  text_b=None,
                                  label=label)
        feature = rc.convert_single_example(0, example, label_list,
                                            max_seq_length, tokenizer)

        input_ids = np.reshape([feature.input_ids], (1, max_seq_length))
        input_mask = np.reshape([feature.input_mask], (1, max_seq_length))
        segment_ids = np.reshape([feature.segment_ids], (max_seq_length))
        label_ids = [feature.label_id]

        return input_ids, input_mask, segment_ids, label_ids
Exemple #25
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # skip header
         if i == 0:
             continue
         guid = line[0]
         text_a = tokenization.convert_to_unicode(line[1])
         if set_type == "test":
             label = self.get_labels()[-1]
         else:
             try:
                 label = tokenization.convert_to_unicode(line[2])
             except IndexError:
                 logging.exception(line)
                 exit(1)
         examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
    def get_train_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'train.csv')
        with open(file_path, 'r', encoding="utf-8") as f:
            reader = f.readlines()

        examples = []
        for index, line in enumerate(reader):
            guid = 'train-%d' % index
            split_line = line.strip().split("\t")
            # print(split_line)
            text_a = tokenization.convert_to_unicode(split_line[1])
            text_b = tokenization.convert_to_unicode(split_line[2])
            label = split_line[3]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples
Exemple #27
0
 def _create_examples(self, lines, set_type):
     '''create examples for training and dev'''
     examples = []
     for (i, line) in enumerate(lines):
         guid = '%s-%s' % (set_type, i)
         # title + content
         try:
             text_a = tokenization.convert_to_unicode(line[1] + line[2])
             if set_type == "test":
                 label = u"美食"
             else:
                 label = tokenization.convert_to_unicode(line[0])
                 if label not in self.labels:
                     continue
         except Exception as ex:
             continue
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      label=label))
     return examples
def preproc_doc(document):
    """Convert document to list of TF Examples for binary order classification.

  Args:
      document: a wikipedia article as a list of lines

  Returns:
      A list of tfexamples of binary orderings of pairs of sentences in the
      document. The tfexamples are serialized to string to be written directly
      to TFRecord.
  """

    # Each document is a list of lines
    tokenizer = tokenization.FullTokenizer(
        vocab_file=FLAGS.vocab_file,
        do_lower_case=FLAGS.do_lower_case)  # just use lower case?

    # set a random seed for reproducability
    hash_object = hashlib.md5(document[0])
    rng = random.Random(int(hash_object.hexdigest(), 16) % (10**8))

    # Each document is composed of a list of text lines. Each text line is a
    # paragraph. We split the line into sentences but keep the paragraph grouping.
    # The utility functions below expect the document to be split by paragraphs.
    list_of_paragraphs = []
    for line in document:  # each line is a story
        line = tokenization.convert_to_unicode(line)
        line = line.replace(u"\u2018", "'").replace(u"\u2019", "'")
        sents = split_line_by_sentences(line)
        sent_tokens = [tokenizer.tokenize(sent) for sent in sents if sent]
        list_of_paragraphs.append(sent_tokens)

    # In case of any empty paragraphs, remove them.
    list_of_paragraphs = [x for x in list_of_paragraphs if x]

    # Convert the list of paragraphs into TrainingInstance object
    # See preprocessing_utils.py for definition
    if FLAGS.format == FORMAT_BINARY:
        instances = create_instances_from_document(list_of_paragraphs,
                                                   FLAGS.max_seq_length, rng)
    elif FLAGS.format == FORMAT_PARAGRAPH:
        instances = create_paragraph_order_from_document(
            list_of_paragraphs, FLAGS.max_seq_length, rng)

    # Convert token lists into ids and add any needed tokens and padding for BERT
    tf_examples = [
        convert_instance_to_tf_example(tokenizer, instance,
                                       FLAGS.max_seq_length)[0]
        for instance in instances
    ]

    # Serialize TFExample for writing to file.
    tf_examples = [example.SerializeToString() for example in tf_examples]

    return tf_examples
Exemple #29
0
    def get_train_examples(self, data_dir):
        """See base class."""
        lines = self._read_tsv(os.path.join(data_dir, "sct_v2.train.tsv"))
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "train-%s" % (line[0])
            label = int(tokenization.convert_to_unicode(line[1]))

            text_a = tokenization.convert_to_unicode(line[2])

            text_b_pos = tokenization.convert_to_unicode(line[3])
            text_b_neg = tokenization.convert_to_unicode(line[4])

            vs_sent1 = self._string_to_array(line[5][1:-1])
            vs_sent2 = self._string_to_array(line[6][1:-1])
            vs_sent3 = self._string_to_array(line[7][1:-1])
            vs_sent4 = self._string_to_array(line[8][1:-1])

            vs_sent5_pos = self._string_to_array(line[9][1:-1])
            vs_sent5_neg = self._string_to_array(line[10][1:-1])

            cs_dist_pos = self._string_to_array(line[11][1:-1])
            cs_dist_neg = self._string_to_array(line[12][1:-1])

            examples.append(
                InputExample(guid=guid,
                             label=label,
                             text_a=text_a,
                             text_b_pos=text_b_pos,
                             text_b_neg=text_b_neg,
                             vs_sent1=vs_sent1,
                             vs_sent2=vs_sent2,
                             vs_sent3=vs_sent3,
                             vs_sent4=vs_sent4,
                             vs_sent5_pos=vs_sent5_pos,
                             vs_sent5_neg=vs_sent5_neg,
                             cs_dist_pos=cs_dist_pos,
                             cs_dist_neg=cs_dist_neg))

        return examples
Exemple #30
0
def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    with tf.gfile.GFile(input_file, "r") as reader:
        while True:
            line = tokenization.convert_to_unicode(reader.readline())
            if not line:
                break
            line = line.strip()
            examples.append(line)
    return examples
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         split_line = line.strip().split("\t")
         text_a = tokenization.convert_to_unicode(split_line[1])
         text_b = None
         if set_type == "test":
             label = "体育"
         else:
             label = tokenization.convert_to_unicode(split_line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples