Esempio n. 1
0
    def prepare_dataset(self, sentences, max_seq_length, lower=None, train=True):
        """
        Prepare the dataset. Return a list of lists of dictionaries containing:
            - word indexes
            - word char indexes
            - tag indexes
        """
        data = []
        for s in sentences:
            if lower:
                string = [w[0].strip().lower() for w in s]
            else:
                string = [w[0].strip() for w in s]
            char_line = ' '.join(string)
            text = tokenization.convert_to_unicode(char_line)

            if train:
                tags = [w[-1] for w in s]
            else:
                tags = ['O' for _ in string]

            labels = ' '.join(tags)
            labels = tokenization.convert_to_unicode(labels)

            ids, mask, segment_ids, label_ids = self.convert_single_example(char_line=text,
                                                                            max_seq_length=max_seq_length,
                                                                            tokenizer=self.tokenizer,
                                                                            label_line=labels)
            data.append([string, segment_ids, ids, mask, label_ids])

        return data
    def get_train_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'train.csv')
        reader = pd.read_csv(file_path, encoding='utf-8', error_bad_lines=False)
        # 如果数据不是乱序的,注意要shuffle
        # 这里的数据量比较大,取部分跑一下
        reader = reader.head(50000)
        print(type(reader))
        import random
        random.shuffle(reader.values.tolist())
        print("train length:", len(reader))

        examples = []
        for _, row in reader.iterrows():
            line = row[0]
            # print(line)
            split_line = line.strip().split("\t")
            if len(split_line) != 4:
                continue

            guid = split_line[0]
            text_a = tokenization.convert_to_unicode(split_line[1])
            text_b = tokenization.convert_to_unicode(split_line[2])
            label = split_line[3]
            examples.append(InputExample(guid=guid, text_a=text_a,
                                         text_b=text_b, label=label))
        return examples
Esempio n. 3
0
 def _create_examples(self, lines, set_type):
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         if set_type == "test" and i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         if set_type == "test":
             text_a = tokenization.convert_to_unicode(line[1])
             label = "0"
         else:
             text_a = tokenization.convert_to_unicode(line[3])
             label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
 def get_train_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'data_1017.txt')
     f = open(file_path, 'r', encoding='utf8')
     data = f.readlines()
     import random
     import json
     random.shuffle(data)
     print("train length:", len(data))
     examples = []
     for i, row in enumerate(data):
         line = json.loads(row)
         guid = i
         text_a = tokenization.convert_to_unicode(line['question'])
         text_b = tokenization.convert_to_unicode(line['similar'])
         label = line['label']
         examples.append(InputExample(guid=guid, text_a=text_a,
                                      text_b=text_b, label=label))
     return examples
    def get_test_examples(self, data_dir):
        file_path = os.path.join(data_dir, 'test.csv')
        reader = pd.read_csv(file_path, encoding='utf-8', error_bad_lines=False)
        # 这里的数据量比较大,取部分跑一下,跟验证集的数据区分开
        reader = reader.head(10000)

        examples = []
        for _, row in reader.iterrows():
            line = row[0]
            # print(line)
            split_line = line.strip().split("\t")
            if len(split_line) != 4:
                continue

            guid = split_line[0]
            text_a = tokenization.convert_to_unicode(split_line[1])
            text_b = tokenization.convert_to_unicode(split_line[2])
            label = split_line[3]
            examples.append(InputExample(guid=guid, text_a=text_a,
                                         text_b=text_b, label=label))
        return examples