Beispiel #1
0
class data_generator:
    def __init__(self, data, batch_size=64):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.tokenizer = Tokenizer(self.token_dict)
        self.cache_data = []
        self.vocabs = set()
        with open(dict_path, encoding='utf8') as f:
            for l in f:
                self.vocabs.add(l.replace('\n', ''))

    def init_cache_data(self):
        cur_step = 0
        for i, t in enumerate(self.get_next()):
            if i >= self.steps:
                break
            cur_step += 1
            self.cache_data.append(t)

    def __len__(self):
        return self.steps

    def encode(self, text):
        tokens = ['[CLS]'
                  ] + [ch if ch in self.vocabs else '[UNK]'
                       for ch in text] + ['[SEP]']
        return self.tokenizer._convert_tokens_to_ids(tokens), [0] * len(tokens)

    def __iter__(self):
        while True:
            idxs = [i for i in range(len(self.data))]
            np.random.shuffle(idxs)
            BERT_INPUT0, BERT_INPUT1, BIO = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d['text']
                or_text = text
                indices, segments = self.encode(or_text)
                entity = d['entity']
                text = '^' + text + '^'
                bio = get_data_bio(text, entity)
                BERT_INPUT0.append(indices)
                BERT_INPUT1.append(segments)
                BIO.append(bio)
                if len(BERT_INPUT1) == self.batch_size or i == idxs[-1]:
                    BERT_INPUT0 = np.array(seq_padding(BERT_INPUT0))
                    BERT_INPUT1 = np.array(seq_padding(BERT_INPUT1))
                    BIO = np.array(seq_padding(BIO))
                    yield [BERT_INPUT0, BERT_INPUT1, BIO], None
                    BERT_INPUT0, BERT_INPUT1, BIO = [], [], []
Beispiel #2
0
    class test_data_generator:
        def __init__(self, data, batch_size=64):
            self.data = data
            self.batch_size = batch_size
            self.steps = len(self.data) // self.batch_size
            if len(self.data) % self.batch_size != 0:
                self.steps += 1
            self.token_dict = {}
            with codecs.open(dict_path, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    self.token_dict[token] = len(self.token_dict)
            self.tokenizer = Tokenizer(self.token_dict)
            self.cache_data = []
            self.vocabs = set()
            with open(dict_path, encoding='utf8') as f:
                for l in f:
                    self.vocabs.add(l.replace('\n', ''))

        def __len__(self):
            return self.steps

        def encode(self, text):
            tokens = ['[CLS]'] + [
                ch if ch in self.vocabs else '[UNK]' for ch in text
            ] + ['[SEP]']
            return self.tokenizer._convert_tokens_to_ids(
                tokens), [0] * len(tokens)

        def __iter__(self):
            while True:
                idxs = [i for i in range(len(self.data))]
                BERT_INPUT0, BERT_INPUT1, S, ORDATA = [], [], [], []
                for i in idxs:
                    d = deepcopy(self.data[i])
                    text = d['text']
                    or_text = text
                    d['text'] = '^' + text + '^'
                    indices, segments = self.encode(or_text)
                    BERT_INPUT0.append(indices)
                    BERT_INPUT1.append(segments)
                    s = [0] * len(text)
                    S.append(s)
                    ORDATA.append(d)
                    # import ipdb
                    # ipdb.set_trace()
                    if len(S) == self.batch_size or i == idxs[-1]:
                        BERT_INPUT0 = np.array(seq_padding(BERT_INPUT0))
                        BERT_INPUT1 = np.array(seq_padding(BERT_INPUT1))
                        S = np.array(seq_padding(S))

                        # import ipdb
                        # ipdb.set_trace()
                        yield [BERT_INPUT0, BERT_INPUT1, S, ORDATA]
                        BERT_INPUT0, BERT_INPUT1, S, ORDATA = [], [], [], []
Beispiel #3
0
def encode(text):
    vocabs = set()
    with open(dict_path, encoding='utf8') as f:
        for l in f:
            vocabs.add(l.replace('\n', ''))

    token_dict = {}
    with codecs.open(dict_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
    tokenizer = Tokenizer(token_dict)
    tokens = ['[CLS]'] + [ch if ch in vocabs else '[UNK]'
                          for ch in text] + ['[SEP]']
    return tokenizer._convert_tokens_to_ids(tokens), [0] * len(tokens)
Beispiel #4
0
    class data_generator:
        def __init__(self, data, batch_size=64):
            self.data = data
            self.batch_size = batch_size
            self.steps = len(self.data) // self.batch_size
            if len(self.data) % self.batch_size != 0:
                self.steps += 1
            self.token_dict = {}
            with codecs.open(dict_path, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    self.token_dict[token] = len(self.token_dict)
            self.tokenizer = Tokenizer(self.token_dict)
            self.cache_data = []
            self.vocabs = set()
            with open(dict_path, encoding='utf8') as f:
                for l in f:
                    self.vocabs.add(l.replace('\n', ''))

        def init_cache_data(self):
            cur_step = 0
            for i, t in enumerate(self.get_next()):
                if i >= self.steps:
                    break
                cur_step += 1
                self.cache_data.append(t)

        def __len__(self):
            return self.steps

        def encode(self, text):
            tokens = ['[CLS]'] + [
                ch if ch in self.vocabs else '[UNK]' for ch in text
            ] + ['[SEP]']
            return self.tokenizer._convert_tokens_to_ids(
                tokens), [0] * len(tokens)

        def __iter__(self):
            while True:
                idxs = [i for i in range(len(self.data))]
                np.random.shuffle(idxs)
                BERT_INPUT0, BERT_INPUT1, S, K1, K2, O1, O2, = [], [], [], [], [], [], []
                for i in idxs:
                    d = self.data[i]
                    text = d['text']
                    or_text = text
                    text = '^' + text + '^'
                    text = text
                    items = {}
                    for sp in d['spo_list']:
                        subjectid = text.find(sp[0])
                        objectid = text.find(sp[2])
                        if subjectid != -1 and objectid != -1:
                            key = (subjectid, subjectid + len(sp[0]))
                            if key not in items:
                                items[key] = []
                            items[key].append((objectid, objectid + len(sp[2]),
                                               predicate2id[sp[1]]))
                    if items:
                        indices, segments = self.encode(or_text)
                        BERT_INPUT0.append(indices)
                        BERT_INPUT1.append(segments)
                        s1, s2 = [0] * len(text), [0] * len(text)
                        for j in items:
                            s1[j[0]] = 1
                            s2[j[1] - 1] = 1
                        s = [0] * len(text)
                        for idx in range(len(s1)):
                            if s1[idx] == 1:
                                s[idx] = 1
                            if s2[idx] == 1:
                                s[idx] = 2
                        k1, k2 = choice(list(items.keys()))
                        o1, o2 = [0] * len(text), [0] * len(
                            text)  # 0是unk类(共49+1个类)
                        for j in items[(k1, k2)]:
                            o1[j[0]] = j[2]
                            o2[j[1] - 1] = j[2]
                        S.append(s)
                        K1.append([k1])
                        K2.append([k2 - 1])
                        O1.append(o1)
                        O2.append(o2)
                        if len(S) == self.batch_size or i == idxs[-1]:
                            BERT_INPUT0 = np.array(seq_padding(BERT_INPUT0))
                            BERT_INPUT1 = np.array(seq_padding(BERT_INPUT1))
                            S = np.array(seq_padding(S))
                            O1 = np.array(seq_padding(O1))
                            O2 = np.array(seq_padding(O2))
                            K1, K2 = np.array(K1), np.array(K2)
                            yield [
                                BERT_INPUT0, BERT_INPUT1, S, K1, K2, O1, O2
                            ], None
                            BERT_INPUT0, BERT_INPUT1, S, K1, K2, O1, O2 = [], [], [], [], [], [], []
Beispiel #5
0
class data_generator:
    def __init__(self, data, batch_size=32):
        self.data = data
        self.batch_size = batch_size

        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
        self.token_dict = {}
        with codecs.open(dict_path, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)
        self.tokenizer = Tokenizer(self.token_dict)
        self.cache_data = []
        self.vocabs = set()
        with open(dict_path, encoding='utf8') as f:
            for l in f:
                self.vocabs.add(l.replace('\n', ''))

    def init_cache_data(self):
        cur_step = 0
        for i, t in enumerate(self.get_next()):
            if i >= self.steps:
                break
            cur_step += 1
            self.cache_data.append(t)

    def __len__(self):
        return self.steps

    def encode(self, text):
        tokens = ['[CLS]'
                  ] + [ch if ch in self.vocabs else '[UNK]'
                       for ch in text] + ['[SEP]']
        return self.tokenizer._convert_tokens_to_ids(tokens), [0] * len(tokens)

    def __iter__(self):
        while True:
            idxs = [i for i in range(len(self.data))]
            np.random.shuffle(idxs)
            BERT_INPUT0, BERT_INPUT1, BICHAR_INPUT, BIO = [], [], [], []
            for i in idxs:
                _data = self.data[i]
                text = _data['text']
                or_text = text
                indices, segments = self.encode(or_text)

                #前后要加上填充两个无用字符
                _bio = [BIO2id.get(bio, 0) for bio in _data['bio']]
                # _bichar = [bichar2id.get(bichar,0) for bichar in _data['bichar']]

                #在前后插入0 作为pad
                # _bichar.insert(0,0)
                # _bichar.append(0)

                _bio.insert(0, 0)
                _bio.append(0)

                BERT_INPUT0.append(indices)
                BERT_INPUT1.append(segments)
                # BICHAR_INPUT.append(_bichar)
                BIO.append(_bio)

                if len(BERT_INPUT1) == self.batch_size or i == idxs[-1]:
                    BERT_INPUT0 = np.array(seq_padding(BERT_INPUT0))
                    BERT_INPUT1 = np.array(seq_padding(BERT_INPUT1))
                    # BICHAR_INPUT = np.array(seq_padding(BICHAR_INPUT))
                    BIO = np.array(seq_padding(BIO))

                    yield [BERT_INPUT0, BERT_INPUT1, BIO], None
                    BERT_INPUT0, BERT_INPUT1, BIO = [], [], []
Beispiel #6
0
class data_parser:
    def __init__(self,
                 VOCAB_PATH=None,
                 TASK=None,
                 SEQ_LEN=None,
                 DATA_DIR=None):

        self.TASK = TASK
        self.SEQ_LEN = SEQ_LEN
        self.DATA_DIR = DATA_DIR

        self.token_dict = {}
        with codecs.open(VOCAB_PATH, 'r', 'utf8') as reader:
            for line in reader:
                token = line.strip()
                self.token_dict[token] = len(self.token_dict)

        self.tokenizer = Tokenizer(self.token_dict, cased=False)

    def _read_tsv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "rU") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines

    def convert_to_unicode(self, text):
        """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
        if six.PY3:
            if isinstance(text, str):
                return text
            elif isinstance(text, bytes):
                return text.decode("utf-8", "ignore")
            else:
                raise ValueError("Unsupported string type: %s" % (type(text)))
        elif six.PY2:
            if isinstance(text, str):
                return text.decode("utf-8", "ignore")
            elif isinstance(text, unicode):
                return text
            else:
                raise ValueError("Unsupported string type: %s" % (type(text)))
        else:
            raise ValueError("Not running on Python2 or Python 3?")

    def encode(self, first, second=None, max_len=None):

        first_tokens = self.tokenizer._tokenize(first)
        second_tokens = self.tokenizer._tokenize(
            second) if second is not None else None
        self.tokenizer._truncate(first_tokens, second_tokens, max_len)
        tokens, first_len, second_len = self.tokenizer._pack(
            first_tokens, second_tokens)
        token_ids = self.tokenizer._convert_tokens_to_ids(tokens)
        segment_ids = [0] * first_len + [1] * second_len
        token_len = first_len + second_len
        pad_len = 0
        if max_len is not None:
            pad_len = max_len - first_len - second_len
            token_ids += [self.tokenizer._pad_index] * pad_len
            segment_ids += [0] * pad_len
        input_mask = [1] * token_len + [0] * pad_len
        return token_ids, segment_ids, input_mask

    def get_train_data(self):

        data_path = os.path.join(self.DATA_DIR, "train.tsv")
        train_x, train_y = self.load_data(data_path, set_type='train')
        return train_x, train_y

    def get_dev_data(self):

        data_path = os.path.join(self.DATA_DIR, "dev.tsv")
        dev_x, dev_y = self.load_data(data_path, set_type='dev')
        return dev_x, dev_y

    def get_test_data(self):

        data_path = os.path.join(self.DATA_DIR, "test.tsv")
        test_x, test_y = self.load_data(data_path, set_type='test')
        return test_x, test_y

    def load_data(self, data_path, set_type=None):

        if self.TASK == 'qqp':
            data_x, data_y = self.load_data_qqp(data_path, set_type=set_type)
        elif self.TASK == 'sst-2':
            data_x, data_y = self.load_data_sst(data_path, set_type=set_type)
        elif self.TASK == 'qnli':
            data_x, data_y = self.load_data_qnli(data_path, set_type=set_type)
        elif self.TASK == 'cola':
            data_x, data_y = self.load_data_cola(data_path, set_type=set_type)
        elif self.TASK == 'rte':
            data_x, data_y = self.load_data_rte(data_path, set_type=set_type)
        elif self.TASK == 'mrpc':
            data_x, data_y = self.load_data_mrpc(data_path, set_type=set_type)
        elif self.TASK == 'mnli-m' or self.TASK == 'mnli-mm':
            data_x, data_y = self.load_data_mnli(data_path, set_type=set_type)
        elif self.TASK == 'sts-b':
            data_x, data_y = self.load_data_stsb(data_path, set_type=set_type)
        else:
            raise ValueError('No data loader for the given TASK.')

        return data_x, data_y

    def load_data_qqp(self, path, set_type='train'):

        indices, sentiments, masks, final_segments = [], [], [], []
        lines = self._read_tsv(path)
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            if (set_type == 'train' or set_type == "dev") and len(line) < 6:
                continue
            if set_type == "test":
                text_a = self.convert_to_unicode(line[1])
                text_b = self.convert_to_unicode(line[2])
                label = self.convert_to_unicode(line[0])
            else:
                text_a = self.convert_to_unicode(line[3])
                text_b = self.convert_to_unicode(line[4])
                label = self.convert_to_unicode(line[5])
            ids, segments, mask = self.encode(text_a,
                                              text_b,
                                              max_len=self.SEQ_LEN)
            indices.append(ids)
            final_segments.append(segments)
            sentiments.append(label)
            masks.append(mask)
        items = list(zip(indices, masks, final_segments, sentiments))
        if set_type != "test":
            np.random.shuffle(items)
        indices, masks, final_segments, sentiments = zip(*items)
        indices = np.array(indices)
        masks = np.array(masks)
        final_segments = np.array(final_segments)
        sentiments = np.array(sentiments)
        return [indices, final_segments, masks], sentiments

    def load_data_sst(self, path, set_type='train'):

        indices, sentiments, masks = [], [], []
        lines = self._read_tsv(path)
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            if (set_type == 'train' or set_type == 'dev') and len(line) < 2:
                continue
            if set_type == "test":
                text_a = self.convert_to_unicode(line[1])
                label = self.convert_to_unicode(line[0])
            else:
                text_a = self.convert_to_unicode(line[0])
                label = self.convert_to_unicode(line[1])
            ids, segments, mask = self.encode(text_a, max_len=self.SEQ_LEN)
            indices.append(ids)
            sentiments.append(label)
            masks.append(mask)
        items = list(zip(indices, masks, sentiments))
        if set_type != "test":
            np.random.shuffle(items)
        indices, masks, sentiments = zip(*items)
        indices = np.array(indices)
        masks = np.array(masks)
        return [indices, np.zeros_like(indices), masks], np.array(sentiments)

    def load_data_mrpc(self, path, set_type='train'):

        indices, sentiments, masks, final_segments = [], [], [], []
        lines = self._read_tsv(path)
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            if (set_type == 'train' or set_type == 'dev') and len(line) < 5:
                continue
            if set_type == "test":
                text_a = self.convert_to_unicode(line[3])
                text_b = self.convert_to_unicode(line[4])
                label = self.convert_to_unicode(line[0])
            else:
                text_a = self.convert_to_unicode(line[3])
                text_b = self.convert_to_unicode(line[4])
                label = self.convert_to_unicode(line[0])
            ids, segments, mask = self.encode(text_a,
                                              text_b,
                                              max_len=self.SEQ_LEN)
            indices.append(ids)
            final_segments.append(segments)
            sentiments.append(label)
            masks.append(mask)
        items = list(zip(indices, masks, final_segments, sentiments))
        if set_type != "test":
            np.random.shuffle(items)
        indices, masks, final_segments, sentiments = zip(*items)
        indices = np.array(indices)
        masks = np.array(masks)
        final_segments = np.array(final_segments)
        return [indices, final_segments, masks], np.array(sentiments)

    def load_data_qnli(self, path, set_type='train'):

        indices, sentiments, masks, final_segments = [], [], [], []
        lines = self._read_tsv(path)
        data_labels = {'entailment': '1', 'not_entailment': '0'}
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            if (set_type == 'train' or set_type == 'dev') and len(line) < 4:
                continue
            if set_type == "test":
                text_a = self.convert_to_unicode(line[1])
                text_b = self.convert_to_unicode(line[2])
                label = self.convert_to_unicode(line[0])
            else:
                text_a = self.convert_to_unicode(line[1])
                text_b = self.convert_to_unicode(line[2])
                label = self.convert_to_unicode(data_labels[line[3]])
            ids, segments, mask = self.encode(text_a,
                                              text_b,
                                              max_len=self.SEQ_LEN)
            indices.append(ids)
            final_segments.append(segments)
            sentiments.append(label)
            masks.append(mask)
        items = list(zip(indices, masks, final_segments, sentiments))
        if set_type != "test":
            np.random.shuffle(items)
        indices, masks, final_segments, sentiments = zip(*items)
        indices = np.array(indices)
        masks = np.array(masks)
        final_segments = np.array(final_segments)
        return [indices, final_segments, masks], np.array(sentiments)

    def load_data_rte(self, path, set_type='train'):

        indices, sentiments, masks, final_segments = [], [], [], []
        lines = self._read_tsv(path)
        data_labels = {'entailment': '1', 'not_entailment': '0'}
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            if (set_type == 'train' or set_type == 'dev') and len(line) < 4:
                continue
            if set_type == "test":
                text_a = self.convert_to_unicode(line[1])
                text_b = self.convert_to_unicode(line[2])
                label = self.convert_to_unicode(line[0])
            else:
                text_a = self.convert_to_unicode(line[1])
                text_b = self.convert_to_unicode(line[2])
                label = self.convert_to_unicode(data_labels[line[3]])
            ids, segments, mask = self.encode(text_a,
                                              text_b,
                                              max_len=self.SEQ_LEN)
            indices.append(ids)
            final_segments.append(segments)
            sentiments.append(label)
            masks.append(mask)
        items = list(zip(indices, masks, final_segments, sentiments))
        if set_type != "test":
            np.random.shuffle(items)
        indices, masks, final_segments, sentiments = zip(*items)
        indices = np.array(indices)
        masks = np.array(masks)
        final_segments = np.array(final_segments)
        return [indices, final_segments, masks], np.array(sentiments)

    def load_data_cola(self, path, SEQ_LEN=None, set_type='train'):

        indices, sentiments, masks = [], [], []
        lines = self._read_tsv(path)
        for (i, line) in enumerate(lines):
            if i == 0 and set_type == 'test':
                continue
            if (set_type == 'train' or set_type == 'dev') and len(line) < 4:
                continue
            if set_type == "test":
                text_a = self.convert_to_unicode(line[1])
                label = self.convert_to_unicode(line[0])
            else:
                text_a = self.convert_to_unicode(line[3])
                label = self.convert_to_unicode(line[1])
            ids, segments, mask = self.encode(text_a, max_len=self.SEQ_LEN)
            indices.append(ids)
            sentiments.append(label)
            masks.append(mask)
        items = list(zip(indices, masks, sentiments))
        if set_type != "test":
            np.random.shuffle(items)
        indices, masks, sentiments = zip(*items)
        indices = np.array(indices)
        masks = np.array(masks)
        return [indices, np.zeros_like(indices), masks], np.array(sentiments)

    def load_data_mnli(self, path, set_type='train'):

        data_labels = {'contradiction': '0', 'entailment': '1', 'neutral': '2'}
        indices, sentiments, masks, final_segments = [], [], [], []
        lines = self._read_tsv(path)
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            if set_type == "test":
                text_a = self.convert_to_unicode(line[8])
                text_b = self.convert_to_unicode(line[9])
                label = self.convert_to_unicode(line[0])
            else:
                text_a = self.convert_to_unicode(line[8])
                text_b = self.convert_to_unicode(line[9])
                label = self.convert_to_unicode(data_labels[line[-1]])
            ids, segments, mask = self.encode(text_a,
                                              text_b,
                                              max_len=self.SEQ_LEN)
            indices.append(ids)
            final_segments.append(segments)
            sentiments.append(label)
            masks.append(mask)
        items = list(zip(indices, masks, final_segments, sentiments))
        if set_type != "test":
            np.random.shuffle(items)
        indices, masks, final_segments, sentiments = zip(*items)
        indices = np.array(indices)
        masks = np.array(masks)
        final_segments = np.array(final_segments)
        return [indices, final_segments, masks], np.array(sentiments)

    def load_data_stsb(self, path, set_type='train'):

        indices, sentiments, masks, final_segments = [], [], [], []
        lines = self._read_tsv(path)
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            if set_type == "test":
                text_a = self.convert_to_unicode(line[7])
                text_b = self.convert_to_unicode(line[8])
                label = self.convert_to_unicode(line[0])
            else:
                text_a = self.convert_to_unicode(line[7])
                text_b = self.convert_to_unicode(line[8])
                label = float(line[-1])
            ids, segments, mask = self.encode(text_a,
                                              text_b,
                                              max_len=self.SEQ_LEN)
            indices.append(ids)
            final_segments.append(segments)
            sentiments.append(label)
            masks.append(mask)
        items = list(zip(indices, masks, final_segments, sentiments))
        if set_type != "test":
            np.random.shuffle(items)
        indices, masks, final_segments, sentiments = zip(*items)
        indices = np.array(indices)
        final_segments = np.array(final_segments)
        masks = np.array(masks)
        return [indices, final_segments, masks], np.array(sentiments)