コード例 #1
0
ファイル: utils.py プロジェクト: Hongqin-Li/poetry-generator
def get_vocab(paths):

    vocab = Vocabulary(min_freq=10)

    for path in paths:
        poems = get_poems(path)
        update_vocab(vocab, poems)

    vocab.build_vocab()

    return vocab
コード例 #2
0
def read_instances_from_file(files, max_len=400, keep_case=False):
    ''' Collect instances and construct vocab '''

    vocab = Vocabulary()
    lb_vocab = Vocabulary(need_default=False)
    sets = []

    for file in files:
        sents, labels = [], []
        trimmed_sent = 0
        with open(file) as f:
            lines = f.readlines()
            for l in lines:
                l = l.strip().split('\t')
                if len(l) < 2:
                    continue
                label = l[0]
                sent = l[1]
                if not keep_case:
                    sent = sent.lower()
                word_lst = sent.split()
                if len(word_lst) > max_len:
                    word_lst = word_lst[:max_len]
                    trimmed_sent += 1
                if word_lst:
                    sents.append(word_lst)
                    labels.append(label)
                    vocab.add_word_lst(word_lst)
                    lb_vocab.add_word(label)

        assert len(sents) == len(labels)

        sets.append({'sents': sents, 'labels': labels})

        logger.info('Get {} instances from file {}'.format(len(sents), file))
        if trimmed_sent:
            logger.info(
                '{} sentences are trimmed. Max sentence length: {}.'.format(
                    trimmed_sent, max_len))

    logger.info('Building vocabulary...')
    vocab.add_word_lst(['<cls>'] * 6)
    vocab.build_vocab()
    lb_vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}. # Class: {}.'.format(
        len(vocab), len(lb_vocab)))
    logger.info('<pad>: {}'.format(vocab.to_index('<pad>')))
    logger.info('<unk>: {}'.format(vocab.to_index('<unk>')))
    logger.info('<cls>: {}'.format(vocab.to_index('<cls>')))

    return sets, vocab, lb_vocab
コード例 #3
0
def load_data_from_file(data_file,
                        build_vocab=True,
                        min_freq=1,
                        max_vocab_size=5000):
    with open(data_file) as fp:
        data = [src_lang_model.tokenizer(text.strip()).text for text in fp]
        data = [remove_punc(tok.split()) for tok in data]
        if build_vocab:
            vocab = Vocabulary()
            vocab.build_vocab(data,
                              lower=True,
                              min_freq=min_freq,
                              max_vocab_size=max_vocab_size)
            return data, vocab
        else:
            return data
コード例 #4
0
class Conversation:
    def __init__(self):
        self.dial_info = Dialogue_Info()
        self.vocab = Vocabulary()

        self.dial_pair = []
        self.train_dial_pair = []
        self.valid_dial_pair = []
        self.test_dial_pair = []

        self.encoded_train_dial_pair = []
        self.encoded_valid_dial_pair = []
        self.encoded_test_dial_pair = []

    def split_data_set(self, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1):

        train_size = int(len(self.dial_pair) * train_ratio)
        valid_size = int(len(self.dial_pair) * valid_ratio)
        test_size = int(len(self.dial_pair) * test_ratio)

        while len(self.valid_dial_pair) < valid_size:
            index = random.randint(0, len(self.dial_pair) - 1)
            if index not in self.valid_dial_pair:
                self.valid_dial_pair.append(self.dial_pair[index])

        while len(self.test_dial_pair) < test_size:
            index = random.randint(0, len(self.dial_pair) - 1)
            if index not in self.test_dial_pair and index not in self.valid_dial_pair:
                self.test_dial_pair.append(self.dial_pair[index])

        for index in range(len(self.dial_pair)):
            if index not in self.valid_dial_pair and index not in self.test_dial_pair:
                self.train_dial_pair.append(self.dial_pair[index])
                if len(self.train_dial_pair) >= train_size: break

    def build_dialogue_pair(self):

        for item in self.dial_info.dial_info:
            item_len = len(item)
            if item_len % 2 != 0:
                item = item[0:int(item_len / 2) * 2]

            for index in range(0, len(item), 2):
                self.dial_pair.append(list((item[index], item[index + 1])))

    def encode_dialogue_pair(self):
        # self.get_raw_sent(source_path)

        for train_item in self.train_dial_pair:
            encode_sent = [
                self.vocab.encode_sent(train_item[0]),
                self.vocab.encode_sent(train_item[1])
            ]
            if len(encode_sent[0]) > 1 and len(
                    encode_sent[0]) < max_len and len(
                        encode_sent[1]) > 1 and len(encode_sent[1]) < max_len:
                self.encoded_train_dial_pair.append(encode_sent)

        for valid_item in self.valid_dial_pair:
            encode_sent = [
                self.vocab.encode_sent(valid_item[0]),
                self.vocab.encode_sent(valid_item[1])
            ]
            if len(encode_sent[0]) > 1 and len(
                    encode_sent[0]) < max_len and len(
                        encode_sent[1]) > 1 and len(encode_sent[1]) < max_len:
                self.encoded_valid_dial_pair.append(encode_sent)

        for test_item in self.test_dial_pair:
            encode_sent = [
                self.vocab.encode_sent(test_item[0]),
                self.vocab.encode_sent(test_item[1])
            ]
            if len(encode_sent[0]) > 1 and len(
                    encode_sent[0]) < max_len and len(
                        encode_sent[1]) > 1 and len(encode_sent[1]) < max_len:
                self.encoded_test_dial_pair.append(encode_sent)

    def save_dialogue_pair(self, exp_data_dir):
        dial_pair_path = os.path.join(exp_data_dir, 'dialogue_pair.json')
        train_dial_pair_path = os.path.join(exp_data_dir,
                                            'train_dialogue_pair.json')
        valid_dial_pair_path = os.path.join(exp_data_dir,
                                            'valid_dialogue_pair.json')
        test_dial_pair_path = os.path.join(exp_data_dir,
                                           'test_dialogue_pair.json')

        encoded_train_dial_pair_path = os.path.join(
            exp_data_dir, 'encoded_train_dialogue_pair.json')
        encoded_valid_dial_pair_path = os.path.join(
            exp_data_dir, 'encoded_valid_dialogue_pair.json')
        encoded_test_dial_pair_path = os.path.join(
            exp_data_dir, 'encoded_test_dialogue_pair.json')

        with open(dial_pair_path, 'w') as f:
            json.dump(self.dial_pair, f)

        with open(train_dial_pair_path, 'w') as f:
            json.dump(self.train_dial_pair, f)

        with open(valid_dial_pair_path, 'w') as f:
            json.dump(self.valid_dial_pair, f)

        with open(test_dial_pair_path, 'w') as f:
            json.dump(self.test_dial_pair, f)

        with open(encoded_train_dial_pair_path, 'w') as f:
            json.dump(self.encoded_train_dial_pair, f)

        with open(encoded_valid_dial_pair_path, 'w') as f:
            json.dump(self.encoded_valid_dial_pair, f)

        with open(encoded_test_dial_pair_path, 'w') as f:
            json.dump(self.encoded_test_dial_pair, f)

    def load_dialogue_pair(self, exp_data_dir):
        train_dial_pair_path = os.path.join(exp_data_dir,
                                            'train_dialogue_pair.json')
        valid_dial_pair_path = os.path.join(exp_data_dir,
                                            'valid_dialogue_pair.json')
        test_dial_pair_path = os.path.join(exp_data_dir,
                                           'test_dialogue_pair.json')

        with open(train_dial_pair_path, 'r') as f:
            self.train_dial_pair = json.load(f)

        with open(valid_dial_pair_path, 'r') as f:
            self.valid_dial_pair = json.load(f)

        with open(test_dial_pair_path, 'r') as f:
            self.test_dial_pair = json.load(f)

    def create_conversation(self, raw_data_dir, exp_data_dir):
        threshold = 5

        self.dial_info.build_dialogue_info(raw_data_dir)
        self.dial_info.save_dialogue_info(exp_data_dir)

        self.vocab.build_vocab(threshold, self.train_dial_pair)
        self.vocab.save_vocab(exp_data_dir)

        self.build_dialogue_pair()
        self.split_data_set()
        self.encode_dialogue_pair()
        self.save_dialogue_pair(exp_data_dir)

    def re_encode_dialogue_pair(self, exp_data_dir):
        vocab_path = os.path.join(exp_data_dir, 'vocabulary.json')
        self.vocab.load_vocab(vocab_path)
        self.load_dialogue_pair(exp_data_dir)
        self.encode_dialogue_pair()
        self.save_dialogue_pair(exp_data_dir)
コード例 #5
0
class CluenerProcessor:
    """Processor for the chinese ner data set."""
    def __init__(self,data_dir):
        self.vocab = Vocabulary()
        self.data_dir = data_dir

    def get_vocab(self):
        vocab_path = self.data_dir / 'vocab.pkl'
        if vocab_path.exists():
            self.vocab.load_from_file(str(vocab_path))
        else:
            files = ["train.txt", "dev.txt", "test.txt"]
            for file in files:
                with open(str(self.data_dir / file), 'r') as fr:
                    for line in fr:
                        text = line.strip().split(" ")[0]
                        self.vocab.update(list(text))
            self.vocab.build_vocab()
            self.vocab.save(vocab_path)

    def get_train_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "train.txt"), "train")

    def get_dev_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "dev.txt"), "dev")

    def get_test_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "test.txt"), "test")

    def _create_examples1(self,input_path,mode):
        examples = []
        with open(input_path, 'r') as f:
            idx = 0
            for line in f:
                json_d = {}
                line = json.loads(line.strip())
                text = line['text']
                label_entities = line.get('label', None)
                words = list(text)
                labels = ['O'] * len(words)
                if label_entities is not None:
                    for key, value in label_entities.items():
                        for sub_name, sub_index in value.items():
                            for start_index, end_index in sub_index:
                                assert ''.join(words[start_index:end_index + 1]) == sub_name
                                if start_index == end_index:
                                    labels[start_index] = 'S-' + key
                                else:
                                    labels[start_index] = 'B-' + key
                                    labels[start_index + 1:end_index + 1] = ['I-' + key] * (len(sub_name) - 1)
                json_d['id'] = f"{mode}_{idx}"
                json_d['context'] = " ".join(words)
                json_d['tag'] = " ".join(labels)
                json_d['raw_context'] = "".join(words)
                idx += 1
                examples.append(json_d)
        return examples

    # 读取数据集
    def _create_examples(self,input_path,mode):
        # 读取数据集
        with open(input_path, "r", encoding="utf-8") as f:
            content = [_.strip() for _ in f.readlines()]

        # 添加原文句子以及该句子的标签
        # 读取空行所在的行号
        index = [-1]
        index.extend([i for i, _ in enumerate(content) if ' ' not in _])
        index.append(len(content))

        # 按空行分割,读取原文句子及标注序列
        sentences, tags = [], []
        examples = []
        idx = 0
        for j in range(len(index)-1):
            json_d = {}
            sent, tag = [], []
            segment = content[index[j]+1: index[j+1]]
            for line in segment:
                sent.append(line.strip().split(" ")[0])
                tag.append(line.strip().split(" ")[-1])

            sentences.append(' '.join(sent))
            tags.append(tag)

            json_d['id'] = f"{mode}_{idx}"
            json_d['context'] = " ".join(sent)
            json_d['tag'] = " ".join(tag)
            json_d['raw_context'] = "".join(sent)
            idx += 1
            examples.append(json_d)

        return examples
コード例 #6
0
def read_instances_from_file(files, max_len, keep_case):
    ''' Collect instances and construct vocab '''

    vocab = Vocabulary()
    pos_vocab = Vocabulary(need_default=False)
    ner_vocab = Vocabulary(need_default=False)
    srl_vocab = Vocabulary(need_default=False)
    chunk_vocab = Vocabulary(need_default=False)
    sets = []

    for file in files:
        sents = []
        pos_labels, ner_labels, srl_labels, chunk_labels = [], [], [], []
        trimmed_sent = 0
        with open(file) as f:
            lines = f.readlines()
            sent = []
            pos_label, ner_label, srl_label, chunk_label = [], [], [], []
            for l in lines:
                l = l.strip()
                if l == '':
                    if len(sent) > 0:
                        if len(sent) > max_len:
                            trimmed_sent += 1
                            pos_labels.append(pos_label[:max_len])
                            ner_labels.append(ner_label[:max_len])
                            srl_labels.append(srl_label[:max_len])
                            chunk_labels.append(chunk_label[:max_len])
                            sents.append(sent[:max_len])
                        else:
                            pos_labels.append(pos_label)
                            ner_labels.append(ner_label)
                            srl_labels.append(srl_label)
                            chunk_labels.append(chunk_label)
                            sents.append(sent)
                        sent = []
                        pos_label, ner_label, srl_label, chunk_label = [], [], [], []
                else:
                    l = l.split()
                    word = l[0]

                    if not keep_case:
                        word = word.lower()

                    sent.append(word)
                    pos_label.append(l[2])
                    ner_label.append(l[3])
                    srl_label.append(l[4])
                    chunk_label.append(l[5])

                    vocab.add_word(word)
                    pos_vocab.add_word(l[2])
                    ner_vocab.add_word(l[3])
                    srl_vocab.add_word(l[4])
                    chunk_vocab.add_word(l[5])

        sets.append({
            'sents': sents,
            'pos_labels': pos_labels,
            'ner_labels': ner_labels,
            'srl_labels': srl_labels,
            'chunk_labels': chunk_labels
        })

        logger.info('Get {} instances from file {}'.format(len(sents), file))
        if trimmed_sent:
            logger.warning(
                '{} sentences are trimmed. Max sentence length: {}.'.format(
                    trimmed_sent, max_len))

    logger.info('Building vocabulary...')
    vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}'.format(len(vocab)))

    pos_vocab.build_vocab()
    ner_vocab.build_vocab()
    srl_vocab.build_vocab()
    chunk_vocab.build_vocab()
    logger.info('# class in POS Tagging: {}'.format(len(pos_vocab)))
    logger.info('# class in NER Tagging: {}'.format(len(ner_vocab)))
    logger.info('# class in SRL Tagging: {}'.format(len(srl_vocab)))
    logger.info('# class in Chunking: {}'.format(len(chunk_vocab)))

    return sets, vocab, [pos_vocab, ner_vocab, srl_vocab, chunk_vocab]
コード例 #7
0
class CluenerProcessor:
    """Processor for the chinese ner data set."""
    def __init__(self,data_dir):
        self.vocab = Vocabulary()
        self.data_dir = data_dir

    def get_vocab(self):
        vocab_path = self.data_dir / 'vocab.pkl'
        if vocab_path.exists():
            self.vocab.load_from_file(str(vocab_path))
        else:
            files = ["train.json", "dev.json", "test.json"]
            for file in files:
                with open(str(self.data_dir / file), 'r') as fr:
                    for line in fr:
                        line = json.loads(line.strip())
                        text = line['text']
                        self.vocab.update(list(text))
            self.vocab.build_vocab()
            self.vocab.save(vocab_path)

    def get_train_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "train.json"), "train")

    def get_dev_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "dev.json"), "dev")

    def get_test_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "test.json"), "test")

    def _create_examples(self,input_path,mode):
        examples = []
        with open(input_path, 'r') as f:
            idx = 0
            for line in f:
                json_d = {}
                line = json.loads(line.strip())
                text = line['text']
                label_entities = line.get('label', None)
                words = list(text)
                labels = ['O'] * len(words)
                if label_entities is not None:
                    for key, value in label_entities.items():
                        for sub_name, sub_index in value.items():
                            for start_index, end_index in sub_index:
                                assert ''.join(words[start_index:end_index + 1]) == sub_name
                                if start_index == end_index:
                                    labels[start_index] = 'S-' + key
                                else:
                                    labels[start_index] = 'B-' + key
                                    labels[start_index + 1:end_index + 1] = ['I-' + key] * (len(sub_name) - 1)
                json_d['id'] = f"{mode}_{idx}"
                json_d['context'] = " ".join(words)
                json_d['tag'] = " ".join(labels)
                json_d['raw_context'] = "".join(words)
                idx += 1
                examples.append(json_d)
        return examples
コード例 #8
0
ファイル: data_processor.py プロジェクト: rongruosong/NER
class CluenerProcessor:
    """Processor for the chinese ner data set."""
    def __init__(self,data_dir):
        self.vocab = Vocabulary()
        self.data_dir = data_dir

    def get_vocab(self):
        """
        VOCAB如果存在,直接读取。如果不存在,开始建立并保存
        """
        vocab_path = self.data_dir / 'vocab.pkl'
        if vocab_path.exists():
            self.vocab.load_from_file(str(vocab_path))
        else:
            #只需要创建train的词表就可以。不使用预训练模型时,非train的字也训练不到
            files = ["train.txt", "dev.txt", "test.txt"]
            for file in files:
                with open(str(self.data_dir / file), 'r',encoding='utf-8') as fr:
                    for line in fr:
                        #line = json.loads(line.strip())
                        line = line.strip().split(' ')
                        text = line[0]
                        self.vocab.update(list(text))
            self.vocab.build_vocab()
            self.vocab.save(vocab_path)

    def get_train_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "train.txt"), "train")

    def get_dev_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "dev.txt"), "dev")

    def get_test_examples(self):
        """See base class."""
        return self._create_examples(str(self.data_dir / "test.txt"), "test")

    def _create_examples(self,input_path,mode):
        """
        Returns:List[Dict]

        [{'context':['中','国','人'] ,'tag':[B-name,I-name,I-name]} ,{},{}...]
        """

        examples = []
        with open(input_path, 'r',encoding='utf-8') as f:

            words,labels = [],[]
            flag = False
            for line in f:
                json_d = {}
                content = line.strip()
                tokens = content.split(' ') #[word,label]

                if len(tokens) == 2:
                    words.append(tokens[0])
                    labels.append(tokens[-1])
                    if tokens[-1] != 'O':
                        flag = True

                else:
                    if len(content) == 0 and len(words)>0:
                        if flag:
                            json_d['context'] = " ".join(words)
                            json_d['tag'] = " ".join(labels)
                            words, labels = [], []
                            examples.append(json_d)
                            flag = False
                        else:
                            words = []
                            labels = []

        return examples