Ejemplo n.º 1
0
from flyai.dataset import Dataset

from model import Model

data = Dataset()
model = Model(data)
# p = model.predict(age=53, sex=1, cp=3, trestbps=130, chol=246, fbs=1, restecg=2, thalach=173, exang=0, oldpeak=0.0,
#                   slope=1, ca=3, thal=3)
# print(p)

tData = data.get_all_data()
preds = model.predict_all(tData[0])

y_test = []
for label in tData[1]:
    y_test.append(label['label'])

rCount = 0.0
for i in range(0, len(preds)):
    if preds[i] == y_test[i]:
        rCount += 1.

test_acc = rCount / len(preds)

print('accuracy %g' % test_acc)
Ejemplo n.º 2
0
class Instructor(object):
    """
    特点:使用flyai字典的get all data  | 自己进行划分next batch
    """
    def __init__(self, arguments):
        self.arguments = arguments

    def train(self,
              train_category,
              dev_category,
              train_news,
              dev_news,
              tokenizer,
              Net=None,
              model=None):
        if os.path.exists(self.arguments.output_config_file) is True:
            os.remove(self.arguments.output_config_file)

        logger.info('>>train.shape: {} | dev.shape: {}'.format(
            train_category.shape, dev_category.shape))
        train_dataloader, train_examples_len = Util.load_data(
            news=train_news,
            category=train_category,
            data_type='train',
            label_list=self.arguments.label_list,
            max_length=self.arguments.max_seq_length,
            tokenizer=tokenizer,
            batch_size=self.arguments.BATCH)
        dev_dataloader, dev_examples_len = Util.load_data(
            news=dev_news,
            category=dev_category,
            data_type='dev',
            label_list=self.arguments.label_list,
            max_length=self.arguments.max_seq_length,
            tokenizer=tokenizer,
            batch_size=self.arguments.BATCH)

        num_train_optimization_steps = int(
            train_examples_len / self.arguments.BATCH /
            self.arguments.gradient_accumulation_steps) * self.arguments.EPOCHS

        # 模型准备
        logger.info("model name is {}".format(self.arguments.model_name))

        if model is None:
            if self.arguments.model_name == "BertOrigin":
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir)

            elif self.arguments.model_name == 'BertHAN':
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir)

            elif self.arguments.model_name == "BertCNN":
                filter_sizes = [
                    int(val) for val in self.arguments.filter_sizes.split()
                ]
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    n_filters=self.arguments.filter_num,
                    filter_sizes=filter_sizes,
                    cache_dir=self.arguments.cache_dir)

            elif self.arguments.model_name == "BertATT":
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir)

            elif self.arguments.model_name == "BertRCNN":
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir,
                    rnn_hidden_size=self.arguments.rnn_hidden_size,
                    num_layers=self.arguments.num_layers,
                    bidirectional=self.arguments.bidirectional,
                    dropout=self.arguments.dropout)

            elif self.arguments.model_name == "BertCNNPlus":
                filter_sizes = [
                    int(val) for val in self.arguments.filter_sizes.split()
                ]
                model = Net.from_pretrained(
                    pretrained_model_name_or_path=self.arguments.
                    bert_model_dir,
                    num_labels=self.arguments.num_labels,
                    cache_dir=self.arguments.cache_dir,
                    n_filters=self.arguments.filter_num,
                    filter_sizes=filter_sizes)

        model.to(DEVICE)
        """ 优化器准备 """
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        # To reproduce BertAdam specific behavior set correct_bias=False
        optimizer = AdamW(params=optimizer_grouped_parameters,
                          lr=self.arguments.learning_rate,
                          correct_bias=False)
        # PyTorch scheduler
        scheduler = WarmupLinearSchedule(
            optimizer=optimizer,
            warmup_steps=self.arguments.warmup_proportion,
            t_total=num_train_optimization_steps)
        """ 损失函数准备 """
        if self.arguments.use_label_smoothing:
            criterion = NMTCriterion(
                label_smoothing=self.arguments.label_smoothing)
        else:
            criterion = nn.CrossEntropyLoss()
        criterion = criterion.to(DEVICE)

        best_auc, best_acc, global_step, early_stop_times = 0, 0, 0, 0
        for epoch in range(int(self.arguments.EPOCHS)):
            if early_stop_times >= self.arguments.early_stop * (
                    train_examples_len // self.arguments.BATCH):
                break

            logger.info(f'---------------- Epoch: {epoch + 1:02} ----------')

            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                if self.arguments.label_smoothing:
                    criterion.train()

                batch = tuple(t.to(DEVICE) for t in batch)
                _, input_ids, input_mask, segment_ids, label_ids = batch

                logits = model(input_ids, segment_ids, input_mask, labels=None)
                loss = criterion(inputs=logits,
                                 labels=label_ids,
                                 normalization=1.0,
                                 reduce=False)

                # 修正
                if self.arguments.gradient_accumulation_steps > 1:
                    loss = loss / self.arguments.gradient_accumulation_steps

                loss.backward(torch.ones_like(loss))
                scheduler.step()

                if (step +
                        1) % self.arguments.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                    if global_step % self.arguments.print_step == 0 and global_step != 0:
                        dev_loss, dev_acc, dev_report, dev_auc = Util.evaluate(
                            model,
                            dev_dataloader,
                            criterion,
                            DEVICE,
                            self.arguments.label_list,
                            args=self.arguments)
                        logger.info('\n>>>dev report: \n{}'.format(dev_report))
                        # 以 acc 取优
                        if dev_acc > best_acc:
                            best_acc = dev_acc
                            # 以 auc 取优
                            if dev_auc > best_auc:
                                best_auc = dev_auc

                            # 保存模型
                            model_to_save = model.module if hasattr(
                                model, 'module') else model
                            torch.save(model_to_save.state_dict(),
                                       self.arguments.output_model_file)
                            with open(self.arguments.output_config_file,
                                      'w') as f:
                                f.write(model_to_save.config.to_json_string())

                            early_stop_times = 0
                        else:
                            early_stop_times += 1

        if os.path.exists(self.arguments.output_config_file) is False:
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(),
                       self.arguments.output_model_file)
            with open(self.arguments.output_config_file, 'w') as f:
                f.write(model_to_save.config.to_json_string())

    def generate(self):
        self.dataset = Dataset(epochs=self.arguments.EPOCHS,
                               batch=self.arguments.BATCH,
                               val_batch=self.arguments.BATCH)
        news, category, _, _ = self.dataset.get_all_data()
        news = np.asarray([i['news'] for i in news])
        category = np.asarray([i['category'] for i in category])

        index = [i for i in range(len(news))]
        np.random.shuffle(np.asarray(index))
        train_news, dev_news = news[index[0:int(len(index) * 0.9)]], news[
            index[int(len(index) * 0.9):]]
        train_category, dev_category = category[
            index[0:int(len(index) *
                        0.9)]], category[index[int(len(index) * 0.9):]]

        return train_news, train_category, dev_news, dev_category

    def run(self):
        remote_helper.get_remote_date(
            "https://www.flyai.com/m/chinese_base.zip")
        before_vocab_dir = os.path.join(os.getcwd(), 'vocab.txt')
        after_vocab_dir = os.path.join(args.bert_model_dir, 'vocab.txt')
        logger.info('>before_vocab_dir:{}'.format(before_vocab_dir))
        logger.info('>after_vocab_dir:{}'.format(after_vocab_dir))

        shutil.copyfile(before_vocab_dir, after_vocab_dir)

        if not os.path.exists(self.arguments.output_dir):
            os.mkdir(self.arguments.output_dir)

        self.arguments.BATCH = self.arguments.BATCH // self.arguments.gradient_accumulation_steps

        # 数据准备  分词器选择
        tokenizer = BertTokenizer(
            self.arguments.bert_vocab_file).from_pretrained(
                self.arguments.bert_model_dir,
                do_lower_case=self.arguments.do_lower_case)
        # 获取数据 news/keywords
        train_news, train_category, dev_news, dev_category = self.generate()

        self.train(Net=Net,
                   train_category=train_category,
                   dev_category=dev_category,
                   train_news=train_news,
                   dev_news=dev_news,
                   tokenizer=tokenizer)
Ejemplo n.º 3
0
import re
import jieba
import gensim
import json
# ---------超参
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs")
parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size")
args = parser.parse_args()
# ---------数据获取辅助类
dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH)
# ---------模型操作辅助类
modelpp = Model(dataset)
# 统计数据的分布

train_x, train_label, val_x, val_label = dataset.get_all_data()

topic_a = {
    'topic': '深圳禁摩限电',
    'total_number': 0,
    'None': 0,
    'Favor': 0,
    'Agan': 0,
    'text': []
}
topic_b = {
    'topic': '春节放鞭炮',
    'total_number': 0,
    'None': 0,
    'Favor': 0,
    'Agan': 0,
Ejemplo n.º 4
0
# -*- coding: utf-8 -*
from flyai.source.source import Source
from flyai.utils.yaml_helper import Yaml
from flyai.dataset import Dataset
from model import Model

print('调用了predict')

data = Dataset()
model = Model(data)

p = model.predict_all(data.get_all_data()[0])
print(p)
Ejemplo n.º 5
0
    参数为csv中作为输入x的一条数据,该方法会被dataset.next_train_batch()
    和dataset.next_validation_batch()多次调用。评估的时候会调用该方法做数据处理
    该方法字段与app.yaml中的input:->columns:对应
    '''

    def output_x(self, TARGET, TEXT):
        text2vec = self.input_x(TARGET, TEXT)
        return text2vec

    '''
    输出的结果,会被dataset.to_categorys(data)调用
    '''

    def output_y(self, data):
        index = np.argmax(data)
        return index


if __name__ == '__main__':
    from flyai.dataset import Dataset
    dataset = Dataset(10, 32)
    train_x, train_y, val_x, val_y = dataset.get_all_data()
    preTrainedEmbedding = PreTrainedEmbedding()
    contents = [x['TEXT'] for x in train_x]
    unfounds = []
    for words in contents:
        print(words)
        vector, unfound = preTrainedEmbedding.turnToVectors(words)
        unfounds.append(unfound)
    print("unfound probability is: %f", np.mean(unfounds))
class Instructor(object):
    """
    特点:使用flyai字典的get all data  | 自己进行划分next batch
    """

    def __init__(self, args):
        self.args = args
        self.tag_map = {label: i for i, label in enumerate(self.args.labels)}

    def train(self, train_source, train_target, dev_source, dev_target):
        if os.path.exists(self.args.output_dir) is True:
            shutil.rmtree(self.args.output_dir)

        train_dataloader = create_batch_iter(mode='train', X=train_source, y=train_target, batch_size=self.args.BATCH)
        dev_dataloader = create_batch_iter(mode='dev', X=dev_source, y=dev_target, batch_size=self.args.BATCH)

        self.model.to(DEVICE)

        # 优化器准备
        param_optimizer = list(self.model.named_parameters())
        no_decay = list(['bias', 'LayerNorm.bias', 'LayerNorm.weight'])
        optimizer_grouped_parameters = list([{'params': [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0}])

        optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.args.learning_rate)

        total_size = math.ceil(len(train_source) / self.args.BATCH)

        best_acc = 0
        for epoch in range(self.args.EPOCHS):
            for train_step, train_batch in enumerate(tqdm(train_dataloader, desc='Train_Iteration')):
                self.model.train()
                self.model.zero_grad()

                train_batch = tuple(t.to(DEVICE) for t in train_batch)
                t_input_ids, t_input_mask, t_labels, t_out_masks = train_batch

                t_bert_encode = self.model(t_input_ids, t_input_mask)
                loss = self.model.loss_fn(bert_encode=t_bert_encode, tags=t_labels, output_mask=t_out_masks)
                loss.backward()

                # 梯度裁剪
                # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1)
                optimizer.step()

                if train_step % 10 == 0:
                    self.model.eval()
                    eval_loss = 0

                    for dev_step, dev_batch in enumerate(dev_dataloader):
                        dev_batch = tuple(t.to(DEVICE) for t in dev_batch)
                        d_input_ids, d_input_mask, d_label_ids, d_output_mask = dev_batch

                        with torch.no_grad():
                            d_bert_encode = self.model(d_input_ids, d_input_mask)
                        eval_loss += self.model.loss_fn(bert_encode=d_bert_encode, tags=d_label_ids,
                                                        output_mask=d_output_mask)
                        predicts = self.model.predict(d_bert_encode, d_output_mask)

                        d_label_ids = d_label_ids.view(1, -1)
                        d_label_ids = d_label_ids[d_label_ids != -1]

                        eval_acc, eval_f1 = self.model.acc_f1(predicts, d_label_ids)

                        if eval_acc > best_acc:
                            best_acc = eval_acc
                            save_model(self.model, self.args.output_dir)

                        self.model.class_report(predicts, d_label_ids)

                    logger.info("\n>step {}".format(train_step))
                    logger.info("\n>epoch [{}] {}/{}\n\tloss {:.2f}".format(epoch, train_step, total_size, loss.item()))
        if self.args.output_dir is False:
            save_model(self.model, self.args.output_dir)

    def generate(self):
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH)
        source, target, _, _ = self.dataset.get_all_data()
        source = np.asarray([i['source'].split(' ') for i in source])
        target = np.asarray([i['target'].split(' ') for i in target])

        index = [i for i in range(len(source))]
        np.random.shuffle(np.asarray(index))
        train_source, dev_source = source[index[0:int(len(index) * 0.9)]], source[index[int(len(index) * 0.9):]]
        train_target, dev_target = target[index[0:int(len(index) * 0.9)]], target[index[int(len(index) * 0.9):]]

        return train_source, train_target, dev_source, dev_target

    def run(self):
        # ## albert-base
        # remote_helper.get_remote_date('https://www.flyai.com/m/albert_base_zh_tensorflow.zip')
        # convert_tf_checkpoint_to_pytorch(
        #     tf_checkpoint_path="./data/input/model",
        #     bert_config_file="./data/input/model/albert_config_base.json",
        #     pytorch_dump_path="./data/input/model/pytorch_model.bin",
        #     share_type="all")

        # ## albert-large
        remote_helper.get_remote_date('https://www.flyai.com/m/albert_large_zh.zip')
        convert_tf_checkpoint_to_pytorch(
            tf_checkpoint_path="./data/input/model",
            bert_config_file="./data/input/model/albert_config_large.json",
            pytorch_dump_path="./data/input/model/pytorch_model.bin",
            share_type="all")

        # ## albert-xlarge
        # remote_helper.get_remote_date('https://www.flyai.com/m/albert_xlarge_zh_183k.zip')
        # convert_tf_checkpoint_to_pytorch(tf_checkpoint_path="./data/input/model",
        #                                  bert_config_file="./data/input/model/albert_config_xlarge.json",
        #                                  pytorch_dump_path="./data/input/model/pytorch_model.bin",
        #                                  share_type="all")

        self.model = Net(
            tag_map=self.tag_map,
            batch_size=self.args.BATCH,
            dropout=self.args.dropout,
            embedding_dim=self.args.embedding_size,
            hidden_dim=self.args.hidden_size,
        )

        train_source, train_target, dev_source, dev_target = self.generate()

        self.train(train_source, train_target, dev_source, dev_target)
Ejemplo n.º 7
0
# -*- coding: utf-8 -*
from flyai.dataset import Dataset
from model import Model

from processor import Processor

# 数据获取辅助类
dataset = Dataset()
# 模型操作辅助类
model = Model(dataset)

result = model.predict(
    text="您好!我们这边是施华洛世奇鄞州万达店!您是我们尊贵的会员,特意邀请您参加我们x.x-x.x的三八女人节活动!满xxxx元享晶璨花漾丝巾")
print(result)

tData = dataset.get_all_data()
preds = model.predict_all(tData[0])

y_test = []
for label in tData[1]:
    y_test.append(label['label'])

rCount = 0.0
for i in range(0, len(preds)):
    if preds[i] == y_test[i]:
        rCount += 1.

test_acc = rCount / len(preds)

print('accuracy %g' % test_acc)
Ejemplo n.º 8
0
        target["labels"] = labels
        target["image_id"] = image_id
        target["area"] = areas
        target["iscrowd"] = iscrowd
        return torchvision.transforms.ToTensor()(img), target

    def __len__(self):
        return len(self.img_path_list)


def collate_fn(batch):
    return tuple(zip(*batch))


# 获取所有原始数据
x_train, y_train, x_val, y_val = dataset.get_all_data(
)  # 示例: [{'img_path': 'img/019646.jpg'}, ...] [{'label_path': 'label/019646.jpg.txt'}, ...]
# 构建自己的数据加载器
train_dataset = MaskDataset(x_train, y_train)
valid_dataset = MaskDataset(x_val, y_val)
#  批大小
train_batch_size = args.BATCH
valid_batch_size = 1
train_data_loader = torch.utils.data.DataLoader(train_dataset,
                                                batch_size=train_batch_size,
                                                shuffle=True,
                                                num_workers=0,
                                                collate_fn=collate_fn)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset,
                                                batch_size=valid_batch_size,
                                                shuffle=False,
                                                num_workers=0,
Ejemplo n.º 9
0
class Run(object):
    def __init__(self):
        self.args = args
        self.dataset = Dataset(epochs=self.args.total_epochs,
                               batch=self.args.batch_size,
                               val_batch=self.args.batch_size)

    def train(self):
        self.audio_paths, self.labels, _, _ = self.dataset.get_all_data()

        # unit2idx
        unit2idx = {}
        with open(self.args.vocab_txt_path, 'r', encoding='utf-8') as fr:
            for line in fr:
                unit, idx = line.strip().split()
                unit2idx[unit] = int(idx)

        # 模型定义
        model = Transformer(
            input_size=self.args.input_size,
            vocab_size=self.args.vocab_size,
            d_model=self.args.model_size,
            n_heads=self.args.n_heads,
            d_ff=self.args.model_size * 4,
            num_enc_blocks=self.args.num_enc_blocks,
            num_dec_blocks=self.args.num_dec_blocks,
            residual_dropout_rate=self.args.residual_dropout_rate,
            share_embedding=self.args.share_embedding)
        if torch.cuda.is_available():
            model.cuda()  # 将模型加载到GPU中

        # 根据生成词表指定大小
        vocab_size = len(unit2idx)
        print('Set the size of vocab: %d' % vocab_size)

        # 将模型加载
        dataset = AudioDataset(
            audios_list=[i['audio_path'] for i in self.audio_paths],
            labels_list=[i['label'] for i in self.labels],
            unit2idx=unit2idx)
        dataloader = DataLoader(dataset,
                                batch_size=self.args.batch_size,
                                shuffle=True,
                                num_workers=0,
                                pin_memory=False,
                                collate_fn=Util.collate_fn)

        # lr = Util.get_learning_rate(step=1)
        lr = self.args.lr_factor
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=lr,
                                     betas=(0.9, 0.98),
                                     eps=1e-9)

        if not os.path.exists(self.args.data_model_dir):
            os.makedirs(self.args.data_model_dir)

        global_step = 1
        step_loss = 0
        print('Begin to Train...')
        for epoch in range(self.args.total_epochs):
            print('***** epoch: %d *****' % epoch)
            for step, (inputs, targets) in enumerate(dataloader):
                # 将输入加载到GPU中
                if torch.cuda.is_available():
                    inputs = inputs.cuda()
                    targets = targets.cuda()

                loss = model(inputs, targets)
                loss.backward()
                step_loss += loss.item()

                if (step + 1) % self.args.accu_grads_steps == 0:
                    # 梯度裁剪
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)

                    optimizer.step()
                    optimizer.zero_grad()
                    if global_step % 10 == 0:
                        print(
                            '-Training-Epoch-%d, Global Step:%d, lr:%.8f, Loss:%.5f'
                            % (epoch, global_step, lr,
                               step_loss / self.args.accu_grads_steps))
                    global_step += 1
                    step_loss = 0

                    # 学习率更新
                    # lr = Util.get_learning_rate(global_step)
                    lr = self.args.lr_factor
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr

            # 模型保存
            checkpoint = model.state_dict()
            torch.save(
                checkpoint,
                os.path.join(self.args.data_model_dir,
                             'model.epoch.%d.pt' % epoch))
        print('Done!')
Ejemplo n.º 10
0
        transforms.RandomVerticalFlip(),
        transforms.RandomAffine(degrees=30, translate=(0.1, 0.1), scale=(0.9, 1.1)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        transforms.RandomErasing()
    ]),
    'val': transforms.Compose([
        transforms.Resize(crop_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# 加载数据
dataset = Dataset()
x_train, y_train, x_val, y_val = dataset.get_all_data()

# 打乱数据
all_x = x_train + x_val
all_y = y_train + y_val
length = len(all_x)
split = int(length * 0.1)
random.seed(0)
samples = random.sample(range(length), length)

# list无法转换,使用numpy
all_x, all_y = np.array(all_x),np.array(all_y)
all_x = all_x[samples]
all_y = all_y[samples]
x_train, y_train, x_val, y_val = all_x[:-split], all_y[:-split], all_x[-split:], all_y[-split:]
class StanceDetection(object):
    def __init__(self, exec_type='train'):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs")
        parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size")
        self.args = parser.parse_args()
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH)
        self.model_dir = os.path.join(os.getcwd(), arguments.model_dir)

        # 1. Split the data, read into defined format
        label2idx = dict((arguments.labels[i], i) for i in range(len(arguments.labels)))
        target_text, stance, _, _ = self.dataset.get_all_data()

        indexes = [" ".join(jieba.cut(i['TARGET'].lower(), cut_all=False)) for i in target_text]
        questions = [" ".join(jieba.cut(i['TEXT'].lower(), cut_all=False)) for i in target_text]
        labels = [i['STANCE'] for i in stance]
        data = [indexes, questions, labels]
        assert len(data[0]) == len(data[1]) == len(data[2])

        # 2. Data follows this order: train, test
        train_num = int(len(data[0]) * arguments.portion)
        train_data = [d[:train_num] for d in data]
        dev_data = [d[train_num:] for d in data]

        # 3. Read the vocab text file and get VOCAB dictionary
        vocab = read_emb(filename=os.path.join(os.getcwd(), arguments.sgns_dir), stat_lines=1)

        # 4. Transform text into indexes
        self.datasets, word2idx, embeddings = make_datasets(vocab=vocab,
                                                            raw_data={'training': train_data, 'validation': dev_data},
                                                            label2idx=label2idx,
                                                            big_voc=arguments.big_voc,
                                                            feat_names=arguments.feat_names)
        self.datasets_train = load_tvt(tvt_set=self.datasets['training'],
                                       max_lens=[arguments.ans_len, arguments.ask_len],
                                       feat_names=arguments.feat_names)
        self.datasets_dev = load_tvt(tvt_set=self.datasets['validation'],
                                     max_lens=[arguments.ans_len, arguments.ask_len],
                                     feat_names=arguments.feat_names)

        idx2word = dict((v, k) for k, v in word2idx.items())
        self.datasets["word2idx"] = word2idx
        self.datasets["idx2word"] = idx2word

        self.embeddings = torch.from_numpy(np.asarray(embeddings, dtype=np.float32))

        if exec_type == 'train':
            self.main()
        else:
            model = load_torch_model(self.model_dir)
            test(model=model, dataset=self.datasets, test_set=None)

    def main(self):
        """ continue training or not """
        if arguments.proceed:
            if os.path.exists(self.model_dir):
                with open(self.model_dir, "rb") as saved_model:
                    model = torch.load(saved_model)
        else:
            models = {"Net": Net}
            model = models[arguments.model](embeddings=self.embeddings,
                                            input_dim=self.embeddings.size(1),
                                            hidden_dim=arguments.nhid,
                                            num_layers=arguments.nlayers,
                                            output_dim=arguments.nclass,
                                            max_step=[arguments.ans_len, arguments.ask_len],
                                            dropout=arguments.dropout)
            if arguments.model in ["Net"]:
                model.nhops = arguments.nhops

        # train
        model.to(device=DEVICE)
        # 优化器
        optimizer = optim.Adam(model.parameters(), lr=arguments.lr, weight_decay=5e-5)
        # 损失函数
        criterion = nn.CrossEntropyLoss()

        best_f1_test, best_p_valid, best_f1_valid = -np.inf, -np.inf, -np.inf
        epoch_f1_test, epoch_f1_valid, epoch_f1_cur = 0, 0, 0
        batches_per_epoch = len(self.datasets_train) // self.args.BATCH
        max_train_steps = int(self.args.EPOCHS * batches_per_epoch)

        print("--------------\nEpoch 0 begins!")
        bar = Bar("  Processing", max=max_train_steps)
        print(max_train_steps, self.args.EPOCHS, len(self.datasets_train), self.args.BATCH)

        for step in range(max_train_steps):
            bar.next()
            training_batch = self.datasets_train.next_batch(self.args.BATCH)
            features, seq_lens, mask_matrice, labels = training_batch
            (answers, answers_seqlen, answers_mask), (questions, questions_seqlen, questions_mask) \
                = zip(features, seq_lens, mask_matrice)

            assert self.args.BATCH == len(labels) == len(questions) == len(answers)

            # Prepare data and prediction
            labels_ = Variable(torch.LongTensor(labels)).to(DEVICE)

            # necessary for Room model
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=0.25)

            # zero grad
            model.train()
            model.zero_grad()
            outputs = classify_batch(model=model,
                                     features=[answers, answers_seqlen, answers_mask, questions, questions_seqlen,
                                               questions_mask],
                                     max_lens=(arguments.ans_len, arguments.ask_len))
            loss = criterion(outputs[0].view(len(labels_), -1), labels_)

            loss.backward()
            optimizer.step()

            # Test after each epoch
            if (step + 1) % batches_per_epoch == 0:
                tic = time.time()
                f1_score, p_score = test(model=model,
                                         log_result=False,
                                         dataset=self.datasets,
                                         test_set=self.datasets_dev,
                                         batch_size=self.args.BATCH)

                print("\n  Begin to predict the results on Valid")
                print("  using %.5fs" % (time.time() - tic))
                print("  ----Old best F1 on Valid is %f on epoch %d" % (best_f1_valid, epoch_f1_valid))
                print("  ----Old best F1 on Test is %f on epoch %d" % (best_f1_test, epoch_f1_test))

                if f1_score > best_f1_valid:
                    with open(self.model_dir, 'wb') as to_save:
                        torch.save(model, to_save)

                    best_f1_valid = f1_score
                    print("  ----New best F1 on Valid is %f" % f1_score)
                    epoch_f1_valid = self.datasets_train.epochs_completed
                print("--------------\nEpoch %d begins!" % (self.datasets_train.epochs_completed + 1))

        bar.finish()
Ejemplo n.º 12
0
class Instructor(object):
    """
    特点:使用flyai字典的get all data  |  自己进行划分next batch  |  按照不同的topic进行单独训练
    """

    def __init__(self, arguments):
        # 项目的超参
        parser = argparse.ArgumentParser()
        parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs")
        parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size")
        self.args = parser.parse_args()
        self.arguments = arguments
        self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH)

        if 'bert' in self.arguments.model_name:
            self.tokenizer = Tokenizer4Bert(max_seq_len=self.arguments.max_seq_len,
                                            pretrained_bert_name=os.path.join(os.getcwd(),
                                                                              self.arguments.pretrained_bert_name))
            bert = BertModel.from_pretrained(pretrained_model_name_or_path=self.arguments.pretrained_bert_name)
            self.model = self.arguments.model_class(bert, self.arguments).to(self.arguments.device)
        else:
            self.tokenizer = Util.bulid_tokenizer(
                fnames=[self.arguments.dataset_file['train'], self.arguments.dataset_file['test']],
                max_seq_len=self.arguments.max_seq_len,
                dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset)
            )
            embedding_matrix = Util.build_embedding_matrix(
                word2idx=self.tokenizer.word2idx,
                embed_dim=self.arguments.embed_dim,
                dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.arguments.embed_dim), self.arguments.dataset)
            )
            self.model = self.arguments.model_class(embedding_matrix, self.arguments).to(self.arguments.device)

        if self.arguments.device.type == 'cuda':
            logger.info(
                'cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.arguments.device.index)))

        Util.print_args(model=self.model, logger=logger, args=self.arguments)

        target_text, stance, _, _ = self.dataset.get_all_data()
        self.target = np.asarray([i['TARGET'].lower() for i in target_text])
        text = np.asarray([i['TEXT'].lower() for i in target_text])
        self.stance = np.asarray([i['STANCE'] for i in stance])
        self.target_set = set()
        for tar in self.target:
            self.target_set.add(tar)
        self.text = PreProcessing(text).get_file_text()

    def run(self):
        # loss and optimizer
        criterion = nn.CrossEntropyLoss()
        _params = filter(lambda x: x.requires_grad, self.model.parameters())
        optimizer = self.arguments.optimizer(_params, lr=self.arguments.learning_rate,
                                             weight_decay=self.arguments.l2reg)

        for topic in self.arguments.topics:
            logger.info('>' * 100)
            logger.info('topic: {}'.format(topic))
            index = np.where(self.target == topic.lower())

            self.trainset = ABSADataset(data_type=None,
                                        fname=(self.target[index], self.text[index], self.stance[index]),
                                        tokenizer=self.tokenizer)

            self.valset_len = int(len(self.trainset) * self.arguments.valset_ratio)
            self.trainset, self.valset = random_split(self.trainset,
                                                      (len(self.trainset) - self.valset_len, self.valset_len))
            train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.args.BATCH, shuffle=True)
            val_data_loader = DataLoader(dataset=self.valset, batch_size=self.args.BATCH, shuffle=False)

            # 训练
            max_val_acc = 0
            max_val_f1 = 0
            global_step = 0
            best_model_path = None
            Util.reset_params(model=self.model, args=self.arguments)

            for epoch in range(self.args.EPOCHS):
                logger.info('>>')
                logger.info('epoch: {}'.format(epoch))
                n_correct, n_total, loss_total = 0, 0, 0
                self.model.train()
                for i_batch, sample_batched in enumerate(train_data_loader):
                    global_step += 1
                    optimizer.zero_grad()

                    inputs = [sample_batched[col].to(self.arguments.device) for col in self.arguments.inputs_cols]
                    outputs = self.model(inputs)
                    targets = torch.tensor(sample_batched['polarity']).to(self.arguments.device)

                    loss = criterion(outputs, targets)
                    loss.backward()
                    optimizer.step()

                    n_correct += (torch.argmax(outputs, -1) == targets).sum().item()
                    n_total += len(outputs)
                    loss_total += loss.item() * len(outputs)
                    if global_step % self.arguments.log_step == 0:
                        train_acc = n_correct / n_total
                        train_loss = loss_total / n_total
                        logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc))

                val_acc, val_f1 = Util.evaluate_acc_f1(model=self.model, args=self.arguments,
                                                       data_loader=val_data_loader)
                logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format(val_acc, val_f1))
                if val_acc > max_val_acc:
                    max_val_acc = val_acc
                    best_model_path = os.path.join(os.getcwd(), self.arguments.best_model_path, topic)
                    if os.path.exists(best_model_path) is False:
                        os.mkdir(best_model_path)
                    Util.save_model(model=self.model, output_dir=best_model_path)
                    logger.info('>> saved: {}'.format(best_model_path))
                if val_f1 > max_val_f1:
                    max_val_f1 = val_f1

            Util.save_model(model=self.model, output_dir=best_model_path)

            logger.info('>>> target: {}'.format(self.target_set))
            logger.info('> max_val_acc: {0} max_val_f1: {1}'.format(max_val_acc, max_val_f1))
            logger.info('> train save model path: {}'.format(best_model_path))
class Instructor(object):
    """
    特点:使用flyai字典的get all data  | 自己进行划分next batch
    """
    def __init__(self, args):
        self.args = args
        self.sortedDict = SortedByCountsDict(dump_dir=self.args.vocab_dump_dir)
        self.acoustic_vocab_size, self.acoustic_vocab = Util.get_acoustic_vocab_list(
        )
        self.language_vocab_size, self.language_vocab = Util.get_language_vocab_list(
        )

    def generate(self):
        self.data = Dataset(epochs=self.args.EPOCHS,
                            batch=self.args.BATCH,
                            val_batch=self.args.BATCH)
        audio_paths, labels, _, _ = self.data.get_all_data()

        # wav文件路径
        audio_paths = [i['audio_path'] for i in audio_paths]
        # wav文本数据 TODO:此处包含空格, 测试去掉空格能否提升模型性能
        audio_labels = []
        # wav文本拼音
        audio_pinyins = []
        for label in labels:
            label = label['label'].split(' ')
            audio_labels.append(''.join(label))
            audio_pinyins.append(' '.join([
                ' '.join([
                    ' '.join(j)
                    for j in pinyin(i, style=Style.TONE3, heteronym=False)
                ]) for i in label
            ]))

        # 构建字典
        for label in labels:
            self.sortedDict.append_tokens(label)
        self.sortedDict.dump_pkl()

        # 划分训练/验证
        audio_paths = np.asarray(audio_paths)
        audio_labels = np.asarray(audio_labels)
        audio_pinyins = np.asarray(audio_pinyins)

        index = [i for i in range(len(audio_paths))]
        np.random.shuffle(np.asarray(index))
        train_audio_paths, dev_audio_paths = audio_paths[
            index[0:int(len(index) *
                        0.9)]], audio_paths[index[int(len(index) * 0.9):]]
        train_labels, dev_labels = audio_labels[
            index[0:int(len(index) *
                        0.9)]], audio_labels[index[int(len(index) * 0.9):]]
        train_pinyins, dev_pinyins = audio_pinyins[
            index[0:int(len(index) *
                        0.9)]], audio_pinyins[index[int(len(index) * 0.9):]]

        return train_audio_paths.tolist(), train_labels.tolist(
        ), train_pinyins.tolist(), dev_audio_paths.tolist(), dev_labels.tolist(
        ), dev_pinyins.tolist()

    def train_am(self, train_audio_paths, train_labels, train_pinyins,
                 dev_audio_paths, dev_labels, dev_pinyins):
        """
        训练声学模型
        :param train_audio_paths:
        :param train_labels:
        :param train_pinyins:
        :param dev_audio_paths:
        :param dev_labels:
        :param dev_pinyins:
        :return:
        """
        model = CNNCTCModel(args=self.args,
                            vocab_size=self.acoustic_vocab_size)
        # model = CNNRNNCTCModel(args=self.args, vocab_size=self.acoustic_vocab_size)

        hp = self.args
        hp.batch_size = self.args.am_batch_size
        hp.epochs = self.args.am_epochs
        hp.data_path = self.args.wav_dir
        hp.data_type = 'train'
        hp.feature_max_length = hp.am_feature_max_length
        train_generator = DataGenerator(audio_paths=train_audio_paths,
                                        labels=train_labels,
                                        pinyins=train_pinyins,
                                        hp=hp,
                                        acoustic_vocab=self.acoustic_vocab)
        hp.data_type = 'dev'
        dev_generator = DataGenerator(audio_paths=dev_audio_paths,
                                      labels=dev_labels,
                                      pinyins=dev_pinyins,
                                      hp=hp,
                                      acoustic_vocab=self.acoustic_vocab)
        cpCallBack = ModelCheckpoint(os.path.join(self.args.AmModelFolder,
                                                  hp.am_ckpt),
                                     verbose=1,
                                     save_best_only=True)
        tbCallBack = keras.callbacks.TensorBoard(
            log_dir=self.args.AmModelTensorBoard,
            histogram_freq=0,
            write_graph=True,
            write_images=True,
            update_freq='epoch')

        select_model = '0'
        if os.path.exists(hp.AmModelFolder + select_model + '.hdf5'):
            print('load acoustic model...')
            model.load_model(select_model)

        model.ctc_model.fit_generator(train_generator,
                                      steps_per_epoch=len(train_pinyins) //
                                      hp.batch_size,
                                      validation_data=dev_generator,
                                      validation_steps=20,
                                      epochs=hp.epochs,
                                      workers=10,
                                      use_multiprocessing=True,
                                      callbacks=[cpCallBack, tbCallBack])

    def train_lm(self, train_labels, train_pinyins):
        """
        训练语言学模型
        :param train_labels:
        :param train_pinyins:
        :param dev_audio_paths:
        :param dev_labels:
        :param dev_pinyins:
        :return:
        """
        hp = self.args
        hp.batch_size = self.args.lm_batch_size
        hp.epochs = self.args.lm_epochs
        hp.data_type = 'train'
        hp.max_len = self.args.lm_max_len
        hp.hidden_units = self.args.lm_hidden_units
        hp.is_training = self.args.lm_is_training
        hp.feature_dim = self.args.lm_feature_dim
        hp.num_heads = self.args.lm_num_heads
        hp.num_blocks = self.args.lm_num_blocks
        hp.position_max_length = self.args.lm_position_max_length
        hp.lr = self.args.lm_lr
        hp.dropout_rate = self.args.lm_dropout_rate

        epochs = hp.epochs
        lm_model = TransformerModel(
            arg=hp,
            acoustic_vocab_size=self.acoustic_vocab_size,
            language_vocab_size=self.language_vocab_size)

        batch_num = len(train_pinyins) // hp.batch_size
        with lm_model.graph.as_default():
            saver = tf.train.Saver(max_to_keep=50)
            config = tf.ConfigProto()
            # 占用GPU90%的显存
            config.gpu_options.per_process_gpu_memory_fraction = 0.9
        with tf.Session(graph=lm_model.graph, config=config) as sess:
            merged = tf.summary.merge_all()
            sess.run(tf.global_variables_initializer())
            if os.path.exists(hp.LmModelFolder):
                print('loading language model...')
                latest = tf.train.latest_checkpoint(hp.LmModelFolder)
                if latest is not None:
                    saver.restore(sess, latest)
            writer = tf.summary.FileWriter(hp.LmModelTensorboard,
                                           tf.get_default_graph())
            for k in range(epochs):
                total_loss = 0
                batch = Util.get_lm_batch(args=hp,
                                          pny_lst=train_pinyins,
                                          han_lst=train_labels,
                                          acoustic_vocab=self.acoustic_vocab,
                                          language_vocab=self.language_vocab)
                for i in range(batch_num):
                    input_batch, label_batch = next(batch)
                    feed = {lm_model.x: input_batch, lm_model.y: label_batch}
                    cost, _ = sess.run([lm_model.mean_loss, lm_model.train_op],
                                       feed_dict=feed)
                    total_loss += cost
                    if i % 10 == 0:
                        print("epoch: %d step: %d/%d  train loss=6%f" %
                              (k + 1, i, batch_num, cost))
                        if i % 5000 == 0:
                            rs = sess.run(merged, feed_dict=feed)
                            writer.add_summary(rs, k * batch_num + i)
                print('epochs', k + 1, ': average loss = ',
                      total_loss / batch_num)
                saver.save(sess, hp.LmModelFolder + hp.lm_ckpt)
            writer.close()
        pass

    def run(self):
        # 拷贝文件
        for name, after_dir in zip(
            ['dict.txt', 'hanzi.txt', 'mixdict.txt'],
            [self.args.dict_dir, self.args.hanzi_dir, self.args.mixdict_dir]):
            before_dir = os.path.join(os.getcwd(), 'attach_data', name)
            logger.info('>>>name:{}'.format(name))
            logger.info('>before_dir:{}'.format(before_dir))
            logger.info('>after_dir:{}'.format(after_dir))
            shutil.copyfile(before_dir, after_dir)

        train_audio_paths, train_labels, train_pinyins, dev_audio_paths, dev_labels, dev_pinyins = self.generate(
        )
        logger.info('start train am model!')
        self.train_am(train_audio_paths, train_labels, train_pinyins,
                      dev_audio_paths, dev_labels, dev_pinyins)
        logger.info('end train am model!')

        logger.info('start train lm model!')
        self.train_lm(train_labels=train_labels, train_pinyins=train_pinyins)
        logger.info('end train lm model!')