Beispiel #1
0
    def val(self, x, prefix="val"):
        self.model.eval()
        with th.no_grad():
            logits = self.model.forward(self.features, self.adj)
            loss = self.criterion(logits[x],
                                  self.target[x])
            acc = accuracy(logits[x],
                           self.target[x])
            f1, precision, recall = macro_f1(logits[x],
                                             self.target[x],
                                             num_classes=self.nclass)

            desc = {
                f"{prefix}_loss": loss.item(),
                "acc"           : acc,
                "macro_f1"      : f1,
                "precision"     : precision,
                "recall"        : recall,
            }
        return desc
def test(test_dataloader, model, device, dev_batch_size):
    model.eval()
    outputs = []
    gold_labels = []
    with torch.no_grad():
        for _, (batch_tensor, batch_sent_len,
                batch_labels) in enumerate(test_dataloader):
            batch_tensor = batch_tensor.to(device)
            batch_sent_len = batch_sent_len.to(device)
            batch_labels = batch_labels.to(device)

            output = model([batch_tensor, batch_sent_len])

            output = output.to('cpu').numpy()
            outputs.append(output)
            gold = batch_labels.to('cpu').numpy()
            gold_labels.append(gold)

    outputs = np.concatenate(outputs, 0)
    gold_labels = np.concatenate(gold_labels, 0)
    test_f1_score = round(macro_f1(outputs, gold_labels), 4)

    return outputs, test_f1_score
Beispiel #3
0
def valid(flags, valid_dataset, root_path=''):
    model = get_model(flags.max_len, flags.vocab_size, flags.embedding_dim,
                      flags.lstm_unit, flags.dropout_loss_rate,
                      flags.label_num)

    model.load_weights(root_path + flags.weight_save_path)

    valid_losses = 0
    avg_prec, avg_recall, avg_f1 = 0, 0, 0

    for _, batch in enumerate(
            valid_dataset.shuffle(flags.shuffle_size).batch(flags.batch_size)):
        x, y = batch
        y_true = [y[:, i, :] for i in range(y.shape[1])]
        pred = model.predict(x)
        loss = [
            tf.keras.losses.categorical_crossentropy(y_i, p_i)
            for y_i, p_i in zip(y_true, pred)
        ]
        valid_losses += sum(sum(loss) / x.shape[0])
        for i in range(x.shape[0]):
            prec, recall, f1 = macro_f1(
                4, list(map(np.argmax,
                            np.array(pred)[:, i, :])),
                list(map(np.argmax, y[i])))
            avg_prec += prec
            avg_recall += recall
            avg_f1 += f1

    valid_losses = valid_losses / (flags.num_valid_sample / flags.batch_size)
    avg_prec /= flags.num_valid_sample
    avg_recall /= flags.num_valid_sample
    avg_f1 /= flags.num_valid_sample

    print(
        f'Valid loss={valid_losses:.4f}, precision={avg_prec:.4f}, recall={avg_recall:.4f}, f1={avg_f1:.4f}'
    )
Beispiel #4
0
def main(args):
    # 可选择输出日志到文件
    # logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s',
    #                     filename=args.model_name + '.log',
    #                     filemode='w',
    #                     level=logging.INFO)
    logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s',
                        level=logging.INFO)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    set_seed(args.seed, n_gpu)

    logging.info("The active device is: {}, gpu_num: {}".format(device, n_gpu))

    args.ckpt_path = os.path.join(args.ckpt_path, args.model_name)
    try:
        os.makedirs(args.ckpt_path)
    except:
        pass

    extra_token_dict = {"unk_token": "<UNK>", "pad_token": "<PAD>"}

    train_dev_sentences, test_sentences, train_dev_labels, test_labels, word2id, id2word, embeddings = \
        load_data(args.raw_path, args.embedding_path, args.train_path, args.test_path, args.max_seq_len, extra_token_dict)

    f1_list = [0] * 5
    if args.do_train:
        kf = StratifiedKFold(n_splits=5,
                             shuffle=True).split(train_dev_sentences,
                                                 train_dev_labels)
        for cv_i, (train_index, dev_index) in enumerate(kf):
            logging.info(
                "******************Train CV_{}******************".format(cv_i))
            # 准备模型
            if args.model_name == "TextRNN":
                config = TextRNN_Config(embeddings, args.num_label)
                model = TextRNN(config)
            if args.model_name == "TextCNN":
                config = TextCNN_Config(embeddings, args.num_label)
                model = TextCNN(config)
            if args.model_name == "TextRCNN":
                config = TextRCNN_Config(embeddings, args.max_seq_len,
                                         args.num_label)
                model = TextRCNN(config)
            if args.model_name == "TextCNN_Highway":
                config = TextCNN_Highway_Config(embeddings, args.num_label)
                model = TextCNN_Highway(config)
            if args.model_name == "TextRNN_Attention":
                config = TextRNN_Attention_Config(embeddings, args.num_label)
                model = TextRNN_Attention(config)
            logging.info("Already load the model: {},".format(args.model_name))
            model.to(device)

            train_sentences = [train_dev_sentences[i] for i in train_index]
            train_labels = [train_dev_labels[i] for i in train_index]
            dev_sentences = [train_dev_sentences[i] for i in dev_index]
            dev_labels = [train_dev_labels[i] for i in dev_index]

            logging.info("Prepare dataloader...")
            train_tensor, train_sent_len, train_labels_tensor = convert2feature(
                train_sentences, train_labels, word2id, args.max_seq_len)
            train_data = TensorDataset(train_tensor, train_sent_len,
                                       train_labels_tensor)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data,
                                          sampler=train_sampler,
                                          batch_size=args.batch_size)
            train_dataloader = cycle(train_dataloader)

            dev_tensor, dev_sent_len, dev_labels_tensor = convert2feature(
                dev_sentences, dev_labels, word2id, args.max_seq_len)
            dev_data = TensorDataset(dev_tensor, dev_sent_len,
                                     dev_labels_tensor)
            dev_sampler = SequentialSampler(dev_data)
            dev_dataloader = DataLoader(dev_data,
                                        sampler=dev_sampler,
                                        batch_size=args.dev_batch_size)

            logging.info("Begin to train...")
            f1_list[cv_i] = train_eval(train_dataloader, dev_dataloader, model,
                                       args.ckpt_path, args.train_steps,
                                       args.check_step, args.eval_step,
                                       args.lr, args.warmup_steps, cv_i)
            if not args.do_cv:
                break
        if args.do_cv:
            cv_f1 = np.mean(np.array(f1_list))
            logging.info("CV F1_list: {}, Mean_F1: {:.4f}".format(
                f1_list, cv_f1))

    if args.do_test:
        logging.info("******************Test******************")
        logging.info("Begin to test {}...".format(args.model_name))
        test_tensor, test_sent_len, test_labels_tensor = convert2feature(
            test_sentences, test_labels, word2id, args.max_seq_len)
        test_data = TensorDataset(test_tensor, test_sent_len,
                                  test_labels_tensor)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.dev_batch_size)

        final_results = np.zeros((len(test_labels), args.num_label))
        test_labels = test_labels_tensor.to('cpu').numpy()
        for cv_i in range(5):
            ckpt_path = os.path.join(args.ckpt_path,
                                     "pytorch_model_{}.pkl".format(cv_i))
            if args.model_name == "TextRNN":
                config = TextRNN_Config(embeddings, args.num_label)
                model = TextRNN(config)
                model.load_state_dict(torch.load(ckpt_path))
                model.to(device)
            if args.model_name == "TextCNN":
                config = TextCNN_Config(embeddings, args.num_label)
                model = TextCNN(config)
                model.load_state_dict(torch.load(ckpt_path))
                model.to(device)
            if args.model_name == "TextRCNN":
                config = TextRCNN_Config(embeddings, args.max_len,
                                         args.num_label)
                model = TextRCNN(config)
                model.load_state_dict(torch.load(ckpt_path))
                model.to(device)
            if args.model_name == "TextRNN_Attention":
                config = TextRNN_Attention_Config(embeddings, args.num_label)
                model = TextRNN_Attention(config)
                model.load_state_dict(torch.load(ckpt_path))
                model.to(device)
            output_labels, test_f1_score = test(test_dataloader, model, device,
                                                args.dev_batch_size)
            final_results = final_results + output_labels
            logging.info(
                "The cv_{} result of {} on test data: F1: {:.4f}".format(
                    cv_i, args.model_name, test_f1_score))
            if not args.do_cv:
                break

        test_f1_score = round(macro_f1(final_results, test_labels), 4)
        logging.info("The final result of {} on test data: F1: {:.4f}".format(
            args.model_name, test_f1_score))
Beispiel #5
0
            for core_value in arff.attributes[arff.attr_position["class"]][1]:
                if core_value != classification:
                    confusion_matrices[core_value]["tn"] += 1
        else:
            # This is a false positive for this classification
            confusion_matrices[classification]["fp"] += 1
            # A FP for this one is a FN for the correct one
            confusion_matrices[record[arff.attr_position["class"]]]["fn"] += 1
    
    results_dict = {}
    results_dict["mp"] = utils.micro_precision(arff.attributes[arff.attr_position["class"]][1], confusion_matrices)
    results_dict["mr"] = utils.micro_recall(arff.attributes[arff.attr_position["class"]][1], confusion_matrices)
    results_dict["mf1"] = utils.micfo_f1(arff.attributes[arff.attr_position["class"]][1], confusion_matrices)
    results_dict["Mp"] = utils.macro_precision(arff.attributes[arff.attr_position["class"]][1], confusion_matrices)
    results_dict["Mr"] = utils.macro_recall(arff.attributes[arff.attr_position["class"]][1], confusion_matrices)
    results_dict["Mf1"] = utils.macro_f1(arff.attributes[arff.attr_position["class"]][1], confusion_matrices)
    results_dict["ac"] = utils.accuracy(arff.attributes[arff.attr_position["class"]][1], confusion_matrices)
    
    print("Micro Precision  " + str(run_num) + ": " + str(results_dict["mp"]))
    print("Micro Recall     " + str(run_num) + ": " + str(results_dict["mr"]))
    print("Micro F1         " + str(run_num) + ": " + str(results_dict["mf1"]))
    print("Macro Precision  " + str(run_num) + ": " + str(results_dict["Mp"]))
    print("Macro Recall     " + str(run_num) + ": " + str(results_dict["Mr"]))
    print("Macro F1         " + str(run_num) + ": " + str(results_dict["Mf1"]))
    print("Accuracy         " + str(run_num) + ": " + str(results_dict["ac"]))

    validation_results.append(results_dict)

    # Push the test data back into the training data
    arff.data.extend(training_records)
Beispiel #6
0
def train_val(load_path=None):
    '''main funtion to train and validate '''
    path = './data/'
    train_path, test_path = path + 'new_columns_trains_sets.csv', path + 'val_sets_v1.csv'
    model = AstroNet()
    print(model)
    if load_path:
        state_dict = torch.load(load_path)  # print(state_dict)
        model.load_state_dict(state_dict)
    print("data and model is ready!!!")
    ''' loss_function ------ optimizer ------->  train '''
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001,
                          momentum=0.9)  # 0.0001?
    scheduler = lr_scheduler.MultiStepLR(optimizer,
                                         milestones=[6, 8],
                                         gamma=0.1)  # to adjust LR
    epoches = 12
    device = torch.device(
        "cpu"
    )  # torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)  # # of course we have Gpu!! but not necessary!
    model_dir = "./checkpoints/"
    # training---------------------------------------------------------------------------------------------------------
    print("ready to train!!", "Current Time : " + time.ctime())
    best_model_wts = copy.deepcopy(model.state_dict())
    best_f1 = 0.0
    for epoch in range(1, epoches + 1):
        print('Epoch {}/{}'.format(epoch, epoches), '--' * 10)
        running_loss, running_corrects, data_num = 0.0, 0.0, 0.0
        train_reader = pandas.read_csv(train_path,
                                       chunksize=256,
                                       low_memory=False)
        '''one thing to be clarified here is that since the csv file is too large to load, we have to read it chunk by chunk, 
            each chunk (whose type is pandas.DataFrame) is used to initialize a Astronomy Dataset defined in nn_ModelSet.py'''
        scheduler.step(
        )  # adjust this to end of each epoch if using pytorch>1.1...
        model.train()  # Set model to training mode
        for batch_idx, train_df in enumerate(train_reader, 1):
            try:
                train_set = Astronomy(
                    train_df
                )  # each chunk is used to initialize a Astronomy Dataset
                train_loader = DataLoader(train_set,
                                          batch_size=256,
                                          shuffle=True,
                                          num_workers=8)
                # actually one batch though 'for' is used:
                for inputs, labels, ids in train_loader:
                    inputs = inputs.to(device)
                    labels = labels.to(device)
            except ValueError:  # there seems to be that some training data has data_type problem, i choose to drop them >~<
                continue
            # zero the parameter gradients
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print something like loss and precision is also important!!
            _value, predictions = torch.max(outputs,
                                            dim=1)  # common trick here!!!
            batch_num, batch_corrects = len(outputs), torch.sum(
                predictions == labels.data)
            print(
                '->batch NO.#%d\tbatch_size:%d\tavg_loss:%.4f\tacc:%.4f ' %
                (batch_idx, batch_num, loss.item(),
                 batch_corrects.double() / batch_num), time.ctime())
            running_corrects, running_loss, data_num = running_corrects + batch_corrects.item(
            ), running_loss + loss.item() * batch_num, data_num + batch_num

        epoch_loss, epoch_acc = running_loss / data_num, running_corrects / data_num  # averaged!
        print('Epoch {}/{}'.format(epoch, epoches),
              'Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc),
              'data_num: ', data_num)

        # evaluating------------------------------------------------------------------------------------------------------------
        print('evaluating', '---' * 20)
        model.eval()
        test_reader = pandas.read_csv(test_path, chunksize=4096)
        result = pandas.DataFrame({'id': [], 'label': []})
        label2cls = {0.0: 'star', 1.0: 'galaxy', 2.0: 'qso'}
        for bat_idx, test_df in enumerate(test_reader, 1):
            test_set = Astronomy(
                test_df,
                'test')  # each chunk is used to initialize a Astronomy Dataset
            test_loader = DataLoader(test_set,
                                     batch_size=4096,
                                     shuffle=False,
                                     num_workers=8)
            for inputs, ids in test_loader:
                inputs.to(device)
            outputs = model(inputs)
            _idx, predictions = torch.max(outputs, dim=1)
            _result = pandas.DataFrame({
                'id': list(ids),
                'label_': list(predictions.float())
            })
            _result['label'] = _result['label_'].map(
                label2cls
            )  # some map problem happens, i dont know why but i will fix it later
            result = pandas.concat([result, _result], axis=0)
        result.to_csv('./results/Feb16-evaluate_epoch%d.csv' % epoch,
                      index=False,
                      encoding='utf-8')
        fix_map('./results/evaluate_epoch%d.csv' %
                epoch)  # fixing some map problem...
        # eval and deep copy the model
        print('Answer written to disk & Start Macro-F1 evaluating!')
        epoch_f1 = macro_f1('./results/evaluate_epoch%d.csv' % epoch,
                            './logs/Feb16-evaluate_epoch%d.json' % epoch)
        if epoch_f1 > 0:  # i save model each epoch, chang this line to 'if epoch_f1 > best_f1:' if you just want best model on validate_set
            best_f1 = epoch_f1
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(
                best_model_wts, model_dir + "Feb16-epoch%d-macroF1-%.4f.pth" %
                (epoch, best_f1))

    print('Best macro f1 : {:4f}'.format(best_f1),
          "Current Time : " + time.ctime())
def train_eval(train_dataloader, dev_dataloader, model, ckpt_path, train_steps,
               check_step, eval_step, lr, warmup_steps, cv_i):
    """训练模型"""

    ckpt_path = os.path.join(ckpt_path, "pytorch_model_{}.pkl".format(cv_i))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # print(model.named_parameters)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps, train_steps)
    loss_func = nn.CrossEntropyLoss()

    model.train()
    best_step = 0
    best_f1 = 0
    train_loss = 0

    bar = range(train_steps)
    for step in bar:
        batch = next(train_dataloader)
        batch = tuple(t.to(device) for t in batch)
        batch_tensor, batch_sent_len, batch_labels = batch

        output = model([batch_tensor, batch_sent_len])
        loss = loss_func(output, batch_labels)
        optimizer.zero_grad()

        loss.backward()
        train_loss += loss.item()

        optimizer.step()
        scheduler.step()

        if (step + 1) % check_step == 0:
            logging.info("Step: {}, Train_batch_loss: {}".format(
                step + 1, train_loss / check_step))
            train_loss = 0

        if (step + 1) % eval_step == 0:
            model.eval()
            output_labels = []
            gold_labels = []
            with torch.no_grad():
                for dev_step, (dev_tensor, dev_sent_len,
                               dev_labels) in enumerate(dev_dataloader):
                    dev_tensor = dev_tensor.to(device)
                    dev_sent_len = dev_sent_len.to(device)
                    dev_labels = dev_labels.to(device)

                    output = model([dev_tensor, dev_sent_len])

                    output = output.to('cpu').numpy()
                    output_labels.append(output)
                    gold = dev_labels.to('cpu').numpy()
                    gold_labels.append(gold)

            output_labels = np.concatenate(output_labels, 0)
            gold_labels = np.concatenate(gold_labels, 0)
            dev_f1_score = round(macro_f1(output_labels, gold_labels), 4)

            if dev_f1_score > best_f1:
                best_step = step
                best_f1 = dev_f1_score
                torch.save(model.state_dict(), ckpt_path)

            logging.info("Dev_f1_score: {}, Best_dev_f1: {}\n".format(
                dev_f1_score, best_f1))

        if step + 1 - best_step > 3000:
            logging.info("Early stopped at Step: {}, Best_dev_f1: {}\n".format(
                step + 1, best_f1))
            break
        model.train()
    return best_f1
Beispiel #8
0
def train(flags, logger, train_dataset, valid_dataset, root_path=''):
    # 读取/初始化检查点参数
    logger.info('Loading checkpoint params...')
    if os.path.exists(root_path + flags.ckpt_params_path):
        with open(root_path + flags.ckpt_params_path, 'r') as f:
            params = json.loads(f.readline())
    else:
        params = {
            'epoch': 0,
            'patience': 1,
            'final_learn': 1,
            'lr': 1e-3,
            'pre_best_loss': 10000000,
            'pre_best_metrics': (0.0, 0.0, 0.0),
            'pre_best_ckpt_path': ''
        }

    # 加载模型
    logger.info('Initialize model...')
    model = get_model(flags.max_len, flags.vocab_size, flags.embedding_dim,
                      flags.lstm_unit, flags.dropout_loss_rate,
                      flags.label_num)
    if params['pre_best_ckpt_path']:
        model.load_weights(root_path + params['pre_best_ckpt_path'])
    # 选择优化器
    logger.info(f'Setting learning rate as {params["lr"]}')
    optimizer = tf.keras.optimizers.Adam(params['lr'])

    # 设置其他参数
    train_batch_nums = math.ceil(flags.num_train_sample / flags.batch_size)
    while True:
        params['epoch'] += 1

        # 初始化训练参数
        train_losses = 0
        valid_losses = 0
        avg_prec, avg_recall, avg_f1 = 0, 0, 0

        # 训练(train)
        with tqdm(enumerate(
                train_dataset.shuffle(flags.shuffle_size).batch(
                    flags.batch_size)),
                  total=train_batch_nums) as pbar:
            for train_step, batch in pbar:
                x, y = batch
                y_true = [y[:, i, :] for i in range(y.shape[1])]
                with tf.GradientTape() as tape:
                    logits = model(x, training=True)
                    loss = [
                        tf.keras.losses.categorical_crossentropy(y_i, l_i)
                        for y_i, l_i in zip(y_true, logits)
                    ]
                grads = tape.gradient(loss, model.trainable_weights)
                optimizer.apply_gradients(zip(grads, model.trainable_weights))

                train_losses += sum(
                    sum(loss) / x.shape[0]
                )  # 如果num_sample/batch_size不为整数,那么最后一个batch的size不等于batch_size

                if train_step == train_batch_nums - 1:
                    # 验证(valid)
                    logger.info(f'Validating at epoch{params["epoch"]}')
                    for _, batch in enumerate(
                            valid_dataset.shuffle(flags.shuffle_size).batch(
                                flags.batch_size)):
                        x, y = batch
                        y_true = [y[:, i, :] for i in range(y.shape[1])]
                        pred = model.predict(x)
                        loss = [
                            tf.keras.losses.categorical_crossentropy(y_i, p_i)
                            for y_i, p_i in zip(y_true, pred)
                        ]
                        valid_losses += sum(sum(loss) / x.shape[0])
                        for i in range(x.shape[0]):
                            prec, recall, f1 = macro_f1(
                                4, list(map(np.argmax,
                                            np.array(pred)[:, i, :])),
                                list(map(np.argmax, y[i])))
                            avg_prec += prec
                            avg_recall += recall
                            avg_f1 += f1

                    valid_losses = valid_losses / (flags.num_valid_sample /
                                                   flags.batch_size)
                    avg_prec /= flags.num_valid_sample
                    avg_recall /= flags.num_valid_sample
                    avg_f1 /= flags.num_valid_sample

                pbar.set_description(
                    f'Epoch{params["epoch"]}: train loss={train_losses / train_step + 1:.4f}, '
                    + f'valid loss={valid_losses:.4f}, ' +
                    f'prec={avg_prec:.4f}, recall={avg_recall:.4f}, f1={avg_f1:.4f}'
                )
        logger.info(
            f'At epoch{params["epoch"]}, training loss={train_losses:.4f}')

        # 检查点
        if valid_losses < params['pre_best_loss']:
            logger.info(f'Saving best checkpoint...')
            params['pre_best_loss'] = float(valid_losses)
            params['pre_best_metrics'] = (float(avg_prec), float(avg_recall),
                                          float(avg_f1))
            params['pre_best_ckpt_path'] = 'model/ckpt/best_ckpt'
            model.save_weights(root_path + params['pre_best_ckpt_path'])
            # 覆盖之前的最佳检查点参数
            with open(root_path + flags.ckpt_params_path, 'w') as f:
                json.dump(params, f)
            # 记录每次loss降低
            with open(root_path + flags.train_log, 'a') as f:
                f.write(
                    f'At epoch{params["epoch"]}, lr={params["lr"]}, train loss={train_losses / train_batch_nums:.4f}, valid loss{valid_losses:.4f}, precison={avg_prec:.4f}, recall={avg_recall:.4f}, f1={avg_f1:.4f}\n'
                )

            params['patience'] = 1
        else:
            logger.info(f'Loss increased at epoch{params["epoch"]}!')
            if params['patience'] > 0:
                params['patience'] -= 1
            else:
                if params['final_learn'] > 0:
                    logger.info(f'Restore previous best checkpoint...')
                    model.load_weights(root_path +
                                       params['pre_best_ckpt_path'])
                    params['final_learn'] -= 1
                    params['lr'] /= 10
                    logger.info(f'Decrease learning rate to {params["lr"]}')
                    optimizer = tf.keras.optimizers.Adam(params['lr'])
                    params['patience'] = 1
                else:
                    model.save_weights(root_path + flags.weight_save_path)
                    logger.info('End of Train.')
                    logger.info(
                        f'Best valid loss: {params["pre_best_loss"]:.4f}, precsion: {params["pre_best_metrics"][0]:.4f}, recall: {params["pre_best_metrics"][1]:.4f}, f1: {params["pre_best_metrics"][2]:.4f}'
                    )
                    break