def main():
    parser = argparse.ArgumentParser(description="-----[RNN-Attention-classifier]-----")
    parser.add_argument("--mode", default="train", help="train: train (with test) a model / test: test saved models")
    parser.add_argument("--model", default="non-static",
                        help="available models: rand, static, non-static, multichannel")
    parser.add_argument("--dataset", default="data", help="available datasets: MR, TREC")
    parser.add_argument("--save_model", default=True, action='store_true', help="whether saving model or not")
    parser.add_argument("--early_stopping", default=False, action='store_true', help="whether to apply early stopping")
    parser.add_argument("--epoch", default=20, type=int, help="number of max epoch")
    parser.add_argument("--learning_rate", default=0.001, type=float, help="learning rate")
    parser.add_argument("--gpu", default=0, type=int, help="the index of gpu to be used")

    options = parser.parse_args()
    if options.mode == "train":
        data, label_cnt = read_data(dir="raw_data/{}".format(task_name), train="train", dev="dev", test="test")
    else:
        data, label_cnt = read_data(dir="raw_data/{}".format(task_name), test="test")
    print(label_cnt)

    data["classes"] = sorted(list(set(data["train_y"])))

    params = {
        "MODEL": options.model,
        "DATASET": options.dataset,
        "SAVE_MODEL": options.save_model,
        "EARLY_STOPPING": options.early_stopping,
        "EPOCH": options.epoch,
        "LEARNING_RATE": options.learning_rate,
        # "MAX_SENT_LEN": max([len(sent) for sent in data["train_x"] + data["dev_x"] + data["test_x"]]),
        "MAX_SENT_LEN": 32,
        "BATCH_SIZE": 50,
        "CLASS_SIZE": len(data["classes"]),
        "DROPOUT_PROB": 0.5,
        "NORM_LIMIT": 3,
        "GPU": options.gpu,
        "H_DIM":32
    }

    print("=" * 20 + "INFORMATION" + "=" * 20)
    print("MODEL:", params["MODEL"])
    print("DATASET:", params["DATASET"])
    print("EPOCH:", params["EPOCH"])
    print("LEARNING_RATE:", params["LEARNING_RATE"])
    print("EARLY_STOPPING:", params["EARLY_STOPPING"])
    print("SAVE_MODEL:", params["SAVE_MODEL"])
    print("=" * 20 + "INFORMATION" + "=" * 20)

    if options.mode == "train":
        print("=" * 20 + "TRAINING STARTED" + "=" * 20)
        model = train(data, params)
        if params["SAVE_MODEL"]:
            save_cls(model, task_name, "attn.{}".format(model_name))
            # save_vocab(data["vocab"], task_name, model_name)
        print("=" * 20 + "TRAINING FINISHED" + "=" * 20)
    else:
        model = load_cls(task_name, model_name).cuda(params["GPU"])

        test_acc = test(data, model, params)
        print("test acc:", test_acc)
                idx = mask.cpu().numpy()
                idx = [int(ix) for ix in idx]
                contents = []
                for i in range(0, len(x)):
                    if i not in idx:
                        contents.append(x[i])
                wl = {
                    "content": ' '.join(contents),
                    "line": line.strip(),
                    "masks": list(idx),
                    "label": str(label)
                }
                #print(wl)
                wl_str = json.dumps(wl)
                fw.write(wl_str)
                fw.write("\n")
    fw.close()
    print("processed over!")


if __name__ == "__main__":
    cls = load_cls("{}".format(task_name), "attn.{}".format(model_name)).cuda()
    for i in cls.parameters():
        i.requires_grad = False
    cls.eval()
    cls_tf_idf(cls, 1, "train")
    cls_tf_idf(cls, 0, "train")
    cls_tf_idf(cls, 1, "dev")
    cls_tf_idf(cls, 0, "dev")
    #cls_tf_idf(cls, 1, "test")
    #cls_tf_idf(cls, 0, "test")
Beispiel #3
0
        lines = line.strip().decode('utf-8').encode('gb18030').split('\t')
    except:
        continue
    if (len(lines) != 2):
        continue
    if (string.atof(lines[1]) > dict_thre and num < dict_num):
        word_dict[lines[0]] = 1
        num += 1
f.close()

frname = os.path.join(save_path, sys.argv[1])
f = open(sys.argv[1], 'r')
fwname = os.path.join(save_path, sys.argv[6] + '.data.' + operation)
fw = open(fwname, 'w')

cls = load_cls("{}".format(sys.argv[7]), "attn.cbert").cuda()
for i in cls.parameters():
    i.requires_grad = False


def cmp(a, b):
    return (a > b) - (a < b)


def cls_tf_idf(batch_lines):
    batch_x = [clean_str(sent) for sent in batch_lines]
    pred, attn = cls(batch_x)
    pred = np.argmax(pred.cpu().data.numpy(), axis=1)
    ret = []
    for line, x, pre, att in zip(batch_lines, batch_x, pred, attn):
        if len(x) > 0:
def run_aug(args, save_every_epoch=False):

    processors = {
        "yelp": biLabelProcessor,
        "amazon": biLabelProcessor,
        "imagecaption": biLabelProcessor,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels(task_name)

    def load_model(model_name):
        weights_path = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, model_name)
        model = torch.load(weights_path)
        return model

    cbert_name = "{}/CBertForMaskedLM_{}_epoch_{}{}".format(
        task_name.lower(), task_name.lower(), args.test_epoch, modified)
    model = load_model(cbert_name)
    model.to(device)

    cls_model = load_cls(task_name, model_name).cuda()
    for i in cls_model.parameters():
        i.requires_grad = False
    cls_model.eval()

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']

    if args.do_eval:

        # eval_bleu参数
        generate_file_0 = "evaluation/outputs/{}/sentiment.test.0.{}".format(
            task_name, model_name)
        dev_file_0 = "evaluation/outputs/{}/sentiment.dev.0.{}".format(
            task_name, model_name)
        orgin_file_0 = "evaluation/outputs/{}/sentiment.test.0.human".format(
            task_name)
        generate_file_1 = "evaluation/outputs/{}/sentiment.test.1.{}".format(
            task_name, model_name)
        dev_file_1 = "evaluation/outputs/{}/sentiment.dev.1.{}".format(
            task_name, model_name)
        orgin_file_1 = "evaluation/outputs/{}/sentiment.test.1.human".format(
            task_name)
        save_file_path = "evaluation/outputs/{}/{}_ft_wc{}".format(
            task_name, model_name, modified)
        if not os.path.exists(save_file_path):
            os.mkdir(save_file_path)
        # eval_acc参数
        dict_file = 'test_tools/li_test_tool/classify_Bilstm/data/style_transfer/zhi.dict.{}'.format(
            task_name)
        if task_name == 'yelp':
            train_rate = 0.9984
            valid_rate = 0.0008
            test_rate = 0.0008
        elif task_name == 'amazon':
            train_rate = 0.9989
            valid_rate = 0.00055
            test_rate = 0.00055

        run_transfer(model,
                     tokenizer,
                     task_name,
                     model_name=model_name,
                     modified=modified,
                     set="dev")
        dev_acc_0 = 1 - eval_acc(dict_file=dict_file,
                                 train_rate=train_rate,
                                 valid_rate=valid_rate,
                                 test_rate=test_rate,
                                 input_file=dev_file_0)
        dev_acc_1 = 1 - eval_acc(dict_file=dict_file,
                                 train_rate=train_rate,
                                 valid_rate=valid_rate,
                                 test_rate=test_rate,
                                 input_file=dev_file_1)
        dev_acc_avg = (dev_acc_0 + dev_acc_1) / 2
        dev_acc_avg = round(dev_acc_avg * 1000) / 10.0
        print('{{"dev acc":{}}}'.format(dev_acc_avg))
        avg_loss = 0
        run_transfer(model,
                     tokenizer,
                     task_name,
                     model_name=model_name,
                     modified=modified)
        bleu_0 = eval_bleu(generate_file=generate_file_0,
                           orgin_file=orgin_file_0) * 100
        bleu_1 = eval_bleu(generate_file=generate_file_1,
                           orgin_file=orgin_file_1) * 100
        bleu_avg = (bleu_0 + bleu_1) / 2
        print('{{"bleu_0": {}, "bleu_1": {}, "bleu_avg": {}}}'.format(
            bleu_0, bleu_1,
            round(bleu_avg * 10) / 10.0))
        acc_0 = (1 - eval_acc(dict_file=dict_file,
                              train_rate=train_rate,
                              valid_rate=valid_rate,
                              test_rate=test_rate,
                              input_file=generate_file_0)) * 100
        acc_1 = (1 - eval_acc(dict_file=dict_file,
                              train_rate=train_rate,
                              valid_rate=valid_rate,
                              test_rate=test_rate,
                              input_file=generate_file_1)) * 100
        acc_avg = (acc_0 + acc_1) / 2
        print('{{"acc_0": {}, "acc_1": {}, "acc_avg": {}}}'.format(
            acc_0, acc_1,
            round(acc_avg * 10) / 10.0))
        _acc = cls_test(cls_model, task_name) * 100
        run_split(generate_file_0)
        run_split(generate_file_1)
        _bleu = eval_multi_bleu(model_name, task_name)
        print('{{"_ACCU": {}, "_BLEU": {}}}'.format(
            round(_acc * 10) / 10.0,
            round(_bleu * 10) / 10.0))
def main():
    save_every_epoch = False

    args, train_dataloader, t_total, device, n_gpu = load_data()
    print("**********************************************************")
    print(args)

    def load_model(model_name):
        weights_path = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, model_name)
        model = torch.load(weights_path)
        return model

    cbert_name = "{}/CBertForMaskedLM_{}_epoch_10{}".format(
        task_name.lower(), task_name.lower(), modified)
    model = load_model(cbert_name)
    model.to(device)

    cls_model = load_cls(task_name, model_name).cuda()
    for i in cls_model.parameters():
        i.requires_grad = False
    cls_model.eval()

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=t_total)

    global_step = 0

    model.train()

    save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name)
    if not os.path.exists(save_model_dir):
        os.mkdir(save_model_dir)

    cls_criterion = nn.CrossEntropyLoss()

    # eval_bleu参数
    generate_file_0 = "evaluation/outputs/{}/sentiment.test.0.{}".format(
        task_name, model_name)
    dev_file_0 = "evaluation/outputs/{}/sentiment.dev.0.{}".format(
        task_name, model_name)
    orgin_file_0 = "evaluation/outputs/{}/sentiment.test.0.human".format(
        task_name)
    generate_file_1 = "evaluation/outputs/{}/sentiment.test.1.{}".format(
        task_name, model_name)
    dev_file_1 = "evaluation/outputs/{}/sentiment.dev.1.{}".format(
        task_name, model_name)
    orgin_file_1 = "evaluation/outputs/{}/sentiment.test.1.human".format(
        task_name)
    save_file_path = "evaluation/outputs/{}/{}_ft_wc{}".format(
        task_name, model_name, modified)
    if not os.path.exists(save_file_path):
        os.mkdir(save_file_path)
    # eval_acc parameters
    dict_file = 'test_tools/li_test_tool/classify_Bilstm/data/style_transfer/zhi.dict.{}'.format(
        task_name)
    if task_name == 'yelp':
        train_rate = 0.9984
        valid_rate = 0.0008
        test_rate = 0.0008
    elif task_name == 'amazon':
        train_rate = 0.9989
        valid_rate = 0.00055
        test_rate = 0.00055
    else:
        train_rate = 0.9984
        valid_rate = 0.0008
        test_rate = 0.0008

    acc_save_dict = {}
    bleu_save_dict = {}
    _acc_save_dict = {}
    _bleu_save_dict = {}
    count_dict = {}
    dev_acc_best = 0
    for e in trange(int(args.num_train_epochs), desc="Epoch"):
        tr_loss, avg_loss, avg_acc = 0, 0, 0.
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            lm_loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
            segment_ids = 1 - segment_ids
            prediction_scores = model(input_ids, segment_ids, input_mask)
            prediction_scores = F.softmax(prediction_scores, dim=2)
            predicted_ids = prediction_scores @ bert_embeddings.weight
            batch_y = torch.stack([1 - b[0] for b in batch[2]])
            pred, _ = cls(cls_model, predicted_ids, batch_y)
            # pred = F.softmax(pred, dim=1)
            cls_loss = cls_criterion(pred, batch_y)

            if lm_loss.item() > 1.5:
                loss = lm_loss / 100000 + cls_loss
            else:
                loss = cls_loss  # + lm_loss
            loss.backward()
            #tr_loss += cls_loss.item()
            avg_loss += cls_loss.item()
            #avg_acc += cls_acc
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                model.zero_grad()
                global_step += 1

            if (step + 1) % 250 == 0:
                print("-------avg_loss: {}, lm_loss: {}--------".format(
                    avg_loss / 250, lm_loss))
            if (step + 1) % 250 == 0 and False:
                run_transfer(model,
                             tokenizer,
                             task_name,
                             model_name=model_name,
                             modified=modified,
                             set="dev")
                dev_acc_0 = 1 - eval_acc(dict_file=dict_file,
                                         train_rate=train_rate,
                                         valid_rate=valid_rate,
                                         test_rate=test_rate,
                                         input_file=dev_file_0)
                dev_acc_1 = 1 - eval_acc(dict_file=dict_file,
                                         train_rate=train_rate,
                                         valid_rate=valid_rate,
                                         test_rate=test_rate,
                                         input_file=dev_file_1)
                dev_acc_avg = (dev_acc_0 + dev_acc_1) / 2
                dev_acc_avg = round(dev_acc_avg * 1000) / 10.0
                print('{{"dev acc":{}}}'.format(dev_acc_avg))
                avg_loss = 0
                run_transfer(model,
                             tokenizer,
                             task_name,
                             model_name=model_name,
                             modified=modified)
                bleu_0 = eval_bleu(generate_file=generate_file_0,
                                   orgin_file=orgin_file_0) * 100
                bleu_1 = eval_bleu(generate_file=generate_file_1,
                                   orgin_file=orgin_file_1) * 100
                bleu_avg = (bleu_0 + bleu_1) / 2
                print('{{"bleu_0": {}, "bleu_1": {}, "bleu_avg": {}}}'.format(
                    bleu_0, bleu_1,
                    round(bleu_avg * 10) / 10.0))
                acc_0 = (1 - eval_acc(dict_file=dict_file,
                                      train_rate=train_rate,
                                      valid_rate=valid_rate,
                                      test_rate=test_rate,
                                      input_file=generate_file_0)) * 100
                acc_1 = (1 - eval_acc(dict_file=dict_file,
                                      train_rate=train_rate,
                                      valid_rate=valid_rate,
                                      test_rate=test_rate,
                                      input_file=generate_file_1)) * 100
                acc_avg = (acc_0 + acc_1) / 2
                print('{{"acc_0": {}, "acc_1": {}, "acc_avg": {}}}'.format(
                    acc_0, acc_1,
                    round(acc_avg * 10) / 10.0))
                _acc = cls_test(cls_model, task_name) * 100
                run_split(generate_file_0)
                run_split(generate_file_1)
                _bleu = eval_multi_bleu(model_name, task_name)
                print('{{"_ACCU": {}, "_BLEU": {}}}'.format(
                    round(_acc * 10) / 10.0,
                    round(_bleu * 10) / 10.0))
                if acc_avg > acc_threshold and dev_acc_avg > acc_threshold:
                    if not (dev_acc_avg < dev_acc_best):
                        # save_model_name = "BertForMaskedLM_" + task_name + "_acc_" + str(acc) + "w_cls"
                        # save_model_path = os.path.join(save_model_dir, save_model_name)
                        # torch.save(model, save_model_path)
                        save_file_name_0 = os.path.join(
                            save_file_path, "sentiment.test.0.{}.{}.{}".format(
                                model_name,
                                round(acc_avg * 10) / 10.0,
                                round(bleu_avg * 10) / 10.0))
                        shutil.copy(generate_file_0, save_file_name_0)
                        save_file_name_1 = os.path.join(
                            save_file_path, "sentiment.test.1.{}.{}.{}".format(
                                model_name,
                                round(acc_avg * 10) / 10.0,
                                round(bleu_avg * 10) / 10.0))
                        shutil.copy(generate_file_1, save_file_name_1)
                    if dev_acc_avg > dev_acc_best:
                        dev_acc_best = dev_acc_avg
                        acc_save_dict[dev_acc_avg] = acc_avg
                        bleu_save_dict[dev_acc_avg] = bleu_avg
                        _acc_save_dict[dev_acc_avg] = _acc
                        _bleu_save_dict[dev_acc_avg] = _bleu
                        count_dict[dev_acc_avg] = 1
                    elif dev_acc_avg == dev_acc_best:
                        acc_save_dict[dev_acc_avg] += acc_avg
                        bleu_save_dict[dev_acc_avg] += bleu_avg
                        _acc_save_dict[dev_acc_avg] += _acc
                        _bleu_save_dict[dev_acc_avg] += _bleu
                        count_dict[dev_acc_avg] += 1

        if save_every_epoch:
            save_model_name = "CBertForMaskedLM_" + task_name + "_w_cls_epoch" + str(
                e + 1) + modified
            save_model_path = os.path.join(save_model_dir, save_model_name)
            torch.save(model, save_model_path)
        else:
            if (e + 1) % 10 == 0:
                save_model_name = "CBertForMaskedLM_" + task_name + "_w_cls_epoch" + str(
                    e + 1) + modified
                save_model_path = os.path.join(save_model_dir, save_model_name)
                torch.save(model, save_model_path)
        if False:
            cnt_best = count_dict[dev_acc_best]
            acc_best = round(
                acc_save_dict[dev_acc_best] * 10.0 / cnt_best) / 10.0
            bleu_best = round(
                bleu_save_dict[dev_acc_best] * 10.0 / cnt_best) / 10.0
            _acc_best = round(
                _acc_save_dict[dev_acc_best] * 10.0 / cnt_best) / 10.0
            _bleu_best = round(
                _bleu_save_dict[dev_acc_best] * 10.0 / cnt_best) / 10.0
            print("Best result: dev_acc {} acc {} bleu {} _acc {} _bleu {}".
                  format(dev_acc_best, acc_best, bleu_best, _acc_best,
                         _bleu_best))
Beispiel #6
0
from torch.autograd import Variable
import torch
import numpy as np
from utils import read_data, read_test_data, load_cls, load_vocab


def test_acc(model):
    data = read_test_data(dir="evaluation/outputs/yelp")

    x = data["test_x"]
    y = data["test_y"]

    model.eval()

    x = [sent for sent in x]

    pred = np.argmax(model(x).cpu().data.numpy(), axis=1)
    acc = sum([1 if p == y else 0 for p, y in zip(pred, y)]) / len(pred)

    return acc


if __name__ == "__main__":
    cls = load_cls("yelp").cuda()
    print(test_acc(cls))