Esempio n. 1
0
def wrapper_ngram(data=TREC, resplit=True, validate_ratio=0.2):
    train_x, train_y, validate_x, validate_y, test_x, test_y, \
    W, mask = prepare_datasets(data, resplit=resplit, validation_ratio=validate_ratio)
    # get input shape
    input_shape = (train_x[0].shape[0], W.shape[1])
    print "input data shape", input_shape
    n_out = len(np.unique(test_y))
    shuffle_indices = np.random.permutation(train_x.shape[0])
    datasets = (train_x[shuffle_indices], train_y[shuffle_indices], validate_x, validate_y, test_x, test_y)
    test_accuracy = train_ngram_net(
        U=W,
        datasets=datasets,
        n_epochs=10,
        ngrams=(3, 2),
        ngram_out=(150, 50),
        non_static=False,
        input_shape=input_shape,
        concat_out=True,
        n_kernels=(8, 16),
        use_bias=False,
        lr_rate=0.02,
        dropout=True,
        dropout_rate=0.5,
        n_hidden=600,
        n_out=n_out,
        ngram_activation=leaky_relu,
        activation=leaky_relu,
        batch_size=50,
        l2_ratio=1e-5,
        update_rule='adagrad',
        skip_gram=False,
    )
    return test_accuracy
Esempio n. 2
0
def error_analysis(data=SST_SENT_POL):
    train_x, train_y, validate_x, validate_y, test_x, test_y, \
    W, mask = prepare_datasets(data, resplit=False, validation_ratio=0.0)
    # get input shape
    input_shape = (train_x[0].shape[0], W.shape[1])
    print "input data shape", input_shape
    n_out = len(np.unique(test_y))
    shuffle_indices = np.random.permutation(train_x.shape[0])
    datasets = (train_x[shuffle_indices], train_y[shuffle_indices], validate_x, validate_y, test_x, test_y)
    best_prediction = train_ngram_net(
        U=W,
        datasets=datasets,
        n_epochs=10,
        ngrams=(1, 2),
        ngram_out=(300, 250),
        non_static=False,
        input_shape=input_shape,
        concat_out=False,
        n_kernels=(4, 4),
        use_bias=False,
        lr_rate=0.02,
        dropout=True,
        dropout_rate=0.2,
        n_hidden=250,
        n_out=n_out,
        ngram_activation=leaky_relu,
        activation=leaky_relu,
        batch_size=50,
        l2_ratio=1e-5,
        update_rule='adagrad',
        skip_gram=False,
        predict=True
    )
    raw_datasets = load_raw_datasets(datasets=data)
    _, _, validate_raw, _, _, _ = raw_datasets
    from collections import Counter
    errors = []
    for i in xrange(len(best_prediction)):
        if best_prediction[i] != validate_y[i]:
            errors.append("%d & %d" % (validate_y[i], best_prediction[i]))
            print validate_y[i], best_prediction[i], " ".join(validate_raw[i])
    errors = Counter(errors)
    print errors.most_common(10)
Esempio n. 3
0
def wrapper_reversed_rec(data=SST_SENT_POL, resplit=True, validate_ratio=0.2, rec_type='lstm'):
    train_x, train_y, validate_x, validate_y, test_x, test_y, \
    W, mask = prepare_datasets(data, resplit=resplit, validation_ratio=validate_ratio, google=False)
    # get input shape
    input_shape = (train_x[0].shape[0], W.shape[1])
    print "input data shape", input_shape
    n_out = len(np.unique(test_y))
    shuffle_indices = np.random.permutation(train_x.shape[0])
    datasets = (train_x[shuffle_indices], train_y[shuffle_indices], validate_x, validate_y, test_x, test_y)
    test_accuracy = train_ngram_rec_net(
        reverse=True,
        U=W,
        non_static=False,
        datasets=datasets,
        n_epochs=20,
        use_bias=True,
        ngrams=(2, 2),
        input_shape=input_shape,
        n_kernels=(4, 4),
        ngram_out=(300, 250),
        lr_rate=0.02,
        dropout_rate=0.3,
        concat_out=False,
        rec_hidden=300,
        mlp_hidden=300,
        n_out=n_out,
        ngram_activation=tanh,
        mlp_activation=leaky_relu,
        rec_activation=tanh,
        batch_size=50,
        update_rule='adagrad',
        rec_type=rec_type,
        clipping=1,
        l2_ratio=1e-5,
        mask=mask,
        mlp=True,
        skip_gram=False,
        bidirection=True
    )
    return test_accuracy
Esempio n. 4
0
def train(opt):
    if torch.cuda.is_available():
        logger.info("%s", torch.cuda.get_device_name(0))

    # set etc
    torch.autograd.set_detect_anomaly(True)

    # prepare teacher config
    teacher_config = load_config(opt, config_path=opt.teacher_config)
    teacher_config['opt'] = opt
    logger.info("[teacher config] :\n%s", teacher_config)

    # prepare student config
    student_config = load_config(opt, config_path=opt.config)
    student_config['opt'] = opt
    logger.info("[student config] :\n%s", student_config)
         
    # set path
    set_path(teacher_config)
  
    # prepare train, valid dataset
    train_loader, valid_loader = prepare_datasets(teacher_config)
 
    # prepare labeled dataset for meta pseudo labels
    mpl_loader = None
    if opt.mpl_data_path:
        mpl_loader, _ = prepare_datasets(teacher_config, train_path=opt.mpl_data_path)

    # -------------------------------------------------------------------------------------------------------
    # distillation
    # -------------------------------------------------------------------------------------------------------
    if opt.do_distill:
        # prepare and load teacher model
        teacher_model = prepare_model(teacher_config, bert_model_name_or_path=opt.teacher_bert_model_name_or_path)
        teacher_checkpoint = load_checkpoint(opt.teacher_model_path, device=opt.device)
        teacher_model.load_state_dict(teacher_checkpoint)
        teacher_model = teacher_model.to(opt.device)
        logger.info("[prepare teacher model and loading done]")
 
        # prepare student model
        student_model = prepare_model(student_config, bert_model_name_or_path=opt.bert_model_name_or_path)
        logger.info("[prepare student model done]")

        best_eval_metric=None
        global_step, tr_loss, best_eval_metric = distill(teacher_config,
                teacher_model,
                student_config,
                student_model,
                train_loader,
                valid_loader,
                best_eval_metric=best_eval_metric,
                mpl_loader=mpl_loader)
        logger.info(f"[distillation done] global steps: {global_step}, total loss: {tr_loss}, best metric: {best_eval_metric}")
    # -------------------------------------------------------------------------------------------------------


    # -------------------------------------------------------------------------------------------------------
    # structured pruning
    # -------------------------------------------------------------------------------------------------------
    if opt.do_prune:
        # restore model from '--save_path', '--bert_output_dir'
        model = prepare_model(student_config, bert_model_name_or_path=opt.bert_output_dir)
        checkpoint = load_checkpoint(opt.save_path, device=opt.device)
        model.load_state_dict(checkpoint)
        model = model.to(opt.device)
        logger.info("[Restore best student model] : {}, {}".format(opt.bert_output_dir, opt.save_path))

        eval_loss = eval_acc = 0
        eval_loss, eval_acc = evaluate(model, student_config, valid_loader)
        logs = {}
        logs['eval_loss'] = eval_loss
        logs['eval_acc'] = eval_acc
        logger.info("[before pruning] :")
        logger.info(json.dumps({**logs}))

        prune_rewire(student_config, model, valid_loader, use_tqdm=True)

        # save pruned model to '--save_path_pruned', '--bert_output_dir_pruned'
        save_model(student_config, model, save_path=opt.save_path_pruned)
        model.bert_tokenizer.save_pretrained(opt.bert_output_dir_pruned)
        model.bert_model.save_pretrained(opt.bert_output_dir_pruned)
        logger.info("[Pruned model saved] : {}, {}".format(opt.save_path_pruned, opt.bert_output_dir_pruned))