def inference_random():
    # 加载验证集验证
    model = ClassificationModel(len(cfg.char2idx))
    model = load_custom_model(model, cfg.save_model_path).to(cfg.device)

    tokenizer = Tokenizer(cfg.char2idx)
    error = 0
    with open(cfg.test_data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        pairs = line.split('\t')
        label, text = pairs[0], pairs[1]
        input_index, _ = tokenizer.encode(text, max_length=cfg.max_seq_len)
        inputs = torch.tensor(input_index).unsqueeze(0)
        inputs_mask = (inputs > 0).to(torch.float32)
        with torch.no_grad():
            scores = model(inputs, inputs_mask)
            prediction = scores.argmax(-1).item()
        if prediction != int(label):
            print(scores[:, int(label)].item())
            print(label)
            print(text)
            print('-' * 50)
            error += 1
    print(error)
Ejemplo n.º 2
0
def train():
    # 加载数据
    char2idx, keep_tokens = load_chinese_base_vocab(cfg.vocab_path)
    tokenizer = Tokenizer(char2idx)
    # train_data = glob(cfg.train_data_path + '*')[16 * 1000 * 35:16 * 1000 * 40]
    train_data = glob(cfg.train_data_path + '*')[8 * 5000 * 5:8 * 5000 * 10]
    train_dataset = CustomDataset(train_data, tokenizer, cfg.max_seq_len)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=cfg.batch_size,
                                  collate_fn=padding,
                                  shuffle=True,
                                  num_workers=4,
                                  pin_memory=True)

    # # debug
    # train_data = glob(cfg.test_data_path + '*')[:8 * 5000 * 5]
    # train_dataset = CustomDataset(train_data, tokenizer, cfg.max_seq_len)
    # train_dataloader = DataLoader(train_dataset, batch_size=cfg.batch_size, collate_fn=padding)
    # # debug
    # 加载模型
    model = CustomUnilmModel(len(char2idx))
    # model = load_pretrained_bert(model, cfg.pretrained_model_path, keep_tokens=keep_tokens).to(cfg.device)
    model = load_custom_model(model, cfg.save_model_path).to(cfg.device)

    loss_function = nn.CrossEntropyLoss(ignore_index=0).to(cfg.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learn_rate)
    # 迭代训练
    iteration, train_loss = 0, 0
    model.train()
    for inputs, token_type, targets in tqdm(train_dataloader,
                                            position=0,
                                            leave=True):
        attention_mask = unilm_mask(inputs, token_type).to(cfg.device)
        inputs, token_type, targets = inputs.to(cfg.device), token_type.to(
            cfg.device), targets.to(cfg.device)
        prediction = model(inputs, token_type, attention_mask)
        loss = loss_function(
            prediction[:, :-1, :].reshape(-1, prediction.shape[-1]),
            targets.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        iteration += 1

        if iteration % cfg.print_loss_steps == 0:
            eval_loss = evaluate(model, tokenizer, loss_function)
            print('')
            print('train_loss:{}'.format(train_loss / cfg.print_loss_steps))
            print('evalu_loss:{}'.format(eval_loss))
            test_string(s1, tokenizer, model)
            test_string(s2, tokenizer, model)
            model.train()
            train_loss = 0

        if iteration % cfg.save_model_steps == 0:
            torch.save(model.state_dict(), cfg.save_model_path)
def train():
    # 加载数据
    tokenizer = Tokenizer(cfg.char2idx)
    train_dataset = CustomDataset(cfg.train_data_path, tokenizer, cfg)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=cfg.batch_size,
                                  collate_fn=padding,
                                  shuffle=True,
                                  num_workers=4,
                                  pin_memory=True)
    model = ClassificationModel(len(cfg.char2idx))
    # model = load_pretrained_bert(model, cfg.pretrained_model_path, keep_tokens=cfg.keep_tokens).to(cfg.device)
    model = load_custom_model(model, cfg.save_model_path).to(cfg.device)

    loss_function = nn.CrossEntropyLoss().to(cfg.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.learn_rate)
    # 迭代训练
    iteration, train_loss = 0, 0
    model.train()
    for inputs, mask, targets in tqdm(train_dataloader, position=0,
                                      leave=True):
        inputs, mask, targets = inputs.to(cfg.device), mask.to(
            cfg.device), targets.to(cfg.device)
        prediction = model(inputs, mask)
        loss = loss_function(prediction, targets.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        iteration += 1

        if iteration % cfg.print_loss_steps == 0:
            eval_loss = evaluate(model, tokenizer, loss_function)
            print('')
            print('train_loss:{}'.format(train_loss / cfg.print_loss_steps))
            print('evalu_loss:{}'.format(eval_loss))
            accuracy(model, tokenizer, cfg.valid_data_path)
            accuracy(model, tokenizer, cfg.test_data_path)
            model.train()
            train_loss = 0

        if iteration % cfg.save_model_steps == 0:
            torch.save(model.state_dict(), cfg.save_model_path)
Ejemplo n.º 4
0
            device_count = {'CPU' : 1, 'GPU' : 0}
        )
        session = Session(config=device_config)
        K.set_session(session)
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = str(G)

    from keras.models import Model
    from keras.models import model_from_json, load_model

    from utils import init_predictor, DecodeCTCPred, Readf, edit_distance, normalized_edit_distance, \
                        BilinearInterpolation, get_lexicon, load_custom_model, open_img, norm, parse_mjsynth

    prng = RandomState(random_state)
    model = load_custom_model(model_path,
                              model_name='/model.json',
                              weights="/final_weights.h5")
    model = init_predictor(model)
    classes = {j: i for i, j in enumerate(get_lexicon())}
    inverse_classes = {v: k for k, v in classes.items()}

    decoder = DecodeCTCPred(top_paths=1,
                            beam_width=10,
                            inverse_classes=inverse_classes)

    img_size = (imgh, imgW) + (1, )

    if validate:
        if mjsynth:
            fnames = open(os.path.join(image_path, val_fname), "r").readlines()
            fnames = np.array(parse_mjsynth(image_path, fnames))