Beispiel #1
0
def eval_split(models,
               crits,
               loader,
               json_path,
               eval_kwargs={},
               flag_eval_what='tap',
               debug=False):
    split = eval_kwargs.get('split', 'val')
    lang_eval = eval_kwargs.get('language_eval', 1)
    val_score_thres = eval_kwargs.get('val_score_thres', 0)
    nms_threshold = eval_kwargs.get('nms_threshold', 0)
    is_reranking = eval_kwargs.get('reranking', False)
    print('is_reranking', is_reranking)
    topN = eval_kwargs.get('topN', 1000)
    get_eval_loss = eval_kwargs.get('get_eval_loss', 1)

    tap_model, cg_model = models
    tap_crit, cg_crit = crits

    for model in models:
        model.eval()
    loader.reset_iterator(split)

    n = 0
    loss_sum = [0, 0, 0, 0, 0]
    loss_evals = 1e-8
    predictions = []
    tap_cg_pred = {}
    iter = 0
    bad_vid_num = 0

    time_consumption = {}
    with torch.set_grad_enabled(False):
        while True:
            data = loader.get_batch(split)
            n = n + 1
            if iter % int(len(loader) / 100) == 0:
                print('generating result.json:{:.3f}%'.format(100 * iter /
                                                              len(loader)))

            if data.get('proposal_num',
                        1) == 0 or data['fc_feats'].shape[0] <= 1:
                continue

            tmp = [data['fc_feats'], data['att_feats'], data['lda_feats']]

            tmp = [Variable(torch.from_numpy(_)).cuda() for _ in tmp]
            c3d_feats, att_feats, lda_feats = tmp

            torch.cuda.synchronize()
            t0 = time.time()
            tap_feats, pred_proposals = tap_model(c3d_feats)

            torch.cuda.synchronize()
            t1 = time.time()
            # select top score 1000 proposals
            cg_gts = data['cg_gts'] if data.get('cg_labels',
                                                None) is not None else []

            if flag_eval_what == 'cg':
                ind_select_list = data['gts_ind_select_list']
                soi_select_list = data['gts_soi_select_list']
                cg_select_list = data['gts_cg_select_list']
                #good_time_stamps = [loader.featstamp_to_time(s, e, len(data['fc_feats']), data['duration']) for (s, e) in soi_select_list]
                good_time_stamps = data['gt_timestamps']
                tap_prob = [1] * len(ind_select_list)

            elif flag_eval_what == 'cg_extend':
                ind_select_list, soi_select_list, cg_select_list, sampled_ids, = data[
                    'ind_select_list'], data['soi_select_list'], data[
                        'cg_select_list'], data['sampled_ids']
                good_time_stamps = [
                    loader.featstamp_to_time(s, e, len(data['fc_feats']),
                                             data['duration'])
                    for (s, e) in soi_select_list
                ]
                tap_prob = [1] * len(ind_select_list)

            elif flag_eval_what == 'SOTA_TEP':
                if data['SOTA_Prop_score'] is None:
                    print('bad video for SOTA_TEP, vid:{}'.format(data['vid']))
                    bad_vid_num += 1
                    continue
                _ind_select_list, _soi_select_list, _cg_select_list = data[
                    'SOTA_ind_select_list'], data[
                        'SOTA_soi_select_list'], data['SOTA_cg_select_list']
                #_good_time_stamps = [loader.featstamp_to_time(s, e, len(data['fc_feats']), data['duration']) for (s, e)                                     in _soi_select_list]
                _good_time_stamps = data['SOTA_timestamps']
                _tap_prob = data['SOTA_Prop_score']
                ind_select_list, soi_select_list, cg_select_list, good_time_stamps, tap_prob = [], [], [], [], []

                if nms_threshold > 0:
                    _, _, pick = gettopN_nms(_good_time_stamps,
                                             _tap_prob,
                                             _tap_prob,
                                             nms_overlap=nms_threshold,
                                             topN=1000)
                else:
                    pick = list(range(len(_tap_prob)))

                for i, p_scpre in enumerate(_tap_prob):
                    if i not in pick:
                        continue
                    if p_scpre >= val_score_thres:
                        ind_select_list.append(_ind_select_list[i])
                        soi_select_list.append(_soi_select_list[i])
                        if len(_cg_select_list):
                            cg_select_list.append(_cg_select_list[i])
                        good_time_stamps.append(_good_time_stamps[i])
                        tap_prob.append(_tap_prob[i])
                    if len(ind_select_list) >= topN:
                        break

            elif flag_eval_what == 'cg' or flag_eval_what == 'tap_cg' or flag_eval_what == 'tap':
                if nms_threshold != 0:
                    ind_select_list, soi_select_list, cg_select_list, good_time_stamps, tap_prob = \
                        gettop1000_nms(pred_proposals.data, data['tap_masks_for_loss'], cg_gts, data['duration'],
                                       loader.featstamp_to_time, overlap=nms_threshold, topN=topN)
                else:
                    ind_select_list, soi_select_list, cg_select_list, good_time_stamps, tap_prob = \
                        gettop1000(pred_proposals.data, data['tap_masks_for_loss'], cg_gts, data['duration'],
                                   loader.featstamp_to_time, val_score_thres=val_score_thres, topN=topN)

            else:
                assert 1 == 0

            t2 = time.time()

            if (len(cg_select_list) == 0) and (split != 'test'):
                sents = []
            else:
                if flag_eval_what == 'tap':
                    sents = [0] * len(ind_select_list)
                    cg_prob = [0] * len(ind_select_list)
                    cg_score = [0] * len(ind_select_list)
                else:
                    seq, cg_prob = cg_model(tap_feats,
                                            c3d_feats,
                                            lda_feats, [],
                                            ind_select_list,
                                            soi_select_list,
                                            mode='eval')
                    if len(seq) == 0:
                        sents = []
                    else:
                        cg_score = cg_prob.sum(1).cpu().numpy().astype('float')
                        # cg_prob = np.round(cg_prob, 3).tolist()
                        sents = utils.decode_sequence(
                            loader.get_vocab(),
                            seq)  # [proposal_num , max_sent_len]
                    torch.cuda.synchronize()
                    t3 = time.time()

            # get val_loss
            if get_eval_loss and tap_crit and (
                    data.get('cg_labels', None)
                    is not None) and len(cg_select_list) and (split != 'test'):
                tmp = [
                    data['tap_labels'], data['tap_masks_for_loss'],
                    data['cg_labels'][cg_select_list],
                    data['cg_masks'][cg_select_list], data['w1']
                ]
                tmp = [
                    Variable(torch.from_numpy(_), requires_grad=False).cuda()
                    for _ in tmp
                ]
                tap_labels, tap_masks_for_loss, cg_labels, cg_masks, w1 = tmp
                tap_loss = tap_crit(pred_proposals, tap_masks_for_loss,
                                    tap_labels, w1)

                loss_sum[0] = loss_sum[0] + tap_loss.item()
                if flag_eval_what != 'tap':
                    pred_captions = cg_model(tap_feats,
                                             c3d_feats,
                                             lda_feats,
                                             cg_labels,
                                             ind_select_list,
                                             soi_select_list,
                                             mode='train')

                    cg_loss = cg_crit(pred_captions, cg_labels[:, 1:],
                                      cg_masks[:, 1:])
                    loss_sum[1] = loss_sum[1] + cg_loss.item()
                    total_loss = eval_kwargs[
                        'lambda1'] * tap_loss + eval_kwargs['lambda2'] * cg_loss
                    loss_sum[2] = loss_sum[2] + total_loss.item()

            vid_info = []
            for i, sent in enumerate(sents):
                proposal_info = {}
                proposal_info['sentence'] = sent
                proposal_info['timestamp'] = good_time_stamps[i]
                # proposal_info['cg_prob'] = cg_prob[i]
                proposal_info['sentence_confidence'] = cg_score[i]
                proposal_info['proposal_score'] = tap_prob[i]
                proposal_info['re_score'] = 10 * tap_prob[i] + cg_score[i]
                proposal_info['num'] = [i, len(sents)]
                vid_info.append(proposal_info)

            if len(vid_info) != 0:
                if is_reranking:
                    vid_info = reranking(vid_info)
                tap_cg_pred[data['vid']] = vid_info

            if data['bounds']['wrapped']:
                loader.reset_iterator(split)
                break

            if iter == eval_kwargs['num_vids_eval']:
                loader.reset_iterator(split)
                break
            '''
            if iter%500==0:
                pred2json = {'results': tap_lm_pred,
                             'version': "VERSION 1.0",
                             "external_data":
                                 {
                                     "used": True,
                                     "details": "First fully-connected layer from VGG-16 pre-trained on ILSVRC-2012 training set"
                                 }
                             }
                with open(json_path+'iter{}'.format(iter), 'w') as f:
                    json.dump(pred2json, f)
            '''
            time_consumption[iter] = {
                'tep': t1 - t0,
                'cg': t3 - t2,
                'postprocess': t2 - t1
            }
            iter += 1
            #relation_analyse(data['vid'], vid_info)
            # torch.cuda.empty_cache()

    pred2json = {
        'results': tap_cg_pred,
        'version': "VERSION 1.0",
        "external_data": {
            "used":
            True,
            "details":
            "First fully-connected layer from VGG-16 pre-trained on ILSVRC-2012 training set"
        }
    }

    with open(json_path, 'w') as f:
        json.dump(pred2json, f)

    json.dump(time_consumption, open(json_path + '.time_consumption.json',
                                     'w'))

    sys.path.append('external_tool/densevid_eval')
    sys.path.append('external_tool/densevid_eval/coco-caption')

    score = {'ARAN': 0}
    if lang_eval:
        from evaluate import eval_score

        sample_score = eval_score(json_path, flag_eval_what == 'tap',
                                  eval_kwargs['val_all_metrics'])
        for key in sample_score.keys():
            score[key] = np.array(sample_score[key])
        print('vilid vid num:{}, bad_num:{}'.format(
            (eval_kwargs['num_vids_eval'] - bad_vid_num), bad_vid_num))

    if flag_eval_what == 'tap':
        import external_tool.eval_ARAN.get_proposal_performance as eval_score_tap
        eval_tap_opt = {}
        eval_tap_opt[
            'ground_truth_filename'] = '/data/huichengzheng/wangteng/dvc2_pytorch04/data/captiondata/val_forARAN.json'
        eval_tap_opt['proposal_filename'] = json_path
        score['ARAN'] = eval_score_tap.main(**eval_tap_opt)

    # Switch back to training mode
    for model in models:
        model.train()

    return tap_cg_pred, score, np.array(loss_sum) / iter
Beispiel #2
0
    def fit(self, train_loader, test_loader, vaild_loader=False,
            loop_conf=False):
        # inituate a dictionry to store all the logs for tensorboard
        ts_writer = {}

        # create working directory if necessary
        if not os.path.exists(self.params["working_dir"]):
            os.makedirs(self.params["working_dir"])

        date_time_now = str(
            datetime.datetime.now()).replace(" ", "_").replace(":", "_")

        # Create sub_working_dir
        sub_working_dir = os.path.join(self.params["working_dir"] +
                                       self.params['sub_name'] +
                                       date_time_now)

        if not os.path.exists(sub_working_dir):
            os.makedirs(sub_working_dir)
        self.params["sub_working_dir"] = sub_working_dir
        logging.info("sub working dir: %s" % sub_working_dir)

        # Creat tf_summary writer
        ts_writer["tensorboard_writer"] = SummaryWriter(sub_working_dir)
        logging.info("Please using 'python -m tensorboard.main --logdir={} \
                     '".format(sub_working_dir))

        # optimizer
        optimizer_dic = {'sgd': torch.optim.SGD(
                            self.module_list.parameters(),
                            lr=self.params["learning_rate"],
                            momentum=self.params["momentum"],
                            weight_decay=self.params["decay"]),
                         'adam': torch.optim.Adam(
                            self.module_list.parameters(),
                            lr=self.params["learning_rate"],
                            weight_decay=self.params["decay"])}
        optimizer = optimizer_dic[self.params['optimizer'].lower()]

        # initiate global step
        self.params["global_step"] = 0

        # initiate learning rate scheduler
        lr_scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=self.params["steps"],
            gamma=self.params["scales"])

        self.train()
        map_results_names = ["best_map", "best_ap", "best_conf",
                             "specific_conf_map", "specific_conf_ap"]
        # Start the training loop
        logging.info("Start training.")
        for epoch in range(self.params["epochs"]):
            save = 1
            eva = 1
            if self.params['loop_epoch'] and epoch > self.params['loop_epoch']:
                loop_conf = True
            for step, samples in enumerate(train_loader):
                if self.params['cuda']:
                    images, labels = (samples["image"].to('cuda'),
                                      samples["label"])
                else:
                    images, labels = samples["image"], samples["label"]

                start_time = time.time()
                self.params["global_step"] += 1

                # Forward and backward
                optimizer.zero_grad()
                batch_size = images.size(0)
                losses = self(images, is_training=True, labels=labels)
                loss = losses[0]
                if torch.isnan(loss):
                    continue
                loss.backward()

                optimizer.step()

                if step > 0 and step % self.params['loss_step'] == 0:
                    _loss = loss.item()
                    duration = float(time.time() - start_time)
                    example_per_second = batch_size / duration
                    lr = optimizer.param_groups[0]['lr']
                    logging.info(
                        "epoch [%.3d] iter = \
                        %d loss = %.2f example/sec = %.3f lr = %.5f " %
                        (epoch, step, _loss, example_per_second, lr)
                    )
                    ts_writer["tensorboard_writer"].add_scalar(
                            "lr", lr, self.params["global_step"])
                    ts_writer["tensorboard_writer"].add_scalar(
                            "example/sec", example_per_second,
                            self.params["global_step"])
                    for i, name in enumerate(self.losses_name):
                        value = _loss if i == 0 else losses[i]
                        ts_writer["tensorboard_writer"].add_scalar(
                                name, value, self.params["global_step"])

                if eva and (epoch+1) % self.params['eva_epoch'] == 0:
                    self.train(False)
                    logging.info(f"test epoch number {epoch+1}")
                    # results consist best_map, best_ap, best_conf,
                    # specific_conf_map, specific_conf_ap
                    map_results = get_map(self, test_loader, train=True,
                                          loop_conf=loop_conf)
                    self.params['best_map'] = map_results[0]
                    self.params['confidence'] = map_results[2]
                    for index, mr_name in enumerate(map_results_names):
                        try:
                            ts_writer["tensorboard_writer"].add_scalar(
                                    mr_name, map_results[index],
                                    self.params["global_step"])
                        except AttributeError:
                            continue

                    evaluate_running_loss = eval_score(self, test_loader)
                    logging.info(f"evaluate_running_loss:\
                                 {evaluate_running_loss[0]}")
                    for i, name in enumerate(self.losses_name):
                        ts_writer["tensorboard_writer"].add_scalar(
                                "evel_" + name, evaluate_running_loss[i],
                                self.params["global_step"])
                    if vaild_loader:
                        self.params['test_best_map'] = get_map(
                                self, vaild_loader,
                                confidence=[self.params['confidence']])[0]
                        ts_writer["tensorboard_writer"].add_scalar(
                                    "test_best_map",
                                    self.params['test_best_map'],
                                    self.params["global_step"])
                    self.train(True)
                    eva = 0
                if save and (epoch+1) % self.params['save_epoch'] == 0:
                    _save_checkpoint(self)
                    save = 0
            lr_scheduler.step()
#        best_map, best_ap, best_conf, specific_conf_map, specific_conf_ap, \
#            map_frame = get_map(self, test_loader, train=False, loop_conf=True)
        map_results = get_map(self, test_loader, train=False, loop_conf=True)
        self.params['best_map'] = map_results[0]
        self.params['confidence'] = map_results[2]
        if vaild_loader:
            self.params['test_best_map'] = get_map(
                    self, vaild_loader,
                    confidence=[self.params['confidence']])[0]
            ts_writer["tensorboard_writer"].add_scalar(
                        "test_best_map", self.params['test_best_map'],
                        self.params["global_step"])
        _save_checkpoint(self)
        for index, mr_name in enumerate(map_results_names):
            try:
                ts_writer["tensorboard_writer"].add_scalar(
                        mr_name, map_results[index],
                        self.params["global_step"])
            except (AttributeError):
                continue
        # model.train(True)
        logging.info("Bye~")
        if self.params['return_csv']:
            map_results[5].to_csv(
                    f"{self.params['sub_working_dir']}/final_performance.csv",
                    index=True)
        return tuple(map_results)
def main():
    print("preparing data...")
    SRC = data.Field(tokenize=tokenizer,
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True)
    TRG = data.Field(tokenize=tokenizer,
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True)
    train, val, test, filename = choose_dataset(False, SRC, TRG)
    SRC.build_vocab(train)
    TRG.build_vocab(train)

    train_batch_size = 128
    test_batch_size = 32
    eval_batch_size = 128
    train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test), sort=False,  batch_sizes=(
        train_batch_size, eval_batch_size, test_batch_size), device=device)

    print("building model...")
    in_tokens = len(SRC.vocab.stoi)  # the size of vocabulary
    out_tokens = len(TRG.vocab.stoi)
    emsize = 768 # embedding dimension
    nhid = 1024 # the dimension of the feedforward network model in nn.TransformerEncoder and nn.TransformerDecoder
    nlayers = 3  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder and nn.TransformerDecoder
    nhead = 2  # the number of heads in the multiheadattention models
    dropout = 0.3  # the dropout value
    model = TransformerModel(in_tokens, out_tokens, emsize, nhead, nhid, nlayers, dropout).to(device)

    print(model)

    criterion = nn.CrossEntropyLoss(ignore_index=TRG.vocab.stoi["<unk>"])
    lr = 0.0001  # learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

    best_val_loss = float("inf")
    epochs = 100  # The number of epochs
    best_model = None
    # model.init_weights()

    print("training...")

    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        t_loss = train_model(model, train_iter, optimizer, criterion, SRC)
        val_loss = evaluate_model(model, val_iter, criterion)
        print('-' * 65)
        print('| epoch {:3d} | time: {:3d}m {:3d}s | train loss {:5.2f} | valid loss {:5.2f}'
              .format(epoch, int((time.time() - epoch_start_time)/60), int((time.time() - epoch_start_time)%60), t_loss, val_loss))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model

        scheduler.step()
        model.eval()
        sentence = "今日はいい日ですね"
        output = []
        sentence = SRC.preprocess(sentence)
        # print(sentence)
        index = [SRC.vocab.stoi[SRC.init_token]] + [SRC.vocab.stoi[i] for i in sentence] + [SRC.vocab.stoi[SRC.eos_token]]
        src_tensor = torch.LongTensor([index]).T.to(device)
        trg = torch.LongTensor([[TRG.vocab.stoi[TRG.init_token]]]).to(device)
        for i in range(25):
            pred = model(src_tensor, trg)

            pred_index = pred.argmax(2)[-1].item()
            # print(pred_index)
            output.append(pred_index)
            if pred_index == TRG.vocab.stoi[TRG.eos_token]:
                break

            pred_index = torch.LongTensor([[pred_index]]).to(device)
            # print(pred_index.size())
            trg = torch.cat((trg, pred_index))

        print("source sentence: ", sentence)
        print("output sentence: ", [TRG.vocab.itos[i] for i in output])

    test_loss = evaluate_model(best_model, test_iter, criterion)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)
    torch.save(model.state_dict(), "../model/transformer.pth")

    model.state_dict(torch.load("../model/transformer.pth", map_location=device))
    # print(model.state_dict())
    # 中間発表時にはvalidデータは用いない
    print("generating sentence from text..")
    path = "../data/test.tsv"
    test_input, test_output, test_pred = [], [], []
    test_input, test_output, test_pred = gen_sentence_list(model, path, SRC, TRG)
    path = "../data/train.tsv"
    train_input, train_output, train_pred = [], [], []
    train_input, train_output, train_pred = gen_sentence_list(model, path, SRC, TRG)

    train_df = convert_list_to_df(train_input, train_output, train_pred)
    test_df = convert_list_to_df(test_input, test_output, test_pred)


    test_df = prepare_df(test_df)
    test_percentage, test_kinds, test_bleu = eval_score(test_df)
    train_df = prepare_df(train_df)
    train_percentage, train_kinds, train_bleu = eval_score(train_df)
    train_df.to_csv("../csv/train/result_transformer.csv")
    test_df.to_csv("../csv/test/result_transformer.csv")
    print(f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}")
    print(f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}")
    with open("./score/score_transformer.txt", mode="w") as f:
        f.write(f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}\n")
        f.write(f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}")
    print("done!")
Beispiel #4
0
writerb2 = csv.writer(output_bleu2, delimiter=',')

for i in range(len(los) - 1):
    print(i)
    prob_list = []
    probability = model.predict(
        [x_val[los[i]:los[i + 1]], sp_val[los[i]:los[i + 1]]])

    #print(probability)
    for j in range(len(probability)):
        prob_list.append(probability[j][0])

    #print(prob_list, y_val[los[i]:los[i+1]])
    #calc_test_result(prob_list, y_val[los[i]:los[i+1]])

    b01, b11, b21, b31, p11, r11, f11, p21, r21, f21, p31, r31, f31 = eval_score(
        prob_list, sent[los[i]:los[i + 1]], summ[i])
    writer1.writerow([i, p11, r11, f11, p21, r21, f21, p31, r31, f31])
    writerb1.writerow([i, b01, b11, b21, b31])

    divstr = mmr(sent[los[i]:los[i + 1]], prob_list)
    b02, b12, b22, b32, p12, r12, f12, p22, r22, f22, p32, r32, f32 = eval_str(
        divstr, summ[i])
    writer2.writerow([i, p12, r12, f12, p22, r22, f22, p32, r32, f32])
    writerb2.writerow([i, b02, b12, b22, b32])

output1.close()
output_bleu1.close()
output2.close()
output_bleu2.close()
"""
scores_dir = "test_scores/"
def main():
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

    SEED = 1234

    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    print("preparing data..")
    paths = ["../data/train.tsv", "../data/val.tsv"]
    src, trg, tmp = [], [], []
    for path in paths:
        with open(path, mode='r', encoding="utf-8") as f:
            for file in f:
                sentence = file.split("\t")
                tmp.append(sentence)

    # random.shuffle(tmp)
    for sentence in tmp:
        src.append(sentence[0])
        trg.append(sentence[1].replace("\t", ""))

    src_tensors = tok(text=src,
                      padding=True,
                      return_tensors='pt',
                      return_attention_mask=False)
    trg_tensors = tok(text=trg,
                      padding=True,
                      return_tensors='pt',
                      return_attention_mask=False)

    dataset = torch.utils.data.TensorDataset(src_tensors['input_ids'],
                                             trg_tensors['input_ids'])

    train_size = int(len(dataset) * 0.8)
    valid_size = len(dataset) - train_size
    train_data, valid_data = torch.utils.data.random_split(
        dataset, [train_size, valid_size])

    batch_size = 128
    # batch_size = 8
    train_data_loader = torch.utils.data.DataLoader(train_data, batch_size)
    valid_data_loader = torch.utils.data.DataLoader(valid_data, batch_size)

    print("building model...")
    emsize = 768  # embedding dimension
    nhid = 1024  # the dimension of the feedforward network model in nn.TransformerEncoder
    nlayers = 1  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
    nhead = 2  # the number of heads in the multiheadattention models
    dropout = 0.3  # the dropout value
    model = TransformerModel(emsize, nhead, nhid, nlayers, dropout).to(device)

    print(model)
    criterion = nn.CrossEntropyLoss(
        ignore_index=tok.convert_tokens_to_ids("[UNK]"))
    lr = 0.0001  # learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

    best_val_loss = float("inf")
    epochs = 100  # The number of epochs
    best_model = None
    model.init_weights()
    train_loss_list, eval_loss_list = [], []

    print("training model...")
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        t_loss = train(model, train_data_loader, optimizer, criterion)
        val_loss = evaluate(model, valid_data_loader, criterion)
        print('-' * 89)
        print(
            '| epoch {:3d} | time: {:3d}m {:3d}s | train loss {:5.2f} | valid loss {:5.2f} | '
            .format(epoch, int((time.time() - epoch_start_time) / 60),
                    int((time.time() - epoch_start_time) % 60), t_loss,
                    val_loss))

        train_loss_list.append(t_loss)
        eval_loss_list.append(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model
        model.eval()
        sentence = "今日は良い日ですね"
        sentence = tok.tokenize(sentence)
        # src = [tok.convert_tokens_to_ids("[CLS]")] + tok.convert_tokens_to_ids(sentence) + [tok.convert_tokens_to_ids("[SEP]")]
        src = tok.convert_tokens_to_ids(
            sentence)  # + [tok.convert_tokens_to_ids("[SEP]")]
        src = torch.LongTensor([src])
        src = torch.t(src)
        src = src.to(device)
        trg = tok.convert_tokens_to_ids("[CLS]")
        trg = torch.LongTensor([[trg]]).to(device)
        output = []
        for i in range(25):
            with torch.no_grad():
                pred = model(src, trg)
            pred_word_index = pred.argmax(2)[-1]
            output.append(pred_word_index)
            if pred_word_index == 3:
                break

            last_index = torch.LongTensor([[pred_word_index.item()]
                                           ]).to(device)
            trg = torch.cat((trg, last_index))
        predict = tok.convert_ids_to_tokens(output)
        print("source sentence: ", sentence)
        print("predicted sentence: ", predict)
        scheduler.step()

    torch.save(best_model.state_dict(),
               "../model/bert_embedded_transformer.pth")

    # model.init_weights()

    # model.state_dict(torch.load("../model/bert_embedded_transformer.pth"))

    print("generating sentence from text..")
    path = "../data/test.tsv"
    test_input, test_output, test_pred = [], [], []
    test_input, test_output, test_pred = gen_sentence_list(model, path)
    path = "../data/train.tsv"
    train_input, train_output, train_pred = [], [], []
    train_input, train_output, train_pred = gen_sentence_list(model, path)

    print("converting list to dataframe")
    train_df = convert_list_to_df(train_input, train_output, train_pred)
    test_df = convert_list_to_df(test_input, test_output, test_pred)

    test_df = prepare_df(test_df)
    test_percentage, test_kinds, test_bleu = eval_score(test_df)
    train_df = prepare_df(train_df)
    train_percentage, train_kinds, train_bleu = eval_score(train_df)
    train_df.to_csv("../csv/train/result_bert_embedded_transformer.csv")
    test_df.to_csv("../csv/test/result_bert_embedded_transformer.csv")
    print(
        f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}"
    )
    print(
        f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}"
    )
    with open("./score/score_bert_embedded_transformer.txt", mode="w") as f:
        f.write(
            f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}\n"
        )
        f.write(
            f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}"
        )
    print("done!")
def main():
    # pytorchのデータフィードの定義(重要!!)
    print("preparing data...")
    SRC = data.Field(sequential=True,
                     tokenize=tokenizer,
                     init_token='<sos>',
                     eos_token='<eos>',
                     lower=True)
    TRG = data.Field(sequential=True,
                     tokenize=tokenizer,
                     init_token='<sos>',
                     eos_token='<eos>',
                     lower=True)

    train, val, test, filename = choose_dataset(False, SRC, TRG)

    # 辞書の作成
    SRC.build_vocab(train)
    TRG.build_vocab(train)

    # 各データをバッチ化する
    train_batch_size = 128
    test_batch_size = 32
    eval_batch_size = 32
    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test),
        sort=False,
        batch_sizes=(train_batch_size, eval_batch_size, test_batch_size),
        device=device)
    # ハイパーパラメータの設定
    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)
    ENC_EMB_DIM = 768
    DEC_EMB_DIM = 768
    ENC_HID_DIM = 1024
    DEC_HID_DIM = 1024
    N_LAYERS = 1
    ENC_DROPOUT = 0.3
    DEC_DROPOUT = 0.3

    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT)

    model = Seq2Seq(enc, dec, device).to(device)

    print(model)
    # モデルの重みの初期化
    model.apply(init_weights)
    # 最適化手法の設定
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
    # lossの計算用のやつ
    criterion = nn.CrossEntropyLoss(ignore_index=SRC_PAD_IDX)
    # 学習率の自動最適化
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

    epochs = 100
    clip = 1
    best_model = None

    print("training...")
    best_valid_loss = float('inf')
    for epoch in range(epochs):

        start_time = time.time()

        train_loss = train_model(model, train_iter, optimizer, criterion, clip,
                                 TRG)
        valid_loss = evaluate_model(model, val_iter, criterion)

        scheduler.step()
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_model = model
            # torch.save(model.state_dict(), 'tut1-model.pt')

        print("-" * 65)
        print(
            f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f}'
        )
    # 重みの保存
    torch.save(best_model.state_dict(), '../model/seq2seq.pth')

    model.state_dict(torch.load("../model/seq2seq.pth"))
    print("generating sentence...")
    # テストデータに対して文章を生成する
    path = "../data/test.tsv"
    test_input, test_output, test_pred = gen_sentence_list(
        model, path, SRC, TRG)
    test_df = convert_list_to_df(test_input, test_output, test_pred)
    # 学習データに対して文章を生成する
    path = "../data/train.tsv"
    train_input, train_output, train_pred = gen_sentence_list(
        model, path, SRC, TRG)
    train_df = convert_list_to_df(train_input, train_output, train_pred)
    # スコアの計算
    test_df = prepare_df(test_df)
    test_percentage, test_kinds, test_bleu = eval_score(test_df)
    train_df = prepare_df(train_df)
    train_percentage, train_kinds, train_bleu = eval_score(test_df)
    # 結果の保存
    train_df.to_csv("../csv/train/result_Seq2seq.csv")
    test_df.to_csv("../csv/test/result_Seq2seq.csv")
    print(
        f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}"
    )
    print(
        f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}"
    )
    with open("./score/score_seq2seq.txt", mode="w") as f:
        f.write(
            f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}"
        )
        f.write(
            f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}"
        )
    print("done!")
def main():

    print("preparing data...")
    paths = ["../data/train.tsv", "../data/val.tsv"]
    src, trg, tmp = [], [], []
    for path in paths:
        with open(path, mode='r', encoding="utf-8") as f:
            for file in f:
                sentence = file.split("\t")
                tmp.append(sentence)

    # random.shuffle(tmp)

    for sentence in tmp:
        src.append(sentence[0])
        trg.append(sentence[1].replace("\n", ""))

    src_tensors = tok(text=src,
                      padding=True,
                      return_tensors='pt',
                      return_attention_mask=False)
    trg_tensors = tok(text=trg,
                      padding=True,
                      return_tensors='pt',
                      return_attention_mask=False)

    dataset = torch.utils.data.TensorDataset(src_tensors['input_ids'],
                                             trg_tensors['input_ids'])
    train_size = int(len(dataset) * 0.8)
    valid_size = len(dataset) - train_size
    train_data, valid_data = torch.utils.data.random_split(
        dataset, [train_size, valid_size])

    batch_size = 128
    train_data_loader = torch.utils.data.DataLoader(train_data, batch_size)
    valid_data_loader = torch.utils.data.DataLoader(valid_data, batch_size)

    print("building model...")
    OUTPUT_DIM = tok.vocab_size
    # OUTPUT_DIM = 3454
    ENC_EMB_DIM = 768
    DEC_EMB_DIM = 768
    ENC_HID_DIM = 1024
    DEC_HID_DIM = 1024
    N_LAYERS = 1
    ENC_DROPOUT = 0.3
    DEC_DROPOUT = 0.3

    enc = Encoder(ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT)

    model = Seq2Seq(enc, dec, device).to(device)

    print(model)
    # model.apply(init_weights)

    optimizer = optim.Adam(model.parameters())
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    epochs = 100
    clip = 1
    best_valid_loss = float('inf')
    best_model = None

    print("training...")

    for epoch in range(epochs):

        start_time = time.time()

        train_loss = train(model, train_data_loader, optimizer, criterion,
                           clip)
        valid_loss = evaluate(model, valid_data_loader, criterion)

        scheduler.step()
        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_model = model

        print("-" * 65)
        print(
            f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} |'
        )

    torch.save(best_model.state_dict(), '../model/bert_embedded_seq2seq.pth')
    # model.apply(init_weights)

    # model.state_dict(torch.load("../model/bert_embedded_seq2seq.pth"))

    print("generating sentences...")
    path = "../data/test.tsv"
    test_input, test_output, test_pred = gen_sentence_list(model, path, tok)
    # print(test_pred)

    path = "../data/train.tsv"
    train_input, train_output, train_pred = gen_sentence_list(model, path, tok)

    train_df = convert_list_to_df(train_input, train_output, train_pred)
    test_df = convert_list_to_df(test_input, test_output, test_pred)

    test_df = prepare_df(test_df)
    test_percentage, test_kinds, test_bleu = eval_score(test_df)
    train_df = prepare_df(train_df)
    train_percentage, train_kinds, train_bleu = eval_score(test_df)
    train_df.to_csv("../csv/train/result_bert_embedded_Seq2seq.csv")
    test_df.to_csv("../csv/test/result_bert_embedded_Seq2seq.csv")
    print(
        f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}"
    )
    print(
        f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}"
    )
    with open("./score/bert_embedded_score_seq2seq.txt", mode="w") as f:
        f.write(
            f"TEST DATA: 一致率: {test_percentage}, 種類数: {test_kinds}, BLEU: {test_bleu}"
        )
        f.write(
            f"TRAIN DATA: 一致率: {train_percentage}, 種類数: {train_kinds}, BLEU: {train_bleu}"
        )
    print("done!")