def test():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="../data/Time Dataset.json",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument(
        "--check_point",
        type=str,
        default='../checkpoint/Oct30_21-01-28/checkpoint_mymodel_8.pth',
        help="Path or url of the dataset cache")
    parser.add_argument("--batch_size",
                        type=int,
                        default=100,
                        help="Batch size for validation")
    parser.add_argument("--embedding_dim",
                        type=int,
                        default=100,
                        help="Batch size for validation")
    parser.add_argument("--hidden_dim",
                        type=int,
                        default=100,
                        help="Batch size for validation")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--train_precent",
                        type=float,
                        default=0.7,
                        help="Batch size for validation")
    args = parser.parse_args()
    device = torch.device(args.device)

    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    logdir = os.path.join('../logs', current_time + '_' + socket.gethostname())

    train_data_loader, valid_data_loader, input_lengths, target_lengths = get_data_loaders(
        args.dataset_path, args.batch_size, args.train_precent)

    encoder = Encoder(input_lengths + 1, args.embedding_dim, args.hidden_dim)
    decoder = Decoder(target_lengths + 1, args.embedding_dim, args.hidden_dim)
    model = Seq2Seq(encoder, decoder, device).to(device)

    check_point = torch.load(args.check_point)
    model.load_state_dict(check_point)
    model.eval()

    pairs = json.load(open('../data/Time Dataset.json', 'rt',
                           encoding='utf-8'))
    data = array(pairs)
    src_texts = data[:, 0]
    trg_texts = data[:, 1]
    src_c2ix, src_ix2c = build_vocab(src_texts)
    trg_c2ix, trg_ix2c = build_vocab(trg_texts)

    def get_decode(src):
        result = []
        for t in src:
            result.append(src_ix2c[t])
        sndx = 0
        if '^' in result:
            sndx = result.index('^') + 1
        endx = result.index('$')
        return ''.join(result[sndx:endx])

    def get_decode_target(target):
        result = []
        for t in target:
            result.append(trg_ix2c[int(t)])
        sndx = 0
        if '^' == result[0]:
            sndx = result.index('^') + 1
        endx = result.index('$')
        return ''.join(result[sndx:endx])

    max_src_len = max(list(map(len, src_texts))) + 2
    max_trg_len = max(list(map(len, trg_texts))) + 2
    max_src_len, max_trg_len

    for batch in valid_data_loader:
        src_seqs = batch[0].transpose(0, 1).to(device)
        src_lengths = batch[1].to(device)
        trg_seqs = batch[2].transpose(0, 1).to(device)
        outputs, attn_weights = model.predict(src_seqs=src_seqs,
                                              src_lengths=src_lengths)
        # print(outputs.cpu().detach().numpy())
        outputs_index = torch.argmax(outputs.cpu(), dim=2)
        outputs_index_mat = outputs_index.permute(1, 0)

        for i in range(outputs_index_mat.shape[0]):
            print('src:    \t',
                  get_decode(src_seqs.cpu().permute(1, 0)[i].numpy()))
            print('target :\t',
                  get_decode_target(trg_seqs.cpu().permute(1, 0)[i].numpy()))
            print('predict:\t',
                  get_decode_target(outputs_index_mat[i].detach().numpy()[1:]))
            print('=' * 64)
dim_y = len(w2i)
dim_tag = len(t2i)
num_sents = batch_size

print "#features = ", dim_x, "#labels = ", dim_y
print "#tag len = ", dim_tag


print "load test data..."
test_batch = 1
test_data_x_y = get_data.test_processing_long(r"data/post-test.txt", r"data/post-tag-test.txt", i2w, w2i, i2t, t2i, 100, test_batch)
reference_dic = cPickle.load(open(r'print_bleu_score/reference_dic.pkl', 'rb'))
print "done."

print "compiling..."
model = Seq2Seq(dim_x + dim_tag, dim_y + dim_tag, dim_y, dim_tag, hidden_size_encoder, hidden_size_decoder, cell, optimizer, drop_rate, num_sents)
# # load_error_model("GRU-200_best.model", model)

print "training..."


start = time.time()
g_error = 0.5
for i in xrange(10000):
    error = 0.0
    in_start = time.time()
    for get_num_start in xrange((full_data_len/read_data_batch)+1):
        read_data_batch_error = 0.0
        in_b_start = time.time()
        get_num_end = get_num_start*read_data_batch + read_data_batch
        if get_num_end > full_data_len:
Beispiel #3
0
    sort_within_batch=True,
    repeat=False)

### Encoder
### Decoder
### Seq2Seq

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

enc = Model.Encoder(INPUT_DIM, Paras.ENC_EMB_DIM, Paras.HID_DIM,
                    Paras.N_LAYERS, Paras.ENC_DROPOUT)
dec = Model.Decoder(OUTPUT_DIM, Paras.DEC_EMB_DIM, Paras.HID_DIM,
                    Paras.N_LAYERS, Paras.DEC_DROPOUT)

model = Seq2Seq.Seq2Seq(enc, dec, device).to(device)

### init_weights

model.apply(Model.init_weights)

### count_parameters

print(f'The model has {Model.count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())

# calculating loss,ignoring loss of padding token
PAD_IDX = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
Beispiel #4
0

def make_distance_matrix(vectors):
    nb_vector = len(vectors)
    out = np.zeros([nb_vector, nb_vector])
    for i, v1 in enumerate(vectors):
        for j, v2 in enumerate(vectors):
            out[i][j] = np.linalg.norm(v1 - v2)

    return out


if __name__ == "__main__":
    batch, vocab_size = convert_to_batch(SENTENCES)

    seq2seq = Seq2Seq(vocab_size, NB_HIDDEN, NB_HIDDEN, LEARNING_RATE)
    init_global = tf.global_variables_initializer()
    init_local = tf.local_variables_initializer()

    loss_placeholder = tf.placeholder(tf.float32, [])
    loss_summary = tf.summary.scalar("Loss", loss_placeholder)
    accuracy_placeholder = tf.placeholder(tf.float32, [])
    accuracy_summary = tf.summary.scalar("Accuracy", accuracy_placeholder)
    summary_writer = tf.summary.FileWriter(LOG_DIR, tf.get_default_graph())

    with tf.Session() as sess:
        sess.run([init_global, init_local])

        accuracy = 0.0
        epoch = 0
        while accuracy < STOP_TRESHOLD:
Beispiel #5
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="../data/time_transfor/Time Dataset.json",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='../cache/',
                        help="Path or url of the dataset cache")
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size for validation")
    parser.add_argument("--embedding_dim",
                        type=int,
                        default=100,
                        help="Batch size for validation")
    parser.add_argument("--hidden_dim",
                        type=int,
                        default=100,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=1,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--train_precent",
                        type=float,
                        default=0.7,
                        help="Batch size for validation")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=30,
                        help="Number of training epochs")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--log_step",
                        type=int,
                        default=10,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--raw_data",
                        action='store_true',
                        default=True,
                        help="If true read data by raw function")
    args = parser.parse_args()
    device = torch.device(args.device)

    train_data_loader, valid_data_loader, input_lengths, target_lengths = get_data_loaders(
        args.dataset_path, args.batch_size, args.train_precent)

    encoder = Encoder(input_lengths + 1, args.embedding_dim, args.hidden_dim)
    decoder = Decoder(target_lengths + 1, args.embedding_dim, args.hidden_dim)
    model = Seq2Seq(encoder, decoder, device).to(device)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.NLLLoss(ignore_index=0).to(device)

    def update(engine, batch):
        model.train()
        src_seqs = batch[0].transpose(0, 1).to(device)
        src_lengths = batch[1].to(device)
        trg_seqs = batch[2].transpose(0, 1).to(device)
        output = model(src_seqs, src_lengths, trg_seqs)
        loss = criterion(output.contiguous().view(-1, output.shape[2]),
                         trg_seqs.contiguous().view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)

        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            src_seqs = batch[0].transpose(0, 1).to(device)
            src_lengths = batch[1].to(device)
            trg_seqs = batch[2].transpose(0, 1).to(device)
            output = model(src_seqs, src_lengths, trg_seqs)
            return output.contiguous().view(
                -1, output.shape[2]), trg_seqs.contiguous().view(-1)

    evaluator = Engine(inference)
    metrics = {
        "nll": Loss(criterion, output_transform=lambda x: (x[0], x[1])),
        "accuracy": Accuracy(output_transform=lambda x: (x[0], x[1]))
    }
    for name, metric in metrics.items():
        metric.attach(evaluator, name)
    Loss(criterion, output_transform=lambda x: (x[0], x[1]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def log_validation_results(trainer):
        evaluator.run(valid_data_loader)
        ms = evaluator.state.metrics
        logger.info(
            "Validation Results - Epoch: [{}/{}]  Avg accuracy: {:.6f} Avg loss: {:.6f}"
            .format(trainer.state.epoch, trainer.state.max_epochs,
                    ms['accuracy'], ms['nll']))

    '''======================early stopping =========================='''

    def score_function(engine):
        val_loss = engine.state.metrics['nll']
        return -val_loss

    handler = EarlyStopping(patience=5,
                            score_function=score_function,
                            trainer=trainer)
    evaluator.add_event_handler(Events.COMPLETED, handler)
    '''==================print information by iterator========================='''
    steps = len(train_data_loader.dataset) // train_data_loader.batch_size
    steps = steps if steps > 0 else 1
    logger.info('steps:%d' % steps)

    @trainer.on(Events.ITERATION_COMPLETED)
    def log_training_loss(trainer):
        if trainer.state.iteration % args.log_step == 0:
            logger.info("Epoch[{}/{}] Step[{}/{}] Loss: {:.6f}".format(
                trainer.state.epoch, trainer.state.max_epochs,
                trainer.state.iteration % steps, steps,
                trainer.state.output * args.gradient_accumulation_steps))

    '''================add check point========================'''
    checkpoint_handler = ModelCheckpoint(checkpoint_dir,
                                         'checkpoint',
                                         save_interval=1,
                                         n_saved=3)
    trainer.add_event_handler(
        Events.EPOCH_COMPLETED, checkpoint_handler,
        {'mymodel': getattr(model, 'module', model)
         })  # "getattr" take care of distributed encapsulation
    '''==============run trainer============================='''
    trainer.run(train_data_loader, max_epochs=args.n_epochs)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--resume_ret', action='store_true')
    parser.add_argument('--fourthofdata', action='store_true')
    parser.add_argument('--halfdata', action='store_true')

    opt = parser.parse_args()

    ############################### RETRIEVER #################################

    ret_enc = RNNEncoder(ret_INPUT_DIM, ret_ENC_EMB_DIM, ret_HID_DIM,
                         ret_N_LAYERS, ret_ENC_DROPOUT)
    ret_dec = RNNDecoder(ret_OUTPUT_DIM, ret_DEC_EMB_DIM, ret_HID_DIM,
                         ret_N_LAYERS, ret_DEC_DROPOUT)

    ret_model = Seq2Seq(ret_enc, ret_dec, cuda_device).to(cuda_device)

    print('The model has {0:9d} trainable parameters'.format(
        count_parameters(ret_model)))

    ret_optimizer = optim.Adam(ret_model.parameters())
    ret_criterion = nn.CrossEntropyLoss()

    if not os.path.isdir('models'):
        os.makedirs('models')

    if opt.resume_ret:
        with open("results/" + "ret" + "_data.pickle", "rb") as k:
            data = pickle.load(k)

        train_data = data["train"]
        valid_data = data["valid"]
        test_data = data["test"]
        train_data_adj = data["train_adj"]
        valid_data_adj = data["valid_adj"]
        test_data_adj = data["test_adj"]

        print("valid data", valid_data)

        MODEL_SAVE_PATH = os.path.join(SAVE_DIR, "ret" + '_model.pt')
        ret_model.load_state_dict(torch.load(MODEL_SAVE_PATH))

        with open("results/" + "ret" + "_latent_space_vect.pickle", "rb") as j:
            latent_space_vects = pickle.load(j)
            enc_train_vect = latent_space_vects["train"]
            enc_valid_vect = latent_space_vects["valid"]
    else:
        train_data, valid_data, test_data, train_data_adj, valid_data_adj, test_data_adj = split_data(
            opt)
        data = {}
        data["train"] = train_data
        data["valid"] = valid_data
        data["test"] = test_data
        data["train_adj"] = train_data_adj
        data["valid_adj"] = valid_data_adj
        data["test_adj"] = test_data_adj

        with open("results/" + "ret" + "_data.pickle", "wb") as k:
            pickle.dump(data, k)
        enc_train_vect, enc_valid_vect = train_valid_model(
            filename="ret",
            which_train=ret_train,
            which_evaluate=ret_evaluate,
            model=ret_model,
            train_data=train_data,
            valid_data=valid_data,
            train_data_adj=train_data_adj,
            valid_data_adj=valid_data_adj,
            optimizer=ret_optimizer,
            criterion=ret_criterion)

    enc_test_vect = test_model(filename="ret",
                               which_evaluate=ret_evaluate,
                               model=ret_model,
                               test_data=test_data,
                               test_data_adj=test_data_adj,
                               criterion=ret_criterion)

    ######################## NEAREST NEIGHBOUR #################################

    train_ann = create_annoy_index("AttnEncAttnDecTrain", enc_train_vect)
    valid_ann = create_annoy_index("AttnEncAttnDecValid", enc_valid_vect)
    test_ann = create_annoy_index("AttnEncAttnDecTest", enc_test_vect)

    wordlist2comment_dict = pickle.load(open("wordlist2comment.pickle", "rb"))
    word2idcommentvocab_dict = pickle.load(
        open("word2idcommentvocab.pickle", "rb"))

    sim_train_data = torch.zeros_like(train_data)
    sim_valid_data = torch.zeros_like(valid_data)
    sim_test_data = torch.zeros_like(test_data)

    for training_sample_id in range(train_data.shape[0]):
        training_sample_comment = train_data[
            training_sample_id][:max_comment_len]
        training_sample_code = train_data[training_sample_id][max_comment_len +
                                                              1:]

        annoy_vect = train_ann.get_item_vector(training_sample_id)
        sim_vect_id = train_ann.get_nns_by_vector(annoy_vect, 1)

        if sim_vect_id == training_sample_id:
            print("Same id for training vect and similar vect")
            exit(0)

        sim_train_data[training_sample_id] = train_data[sim_vect_id]

    new_train_data = torch.cat((train_data, sim_train_data), dim=1)
    #print("new_train_data ", new_train_data.shape)

    for valid_sample_id in range(valid_data.shape[0]):
        valid_sample_comment = valid_data[valid_sample_id][:max_comment_len]
        valid_sample_code = valid_data[valid_sample_id][max_comment_len + 1:]

        annoy_vect = valid_ann.get_item_vector(valid_sample_id)
        sim_vect_id = train_ann.get_nns_by_vector(annoy_vect, 1)

        if sim_vect_id == valid_sample_id:
            print("Same id for training vect and similar vect")
            exit(0)

        sim_valid_data[valid_sample_id] = train_data[sim_vect_id]

    new_valid_data = torch.cat((valid_data, sim_valid_data), dim=1)

    for test_sample_id in range(test_data.shape[0]):
        test_sample_comment = test_data[test_sample_id][:max_comment_len]
        test_sample_code = test_data[test_sample_id][max_comment_len + 1:]

        annoy_vect = test_ann.get_item_vector(test_sample_id)
        sim_vect_id = train_ann.get_nns_by_vector(annoy_vect, 1)

        if sim_vect_id == test_sample_id:
            print("Same id for training vect and similar vect")
            exit(0)

        sim_test_data[test_sample_id] = train_data[sim_vect_id]

    new_test_data = torch.cat((test_data, sim_test_data), dim=1)

    ############################### TSNE #################################

    #tsne_test_sample = enc_test_vect[0]
    num_tsne_train_data = 100
    which_tsne_test_sample = random.randint(0, enc_test_vect.shape[0])

    annoy_tsne_test_vect = test_ann.get_item_vector(which_tsne_test_sample)
    tsne_data = enc_train_vect[:num_tsne_train_data]
    tsne_data_add = torch.zeros(11, enc_test_vect.shape[1], device=cuda_device)
    tsne_data_add[0] = enc_test_vect[which_tsne_test_sample]

    nr = 1
    for id in train_ann.get_nns_by_vector(annoy_tsne_test_vect, 10):
        tsne_data_add[nr] = enc_train_vect[id]
        nr += 1

    tsne_data = torch.cat((tsne_data, tsne_data_add), dim=0)

    colour_labels = []
    for i in range(num_tsne_train_data):
        colour_labels += ["#0099cc"]  #train
    colour_labels += ["#e60b42"]  #test
    for i in range(10):
        colour_labels += ["#f09a00"]  #nearest neighbours

    vis_tsne(data=tsne_data, labels=colour_labels, name="10nearest")

    ############################### EDITOR #################################

    ed_enc = GraphCondAttnEncoder(src_vocab_size, trg_vocab_size, ed_hid_dim,
                                  ed_n_layers, ed_n_heads, ed_pf_dim,
                                  AttnEncoderLayer, SelfAttention,
                                  PositionwiseFeedforward, ed_dropout,
                                  cuda_device)
    ed_dec = AttnDecoder(ed_output_dim, ed_hid_dim, ed_n_layers, ed_n_heads,
                         ed_pf_dim, AttnDecoderLayer, SelfAttention,
                         PositionwiseFeedforward, ed_dropout, cuda_device)

    ed_pad_idx = 0
    ed_model = Editor(ed_enc, ed_dec, ed_pad_idx, cuda_device).to(cuda_device)

    for p in ed_model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    print('The model has {0:9d} trainable parameters'.format(
        count_parameters(ed_model)))

    ed_optimizer = optim.Adam(ed_model.parameters())
    ed_criterion = nn.CrossEntropyLoss()

    output_train_vect, output_valid_vect_candidates = train_valid_model(
        filename="ed",
        which_train=ed_train,
        which_evaluate=ed_evaluate,
        model=ed_model,
        train_data=new_train_data,
        valid_data=new_valid_data,
        train_data_adj=train_data_adj,
        valid_data_adj=valid_data_adj,
        optimizer=ed_optimizer,
        criterion=ed_criterion)
    #print("Test model")
    output_test_vect_candidates = test_model(filename="ed",
                                             which_evaluate=ed_evaluate,
                                             model=ed_model,
                                             test_data=new_test_data,
                                             test_data_adj=test_data_adj,
                                             criterion=ed_criterion)
    output_test_vect_reference = test_data[:, max_comment_len:]

    token_dict = pickle.load(open("codevocab.pickle", "rb"))

    all_refs = []
    all_cands = []
    all_bleu_scores = []
    for j in range(test_data.shape[0]):
        ref = []
        cand = []
        for i in range(max_code_len):
            ref_el = output_test_vect_reference[j][i].item()
            cand_el = output_test_vect_candidates[j][i].item()
            if ref_el > 0:
                if ref_el in token_dict:
                    ref += [token_dict[ref_el]]
                if cand_el in token_dict:
                    cand += [token_dict[cand_el]]
        bleu = sentence_bleu([ref], cand)
        all_bleu_scores += [bleu]
        all_refs += [ref]
        all_cands += [cand]

    bleu_eval = {}
    bleu_eval["scores"] = all_bleu_scores
    bleu_eval["references"] = all_refs
    bleu_eval["candidates"] = all_cands

    print("Average BLEU score is ",
          sum(all_bleu_scores) / len(all_bleu_scores))
    pickle.dump(bleu_eval, open("results/bleu_evaluation_results.pickle",
                                "wb"))
Beispiel #7
0
def main(N_EPOCHS, learning_rate, batch_size, device, save_dir):

    dataset = util.load_jsonl(train_path)
    dataset = Dataset.Dataset(dataset)

    train_length = int(len(dataset) * 0.8)
    valid_length = len(dataset) - train_length
    train_set, val_set = torch.utils.data.random_split(
        dataset, (train_length, valid_length))
    # default batch size 보다 작은 부분 남았을 경우 check
    #train_set, val_set,_ = torch.utils.data.random_split(dataset, (32, 32,len(dataset)-64))

    test_set = util.load_jsonl(test_path)
    test_set = Dataset.Dataset(test_set)
    # 생성된 문장 차원 check
    #test_set,_ = torch.utils.data.random_split(test_set, (2,len(test_set)-2))

    train_dataloader = torch.utils.data.DataLoader(train_set,
                                                   batch_size=batch_size,
                                                   shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(
        val_set, batch_size=batch_size)  # shuffle=True

    test_dataloader = torch.utils.data.DataLoader(test_set,
                                                  batch_size=1)  # 한줄씩 생성요약

    # input dim, output dim 변경
    INPUT_DIM = 49990  #80000
    OUTPUT_DIM = 49990  #80000
    ENC_EMB_DIM = 32
    DEC_EMB_DIM = 32
    HID_DIM = 512
    N_LAYERS = 2
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5

    vocab = Vocab.Vocab()

    enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT,
                  device)
    dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

    model = Seq2Seq(enc, dec, device).to(device)
    model.apply(init_weights)

    torch.autograd.set_detect_anomaly(True)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0).to(device)  # padding

    #N_EPOCHS = 1
    CLIP = 1

    best_valid_loss = 100  #float('inf')
    '''
    for epoch in range(N_EPOCHS):

        #start_time = time.time()

        train_loss = train(model, train_dataloader, optimizer, criterion, CLIP, vocab, device)
        valid_loss = evaluate(model, val_dataloader, criterion, vocab, device) # , valid_sents

        #end_time = time.time()
        #epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model, f'{save_dir}/seq2seq.pt')

        #print(f'Epoch: {epoch+1:2} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    '''
    model = torch.load(f'{save_dir}/seq2seq.pt')
    test_sents = test(model, test_dataloader, criterion, vocab, device)

    save_csv(out_path, test_sents)
Beispiel #8
0
    decoder = Seq2SeqBA.Decoder(o_dim, e_dim, enc_h_dim, dec_h_dim, dropout,
                                attention)
    model = Seq2SeqBA.BahdanauS2S(encoder, decoder, device).to(device)
    model_name = "S2SBA.pt"
else:
    encoder = Seq2Seq.Encoder(i_dim,
                              e_dim,
                              enc_h_dim,
                              n_layers=2,
                              dropout=dropout)
    decoder = Seq2Seq.Decoder(o_dim,
                              e_dim,
                              dec_h_dim,
                              n_layers=2,
                              dropout=dropout)
    model = Seq2Seq.Seq2Seq(encoder, decoder, device).to(device)
    model_name = "S2S.pt"

print("Initialize weights")
model.apply(initialize_weights)

optimizer = optim.Adam(model.parameters(), lr=lr)
target_pad_idx = en_field.vocab.stoi[en_field.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=target_pad_idx)

best_val_loss = float('inf')
writer = SummaryWriter(log_dir)
for epoch in range(num_epochs):
    s = time.time()
    train_loss = train(model, train_loader, optimizer, criterion, clip=1)
    val_loss = evaluate(model, val_loader, criterion)
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--resume_ret', action='store_true')
    parser.add_argument('--fourthofdata', action='store_true')
    parser.add_argument('--halfdata', action='store_true')
    parser.add_argument('--threefourthsofdata', action='store_true')
    opt = parser.parse_args()

    train_data, valid_data, test_data = split_data(opt)

    ret_enc = RNNEncoder(ret_INPUT_DIM, ret_ENC_EMB_DIM, ret_HID_DIM,
                         ret_N_LAYERS, ret_ENC_DROPOUT)
    ret_dec = RNNDecoder(ret_OUTPUT_DIM, ret_DEC_EMB_DIM, ret_HID_DIM,
                         ret_N_LAYERS, ret_DEC_DROPOUT)

    model = Seq2Seq(ret_enc, ret_dec, cuda_device).to(cuda_device)

    print('The model has {0:9d} trainable parameters'.format(
        count_parameters(model)))

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    if not os.path.isdir('models'):
        os.makedirs('models')

    enc_train_vect, enc_valid_vect = train_valid_model(model=model,
                                                       train_data=train_data,
                                                       valid_data=valid_data,
                                                       optimizer=optimizer,
                                                       criterion=criterion)
    enc_test_vect = test_model(model=model,
                               test_data=test_data,
                               criterion=criterion)

    train_ann = create_annoy_index("AttnEncAttnDecTrain", enc_train_vect)
    valid_ann = create_annoy_index("AttnEncAttnDecValid", enc_valid_vect)
    test_ann = create_annoy_index("AttnEncAttnDecTest", enc_test_vect)

    wordlist2comment_dict = pickle.load(open("wordlist2comment.pickle", "rb"))
    word2idcommentvocab_dict = pickle.load(
        open("word2idcommentvocab.pickle", "rb"))

    sim_train_data = torch.zeros_like(train_data)
    sim_valid_data = torch.zeros_like(valid_data)
    sim_test_data = torch.zeros_like(test_data)

    for training_sample_id in range(train_data.shape[0]):
        training_sample_comment = train_data[
            training_sample_id][:max_comment_len]
        training_sample_code = train_data[training_sample_id][max_comment_len +
                                                              1:]

        annoy_vect = train_ann.get_item_vector(training_sample_id)
        sim_vect_id = train_ann.get_nns_by_vector(annoy_vect, 1)

        if sim_vect_id == training_sample_id:
            print("Same id for training vect and similar vect")
            exit(0)

        sim_train_data[training_sample_id] = train_data[sim_vect_id]

    for valid_sample_id in range(valid_data.shape[0]):
        valid_sample_comment = valid_data[valid_sample_id][:max_comment_len]
        valid_sample_code = valid_data[valid_sample_id][max_comment_len + 1:]

        annoy_vect = valid_ann.get_item_vector(valid_sample_id)
        sim_vect_id = train_ann.get_nns_by_vector(annoy_vect, 1)

        if sim_vect_id == valid_sample_id:
            print("Same id for training vect and similar vect")
            exit(0)

        sim_valid_data[valid_sample_id] = train_data[sim_vect_id]

    for test_sample_id in range(test_data.shape[0]):
        test_sample_comment = test_data[test_sample_id][:max_comment_len]
        test_sample_code = test_data[test_sample_id][max_comment_len + 1:]

        annoy_vect = test_ann.get_item_vector(test_sample_id)
        sim_vect_id = train_ann.get_nns_by_vector(annoy_vect, 1)

        if sim_vect_id == test_sample_id:
            print("Same id for training vect and similar vect")
            exit(0)

        sim_test_data[test_sample_id] = train_data[sim_vect_id]

    output_test_vect_reference = test_data[:, max_comment_len:]
    output_test_vect_candidates = sim_test_data[:, max_comment_len:]

    token_dict = pickle.load(open("codevocab.pickle", "rb"))

    all_refs = []
    all_cands = []
    all_bleu_scores = []
    for j in range(test_data.shape[0]):
        ref = []
        cand = []
        for i in range(max_code_len):
            ref_el = output_test_vect_reference[j][i].item()
            cand_el = output_test_vect_candidates[j][i].item()
            if ref_el > 0:
                if ref_el in token_dict:
                    ref += [token_dict[ref_el]]
                if cand_el in token_dict:
                    cand += [token_dict[cand_el]]
        bleu = sentence_bleu([ref], cand)
        all_bleu_scores += [bleu]
        all_refs += [ref]
        all_cands += [cand]

    bleu_eval = {}
    bleu_eval["scores"] = all_bleu_scores
    bleu_eval["references"] = all_refs
    bleu_eval["candidates"] = all_cands

    print("Average BLEU score is ",
          sum(all_bleu_scores) / len(all_bleu_scores))
    pickle.dump(bleu_eval, open("results/bleu_evaluation_results.pickle",
                                "wb"))
Beispiel #10
0
NB_HIDDEN = 250
BATCH_SIZE = 50
NB_STEP = 15
NB_FEATURES = 26 # Alphabet
LEARNING_RATE = 1e-2
STOP_TRESHOLD = 1.
LOG_DIR = "logs/" + str(LEARNING_RATE) + "_learning_rate"

# Random one hot batch generator tensor
random_one_hot_batch_generator = tf.one_hot(tf.random_uniform([NB_STEP], minval=0, \
                                                              maxval=NB_FEATURES - 1, \
                                                              dtype=tf.int32), NB_FEATURES)

if __name__ == "__main__":
    seq2seq = Seq2Seq(NB_FEATURES, NB_HIDDEN, NB_HIDDEN, LEARNING_RATE)
    init_global = tf.global_variables_initializer()
    init_local = tf.local_variables_initializer()

    loss_placeholder = tf.placeholder(tf.float32, [])
    loss_summary = tf.summary.scalar("Loss", loss_placeholder)
    accuracy_placeholder = tf.placeholder(tf.float32, [])
    accuracy_summary = tf.summary.scalar("Accuracy", accuracy_placeholder)
    summary_writer = tf.summary.FileWriter(LOG_DIR, tf.get_default_graph())

    with tf.Session() as sess:
        sess.run([init_global, init_local])

        accuracy = 0.0
        epoch = 0
        while accuracy < STOP_TRESHOLD:
Beispiel #11
0
def train_model():
    train_noisy_Id, train_noisy_len, train_clean_Id, train_clean_len, train_answer_Id, train_answer_len, test_noisy_Id, test_noisy_len, test_clean_Id, test_clean_len, test_answer_Id, test_answer_len, eval_noisy_Id, eval_noisy_len, eval_clean_Id, eval_clean_len, eval_answer_Id, eval_answer_len, vocab_size = load_data(
    )

    max_answer_length = np.asarray(train_answer_Id).shape[1]
    max_target_length = np.asarray(train_clean_Id).shape[1]
    max_source_length = np.asarray(train_noisy_Id).shape[1]

    print "trian answer Lstm model"
    an_Lstm = Answer_LSTM.answer_lstm(batch_size, max_answer_length,
                                      vocab_size, embedding_size, num_units,
                                      None, None, None, None)
    an_Lstm.build_graph()

    saver = tf.train.Saver(sharded=False)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # saver.restore(sess, "../Seq_ckpt/pretrain-model")
        for batch in range(max_batches):
            answer_shuffle, ans_len = next_batch_Lstm(train_answer_Id,
                                                      train_answer_len,
                                                      batch_size)
            fd = {
                an_Lstm.answer_inputs: answer_shuffle,
                an_Lstm.answer_inputs_length: ans_len
            }
            l, _ = sess.run([an_Lstm.loss_answer, an_Lstm.train_lstm], fd)
            if batch == 0 or batch % batches_in_epoch == 0:
                print('batch {}'.format(batch))
                answer_shuffle, ans_len = next_batch_Lstm(
                    eval_answer_Id, eval_answer_len, batch_size)
                fd_eval = {
                    an_Lstm.answer_inputs: answer_shuffle,
                    an_Lstm.answer_inputs_length: ans_len
                }
                print('  minibatch loss: {}'.format(
                    sess.run(an_Lstm.loss_answer, fd_eval)))
        saver.save(sess, "../Seq_ckpt/pretrain-lstm")

    print "trian Seq2seq model"
    Seq2Seq_model = Seq2Seq.Seq2Seq(batch_size, max_source_length,
                                    max_target_length, vocab_size,
                                    embedding_size, num_units, None, None,
                                    None, None, None, None, None)
    Seq2Seq_model.build_graph()

    saver = tf.train.Saver(sharded=False)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # saver.restore(sess, "../Seq_ckpt/pretrain-model")
        for batch in range(max_batches):
            source_shuffle, source_len, target_shuffle, target_len = next_batch_Seq2seq(
                train_noisy_Id, train_noisy_len, train_clean_Id,
                train_clean_len, batch_size)
            fd = {
                Seq2Seq_model.encoder_inputs: source_shuffle,
                Seq2Seq_model.encoder_inputs_length: source_len,
                Seq2Seq_model.decoder_targets: target_shuffle,
                Seq2Seq_model.decoder_length: target_len
            }
            l, _ = sess.run(
                [Seq2Seq_model.loss_seq2seq, Seq2Seq_model.train_op], fd)
            if batch == 0 or batch % batches_in_epoch == 0:
                print('batch {}'.format(batch))
                source_shuffle, source_len, target_shuffle, target_len = next_batch_Seq2seq(
                    eval_noisy_Id, eval_noisy_len, eval_clean_Id,
                    eval_clean_len, batch_size)
                fd_eval = {
                    Seq2Seq_model.encoder_inputs: source_shuffle,
                    Seq2Seq_model.encoder_inputs_length: source_len,
                    Seq2Seq_model.decoder_targets: target_shuffle,
                    Seq2Seq_model.decoder_length: target_len
                }
                print('  minibatch loss: {}'.format(
                    sess.run(Seq2Seq_model.loss_seq2seq, fd_eval)))
        saver.save(sess, "../Seq_ckpt/pretrain-seq2seq")