Beispiel #1
0
def calc_score():
    seed_everything()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = Seq2seq().to(device)
    model.load(
        torch.load('{}/{}_{}.pth'.format(OUTPUT_DIR, FN, CKPT_NUM))['model'])

    sp = spm.SentencePieceProcessor()
    sp.Load(SP_PATH)

    test_data_txt = open(TEST_DATA_TXT_PATH, 'r', encoding='utf8')
    with open(TEST_DATA_PKL_PATH, 'rb') as f:
        test_data_pkl = pickle.load(f)
    dataset = DialogDataset(test_data_pkl)
    data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    if LOSS == 'SCE':
        criterion = SCELoss()
    elif LOSS == 'ITF':
        criterion = ITFLoss(device, _lambda=LAMBDA)
    else:
        criterion = INFLoss(device, _lambda=LAMBDA)

    count = 0
    test_ref, test_hyp = [], []
    for line in test_data_txt:
        count += 1
        if count % 2 != 0:
            if LANGUAGE == 'JP':
                test_ref.append(
                    sp.EncodeAsPieces(
                        evaluate(line, sp, model, device).replace('▁', '')))
            else:
                test_ref.append(
                    sp.EncodeAsPieces(
                        evaluate(line, sp, model, device).replace('▁', ' ')))
        else:
            if LANGUAGE == 'JP':
                test_hyp.append(sp.EncodeAsPieces(line.replace('▁', '')))
            else:
                test_hyp.append(sp.EncodeAsPieces(line.replace('▁', ' ')))

    print("---------- RESULTS ---------")
    test_ppl = get_perplexity(model, criterion, data_loader, device)
    print("PPL: {}".format(test_ppl))

    test_bleu_1 = get_bleu_score(test_ref, test_hyp, 1) * 100
    test_bleu_2 = get_bleu_score(test_ref, test_hyp, 2) * 100
    print("BLEU-1:{}, 2:{}".format(test_bleu_1, test_bleu_2))

    test_rouge_1 = get_rouge_score(test_ref, test_hyp, 1) * 100
    test_rouge_2 = get_rouge_score(test_ref, test_hyp, 2) * 100
    print("ROUGE-1:{}, 2:{}".format(test_rouge_1, test_rouge_2))

    test_dist_1 = get_dist_n(test_ref, 1) * 100
    test_dist_2 = get_dist_n(test_ref, 2) * 100
    print("DIST-1:{}, 2:{}".format(test_dist_1, test_dist_2))

    test_len = get_length(test_ref) / count
    print("LENGTH:{}".format(test_len))
Beispiel #2
0
    def __init__(self, model_path, output_dir):
        self.logger = logging.getLogger('paragraph-level')

        self.output_dir = output_dir
        self.test_data = open(config.test_trg_file, "r").readlines()
        self.data_loader = get_loader(config.test_src_file,
                                      config.test_trg_file,
                                      config.test_ans_file,
                                      batch_size=1,
                                      use_tag=False,
                                      shuffle=False)

        self.tokenizer = BertTokenizer.from_pretrained(r'MTBERT/vocab.txt')
        self.model_config = BertConfig.from_pretrained('MTBERT')
        self.model = Seq2seq()
        if config.use_gpu:
            state_dict = torch.load(model_path, map_location=config.device)
        else:
            state_dict = torch.load(model_path, map_location='cpu')

        self.model.load_state_dict(state_dict)
        self.model.eval()
        if config.use_gpu:
            self.moddel = self.model.to(config.device)
        self.pred_dir = 'result/pointer_maxout_ans/generated.txt'
        self.golden_dir = 'result/pointer_maxout_ans/golden.txt'
        self.src_file = 'result/pointer_maxout_ans/src.txt'

        # dummy file for evaluation
        with open(self.src_file, "w") as f:
            for i in range(len(self.data_loader)):
                f.write(str(i) + "\n")
    def __init__(self, model_path, output_dir):
        with open(config.word2idx_file, "rb") as f:
            word2idx = pickle.load(f)

        self.output_dir = output_dir
        self.test_data = open(config.test_trg_file, "r").readlines()
        self.data_loader = get_loader(config.test_src_file,
                                      config.test_trg_file,
                                      word2idx,
                                      batch_size=1,
                                      use_tag=True,
                                      shuffle=False)

        self.tok2idx = word2idx
        self.idx2tok = {idx: tok for tok, idx in self.tok2idx.items()}
        self.model = Seq2seq()
        state_dict = torch.load(model_path)
        self.model.load_state_dict(state_dict)
        self.model.eval()
        self.moddel = self.model.to(config.device)
        self.pred_dir = os.path.join(output_dir, "generated.txt")
        self.golden_dir = os.path.join(output_dir, "golden.txt")
        self.src_file = os.path.join(output_dir, "src.txt")

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # dummy file for evaluation
        with open(self.src_file, "w") as f:
            for i in range(len(self.data_loader)):
                f.write(str(i) + "\n")
Beispiel #4
0
    def __init__(self, model_path=None):
        # load dictionary and embedding file
        with open(config.embedding, "rb") as f:
            embedding = pickle.load(f)
            embedding = torch.Tensor(embedding).to(config.device)
        with open(config.word2idx_file, "rb") as f:
            word2idx = pickle.load(f)

        # train, dev loader
        print("load train data")
        self.train_loader = get_loader(config.train_src_file,
                                       config.train_trg_file,
                                       word2idx,
                                       use_tag=config.use_tag,
                                       batch_size=config.batch_size,
                                       debug=config.debug)
        self.dev_loader = get_loader(config.dev_src_file,
                                     config.dev_trg_file,
                                     word2idx,
                                     use_tag=config.use_tag,
                                     batch_size=128,
                                     debug=config.debug)

        train_dir = os.path.join("./save", "seq2seq")
        self.model_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S")))
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)

        self.model = Seq2seq(embedding, config.use_tag, model_path=model_path)
        params = list(self.model.encoder.parameters()) \
                 + list(self.model.decoder.parameters())

        self.lr = config.lr
        self.optim = optim.SGD(params, self.lr, momentum=0.8)
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
Beispiel #5
0
Datei: main.py Projekt: l-xin/hms
def create_model(args):
    trim_min_count = 5
    data_loader = DataLoader(args, trim_min_count=trim_min_count)

    embed_model = nn.Embedding(data_loader.vocab_len, args.embed)
    embed_model.weight.data.copy_(data_loader.embed_vectors)
    encode_model = Encoder(
        embed_model=embed_model,
        hidden_size=args.hidden,
        span_size=data_loader.span_size,
        dropout=args.dropout,
    )

    decode_model = Decoder(
        embed_model=embed_model,
        op_set=data_loader.op_set,
        vocab_dict=data_loader.vocab_dict,
        class_list=data_loader.class_list,
        hidden_size=args.hidden,
        dropout=args.dropout,
        use_cuda=args.use_cuda
    )
    
    seq2seq = Seq2seq(encode_model, decode_model)
    return seq2seq, data_loader
Beispiel #6
0
    def predict(self, seq2seq):
        predicts = []
        ids = []
        sentences = []
        lengths = []
        seq2seq = Seq2seq(self.config, device=self.device, load_emb=True)
        # gold = []
        data = prepare.load_data(self.mode)
        if mode == 'test':
            data = prepare.test_process(data)
        else:
            data = prepare.process(data)
        data = data_prepare.Data(data, config.batch_size, config)
        for batch_i in range(data.batch_number):
            batch_data = data.next_batch(is_random=False)
            pred_action_list, pred_logits_list = self.test_step(
                batch_data, seq2seq)
            pred_action_list = pred_action_list.cpu().numpy()

            sentences.extend(batch_data.sentence_fw)
            predicts.extend([
                pred_action_list[:, i]
                for i in range(pred_action_list.shape[1])
            ])
            # print(len(predicts))
            ids.extend(batch_data.standard_outputs)
            lengths.extend(batch_data.input_sentence_length)

        evaluation.get_result(ids, sentences, lengths, predicts, config)
        # gold.extend(batch_data.all_triples)

        # (r_f1, r_precision, r_recall), (e_f1, e_precision, e_recall) = evaluation.rel_entity_compare(predicts, gold, self.config)
        data.reset()
def train(epoch_num):
    args.train = True
    model = Seq2seq(args, ch_words_dict)
    batch_index, min_loss = 0, 100
    with tf.Session() as sess:
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        for epoch in range(epoch_num):
            train_gegnator = batch_genetator(mode='train')
            try:
                while True:
                    sou_sentences_list, sou_length_list, tag_sentences_list, tag_length_list = next(train_gegnator)
                    print('sentence length{}'.format(len(sou_sentences_list[0])))

                    if len(sou_sentences_list[0]) > 90:
                        continue
                    feed_dict = {model.sequence_input: sou_sentences_list,
                                 model.sequence_length: sou_length_list,
                                 model.target_input: tag_sentences_list,
                                 model.target_length: tag_length_list}

                    loss, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict)
                    print('epoch: {}, batch index: {}, loss: {}, current min loss: {}'.format(epoch, batch_index, loss, min_loss))
                    if loss < min_loss:
                        min_loss = loss
                        print('save at epoch: {}, batch {} the loss is {}'.format(epoch, batch_index, min_loss))
                        saver.save(sess, '../model/model.ckpt')

                    batch_index = batch_index + 1
            except StopIteration as e:
                print('finish training')
Beispiel #8
0
    def __init__(self):
        logging.info("load data......")
        self.data = datasets.Lang8v1()
        self.data.process()
        self.data.show()

        self.config = Config()
        self.config.source_vocab_size = self.data.src_vocab_size
        self.config.target_vocab_size = self.data.tgt_vocab_size
        self.config.batch_size = 1

        logging.info("build model......")
        self.model = Seq2seq(config=self.config,
                             src_embedding=self.data.src_embedding_matrix,
                             tgt_embedding=self.data.tgt_embedding_matrix,
                             useTeacherForcing=False,
                             useAttention=True,
                             useBeamSearch=8)

        logging.info("init model......")
        # with tf.Session() as sess:
        sess = tf.Session()
        self.model.init(sess)
        checkpoint_path = tf.train.latest_checkpoint(
            self.config.checkpoint_dir)
        assert checkpoint_path, 'No checkpoint found'
        logging.info('Restore model from %s' % checkpoint_path)
        self.model.saver.restore(sess, checkpoint_path)
    def __init__(self, args):
        # load dictionary and embedding file
        with open(config.embedding, "rb") as f0:
            embedding = pickle.load(f0)
            embedding = torch.tensor(embedding,
                                     dtype=torch.float).to(config.device)
        with open(config.entity_embedding, "rb") as f1:
            ent_embedding = pickle.load(f1)
            ent_embedding = torch.tensor(ent_embedding,
                                         dtype=torch.float).to(config.device)
        with open(config.relation_embedding, "rb") as f2:
            rel_embedding = pickle.load(f2)
            rel_embedding = torch.tensor(rel_embedding,
                                         dtype=torch.float).to(config.device)
        with open(config.word2idx_file, "rb") as f:
            word2idx = pickle.load(f)
        with open(config.ent2idx_file, "rb") as g:
            ent2idx = pickle.load(g)
        with open(config.rel2idx_file, "rb") as h:
            rel2idx = pickle.load(h)

        # train, dev loader
        print("load train data")
        self.train_loader = get_loader(config.train_src_file,
                                       config.train_trg_file,
                                       config.train_csfile,
                                       word2idx,
                                       use_tag=True,
                                       batch_size=config.batch_size,
                                       debug=config.debug)
        self.dev_loader = get_loader(config.dev_src_file,
                                     config.dev_trg_file,
                                     config.dev_csfile,
                                     word2idx,
                                     use_tag=True,
                                     batch_size=128,
                                     debug=config.debug)

        train_dir = "./save"
        self.model_dir = os.path.join(
            train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S")))
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)

        self.model = Seq2seq(embedding, ent_embedding, rel_embedding)
        # self.model = nn.DataParallel(self.model)
        self.model = self.model.to(config.device)

        if len(args.model_path) > 0:
            print("load check point from: {}".format(args.model_path))
            state_dict = torch.load(args.model_path, map_location="cpu")
            self.model.load_state_dict(state_dict)

        params = self.model.parameters()

        self.lr = config.lr
        self.optim = optim.SGD(params, self.lr, momentum=0.8)
        # self.optim = optim.Adam(params)
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
Beispiel #10
0
    def __init__(self, config: const.Config, mode: str, device: torch.device) -> None:

        self.config = config

        self.device = device

        self.seq2seq = Seq2seq(config, device=device)

        data = prepare.load_data(mode)
        data = prepare.process(data)
        self.data = data_prepare.Data(data, config.batch_size, config)
Beispiel #11
0
def test(dataset, args, test_id):
    if args.model == 'seq2seq':
        model = Seq2seq(32, args.lr, dataset.vocabsize, args.embed_dim, args.fs, args.feat_dim, dataset.pretrain_wordemb, False, pred_batch_size=1)
    else:
        print('choose a model to train! ')
        parser.print_help()
        return

    model.build_model()
    print(model.x)
    saver = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    latest_ckpt = tf.train.latest_checkpoint(args.loadpath)
    saver.restore(sess, latest_ckpt)
    print('restore from', latest_ckpt)
    
    fwirte = open(args.output, 'w')

    for step in range(0, dataset.iters):
        batch_x = dataset.next_batch_test()
        hist = np.zeros((1, dataset.vocabsize), dtype=np.float32)
        hist = list(hist)
        indices = np.ones((1, 1), dtype=np.float32)
        indices = list(indices)

        _, probs = sess.run([model.generated_words, model.probs], 
            feed_dict={ model.x: batch_x, model.hist: hist, model.indices:indices})
        

        words = beamsearch(probs, dataset.wtoi)

        sentence = ''
        for idx in words:
            if idx != 0 and dataset.itow[idx] == '<eos>':
                break;
            if idx != 0:
                sentence += dataset.itow[idx] + ' '

        # test for special mission
        if test_id[step] == 'klteYv1Uv9A_27_33.avi' or test_id[step] == '5YJaS2Eswg0_22_26.avi' or test_id[step] == 'UbmZAe5u5FI_132_141.avi' \
            or test_id[step] == 'JntMAcTlOF0_50_70.avi' or test_id[step] == 'tJHUH9tpqPg_113_118.avi':
            print(test_id[step], sentence)

        fwirte.write('%s,%s\n' % (test_id[step], sentence))

    fwirte.close()
    print('save test result file as', args.output)
    def __init__(self, config: const.Config, device: torch.device) -> None:

        self.config = config

        self.device = device

        self.seq2seq = Seq2seq(config, device=device, load_emb=True)
        self.loss = nn.NLLLoss()
        self.optimizer = torch.optim.Adam(self.seq2seq.parameters())

        data = prepare.load_data('train')
        data = prepare.process(data)
        self.data = data_prepare.Data(data, config.batch_size, config)

        self.epoch_number = config.epoch_number + 1
Beispiel #13
0
    def __init__(self):
        self.DEVICE = torch.device("cuda" if config.is_cuda else "cpu")
        dataset = PairDataset(config.data_path,
                              max_src_len=config.max_src_len,
                              max_tgt_len=config.max_tgt_len,
                              truncate_src=config.truncate_src,
                              truncate_tgt=config.truncate_tgt)

        self.vocab = dataset.build_vocab(embed_file=config.embed_file)
        self.model = Seq2seq(self.vocab)
        self.stop_word = list(
            set([
                self.vocab[x.strip()]
                for x in open(config.stop_word_file).readlines()
            ]))
        self.model.load_model()
        self.model.to(self.DEVICE)
def test_conv():
    seed_everything()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = Seq2seq().to(device)
    model.load(torch.load('{}/{}_{}.pth'.format(OUTPUT_DIR, FN, CKPT_NUM))['model'])

    sp = spm.SentencePieceProcessor()
    sp.Load(SP_PATH)

    test_data_txt = open(TEST_DATA_TXT_PATH, 'r', encoding='utf8')
    csv_name = './result_{}_{}_epoch{}.csv'.format(LOSS, LAMBDA, EPOCH_NUM)

    convs, tgts = [], []
    count = 0
    for line in test_data_txt:
        count += 1
        if count % 2 != 0:
            if LANGUAGE == 'JP':
                underline_replace = ''
            else:
                underline_replace = ' '
            convs.append({
                "src": line,
                "result_1": evaluate(line, sp, model, device).replace('▁', underline_replace),
                "result_2": evaluate(line, sp, model, device).replace('▁', underline_replace),
                "result_3": evaluate(line, sp, model, device).replace('▁', underline_replace),
            })

        else:
            tgts.append(line)

    col_name = ['src', 'result_1', 'result_2', 'result_3', 'tgt']
    try:
        with open(csv_name, 'w', newline='', encoding='utf8') as output_csv:
            csv_writer = csv.writer(output_csv)
            csv_writer.writerow(col_name)

            for conv, tgt in zip(convs, tgts):
                row_items = [conv['src'], conv['result_1'], conv['result_2'], conv['result_3'], tgt]
                csv_writer.writerow(row_items)

    except OSError:
        print('---------- OS Error ----------')
Beispiel #15
0
def run_evaluate():
    seed_everything()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    sp_model = spm.SentencePieceProcessor()
    sp_model.Load(SP_PATH)
    model = Seq2seq().to(device)
    model.load(
        torch.load('{}/{}_{}.pth'.format(OUTPUT_DIR, FN, CKPT_NUM))['model'])

    while True:
        s = input('You > ')
        if s == 'q':
            break
        print('BOT > ', end='')
        if LANGUAGE == 'JP':
            evaluate(s, sp_model, model, device).replace('_', '')
        else:
            evaluate(s, sp_model, model, device).replace('_', ' ')
Beispiel #16
0
    def __init__(self, args):
        self.logger = logging.getLogger('paragraph-level')

        # train, dev loader
        print("load train data")
        self.train_loader = get_loader(config.train_src_file,
                                       config.train_trg_file,
                                       config.train_ans_file,
                                       batch_size=config.batch_size,
                                       debug=config.debug,
                                       shuffle=True)
        self.dev_loader = get_loader(config.dev_src_file,
                                     config.dev_trg_file,
                                     config.dev_ans_file,
                                     batch_size=128,
                                     debug=config.debug)

        train_dir = os.path.join(config.file_path + "save", "seq2seq")
        self.model_dir = os.path.join(
            train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S")))
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)

        self.model = Seq2seq()
        if config.use_gpu:
            self.model = self.model.to(config.device)

        if len(args.model_path) > 0:
            print("load check point from: {}".format(args.model_path))
            state_dict = torch.load(args.model_path, map_location="cpu")
            self.model.load_state_dict(state_dict)

        params = self.model.parameters()
        bert_params = self.model.bert_encoder.named_parameters()
        for name, param in bert_params:
            param.requires_grad = False
        base_params = filter(lambda p: p.requires_grad,
                             self.model.parameters())
        self.lr = config.lr
        self.optim = optim.SGD(base_params, self.lr, momentum=0.8)
        # self.optim = optim.Adam(params)
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
    def __init__(self, model_path, output_dir):
        with open(config.word2idx_file, "rb") as f:
            word2idx = pickle.load(f)

        self.output_dir = output_dir
        self.test_data = open(config.test_trg_file, "r").readlines()
        self.data_loader = get_loader(config.test_src_file,
                                      config.test_trg_file,
                                      word2idx,
                                      batch_size=1,
                                      use_tag=config.use_tag,
                                      shuffle=False)

        self.tok2idx = word2idx
        self.idx2tok = {idx: tok for tok, idx in self.tok2idx.items()}
        self.model = Seq2seq(model_path=model_path)
        self.pred_dir = output_dir + "/generated.txt"
        self.golden_dir = output_dir + "/golden.txt"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
Beispiel #18
0
    def __init__(self):
        # load Bert Tokenizer and pre-trained word embedding
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        embeddings = None
        self.model = Seq2seq(config.dropout, embeddings, use_tag=config.use_tag)

        train_dir = os.path.join("./save", "c2q")

        self.train_loader = self.get_data_loader("./squad/train-v1.1.json")
        self.dev_loader = self.get_data_loader("./squad/new_dev-v1.1.json")

        self.model_dir = os.path.join(train_dir, "train_%d" % int(time.strftime("%m%d%H%M%S")))
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)

        params = list(self.model.encoder.parameters()) \
                 + list(self.model.decoder.parameters())

        self.lr = 0.1
        self.optim = optim.SGD(params, lr=self.lr)
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
    def __init__(self, model_path, output_dir):
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.output_dir = output_dir
        self.golden_q_ids = None
        self.all_c_tokens = None
        self.all_answer_text = None
        self.data_loader = self.get_data_loader("./squad/new_test-v1.1.json")

        self.tok2idx = self.tokenizer.vocab
        self.idx2tok = {idx: tok for tok, idx in self.tok2idx.items()}
        self.model = Seq2seq(dropout=0.0,
                             model_path=model_path,
                             use_tag=config.use_tag)
        self.model.requires_grad = False
        self.model.eval_mode()
        self.src_file = output_dir + "/src.txt"
        self.pred_file = output_dir + "/generated.txt"
        self.golden_file = output_dir + "/golden.txt"
        self.ans_file = output_dir + "/answer.txt"
        self.total_file = output_dir + "/all_files.csv"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
def evaluate():
    args.beam_search_num = -1
    en_id2word_path = '../Dataset/en_id2word_dict.pkl'
    ch_id2word_path = '../Dataset/ch_id2word_dict.pkl'

    with open(en_id2word_path, 'rb') as f:
        en_id2word_dict = pkl.load(f)

    with open(ch_id2word_path, 'rb') as f:
        ch_id2word_dict = pkl.load(f)

    model = Seq2seq(args, ch_words_dict)
    evaluate_generator = batch_genetator(mode='eva')
    batch_index = 0

    with tf.Session() as sess:
        saver = tf.train.Saver()
        saver.restore(sess, '../model/model.ckpt')
        try:
            while True:
                sou_sentences_list, sou_length_list, tag_sentences_list, tag_length_list = next(evaluate_generator)
                if len(sou_sentences_list[0]) > 90:
                    continue
                feed_dict = {model.sequence_input: sou_sentences_list,
                             model.sequence_length: sou_length_list,
                             model.target_input: tag_sentences_list,
                             model.target_length: tag_length_list}
                predict_ids = sess.run(model.out, feed_dict=feed_dict)
                for sentence_index in range(len(sou_sentences_list)):
                    sou_sentence = [en_id2word_dict[i] for i in sou_sentences_list[sentence_index]]

                    predict_sentence = [ch_id2word_dict[i] for i in predict_ids[sentence_index]]
                    tag_sentence = [ch_id2word_dict[i] for i in tag_sentences_list[sentence_index]]
                    print('sou_sentence: {}'.format(sou_sentence))
                    print('predict_sentence: {}'.format(predict_sentence))
                    print('tag_sentence: {}'.format(tag_sentence))
                batch_index = batch_index + 1
        except StopIteration as e:
            print('finish training')
Beispiel #21
0
model_path = "checkpoint/model.ckpt"

if __name__ == "__main__":
    print("(1)load data......")
    docs_source, docs_target = load_data(10)
    w2i_source, i2w_source = make_vocab(docs_source)
    w2i_target, i2w_target = make_vocab(docs_target)

    print("(2) build model......")
    config = Config()
    config.source_vocab_size = len(w2i_source)
    config.target_vocab_size = len(w2i_target)
    model = Seq2seq(config=config,
                    w2i_target=w2i_target,
                    useTeacherForcing=False,
                    useAttention=True,
                    useBeamSearch=3)

    print("(3) run model......")
    print_every = 100
    max_target_len = 20

    with tf.Session(config=tf_config) as sess:
        saver = tf.train.Saver()
        saver.restore(sess, model_path)

        source_batch, source_lens, target_batch, target_lens = get_batch(
            docs_source, w2i_source, docs_target, w2i_target,
            config.batch_size)
Beispiel #22
0
        source_batch.append(source_seq)
        target_batch.append(target_seq)
    return source_batch, source_lens, target_batch, target_lens


if __name__ == '__main__':
    print 'loading data ...'
    doc_source = helper.load_file('./data/small_vocab_en.txt')
    doc_target = helper.load_file('./data/small_vocab_fr.txt')
    s_token2idx, s_idx2token = helper.load_vocab('./data/small_vocab_en.txt', helper.SOURCE_CODES)
    t_token2idx, t_idx2token = helper.load_vocab('./data/small_vocab_fr.txt', helper.TARGET_CODES)
    print 'building model...'
    config = config()
    config.source_vocab_size = len(s_token2idx)
    config.target_vocab_size = len(t_token2idx)
    model = Seq2seq(config, t_token2idx, useTeacherForcing=True)
    batches = 10000
    print_every = 100
    print 'run model...'
    with tf.Session() as sess:
        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())
        losses = []
        total_loss = 0
        for batch in range(batches):
            source_batch, source_lens, target_batch, target_lens = get_batch(doc_source, s_token2idx, doc_target,
                                                                             t_token2idx, config.batch_size)

            feed_dict = {
                model.seq_inputs: source_batch,
                model.seq_inputs_len: source_lens,
Beispiel #23
0
def step_one():

    if args.mode == 0:
        encoder_cell = 'lstm'
        decoder_cell = 'lstm'
    elif args.mode == 1:
        encoder_cell = 'gru'
        decoder_cell = 'gru'
    elif args.mode == 2:
        encoder_cell = 'gru'
        decoder_cell = 'lstm'
    else:
        encoder_cell = 'lstm'
        decoder_cell = 'gru'

    data_loader = DataLoader(args)
    embed_model = nn.Embedding(data_loader.vocab_len, 128)
    #embed_model.weight.data.copy_(torch.from_numpy(data_loader.word2vec.emb_vectors))
    encode_model = EncoderRNN(vocab_size=data_loader.vocab_len,
                              embed_model=embed_model,
                              emb_size=128,
                              hidden_size=256,
                              input_dropout_p=0.3,
                              dropout_p=0.4,
                              n_layers=2,
                              bidirectional=True,
                              rnn_cell=None,
                              rnn_cell_name=encoder_cell,
                              variable_lengths=True)
    decode_model = DecoderRNN_3(vocab_size=data_loader.vocab_len,
                                class_size=data_loader.classes_len,
                                embed_model=embed_model,
                                emb_size=128,
                                hidden_size=512,
                                n_layers=2,
                                rnn_cell=None,
                                rnn_cell_name=decoder_cell,
                                sos_id=data_loader.vocab_dict['END_token'],
                                eos_id=data_loader.vocab_dict['END_token'],
                                input_dropout_p=0.3,
                                dropout_p=0.4)
    seq2seq = Seq2seq(encode_model, decode_model)

    if args.cuda_use:
        seq2seq = seq2seq.cuda()

    weight = torch.ones(data_loader.classes_len)
    pad = data_loader.decode_classes_dict['PAD_token']
    loss = NLLLoss(weight, pad)

    st = SupervisedTrainer(vocab_dict=data_loader.vocab_dict,
                           vocab_list=data_loader.vocab_list,
                           decode_classes_dict=data_loader.decode_classes_dict,
                           decode_classes_list=data_loader.decode_classes_list,
                           cuda_use=args.cuda_use,
                           loss=loss,
                           print_every=10,
                           teacher_schedule=False,
                           checkpoint_dir_name=args.checkpoint_dir_name)

    print('start training')
    st.train(model=seq2seq,
             data_loader=data_loader,
             batch_size=128,
             n_epoch=300,
             template_flag=True,
             resume=args.resume,
             optimizer=None,
             mode=args.mode,
             teacher_forcing_ratio=args.teacher_forcing_ratio,
             post_flag=args.post_flag)
Beispiel #24
0
from torch.utils.data import DataLoader
from config import *
from model import Seq2seq, SCELoss, ITFLoss, INFLoss
from utils import DialogDataset, get_optimizer, seed_everything, one_cycle

logging.basicConfig(level=logging.INFO)

if __name__ == '__main__':
    logging.info('---------- Initializing  ----------')
    seed_everything()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    start_epoch = CKPT_NUM

    logging.info('---------- Define Models ----------')
    model = Seq2seq().to(device)
    if MULTI:
        model = torch.nn.DataParallel(model)

    sp = spm.SentencePieceProcessor()
    sp.Load(SP_PATH)

    logging.info('---------- Define Loss and Optimizer ----------')
    if LOSS == 'SCE':
        criterion = SCELoss()
    elif LOSS == 'ITF':
        criterion = ITFLoss(device, _lambda=LAMBDA)
    else:
        criterion = INFLoss(device, _lambda=LAMBDA)
    _opt = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9)
    optimizer = get_optimizer(_opt)
Beispiel #25
0
def train(dataset, args, retrain):
    if args.model == 'seq2seq':
        model = Seq2seq(args.bs, args.lr, dataset.vocabsize, args.embed_dim, args.fs, args.feat_dim, dataset.pretrain_wordemb)
    else:
        print('choose a model to train! ')
        parser.print_help()
        return
    
    model.build_model()

    saver = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())

    if retrain == True:
        latest_ckpt = tf.train.latest_checkpoint(args.loadpath)
        saver.restore(sess, latest_ckpt)
        print('restore from', latest_ckpt)

    print ('Start training, method=%s, lr=%f, epoch=%d, comment=%s'% (model.name, args.lr, args.ep, args.comment))

    path_args = args.savepath.split('/')[0] + '/output.log'
    fwirte = open(path_args, 'w')

    preloss = 1000.
    earlystop = 0
    cutting_len = 8

    pre_loss = 10.0
    roll_sch = 1.0
    for ep in range(1, args.ep+1):
        correct = []
        train_total_loss = 0

        start_time = time.time()

        if pre_loss < 0.65 and cutting_len < 45: # prevent overfitting
            cutting_len += 10
            dataset.random_sample(4, int(cutting_len))
            print('re sample, size:', dataset.size, ', iters:', dataset.iters, 'cutting_size:', cutting_len)

        else:
            dataset.random_sample(4, int(cutting_len))
        
        for step in range(0, dataset.iters):          # total run 21120 samples
            batch_x, batch_y = dataset.next_batch() #dataset.next_batch() 
            hist = np.zeros((batch_x.shape[0], dataset.vocabsize), dtype=np.float32)
            hist = list(hist)
            indices = np.ones((batch_x.shape[0], 1), dtype=np.float32)
            indices = list(indices)

            batch_y_mask = np.zeros( (batch_y.shape[0], batch_y.shape[1]) )
            nonzeros = list( map(lambda x: (x != 0).sum() , batch_y ) )
            for ind, row in enumerate(batch_y_mask):
                row[:nonzeros[ind]] = 1
            
            if step == 0 and ep == 1:
                print(batch_x.shape, batch_y.shape)
            roll = np.random.rand()
            _, loss = sess.run([model.train_op, model.loss_op], 
                feed_dict={ model.x: batch_x, model.caption: batch_y, model.caption_mask: batch_y_mask, 
                            model.prob_sch:roll_sch, model.roll:roll, model.hist:hist, model.indices:indices})
            # print(logit_words)
            train_total_loss += loss

            if step % 10 == 0 or step == 1:
                # pred, current_embed = sess.run([model.pred, model.current_embed], 
                #     feed_dict={ model.x: batch_x, model.caption: batch_y}),
                # print(len(pred))
                # print(pred[0])
                # print('================================\n')
                # print(len(current_embed))
                # print(current_embed[0])
                print("Epoch: %2d, Step: %7d/%7d, Train_loss: %.4f, roll: %2.3f, roll_sch: %2.3f         " % 
                    (ep, step, dataset.iters, loss, roll, roll_sch), end='\r')

        train_total_loss /= dataset.iters
        pre_loss = train_total_loss

        print("Epoch: %2d, Step: %7d/%7d, Train_loss: %2.4f        " % 
                    (ep, step, dataset.iters, train_total_loss), end='\r')

        test_total_loss = 0
        # total_iters = 75
        # totalsample = dataset.random_sample(2400)
        # total_iters = np.ceil(totalsample/args.bs).astype(np.int32)
        dataset.random_sample(1)
        for step in range(0, dataset.iters): 
            batch_x, batch_y = dataset.next_batch() #dataset.next_batch_val()
            hist = np.zeros((batch_x.shape[0], dataset.vocabsize), dtype=np.float32)
            hist = list(hist)
            indices = np.ones((batch_x.shape[0], 1), dtype=np.float32)
            indices = list(indices)
            # batch_y = np.column_stack((batch_y, np.zeros( [len(batch_y), 1] ))).astype(int)
            batch_y_mask = np.zeros( (batch_y.shape[0], batch_y.shape[1]) )
            nonzeros = list( map(lambda x: (x != 0).sum() , batch_y ) )
            for ind, row in enumerate(batch_y_mask):
                row[:nonzeros[ind]] = 1
            roll = 0.0
            loss = sess.run(model.loss_op, 
                feed_dict={ model.x: batch_x, model.caption: batch_y, model.caption_mask: batch_y_mask, 
                            model.prob_sch:roll_sch, model.roll:roll, model.hist:hist, model.indices:indices})

            test_total_loss += loss

        test_total_loss /= dataset.iters
        end_time = time.time()
        print("Epoch: %2d, take_time: %4.1fs, Train_loss: %2.4f, Test_loss: %2.4f          " % 
                    (ep, (end_time-start_time), train_total_loss, test_total_loss))
        fwirte.write("Epoch: %2d, take_time: %4.1fs, Train_loss: %2.4f, Test_loss: %2.4f\n" % 
                    (ep, (end_time-start_time), train_total_loss, test_total_loss))

        saver.save(sess, args.savepath, global_step=ep)
        if ep > 50:
            roll_sch *= 0.99

        if test_total_loss < 0.55:
            print('earlystop at epoch %d' %(ep))
            break


    print('Done')
    print("Model saved in file: %s\n" % args.savepath)
    fwirte.write('Done')
    fwirte.close()
Beispiel #26
0
            optimizer.apply_gradients(grads_and_vars=zip(
                grads, model.variables))  # 更新参数
            if (batch + 1) % 50 == 0:
                print('[Epoch{} Batch{}] loss:{:.3f}'.format(
                    epoch + 1, batch + 1, loss.numpy()))
        manager.save()  # 每个epoch后保存一个checkpoint
        print('Epoch{} Loss: {:.5f}'.format(epoch + 1, np.mean(epoch_loss)))
        print('***************')


if __name__ == '__main__':
    train_X = np.loadtxt('/data/train_X.txt', dtype='int')
    train_Y = np.loadtxt('/data/train_Y.txt', dtype='int')
    test_X = np.loadtxt('/data/test_X.txt', dtype='int')

    index2word, word2index, embedding_matrix = load_vocab_embedding_matrix()

    config = Configurations()

    train_dataset = tf.data.Dataset.from_tensor_slices(
        (train_X, train_Y)).batch(config.batch_size)

    model = Seq2seq(vocab_size=embedding_matrix.shape[0],
                    embedding_dim=embedding_matrix.shape[1],
                    embedding_matrix=embedding_matrix,
                    gru_units=config.hid_dim,
                    dropout_rate=config.dropout)

    training(model, train_dataset, config.epochs, config.learning_rate,
             word2index['<PAD>'])
Beispiel #27
0
tf_config.gpu_options.allow_growth = True

model_path = "checkpoint/model.ckpt"

if __name__ == "__main__":
    print("(1)load data......")
    docs_source = ['new jersey is usually hot during autumn , and it is never quiet in winter .\n']
    docs_target = ["new jersey est généralement chaud pendant l' automne , et il est jamais calme en hiver .\n"]
    w2i_source, i2w_source = helper.load_vocab('./data/small_vocab_en.txt', helper.SOURCE_CODES)
    w2i_target, i2w_target = helper.load_vocab('./data/small_vocab_fr.txt', helper.TARGET_CODES)

    print("(2) build model......")
    config = config()
    config.source_vocab_size = len(w2i_source)
    config.target_vocab_size = len(w2i_target)
    model = Seq2seq(config, w2i_target, useTeacherForcing=False)

    print("(3) run model......")
    print_every = 100
    max_target_len = 20

    with tf.Session(config=tf_config) as sess:
        saver = tf.train.Saver()
        saver.restore(sess, model_path)

        source_batch, source_lens, target_batch, target_lens = get_batch(docs_source, w2i_source, docs_target,
                                                                         w2i_target, config.batch_size)

        feed_dict = {
            model.seq_inputs: source_batch,
            model.seq_inputs_len: source_lens,
Beispiel #28
0
import sys
sys.path.append('/home/demolwang/demolwang/math_word_problem/critical-based/seq2seq_v2/src')
from model import EncoderRNN, DecoderRNN_1, Seq2seq
import torch
from torch.autograd import Variable
import torch.nn as nn
import pdb


embed_model = nn.Embedding(1000, 100)

encode_model = EncoderRNN(1000, embed_model, 100, 128, 0, 0, 4, True, None, 'lstm', True)

decode_model = DecoderRNN_1(1000, 10, embed_model, 100, 256, 3, None, 'gru', 1, 0, 0, 0)

seq2seq = Seq2seq(encode_model, decode_model)

input = Variable(torch.LongTensor([[1,2,4,5],[4,3,2,9]]))
target = Variable(torch.LongTensor([[4,3,2], [11,3,4]]))

lengths = [4,4]

dol, dh, ssl = seq2seq(input, lengths, target, 0, 3)
pdb.set_trace()
pass



Beispiel #29
0
def run(user_question, seq2seq=Seq2seq(), proc=DataProcess()):
    pro_sent = proc.prepocess_sentence(user_question)
    txt_to_idx = seq2seq.convert_text_to_index(pro_sent, proc.word2idx)
    pred = seq2seq.predict_model()
    sentence = seq2seq.idx_to_sentence(txt_to_idx, pred)
    return sentence_spacing(sentence)
Beispiel #30
0
def main():
    logging.info("(1) load data......")
    data = datasets.Lang8v1()
    data.process()
    data.show()
    # docs_source, docs_target = load_data("")
    # w2i_source, i2w_source = make_vocab(docs_source)
    # w2i_target, i2w_target = make_vocab(docs_target)

    config = Config()
    config.source_vocab_size = data.src_vocab_size
    config.target_vocab_size = data.tgt_vocab_size

    logging.info("(2) build model......")
    model = Seq2seq(config=config,
                    src_embedding=data.src_embedding_matrix,
                    tgt_embedding=data.tgt_embedding_matrix,
                    useTeacherForcing=config.useTeacherForcing,
                    useAttention=config.useAttention)

    logging.info("(3) run model......")
    with tf.Session(config=tf_config) as sess:
        tf.summary.FileWriter('graph', sess.graph)
        model.init(sess)
        best_epoch = 0
        previous_losses = []
        exp_loss = None
        exp_length = None
        exp_norm = None
        total_iters = 0
        start_time = time.time()
        batches_per_epoch = data.nb_train / config.batch_size
        time_per_iter = None

        checkpoint_path = tf.train.latest_checkpoint(config.checkpoint_dir)
        # last_epoch = -int(checkpoint_path[checkpoint_path.rfind('-'):])
        last_epoch = findLatestCheckpointBatch(config.checkpoint_dir)
        # checkpoint_path = os.path.join('checkpoint', "best.ckpt-2")
        logging.info('last epoch: %s' % last_epoch)
        if debug: exit()
        if os.path.exists('checkpoint/checkpoint'):
            logging.info('Restore model from %s' % checkpoint_path)
            model.saver.restore(sess, checkpoint_path)
        else:
            logging.info("Created model with fresh parameters.")
            exit()
        if debug: exit()
        for epoch in range(last_epoch + 1, config.epochs):
            epoch_tic = time.time()
            current_step = 0
            # for source_tokens, source_mask, target_tokens, target_mask in pair_iter(x_train, y_train, FLAGS.batch_size, FLAGS.num_layers):
            for batch_vars in data.get_batch(config.batch_size, 'train'):
                src_batch, tgt_batch, src_lens, tgt_lens = batch_vars
                # Get a batch and make a step.
                tic = time.time()
                loss, grad_norm, param_norm = model.train(*batch_vars)
                toc = time.time()
                iter_time = toc - tic
                # total_iters += np.sum(target_mask)
                # tps = total_iters / (time.time() - start_time)
                current_step += 1
                # if current_step>5: break
                time_per_iter = (time.time() - epoch_tic) / current_step

                # lengths = np.sum(target_mask, axis=0)
                mean_length = np.mean(src_lens)
                std_length = np.std(src_lens)

                if not exp_loss:
                    exp_loss = loss
                    exp_length = mean_length
                    exp_norm = grad_norm
                else:
                    exp_loss = 0.99 * exp_loss + 0.01 * loss
                    exp_length = 0.99 * exp_length + 0.01 * mean_length
                    exp_norm = 0.99 * exp_norm + 0.01 * grad_norm

                loss = loss / mean_length

                if current_step == 1 or current_step % config.print_every == 0:
                    logging.info(
                        time.strftime('%Y-%m-%d %H:%M:%S',
                                      time.localtime(time.time())))
                    logging.info(
                        'epoch %d/%d, batch %d/%.0f\n  loss %f, exp_loss %f, grad norm %f, param norm %f, length mean/std %f/%f'
                        % (epoch, config.epochs, current_step,
                           batches_per_epoch, loss, exp_loss / exp_length,
                           grad_norm, param_norm, mean_length, std_length))
                    logging.info(
                        'Cost Time: {}, ETA: {}, iter time: {:.3f} sec\n'.
                        format(
                            sec2str(toc - start_time),
                            sec2str(time_per_iter *
                                    (batches_per_epoch - current_step)),
                            (time_per_iter)))

                    predict_batch = model.predict(*batch_vars)
                    logging.info('-' * 80)
                    for i in range(3):
                        logging.info("[src]: " + ' '.join([
                            data.src_i2w[num]
                            for num in src_batch[i] if data.src_i2w[num] != PAD
                        ]))
                        logging.info("[tgt]: " + ' '.join([
                            data.tgt_i2w[num]
                            for num in tgt_batch[i] if data.tgt_i2w[num] != PAD
                        ]))
                        logging.info("[prd]: " + ' '.join([
                            data.tgt_i2w[num] for num in predict_batch[i]
                            if data.tgt_i2w[num] != PAD
                        ]))
                        logging.info('-')
                    logging.info('-' * 80)
                    logging.info("")

                if current_step % config.save_every == 0:
                    logging.info('Saving model to {}'.format(checkpoint_path))
                    model.saver.save(sess, checkpoint_path)

            epoch_toc = time.time()
            logging.info('Cost Time: {}, Total ETA: {}\n'.format(
                sec2str(epoch_toc - start_time),
                sec2str((epoch_toc - epoch_tic) * (config.epochs - epoch))))

            ## Validate
            # valid_cost = validate(model, sess, x_dev, y_dev)
            logging.info('validation ...')
            loss_dev = []
            tot_iter = data.nb_dev / config.batch_size
            # nb_dev = config.batch_size*tot_iter
            for i, dev_batch in enumerate(
                    data.get_batch(config.batch_size, 'dev')):
                t = model.test(*dev_batch)
                loss_dev.append(t)
                if i % max(1, tot_iter // 20) == 0:
                    logging.info('  {:.2f}%  loss: {:.2f}'.format(
                        (i + 1) * 100 / tot_iter, t))
                if i + 1 == tot_iter: break
            valid_loss = np.mean(loss_dev)

            logging.info("Epoch %d Validation cost: %.2f time: %s" %
                         (epoch, valid_loss, sec2str(epoch_toc - epoch_tic)))

            ## Checkpoint
            checkpoint_path = os.path.join(config.checkpoint_dir, "best.ckpt")
            if len(previous_losses) > 2 and valid_loss > previous_losses[-1]:
                pass
                # logging.info("Annealing learning rate by %f" % FLAGS.learning_rate_decay_factor)
                # sess.run(model.learning_rate_decay_op)
                # model.saver.restore(sess, checkpoint_path + ("-%d" % best_epoch))
            # else:
            logging.info('Saving checkpoint to {}'.format(checkpoint_path))
            previous_losses.append(valid_loss)
            # best_epoch = epoch
            model.saver.save(sess, checkpoint_path, global_step=epoch)
            with open('checkpoint/log', 'a') as f:
                f.write('{:02d}: {:.6f}\n'.format(epoch, valid_loss))