Esempio n. 1
0
def test_process(opt):
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    opt.batch_size = opt.b

    device = torch.device('cuda' if opt.cuda else 'cpu')

    data_class = SingleTurnDialog.load_class('OpenSubtitles')
    data_arg = Storage()
    data_arg.file_id = opt.datapath
    data_arg.min_vocab_times = 20

    def load_dataset(data_arg, wvpath, embedding_size):
        dm = data_class(**data_arg)
        return dm

    opt.n_position = 100
    dm = load_dataset(data_arg, None, opt.n_position)

    opt.n_src_vocab = dm.valid_vocab_len
    opt.n_trg_vocab = dm.valid_vocab_len
    opt.n_vocab_size = dm.valid_vocab_len
    opt.src_pad_idx = 0
    opt.trg_pad_idx = 0
    opt.pad_idx = 0

    model = transformer_model(opt, device).to(device)

    if (opt.restore != None):
        checkpoint = torch.load(opt.restore)
        model.load_state_dict(checkpoint['net'])

    dl = cotk.dataloader.OpenSubtitles(
        opt.datapath, min_vocab_times=data_arg.min_vocab_times)
    test(model, dm, device, opt, dl)
Esempio n. 2
0
def main(args):
    if args.debug:
        debug()

    if args.cuda:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
    else:
        config = tf.ConfigProto(device_count={'GPU': 0})
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

    data_class = SingleTurnDialog.load_class(args.dataset)
    wordvec_class = WordVector.load_class(args.wvclass)
    if wordvec_class == None:
        wordvec_class = Glove
    if args.cache:
        data = try_cache(data_class, (args.datapath, ), args.cache_dir)
        vocab = data.vocab_list
        embed = try_cache(lambda wv, ez, vl: wordvec_class(wv).load(ez, vl),
                          (args.wvpath, args.embedding_size, vocab),
                          args.cache_dir, wordvec_class.__name__)
    else:
        data = data_class(args.datapath)
        wv = wordvec_class(args.wvpath)
        vocab = data.vocab_list
        embed = wv.load(args.embedding_size, vocab)

    embed = np.array(embed, dtype=np.float32)

    with tf.Session(config=config) as sess:
        model = create_model(sess, data, args, embed)
        if args.mode == "train":
            model.train_process(sess, data, args)
        else:
            model.test_process(sess, data, args)
Esempio n. 3
0
File: main.py Progetto: altale/cotk
def main(args, load_exclude_set, restoreCallback):
    logging.basicConfig(\
     filename=0,\
     level=logging.DEBUG,\
     format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\
     datefmt='%H:%M:%S')

    if args.debug:
        debug()
    logging.info(json.dumps(args, indent=2))

    cuda_init(0, args.cuda)

    volatile = Storage()
    volatile.load_exclude_set = load_exclude_set
    volatile.restoreCallback = restoreCallback

    data_class = SingleTurnDialog.load_class(args.dataset)
    data_arg = Storage()
    data_arg.file_id = args.datapath
    wordvec_class = WordVector.load_class(args.wvclass)
    if wordvec_class is None:
        wordvec_class = Glove

    def load_dataset(data_arg, wvpath, embedding_size):
        wv = wordvec_class(wvpath)
        dm = data_class(**data_arg)
        return dm, wv.load(embedding_size, dm.vocab_list)

    if args.cache:
        dm, volatile.wordvec = try_cache(
            load_dataset, (data_arg, args.wvpath, args.embedding_size),
            args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__)
    else:
        dm, volatile.wordvec = load_dataset(data_arg, args.wvpath,
                                            args.embedding_size)

    volatile.dm = dm

    param = Storage()
    param.args = args
    param.volatile = volatile

    model = Seq2seq(param)
    if args.mode == "train":
        model.train_process()
    elif args.mode == "test":
        test_res = model.test_process()

        for key, val in test_res.items():
            if isinstance(val, bytes):
                test_res[key] = str(val)
        json.dump(test_res, open("./result.json", "w"))
    else:
        raise ValueError("Unknown mode")
Esempio n. 4
0
def main(args, load_exclude_set, restoreCallback):
    logging.basicConfig(\
     filename=0,\
     level=logging.DEBUG,\
     format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\
     datefmt='%H:%M:%S')

    if args.debug:
        debug()
    logging.info(json.dumps(args, indent=2))

    cuda_init(0, args.cuda)

    volatile = Storage()
    volatile.load_exclude_set = load_exclude_set
    volatile.restoreCallback = restoreCallback

    data_class = SingleTurnDialog.load_class(args.dataset)
    data_arg = Storage()
    data_arg.file_id = args.datapath + "#OpenSubtitles"
    data_arg.tokenizer = PretrainedTokenizer(
        BertTokenizer.from_pretrained(args.bert_vocab))
    data_arg.pretrained = "bert"
    wordvec_class = WordVector.load_class(args.wvclass)
    if wordvec_class is None:
        wordvec_class = Glove

    def load_dataset(data_arg, wvpath, embedding_size):
        wv = wordvec_class(wvpath)
        dm = data_class(**data_arg)
        return dm, wv.load_matrix(embedding_size, dm.frequent_vocab_list)

    if args.cache:
        dm, volatile.wordvec = try_cache(
            load_dataset, (data_arg, args.wvpath, args.embedding_size),
            args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__)
    else:
        dm, volatile.wordvec = load_dataset(data_arg, args.wvpath,
                                            args.embedding_size)

    volatile.dm = dm

    param = Storage()
    param.args = args
    param.volatile = volatile

    model = Seq2seq(param)
    if args.mode == "train":
        model.train_process()
    elif args.mode == "test":
        model.test_process()
    else:
        raise ValueError("Unknown mode")
Esempio n. 5
0
def train_process(opt):
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model
    opt.batch_size = opt.b

    device = torch.device('cuda' if opt.cuda else 'cpu')

    data_class = SingleTurnDialog.load_class('OpenSubtitles')
    data_arg = Storage()
    data_arg.file_id = opt.datapath
    data_arg.min_vocab_times = 20

    def load_dataset(data_arg, wvpath, embedding_size):
        dm = data_class(**data_arg)
        return dm

    opt.n_position = 100
    dm = load_dataset(data_arg, None, opt.n_position)

    opt.n_src_vocab = dm.valid_vocab_len
    opt.n_trg_vocab = dm.valid_vocab_len
    opt.n_vocab_size = dm.valid_vocab_len
    opt.src_pad_idx = 0
    opt.trg_pad_idx = 0
    opt.pad_idx = 0

    model = transformer_model(opt, device).to(device)

    n_steps = 0
    optimizer_ = optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09)

    if (opt.restore != None):
        checkpoint = torch.load(opt.restore)
        model.load_state_dict(checkpoint['net'])
        n_steps = checkpoint['n_steps']
        optimizer_.load_state_dict(checkpoint['opt'])

    optimizer = ScheduledOptim(optimizer_, opt.lr, opt.d_model,
                               opt.n_warmup_steps, n_steps)

    dl = cotk.dataloader.OpenSubtitles(
        opt.datapath, min_vocab_times=data_arg.min_vocab_times)
    train(model, dm, optimizer, device, opt, dl)
Esempio n. 6
0
def main(args):
    logging.basicConfig(\
     filename=0,\
     level=logging.DEBUG,\
     format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',\
     datefmt='%H:%M:%S')

    if args.debug:
        debug()
    logging.info(json.dumps(args, indent=2))

    cuda_init(0, args.cuda)

    volatile = Storage()
    data_class = SingleTurnDialog.load_class(args.dataset)
    wordvec_class = WordVector.load_class(args.wvclass)
    if wordvec_class is None:
        wordvec_class = Glove
    if args.cache:
        dm = try_cache(data_class, (args.datapath, ), args.cache_dir)
        volatile.wordvec = try_cache(\
         lambda wv, ez, vl: wordvec_class(wv).load(ez, vl), \
         (args.wvpath, args.embedding_size, dm.vocab_list),
         args.cache_dir, wordvec_class.__name__)
    else:
        dm = data_class(args.datapath)
        wv = wordvec_class(args.wvpath)
        volatile.wordvec = wv.load(args.embedding_size, dm.vocab_list)

    volatile.dm = dm

    param = Storage()
    param.args = args
    param.volatile = volatile

    model = Seq2seq(param)
    if args.mode == "train":
        model.train_process()
    elif args.mode == "test":
        model.test_process()
    else:
        raise ValueError("Unknown mode")
Esempio n. 7
0
def main(args):
    if args.debug:
        debug()

    if args.cuda:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
    else:
        config = tf.ConfigProto(device_count={'GPU': 0})
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

    data_class = SingleTurnDialog.load_class(args.dataset)
    wordvec_class = WordVector.load_class(args.wvclass)
    if wordvec_class == None:
        wordvec_class = Glove
    if args.cache:
        data = try_cache(data_class, (args.datapath, ), args.cache_dir)
        vocab = data.frequent_vocab_list
        embed = try_cache(
            lambda wv, ez, vl: wordvec_class(wv).load_matrix(ez, vl),
            (args.wvpath, args.embedding_size, vocab), args.cache_dir,
            wordvec_class.__name__)
    else:
        data = data_class(args.datapath)
        wv = wordvec_class(args.wvpath)
        vocab = data.frequent_vocab_list
        embed = wv.load_matrix(args.embedding_size, vocab)

    embed = np.array(embed, dtype=np.float32)

    with tf.Session(config=config) as sess:
        model = create_model(sess, data, args, embed)
        if args.mode == "train":
            model.train_process(sess, data, args)
        else:
            test_res = model.test_process(sess, data, args)
            for key, val in test_res.items():
                if isinstance(val, bytes):
                    test_res[key] = str(val)
            json.dump(test_res, open("./result.json", "w"))
def main(args, load_exclude_set, restoreCallback):
    logging.basicConfig(
        filename=0,
        level=logging.DEBUG,
        format='%(asctime)s %(filename)s[line:%(lineno)d] %(message)s',
        datefmt='%H:%M:%S')

    if args.debug:
        debug()
    logging.info(json.dumps(args, indent=2))

    cuda_init(args.device, args.cuda)

    volatile = Storage()
    volatile.load_exclude_set = load_exclude_set
    volatile.restoreCallback = restoreCallback

    data_class = SingleTurnDialog.load_class(args.dataset)
    data_arg = Storage()
    data_arg.file_id = args.datapath

    # RAML parameters
    if args.model == "raml":
        data_arg.raml_file = "samples_iwslt14.txt"
        data_arg.num_samples = 10 or args.n_samples
        data_arg.tau = 0.4

    wordvec_class = WordVector.load_class(args.wvclass)

    def load_dataset(data_arg, wvpath, embedding_size):
        wv = wordvec_class(wvpath)
        dm = data_class(**data_arg)
        return dm, wv.load_matrix(embedding_size, dm.vocab_list)

    if args.cache:
        dm, volatile.wordvec = try_cache(
            load_dataset, (data_arg, args.wvpath, args.embedding_size),
            args.cache_dir, data_class.__name__ + "_" + wordvec_class.__name__)
    else:
        dm, volatile.wordvec = load_dataset(data_arg, args.wvpath,
                                            args.embedding_size)

    volatile.dm = dm

    param = Storage()
    param.args = args
    param.volatile = volatile

    if args.model == "basic":
        model = Seq2seq(param)
    elif args.model == "raml":
        model = Seq2seqRAML(param)
    elif args.model == "scheduled-sampling":
        model = Seq2seqSS(param)
    elif args.model == "policy-gradient":
        model = Seq2seqPG(param)

    if args.mode == "train":
        model.train_process()
    elif args.mode == "test":
        test_res = model.test_process()

        json.dump(test_res, open("./result.json", "w"))
    else:
        raise ValueError("Unknown mode")
Esempio n. 9
0
    def base_test_init(self, dl):
        with pytest.raises(ValueError):
            SingleTurnDialog(
                "./tests/dataloader/dummy_opensubtitles#OpenSubtitles",
                pretrained='none')
        with pytest.raises(ValueError):
            SingleTurnDialog(
                "./tests/dataloader/dummy_opensubtitles#OpenSubtitles",
                pretrained='gpt2')
        with pytest.raises(ValueError):
            SingleTurnDialog(
                "./tests/dataloader/dummy_opensubtitles#OpenSubtitles",
                pretrained='bert')

        assert isinstance(dl, SingleTurnDialog)
        assert isinstance(dl.file_id, str)
        assert isinstance(dl.file_path, str)
        for set_name, fields in dl.fields.items():
            assert isinstance(set_name, str)
            assert isinstance(fields, dict)
            for field_name, field in fields.items():
                assert isinstance(field_name, str)
                assert isinstance(field, Field)

        assert isinstance(dl.vocabs, list)
        for vocab in dl.vocabs:
            assert isinstance(vocab, Vocab)
        assert isinstance(dl.tokenizers, list)
        for toker in dl.tokenizers:
            assert isinstance(toker, Tokenizer)

        for (_, data), (_, index) in zip(dl.data.items(), dl.index.items()):
            assert isinstance(data, dict)
            assert isinstance(index, list)
            for field_name, content in data.items():
                assert isinstance(content, dict)
                for _, each_content in content.items():
                    assert isinstance(each_content, list)
                    assert len(index) == len(each_content)
        for _, batch_id in dl.batch_id.items():
            assert batch_id == 0
        for _, batch_size in dl.batch_size.items():
            assert batch_size is None

        assert isinstance(dl.frequent_vocab_list, list)
        assert dl.frequent_vocab_size == len(dl.frequent_vocab_list)
        assert isinstance(dl.all_vocab_list, list)
        assert dl.all_vocab_size == len(dl.all_vocab_list)
        assert dl.all_vocab_size >= dl.frequent_vocab_size

        for _, data in dl.data.items():
            post = data['post']
            post_ids = post['id']
            assert isinstance(post_ids, list)
            assert isinstance(post_ids[0], list)
            if dl._pretrained is None or dl._pretrained == "gpt2":
                assert post_ids[0][0] == dl.go_id
                assert post_ids[0][-1] == dl.eos_id
            else:  # dl._pretrained == "bert"
                assert post_ids[0][0] == dl.get_special_tokens_id("cls")
                assert post_ids[0][-1] == dl.get_special_tokens_id("sep")
            post_strs = post['str']
            assert isinstance(post_strs, list)
            assert isinstance(post_strs[0], str)

            resp = data['resp']
            resp_ids = resp['id']
            assert isinstance(resp_ids, list)
            assert isinstance(resp_ids[0], list)
            if dl._pretrained is None or dl._pretrained == "gpt2":
                assert post_ids[0][0] == dl.go_id
                assert post_ids[0][-1] == dl.eos_id
            else:  # dl._pretrained == "bert"
                assert post_ids[0][0] == dl.get_special_tokens_id("cls")
                assert post_ids[0][-1] == dl.get_special_tokens_id("sep")
            resp_strs = resp['str']
            assert isinstance(resp_strs, list)
            assert isinstance(resp_strs[0], str)

            assert len(post) == len(resp)

        with pytest.raises(TypeError):
            SingleTurnDialog()