Example #1
0
def predict(args, model_name, restore_path):
    intent_dict = pkl.load(open(PATH + "/intent_dict.p", 'rb'))
    args.id2intent = {v: k for k, v in intent_dict.items()}
    vocab = WordVocab.load_vocab(PATH + args.vocab_path)
    poss_vocab = pkl.load(open(PATH + "/poss_vocab.p", 'rb'))

    args.num_layers = 1
    args.vocab_size = len(vocab)
    args.class_nums = len(intent_dict)
    args.poss_num = len(poss_vocab)

    if args.use_pre_train_emb:
        vocab_emb = pkl.load(open('%s_vocab_emb.p' % args.task_name, 'rb'))
        args.vocab_emb = vocab_emb

    if model_name == 'BaseLSTM':
        model = BaseLstm(args, 'BaseLstm')
    elif model_name == 'BaseLstmStruct':
        model = BaseLstmStruct(args, 'BaseLstmStruct')
    elif model_name == 'BaseTransformerStruct':
        model = BaseTransformerStruct(args, 'BaseTransformerStruct')
    elif model_name == 'cnn':
        model = Cnn(args, 'cnn')
    elif model_name == 'TransformerCNN':
        model = TransformerCNN(args, 'TransformerCNN')
    elif model_name == 'LEAM':
        model = LEAM(args, 'LEAM')
    args.model_name = model_name
    model.build_placeholder()
    model.build_model()
    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess = model.restore(sess, restore_path)
        pdd = PredictDataDeal(vocab=vocab,
                              seq_len=args.seq_len,
                              poss_vocab=poss_vocab,
                              vocab_char=None)
        while True:
            sent = input("输入:")
            t1, t1_len, poss = pdd.predict(sent)
            pre_prob, pre_label = model.predict(sess, t1, t1_len, poss)
            print(args.id2intent[pre_label[0]], np.max(pre_prob, -1))
Example #2
0
from dataset.dataset import BERTDatasetCreator
from dataset import WordVocab
from torch.utils.data import DataLoader
import argparse
import tqdm

parser = argparse.ArgumentParser()
parser.add_argument("-v", "--vocab_path", required=True, type=str)
parser.add_argument("-c", "--corpus_path", required=True, type=str)
parser.add_argument("-e", "--encoding", default="utf-8", type=str)
parser.add_argument("-o", "--output_path", required=True, type=str)
args = parser.parse_args()

word_vocab = WordVocab.load_vocab(args.vocab_path)
builder = BERTDatasetCreator(corpus_path=args.corpus_path,
                             vocab=word_vocab,
                             seq_len=None,
                             encoding=args.encoding)

with open(args.output_path, 'w', encoding=args.encoding) as f:
    for index in tqdm.tqdm(range(len(builder)),
                           desc="Building Dataset",
                           total=len(builder)):
        data = builder[index]
        output_form = "%s\t%s\t%s\t%s\t%d\n"
        t1_text, t2_text = [
            " ".join(t) for t in [data["t1_random"], data["t2_random"]]
        ]
        t1_label, t2_label = [
            " ".join([str(i) for i in label])
            for label in [data["t1_label"], data["t2_label"]]
Example #3
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--train_dataset",
                        required=True,
                        type=str,
                        help="train dataset for train bert")
    parser.add_argument("-t",
                        "--valid_dataset",
                        required=True,
                        type=str,
                        help="valid set for evaluate train set")
    parser.add_argument("-v",
                        "--vocab_path",
                        required=True,
                        type=str,
                        help="built vocab model path with vocab")
    parser.add_argument("-o",
                        "--output_path",
                        required=True,
                        type=str,
                        help="output/bert.model")

    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=0,
                        help="dataloader worker size")
    parser.add_argument("--with_cuda",
                        type=bool,
                        default=True,
                        help="training with CUDA: true, or false")
    parser.add_argument("--corpus_lines",
                        type=int,
                        default=None,
                        help="total number of lines in corpus")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=[0, 1, 2, 3],
                        help="CUDA device ids")
    parser.add_argument("--on_memory",
                        type=bool,
                        default=True,
                        help="Loading on memory: true or false")

    args = parser.parse_args()
    paths = Paths(args.output_path)

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", vocab.vocab_size)
    args.char_nums = vocab.vocab_size

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset,
                                vocab,
                                corpus_lines=args.corpus_lines,
                                on_memory=args.on_memory)

    print("Loading Valid Dataset", args.valid_dataset)
    valid_dataset = BERTDataset(args.valid_dataset, vocab, on_memory=args.on_memory) \
        if args.valid_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=hp.batch_size,
                                   collate_fn=lambda batch: collate_mlm(batch),
                                   num_workers=args.num_workers,
                                   shuffle=False)
    valid_data_loader = DataLoader(valid_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch), num_workers=args.num_workers, shuffle=False) \
        if valid_dataset is not None else None

    print("Building BERT model")
    bert = BERT(embed_dim=hp.embed_dim, hidden=hp.hidden, args=args)

    print("Creating BERT Trainer")
    trainer = BERTTrainer(bert,
                          vocab.vocab_size,
                          train_dataloader=train_data_loader,
                          test_dataloader=valid_data_loader,
                          with_cuda=args.with_cuda,
                          cuda_devices=args.cuda_devices,
                          args=args,
                          path=paths)

    print("Training Start")

    trainer.train()
Example #4
0
    def __init__(self, model_name='/'):
        self.sess_dict = {}
        self.queryObj_dict = {}
        config = tf.ConfigProto(allow_soft_placement=True)

        model_name_meta = '%s.meta' % model_name
        saver = tf.train.import_meta_graph(model_name_meta)  # 加载图结构
        gragh = tf.get_default_graph()  # 获取当前图,为了后续训练时恢复变量
        tensor_name_list = [
            tensor.name for tensor in gragh.as_graph_def().node
        ]  # 得到当前图中所有变量的名称
        for ele in tensor_name_list:
            if str(ele).__contains__('out_softmax'):
                print(ele)
        args_dict = json.load(open('./Configs/BaseLstm.config', 'r'))
        self.args = dict_to_object(dict(args_dict))
        self.label_vocab1 = pkl.load(open("./label_vocab1.p", 'rb'))
        self.label_vocab2 = pkl.load(open("./label_vocab2.p", 'rb'))
        self.label_vocab3 = pkl.load(open("./label_vocab3.p", 'rb'))
        self.vocab = WordVocab.load_vocab(PATH + self.args.vocab_path)
        self.poss_vocab = pkl.load(open("./poss_vocab.p", 'rb'))

        with tf.device('/device:GPU:%s' % 0):
            self.sent_token = gragh.get_tensor_by_name('sent1_token:0')
            self.sent_char = gragh.get_tensor_by_name('sent1_char:0')

            self.sent_word_re = gragh.get_tensor_by_name('sent_word_re:0')
            self.sent_word_re_char = gragh.get_tensor_by_name(
                'sent_word_re_char:0')

            self.sent_len = gragh.get_tensor_by_name('sent1_len:0')
            self.sent_len_char = gragh.get_tensor_by_name('sent_len_char:0')

            self.sent_len_re = gragh.get_tensor_by_name('sent1_len_re:0')
            self.sent_len_re_char = gragh.get_tensor_by_name(
                'sent1_len_re_char:0')

            self.sent_token_neg = gragh.get_tensor_by_name('sent1_token_neg:0')
            self.sent_len_neg = gragh.get_tensor_by_name('sent1_len_neg:0')
            self.sent_char_neg = gragh.get_tensor_by_name('sent_char_neg:0')
            self.sent_char_len_neg = gragh.get_tensor_by_name(
                'sent_char_len_neg:0')

            self.key_emb = gragh.get_tensor_by_name('key_emb:0')

            self.dropout = gragh.get_tensor_by_name('dropout:0')

            name = model_name.split('/')[-1].split('_')[0].replace(
                "BaseLSTM", "BaseLstm")
            try:
                self.soft_out_1 = gragh.get_tensor_by_name(
                    '%s_enc_0/_0/softmax/Softmax:0' % name)
                self.soft_out_2 = gragh.get_tensor_by_name(
                    '%s_enc_1/_1/softmax/Softmax:0' % name)
                self.soft_out_3 = gragh.get_tensor_by_name(
                    '%s_enc_2/_2/softmax/Softmax:0' % name)
            except:
                self.soft_out_1 = gragh.get_tensor_by_name(
                    '%s_enc_0/_0/out_softmax/softmax/Softmax:0' % name)
                self.soft_out_2 = gragh.get_tensor_by_name(
                    '%s_enc_1/_1/out_softmax/softmax/Softmax:0' % name)
                self.soft_out_3 = gragh.get_tensor_by_name(
                    '%s_enc_2/_2/out_softmax/softmax/Softmax:0' % name)

            try:
                self.smentic_out_1 = gragh.get_tensor_by_name(
                    '%s_enc_0/_0/semantic_out/concat:0' % name)
                self.smentic_out_2 = gragh.get_tensor_by_name(
                    '%s_enc_1/_1/semantic_out/concat:0' % name)
                self.smentic_out_3 = gragh.get_tensor_by_name(
                    '%s_enc_2/_2/semantic_out/concat:0' % name)
            except:
                pass

            self.sess = tf.Session(config=config)
            saver.restore(self.sess, '%s' % model_name)

            self.pdd = PredictDataDeal(vocab=self.vocab,
                                       seq_len=self.args.seq_len,
                                       poss_vocab=self.poss_vocab)
def test():
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

    parser = argparse.ArgumentParser()

    parser.add_argument("-c",
                        "--train_dataset",
                        type=str,
                        help="train dataset for train bert",
                        default='./data/corpus_pre.txt')
    parser.add_argument("-t",
                        "--valid_dataset",
                        type=str,
                        help="valid set for evaluate train set",
                        default='./data/corpus_pre.txt')
    parser.add_argument("-v",
                        "--vocab_path",
                        type=str,
                        help="built vocab model path with vocab",
                        default='./data/vocab.test')
    parser.add_argument("-o",
                        "--output_path",
                        type=str,
                        help="output/bert.model",
                        default='./output')

    parser.add_argument("-w",
                        "--num_workers",
                        type=int,
                        default=0,
                        help="dataloader worker size")
    parser.add_argument("--with_cuda",
                        type=bool,
                        default=False,
                        help="training with CUDA: true, or false")
    parser.add_argument("--corpus_lines",
                        type=int,
                        default=None,
                        help="total number of lines in corpus")
    parser.add_argument("--cuda_devices",
                        type=int,
                        nargs='+',
                        default=[0, 1, 2, 3],
                        help="CUDA device ids")
    parser.add_argument("--on_memory",
                        type=bool,
                        default=True,
                        help="Loading on memory: true or false")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()
    set_seed(args)
    paths = Paths(args.output_path)

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", vocab.vocab_size)
    args.char_nums = vocab.vocab_size

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset,
                                vocab,
                                corpus_lines=args.corpus_lines,
                                on_memory=args.on_memory)

    print("Loading Valid Dataset", args.valid_dataset)
    valid_dataset = BERTDataset(args.valid_dataset, vocab, on_memory=args.on_memory) \
        if args.valid_dataset is not None else None
    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=hp.batch_size,
                                   collate_fn=lambda batch: collate_mlm(batch),
                                   num_workers=args.num_workers,
                                   shuffle=False)  # 训练语料按长度排好序的
    valid_data_loader = DataLoader(valid_dataset, batch_size=hp.batch_size, collate_fn=lambda batch: collate_mlm(batch),
                                   num_workers=args.num_workers, shuffle=False) \
        if valid_dataset is not None else None

    print("Load BERT model")
    bert = torch.load('./output/model_bert/bert_ep10.model')
    model = torch.load('./output/model_mlm/mlm_ep10.model')
    print("Creating BERT Trainer")
    trainer = BERTTrainer(bert,
                          vocab.vocab_size,
                          model,
                          train_dataloader=train_data_loader,
                          test_dataloader=valid_data_loader,
                          with_cuda=args.with_cuda,
                          cuda_devices=args.cuda_devices,
                          args=args,
                          path=paths)

    print("Training Start")

    trainer.evaluate_and_print(vocab)
Example #6
0
import argparse
from dataset import WordVocab

parser = argparse.ArgumentParser()
parser.add_argument("-c", "--corpus_path", required=True, type=str)
parser.add_argument("-o", "--output_path", required=True, type=str)
parser.add_argument("-s", "--vocab_size", type=int, default=None)
parser.add_argument("-e", "--encoding", type=str, default="utf-8")
parser.add_argument("-m", "--min_freq", type=int, default=1)
args = parser.parse_args()

with open(args.corpus_path, "r", encoding=args.encoding) as f:
    vocab = WordVocab(f, max_size=args.vocab_size, min_freq=args.min_freq)

vocab.save_vocab(args.output_path)
Example #7
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert")
    parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set")
    parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab")
    parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model")

    parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model")
    parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers")
    parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads")
    parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len")

    parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size")
    parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs")
    parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size")

    parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false")
    parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n")
    parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
    parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
    parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false")

    parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam")
    parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value")

    args = parser.parse_args()

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len,
                                corpus_lines=args.corpus_lines, on_memory=args.on_memory)

    print("Loading Test Dataset", args.test_dataset)
    test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \
        if args.test_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if test_dataset is not None else None

    print("Building BERT model")
    bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads)

    print("Creating BERT Trainer")
    trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader,
                          lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
                          with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq)

    print("Training Start")
    for epoch in range(args.epochs):
        print("eee")
        trainer.train(epoch)
        trainer.save(epoch, args.output_path)

        if test_data_loader is not None:
            trainer.test(epoch)
Example #8
0
    corpus_path = os.path.join(args.dir_path, 'paths.csv')
    contexts_path = os.path.join(args.dir_path, 'path_contexts.csv')

    nodes_vocab = pd.read_csv(node_path)
    nodes_vocab['node_type'] = nodes_vocab.apply(
        lambda x: '_'.join(x['node_type'].split()), axis=1)
    node_dict = nodes_vocab.set_index('id').to_dict(orient='dict')
    node_dict = node_dict['node_type']
    paths = pd.read_csv(corpus_path)
    paths = paths.apply(lambda x: ' '.join(
        [node_dict.get(int(i), '<unk>') for i in x['path'].split(' ')]),
                        axis=1)
    path_list = paths.values.tolist()

    vocab = WordVocab(path_list,
                      max_size=args.vocab_size,
                      min_freq=args.min_freq)
    print("VOCAB SIZE:", len(vocab))
    vocab.save_vocab(os.path.join(args.output_dir_path, 'path_vocab.pickle'))

    f = open(os.path.join(args.output_dir_path, 'nl_vocab.pickle'), 'rb')
    nl_vocab = pickle.load(f)
    f.close()

    def process_tokens(x):
        split_list = split_camel(x['token'], x)
        split_list = nl_vocab.to_seq(split_list)
        return ' '.join([str(i) for i in split_list])

    tokens_paths = [
        os.path.join(args.dir_path, 'tokens.csv'),
Example #9
0
def build_vocab(corpus_path, vocab_path, mode):
    with open(corpus_path, "r", encoding='utf-8') as f:
        vocab = WordVocab(f, max_size=None, min_freq=1, mode=mode)
        print("VOCAB SIZE:", len(vocab))
        vocab.save_vocab(vocab_path)
Example #10
0
def train(args, model_name):

    if not os.path.exists(PATH + args.output_path):
        os.mkdir(PATH + args.output_path)

    _logger.info("new_vocab:%s" % args.new_vocab)
    _logger.info("use_tfrecord:%s" % args.use_tfrecord)
    _logger.info("train_dataset:%s" % args.train_dataset)
    _logger.info("test_dataset:%s" % args.test_dataset)
    _logger.info("task_name:%s" % args.task_name)
    _logger.info("model_name:%s" % args.model_name)
    _logger.info("new_tfrecord:%s" % args.new_tfrecord)
    _logger.info("restore_model:%s" % args.restore_model)
    _logger.info("use_pre_train_emb:%s" % args.use_pre_train_emb)

    _logger.info("build label vocab")
    if args.new_label_vocab:
        intent_dict = build_label_vocab(args.train_dataset, args.test_dataset)
        _logger.info("%s %s" % (intent_dict, len(intent_dict)))
        pkl.dump(intent_dict, open(PATH + "/intent_dict.p", 'wb'))

    intent_dict = pkl.load(open(PATH + "/intent_dict.p", 'rb'))
    args.id2intent = {v: k for k, v in intent_dict.items()}

    ### load word_vocab
    if not args.new_vocab and os.path.exists(PATH + args.vocab_path):
        _logger.info("Loading Vocab: %s" % (PATH + args.vocab_path))
        vocab = WordVocab.load_vocab(PATH + args.vocab_path)
    else:
        _logger.info("build vocab")
        build_vocab(args.train_dataset,
                    PATH + args.vocab_path,
                    mode='word_char')
        _logger.info("Loading Vocab: %s" % (PATH + args.vocab_path))
        vocab = WordVocab.load_vocab(PATH + args.vocab_path)

    _logger.info("Vocab Size:%s" % (len(vocab)))

    poss_vocab = build_poss_vocab(args.train_dataset, args.test_dataset)
    pkl.dump(poss_vocab, open(PATH + "/poss_vocab.p", 'wb'))
    poss_vocab = pkl.load(open(PATH + "/poss_vocab.p", 'rb'))

    ### load pre_train Embedding
    # print(vocab.stoi)
    args.num_layers = 1
    args.vocab_size = len(vocab)
    args.class_nums = len(intent_dict)
    args.poss_num = len(poss_vocab)

    if args.use_pre_train_emb:
        if args.new_pre_vocab:
            pre_emb_cls = PreTrainVocab(args.pre_train_emb_path,
                                        args.pre_train_emb_size)
            vocab_emb = pre_emb_cls.getEmbeddimhArray(vocab)
            pkl.dump(vocab_emb, open('%s_vocab_emb.p' % args.task_name, 'wb'))
            args.vocab_emb = vocab_emb
        else:
            vocab_emb = pkl.load(open('%s_vocab_emb.p' % args.task_name, 'rb'))
            args.vocab_emb = vocab_emb
        _logger.info('load pre_train_emb finish emb_array size:%s' %
                     (len(vocab_emb)))

    ### build tfrecord
    if not os.path.exists(PATH +
                          args.train_tfrecord_path) or not os.path.exists(
                              PATH +
                              args.test_tfrecord_path) or args.new_tfrecord:
        _logger.info('building tfrecords')
        DataSetTfrecord(
            args.train_dataset,
            vocab,
            args.seq_len,
            intent_dict=intent_dict,
            poss_vocab=poss_vocab,
            out_path=PATH + args.train_tfrecord_path,
        )
        DataSetTfrecord(args.test_dataset,
                        vocab,
                        args.seq_len,
                        poss_vocab=poss_vocab,
                        intent_dict=intent_dict,
                        out_path=PATH + args.test_tfrecord_path)
    _read_tfRecord = read_tfRecord(args.seq_len, args.batch_size)
    _logger.info("loading tfrecords")
    train_data_loader = _read_tfRecord(PATH + args.train_tfrecord_path)
    test_data_loader = _read_tfRecord(PATH + args.test_tfrecord_path)
    train_num = int([
        e for e in
        open(PATH + args.train_tfrecord_path +
             ".index", 'r', encoding='utf-8').readlines()
    ][0])
    test_num = int([
        e for e in
        open(PATH + args.test_tfrecord_path +
             ".index", 'r', encoding='utf-8').readlines()
    ][0])
    _logger.info('train_num:%s  test_num:%s' % (train_num, test_num))
    args.train_num = train_num
    args.test_num = test_num

    _logger.info('%s  batch_size:%s  use_tfrecod:%s' %
                 (args.model_name, args.batch_size, args.use_tfrecord))
    for index, e in enumerate(train_data_loader):
        if index % 10:
            print(e)

    # ### 模型选择 BaseTransformerStruct
    # model_name=args.model_name
    if model_name == 'BaseLSTM':
        model = BaseLstm(args, 'BaseLstm')
    elif model_name == 'BaseLstmStruct':
        model = BaseLstmStruct(args, 'BaseLstmStruct')
    elif model_name == 'BaseTransformerStruct':
        model = BaseTransformerStruct(args, 'BaseTransformerStruct')
    elif model_name == 'cnn':
        model = Cnn(args, 'cnn')
    elif model_name == 'TransformerCNN':
        model = TransformerCNN(args, 'TransformerCNN')
    elif model_name == 'LEAM':
        model = LEAM(args, 'LEAM')
    args.model_name = model_name
    model.build_placeholder()
    model.build_model()
    model.build_accuracy()
    model.build_loss()
    model.build_op()

    if args.restore_model == '':
        model.train(train_data_loader,
                    test_data_loader,
                    restore_model=None,
                    save_model=PATH + "/output/%s_%s_2kw.ckpt" %
                    (model_name, args.task_name))
    else:
        model.train(train_data_loader,
                    test_data_loader,
                    restore_model=PATH + args.restore_model,
                    save_model=PATH + "/output/%s_%s_2kw.ckpt" %
                    (model_name, args.task_name))
 def __init__(self, bert, vocab_path):
     # load vocab for tokenization
     self.vocab = WordVocab.load_vocab(vocab_path)
     # load pretrained model
     self.bert = bert
Example #12
0
def main(args):

    ts = time.strftime('%Y-%b-%d-%H:%M:%S', time.gmtime())

    print("Loading Vocab", args.vocab_path)
    vocab = WordVocab.load_vocab(args.vocab_path)
    print("Vocab Size: ", len(vocab))

    print("Loading Train Dataset", args.train_dataset)
    train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.max_sequence_length,
                                corpus_lines=args.corpus_lines, on_memory=args.on_memory)

    print("Loading Test Dataset", args.test_dataset)
    test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.max_sequence_length, on_memory=args.on_memory) \
        if args.test_dataset is not None else None

    print("Creating Dataloader")
    train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
    test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \
        if test_dataset is not None else None

    splits = ['train', 'test']
    data_loaders = {
        'train': train_data_loader,
        'test': test_data_loader
    }

    model = SentenceVAE(
        vocab_size=len(vocab),
        sos_idx=vocab.sos_index,
        eos_idx=vocab.eos_index,
        pad_idx=vocab.pad_index,
        unk_idx=vocab.unk_index,
        max_sequence_length=args.max_sequence_length,
        embedding_size=args.embedding_size,
        rnn_type=args.rnn_type,
        hidden_size=args.hidden_size,
        word_dropout=args.word_dropout,
        embedding_dropout=args.embedding_dropout,
        latent_size=args.latent_size,
        num_layers=args.num_layers,
        bidirectional=args.bidirectional
        )

    if torch.cuda.is_available():
        model = model.cuda()

    print(model)

    if args.tensorboard_logging:
        writer = SummaryWriter(os.path.join(args.logdir, expierment_name(args,ts)))
        writer.add_text("model", str(model))
        writer.add_text("args", str(args))
        writer.add_text("ts", ts)

    save_model_path = os.path.join(args.save_model_path)
    if not os.path.exists(save_model_path):
        os.makedirs(save_model_path)

    def kl_anneal_function(anneal_function, step, k, x0):
        if anneal_function == 'logistic':
            return float(1/(1+np.exp(-k*(step-x0))))
        elif anneal_function == 'linear':
            return min(1, step/x0)

    NLL = torch.nn.NLLLoss(size_average=False, ignore_index=vocab.pad_index)
    def loss_fn(logp, target, length, mean, logv, anneal_function, step, k, x0):

        # cut-off unnecessary padding from target, and flatten
        
        # Negative Log Likelihood
        NLL_loss = NLL(logp, target)

        # KL Divergence
        KL_loss = -0.5 * torch.sum(1 + logv - mean.pow(2) - logv.exp())
        KL_weight = kl_anneal_function(anneal_function, step, k, x0)

        return NLL_loss, KL_loss, KL_weight

    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)

    tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor
    step = 0
    for epoch in range(args.epochs):

        for split in splits:

            data_loader = data_loaders[split]

            tracker = defaultdict(tensor)

            # Enable/Disable Dropout
            if split == 'train':
                model.train()
            else:
                model.eval()

            correct = 0
            close = 0
            total = 0
            for iteration, batch in enumerate(data_loader):

                batch_size = batch['input'].size(0)

                for k, v in batch.items():
                    if torch.is_tensor(v):
                        batch[k] = to_var(v)

                # Forward pass
                logp, mean, logv, z = model(batch['input'], batch['raw_length'])

                # loss calculation
                NLL_loss, KL_loss, KL_weight = loss_fn(logp, batch['target'],
                    batch['raw_length'], mean, logv, args.anneal_function, step, args.k, args.x0)

                loss = (NLL_loss + KL_weight * KL_loss)/batch_size

                # backward + optimization
                if split == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    step += 1

                correct += logp.argmax(dim=1).eq(batch['target']).sum().item()
                close += torch.mul(logp.argmax(dim=1).ge(batch["target"]-10), logp.argmax(dim=1).le(batch["target"]+10)).sum().item()
                total += batch['target'].nelement()


                # bookkeepeing
                tracker['ELBO'] = torch.cat((tracker['ELBO'], loss.view(1,)))

                if args.tensorboard_logging:
                    writer.add_scalar("%s/ELBO"%split.upper(), loss.data[0], epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/NLL Loss"%split.upper(), NLL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Loss"%split.upper(), KL_loss.data[0]/batch_size, epoch*len(data_loader) + iteration)
                    writer.add_scalar("%s/KL Weight"%split.upper(), KL_weight, epoch*len(data_loader) + iteration)

                if iteration % args.print_every == 0 or iteration+1 == len(data_loader):
                    print("%s Batch %04d/%i, Loss %9.4f, NLL-Loss %9.4f, KL-Loss %9.4f, KL-Weight %6.3f"
                        %(split.upper(), iteration, len(data_loader)-1, loss.item(), NLL_loss.item()/batch_size, KL_loss.item()/batch_size, KL_weight))

                if split == 'valid':
                    if 'target_sents' not in tracker:
                        tracker['target_sents'] = list()
                    tracker['target_sents'] += idx2word(batch['raw'].data, i2w=datasets['train'].get_i2w(), pad_idx=datasets['train'].pad_idx)
                    tracker['z'] = torch.cat((tracker['z'], z.data), dim=0)

            print("%s Epoch %02d/%i, Mean ELBO %9.4f, acc %f, clo %f"%(split.upper(), epoch, args.epochs, torch.mean(tracker['ELBO']), correct/total, close/total))

            if args.tensorboard_logging:
                writer.add_scalar("%s-Epoch/ELBO"%split.upper(), torch.mean(tracker['ELBO']), epoch)

            # save a dump of all sentences and the encoded latent space
            if split == 'valid':
                dump = {'target_sents':tracker['target_sents'], 'z':tracker['z'].tolist()}
                if not os.path.exists(os.path.join('dumps', ts)):
                    os.makedirs('dumps/'+ts)
                with open(os.path.join('dumps/'+ts+'/valid_E%i.json'%epoch), 'w') as dump_file:
                    json.dump(dump,dump_file)

            # save checkpoint
            if split == 'train':
                checkpoint_path = os.path.join(save_model_path, "E%i.pytorch"%(epoch))
                torch.save(model.state_dict(), checkpoint_path)
                print("Model saved at %s"%checkpoint_path)