Beispiel #1
0
def main(cfgpath, global_step):
    # parsing config.json
    proj_dir = Path.cwd()
    params = json.load((proj_dir / cfgpath).open())

    # create dataset
    batch_size = params['training'].get('batch_size')
    tr_filepath = params['filepath'].get('tr')
    val_filepath = params['filepath'].get('val')
    tr_ds = create_dataset(tr_filepath, batch_size, True)
    val_ds = create_dataset(val_filepath, batch_size, False)

    # create pre_processor
    vocab = pickle.load((proj_dir / params['filepath'].get('vocab')).open(mode='rb'))
    pre_processor = PreProcessor(vocab=vocab, tokenizer=MeCab().morphs, pad_idx=1)

    # create model

    model = SenCNN(num_classes=2, vocab=vocab)

    # create optimizer & loss_fn
    epochs = params['training'].get('epochs')
    learning_rate = params['training'].get('learning_rate')
    opt = tf.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
    writer = tf.summary.create_file_writer(logdir='./runs/exp')


    # training
    for epoch in tqdm(range(epochs), desc='epochs'):

        tr_loss = 0
        tf.keras.backend.set_learning_phase(1)

        for step, mb in tqdm(enumerate(tr_ds), desc='steps'):
            x_mb, y_mb = pre_processor.convert2idx(mb)

            with tf.GradientTape() as tape:
                mb_loss = loss_fn(y_mb, model(x_mb))
                grads = tape.gradient(target=mb_loss, sources=model.trainable_variables)
            opt.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))
            tr_loss += mb_loss.numpy()

            if tf.equal(opt.iterations % global_step, 0):
                with writer.as_default():
                    val_loss = evaluate(model, val_ds, loss_fn, pre_processor.convert2idx)
                    tf.summary.scalar('tr_loss', tr_loss / (step + 1), step=opt.iterations)
                    tf.summary.scalar('val_loss', val_loss, step=opt.iterations)
                    tf.keras.backend.set_learning_phase(1)
        else:
            tr_loss /= (step + 1)

        val_loss = evaluate(model, val_ds, loss_fn, pre_processor.convert2idx)
        tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(epoch + 1, tr_loss, val_loss))

    ckpt_path = proj_dir / params['filepath'].get('ckpt')
    ckpt = tf.train.Checkpoint(model=model)
    ckpt.save(ckpt_path)
Beispiel #2
0
def main(cfgpath):
    # parsing config.json
    proj_dir = Path.cwd()
    params = json.load((proj_dir / cfgpath).open())

    # create dataset
    batch_size = params['training'].get('batch_size')
    tr_filepath = params['filepath'].get('tr')
    val_filepath = params['filepath'].get('val')
    tr_ds = create_dataset(tr_filepath, batch_size, True)
    val_ds = create_dataset(val_filepath, batch_size, False)

    # create pre_processor
    vocab = pickle.load(
        (proj_dir / params['filepath'].get('vocab')).open(mode='rb'))
    pre_processor = PreProcessor(vocab=vocab, tokenizer=Okt)

    # create model
    model = SenCNN(num_classes=2, vocab=vocab)

    # create optimizer & loss_fn
    epochs = params['training'].get('epochs')
    learning_rate = params['training'].get('learning_rate')
    opt = tf.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.losses.SparseCategoricalCrossentropy()

    # training

    for epoch in tqdm(range(epochs), desc='epochs'):
        tr_loss = 0
        tf.keras.backend.set_learning_phase(1)

        for step, mb in tqdm(enumerate(tr_ds), desc='steps'):
            x_mb, y_mb = pre_processor.convert2idx(mb)
            with tf.GradientTape() as tape:
                mb_loss = loss_fn(y_mb, model(x_mb))
            grads = tape.gradient(target=mb_loss,
                                  sources=model.trainable_variables)
            opt.apply_gradients(
                grads_and_vars=zip(grads, model.trainable_variables))
            tr_loss += mb_loss.numpy()
        else:
            tr_loss /= (step + 1)

        tf.keras.backend.set_learning_phase(0)
        val_loss = 0
        for step, mb in tqdm(enumerate(val_ds), desc='steps'):
            x_mb, y_mb = pre_processor.convert2idx(mb)
            mb_loss = loss_fn(y_mb, model(x_mb))
            val_loss += mb_loss.numpy()
        else:
            val_loss /= (step + 1)

        tqdm.write('epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(
            epoch + 1, tr_loss, val_loss))
Beispiel #3
0
    def main(self):
        batch_size = self._batch_size
        tr_filepath = 'data/train.txt'
        val_filepath = 'data/val.txt'
        tr_ds = SmCnn.create_dataset(self,
                                     tr_filepath,
                                     batch_size,
                                     shuffle=True)
        val_ds = SmCnn.create_dataset(self,
                                      val_filepath,
                                      batch_size,
                                      shuffle=False)

        vocab = pd.read_pickle('data/vocab.pkl')
        pre_processor = PreProcessor(vocab=vocab, tokenizer=Mecab())

        # create model
        model = SmCnn()

        # create optimizer & loss_fn
        epochs = self._epochs
        learning_rate = self._learning_rate
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

        # training
        for epoch in tqdm(range(epochs), desc='steps'):
            tr_loss = 0
            tf.keras.backend.set_learning_phase(1)

            for step, mb in tqdm(enumerate(tr_ds), desc='steps'):
                x_mb, y_mb = pre_processor.convert2idx(mb)
                with tf.GradientTape() as tape:
                    mb_loss = loss_fn(y_mb, model(x_mb))
                grads = tape.gradient(target=mb_loss,
                                      sources=model.trainable_variables)
                opt.apply_gradients(
                    grads_and_vars=zip(grads, model.trainable_variables))
                tr_loss += mb_loss.numpy()
            else:
                tr_loss /= (step + 1)

            tf.keras.backend.set_learning_phase(0)
            val_loss = 0
            for step, mb in tqdm(enumerate(val_ds), desc='steps'):
                x_mb, y_mb = pre_processor.convert2idx(mb)
                mb_loss = loss_fn(y_mb, model(x_mb))
                val_loss += mb_loss.numpy()
            else:
                val_loss /= (step + 1)

            tqdm.write(
                'epoch : {}, tr_loss : {:.3f}, val_loss : {:.3f}'.format(
                    epoch + 1, tr_loss, val_loss))
Beispiel #4
0
def get_preprocessor(ptr_config_info, model_config):
    with open(ptr_config_info.vocab, mode='rb') as io:
        vocab = pickle.load(io)

    if model_config.type == 'etri':
        ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config_info.tokenizer, do_lower_case=False)
        pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence)
    elif model_config.type == 'skt':
        ptr_tokenizer = SentencepieceTokenizer(ptr_config_info.tokenizer)
        pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence)
    return preprocessor
Beispiel #5
0
def main(cfgpath):
    # parsing config.json
    proj_dir = Path.cwd()
    params = json.load((proj_dir / cfgpath).open())

    # create dataset
    batch_size = params['training'].get('batch_size')
    tr_filepath = params['filepath'].get('tr')
    val_filepath = params['filepath'].get('val')
    tst_filepath = params['filepath'].get('tst')

    tr_ds = create_dataset(tr_filepath, batch_size, False, False)
    val_ds = create_dataset(val_filepath, batch_size, False, False)
    tst_ds = create_dataset(tst_filepath, batch_size, False, False)

    # create pre_processor
    vocab = pickle.load(
        (proj_dir / params['filepath'].get('vocab')).open(mode='rb'))
    pre_processor = PreProcessor(vocab=vocab,
                                 tokenizer=MeCab().morphs,
                                 pad_idx=1)

    # create model
    model = SenCNN(num_classes=2, vocab=vocab)
    ckpt = tf.train.Checkpoint(model=model)
    ckpt.restore(save_path=tf.train.latest_checkpoint(proj_dir / 'checkpoint'))

    # evluation
    tr_acc = get_accuracy(model, tr_ds, pre_processor.convert2idx)
    val_acc = get_accuracy(model, val_ds, pre_processor.convert2idx)
    tst_acc = get_accuracy(model, tst_ds, pre_processor.convert2idx)

    print('tr_acc: {:.2%}, val_acc : {:.2%}, tst_acc: {:.2%}'.format(
        tr_acc, val_acc, tst_acc))
Beispiel #6
0
def predict(sentence1, sentence2):
    ptr_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/pretrained"
    data_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/data"
    caseType = "skt"
    model_dir = "C:/Users/aaaaa/workspace/fact-check/BERT_pairwise_text_classification/experiments/base_model"
    checkpoint_model_file = "best_skt.tar"
    
    # ptr_dir = "BERT_pairwise_text_classification/pretrained"
    # data_dir = "BERT_pairwise_text_classification/data"
    # caseType = "skt"
    # model_dir = "BERT_pairwise_text_classification/experiments/base_model"
    # checkpoint_model_file = "best_skt.tar"
    
    # ptr_dir = "pretrained"
    # data_dir = "data"
    # caseType = "skt"
    # model_dir = "experiments/base_model"
    # checkpoint_model_file = "best_skt.tar"
    
    ptr_dir = Path(ptr_dir)
    data_dir = Path(data_dir)
    model_dir = Path(model_dir)
    checkpoint_model_file = Path(checkpoint_model_file)
    
    ptr_config = Config(ptr_dir / 'config_skt.json')
    data_config = Config(data_dir / 'config.json')
    model_config = Config(model_dir / 'config.json')
    
    # vocab
    with open(os.path.join(ptr_dir, ptr_config.vocab), mode='rb') as io:
        vocab = pickle.load(io)
    
    
    ptr_tokenizer = SentencepieceTokenizer(os.path.join(ptr_dir, ptr_config.tokenizer))
    pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
    preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence)
    
    # model (restore)
    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint(checkpoint_model_file)
    config = BertConfig(os.path.join(ptr_dir, ptr_config.config))
    model = PairwiseClassifier(config, num_classes=model_config.num_classes, vocab=preprocessor.vocab)
    model.load_state_dict(checkpoint['model_state_dict'])
    
    device = torch.device('cpu')
    model.to(device)
    
    transform = preprocessor.preprocess
    if model.training:
        model.eval()
        
    indices, token_types = [torch.tensor([elm]) for elm in transform(sentence1, sentence2)]

    with torch.no_grad():
        label = model(indices, token_types)
    label = label.max(dim=1)[1]
    label = label.numpy()[0]

    return label
Beispiel #7
0
def get_preprocessor(dataset_config, coarse_split_fn, fine_split_fn):
    with open(dataset_config.fine_vocab, mode="rb") as io:
        fine_vocab = pickle.load(io)
    with open(dataset_config.coarse_vocab, mode="rb") as io:
        coarse_vocab = pickle.load(io)

    preprocessor = PreProcessor(coarse_vocab=coarse_vocab, fine_vocab=fine_vocab,
                                coarse_split_fn=coarse_split_fn,
                                fine_split_fn=fine_split_fn)
    return preprocessor
    data_config = Config(data_dir / 'config.json')
    model_config = Config(model_dir / 'config.json')

    # vocab
    with open(ptr_config.vocab, mode='rb') as io:
        vocab = pickle.load(io)

    # tokenizer
    if args.type == 'etri':
        ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config.tokenizer,
                                                      do_lower_case=False)
        pad_sequence = PadSequence(length=model_config.length,
                                   pad_val=vocab.to_indices(
                                       vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab,
                                    split_fn=ptr_tokenizer.tokenize,
                                    pad_fn=pad_sequence)
    elif args.type == 'skt':
        ptr_tokenizer = SentencepieceTokenizer(ptr_config.tokenizer)
        pad_sequence = PadSequence(length=model_config.length,
                                   pad_val=vocab.to_indices(
                                       vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab,
                                    split_fn=ptr_tokenizer,
                                    pad_fn=pad_sequence)

    # model (restore)
    checkpoint_manager = CheckpointManager(model_dir)
    checkpoint = checkpoint_manager.load_checkpoint('best_{}.tar'.format(
        args.type))
    config = BertConfig(ptr_config.config)
Beispiel #9
0
def main():
    tr_filepath = Path.cwd() / 'data' / 'train.txt'
    val_filepath = Path.cwd() / 'data' / 'val.txt'

    with open(Path.cwd() / 'data/vocab.pkl', mode='rb') as f:
        vocab = pickle.load(f)

    tr_ds = create_dataset(str(tr_filepath), 128, shuffle=True)
    val_ds = create_dataset(str(val_filepath), 128,
                            shuffle=False)  # 평가 데이터는 셔플 ㄴㄴ

    tokenized = Okt()
    pre_processor = PreProcessor(vocab=vocab, tokenizer=tokenized)

    # create model
    model = SenCNN(num_classes=2, vocab=vocab)

    # create optimizer & loss_fn
    epochs = 10
    learning_rate = 1e-3

    opt = tf.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

    # metrics
    tr_loss_metric = tf.keras.metrics.Mean(name='train_loss')
    tr_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='train_accuracy')
    val_loss_metric = tf.keras.metrics.Mean(name='validation_loss')
    val_accuracy_metric = tf.keras.metrics.SparseCategoricalAccuracy(
        name='validation_accuracy')

    # training

    for epoch in tqdm(range(epochs), desc='epochs'):
        # trining data
        tf.keras.backend.set_learning_phase(1)  # train mode

        for _, mb in tqdm(enumerate(tr_ds), desc='steps'):
            x_mb, y_mb = pre_processor.convert2idx(mb)
            x_mb = pre_processor.pad_sequences(x_mb, 70)
            x_mb, y_mb = pre_processor.convert_to_tensor(x_mb, y_mb)

            with tf.GradientTape() as tape:
                mb_loss = loss_fn(y_mb, model(x_mb))
            grads = tape.gradient(target=mb_loss,
                                  sources=model.trainable_variables)
            opt.apply_gradients(
                grads_and_vars=zip(grads, model.trainable_variables))

            tr_loss_metric.update_state(mb_loss)
            tr_accuracy_metric(y_mb, model(x_mb))

        tr_mean_loss = tr_loss_metric.result()
        tr_mean_accuracy = tr_accuracy_metric.result()

        # test data
        tf.keras.backend.set_learning_phase(0)  # test mode
        for _, mb in tqdm(enumerate(val_ds), desc='steps'):
            x_mb, y_mb = pre_processor.convert2idx(mb)
            x_mb = pre_processor.pad_sequences(x_mb, 70)
            x_mb, y_mb = pre_processor.convert_to_tensor(x_mb, y_mb)
            mb_loss = loss_fn(y_mb, model(x_mb))

            val_loss_metric.update_state(mb_loss)
            val_accuracy_metric.update_state(y_mb, model(x_mb))

        val_mean_loss = val_loss_metric.result()
        val_mean_accuracy = val_accuracy_metric.result()

        tqdm.write(
            'epoch : {}, tr_accuracy : {:.3f}, tr_loss : {:.3f}, val_accuracy : {:.3f}, val_loss : {:.3f}'
            .format(epoch + 1, tr_mean_accuracy, tr_mean_loss,
                    val_mean_accuracy, val_mean_loss))

    ckpt_path = Path.cwd() / 'checkpoint/ckpt'
    ckpt = tf.train.Checkpoint(model=model)
    ckpt.save(ckpt_path)
Beispiel #10
0
    # vocab
    vocab = pickle.load(open(ptr_config.vocab, mode='rb'))

    # tokenizer
    if args.tokenizer == 'ranked':
        print('[RANKED TOKENIZER]')
        ptr_tokenizer = KBertRankedTokenizer(ptr_config.tokenizer,
                                             do_lower_case=False)
    else:
        ptr_tokenizer = BertTokenizer.from_pretrained(ptr_config.tokenizer,
                                                      do_lower_case=False)
        print('[BERT TOKENIZER]')
    pad_sequence = PadSequence(length=model_config.length,
                               pad_val=vocab.to_indices(vocab.padding_token))
    preprocessor = PreProcessor(vocab=vocab,
                                split_fn=ptr_tokenizer.tokenize,
                                pad_fn=pad_sequence,
                                subchar=args.subchar)

    # model
    config = BertConfig(ptr_config.config)
    model = SentenceClassifier(config,
                               num_classes=model_config.num_classes,
                               vocab=preprocessor.vocab)
    bert_pretrained = torch.load(ptr_config.bert)
    model.load_state_dict(bert_pretrained, strict=False)

    # training
    tr_ds = Corpus(data_config.train, preprocessor.preprocess)
    tr_dl = DataLoader(tr_ds,
                       batch_size=model_config.batch_size,
                       shuffle=True,
Beispiel #11
0
    model_dir = Path(args.model_dir)

    data_config = Config(data_dir / 'config.json')
    model_config = Config(model_dir / 'config.json')

    # Vocab and Tokenizer
    ptr_dir = Path("pretrained")
    vocab_filepath = ptr_dir / "{}-vocab.pkl".format(args.type)
    with open(vocab_filepath, mode='rb') as io:
        vocab = pickle.load(io)
    ptr_tokenizer = BertTokenizer.from_pretrained(args.type,
                                                  do_lower_case="uncased"
                                                  in args.type)
    ptr_tokenizer = Tokenizer(vocab, ptr_tokenizer.tokenize)

    preprocessor = PreProcessor(ptr_tokenizer, model_config.max_len)

    # Load Model
    config_filepath = ptr_dir / "{}-config.json".format(args.type)
    config = BertConfig.from_pretrained(config_filepath,
                                        output_hidden_states=False)
    model = BIIN(config,
                 vocab,
                 model_config.hidden_size,
                 enc_num_layers=len(model_config.hidden_size))

    # Data Loader
    tr_ds = Corpus(data_config.tr_path,
                   preprocessor.preprocess,
                   sep='\t',
                   doc_col='question1',
Beispiel #12
0
if __name__ == "__main__":
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)
    data_config = Config(data_dir / "config.json")
    model_config = Config(model_dir / "config.json")

    # tokenizer
    with open(data_config.fine_vocab, mode="rb") as io:
        fine_vocab = pickle.load(io)
    with open(data_config.coarse_vocab, mode="rb") as io:
        coarse_vocab = pickle.load(io)

    preprocessor = PreProcessor(
        coarse_vocab=coarse_vocab,
        fine_vocab=fine_vocab,
        coarse_split_fn=split_morphs,
        fine_split_fn=split_jamos,
    )

    # model
    model = SAN(model_config.num_classes, coarse_vocab, fine_vocab,
                model_config.fine_embedding_dim, model_config.hidden_dim, model_config.multi_step,
                model_config.prediction_drop_ratio)

    # training
    tr_ds = Corpus(data_config.train, preprocessor.preprocess)
    tr_dl = DataLoader(
        tr_ds,
        batch_size=model_config.batch_size,
        shuffle=True,
        num_workers=4,