def run_online(device):
    # predict labels online

    for l in sys.stdin:
        l = l.strip()
        l_lst = l.split('\t')
        if not l or l_lst < 2:
            print('# blank line')
            continue

        text1 = nlp_utils.normalize_text(l_lst[0])
        text2 = nlp_utils.normalize_text(l_lst[1])

        words1 = nlp_utils.split_text(text1, char_based=setup['char_based'])
        words2 = nlp_utils.split_text(text2, char_based=setup['char_based'])

        xs = nlp_utils.transform_to_array2([[words1, words2]],
                                           vocab,
                                           with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)

        with chainer.using_config('train', False), chainer.no_backprop_mode():
            prob = model.predict(xs['xs1'], xs['xs2'], softmax=True)[0]
        answer = int(model.xp.argmax(prob))
        score = float(prob[answer])
        print('{}\t{:.4f}\t{}\t{}'.format(answer, score, ' '.join(words1),
                                          ' '.join(words2)))
 def predict_batch(words_batch):
     xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
     xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
     with chainer.using_config('train', False), chainer.no_backprop_mode():
         probs = model.predict(xs, softmax=True)
     answers = model.xp.argmax(probs, axis=1)
     scores = probs[model.xp.arange(answers.size), answers].tolist()
     for words, answer, score in zip(words_batch, answers, scores):
         print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
 def predict_batch(words_batch):
     xs = nlp_utils.transform_to_array(words_batch, vocab, with_label=False)
     xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
     with chainer.using_config('train', False), chainer.no_backprop_mode():
         probs = model.predict(xs, softmax=True)
     answers = model.xp.argmax(probs, axis=1)
     scores = probs[model.xp.arange(answers.size), answers].tolist()
     for words, answer, score in zip(words_batch, answers, scores):
         print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
Exemple #4
0
def predict_fn(input_data, model):
    """
    This function receives a NumPy array and makes a prediction on it using the model returned
    by `model_fn`.
    
    The default predictor used by `Chainer` serializes input data to the 'npy' format:
    https://docs.scipy.org/doc/numpy-1.14.0/neps/npy-format.html

    The Chainer container provides an overridable pre-processing function `input_fn`
    that accepts the serialized input data and deserializes it into a NumPy array.
    `input_fn` is invoked before `predict_fn` and passes its return value to this function
    (as `input_data`)
    
    The Chainer container provides an overridable post-processing function `output_fn`
    that accepts this function's return value and serializes it back into `npy` format, which
    the Chainer predictor can deserialize back into a NumPy array on the client.

    Args:
        input_data: a numpy array containing the data serialized by the Chainer predictor
        model: the return value of `model_fn`
    Returns:
        a NumPy array containing predictions which will be returned to the client


    For more on `input_fn`, `predict_fn` and `output_fn`, please visit the sagemaker-python-sdk repository:
    https://github.com/aws/sagemaker-python-sdk

    For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
    https://github.com/aws/sagemaker-chainer-containers
    """
    trained_model, vocab = model

    words_batch = []
    for sentence in input_data.tolist():
        text = normalize_text(sentence)
        words = split_text(text)
        words_batch.append(words)

    xs = transform_to_array(words_batch, vocab, with_label=False)
    xs = convert_seq(xs, with_label=False)

    with chainer.using_config('train', False), chainer.no_backprop_mode():
        probs = trained_model.predict(xs, softmax=True)
    answers = trained_model.xp.argmax(probs, axis=1)
    scores = probs[trained_model.xp.arange(answers.size), answers].tolist()

    output = []
    for words, answer, score in zip(words_batch, answers, scores):
        output.append([' '.join(words), answer, score])

    return np.array(output)
def predict_fn(input_data, model):
    """
    This function receives a NumPy array and makes a prediction on it using the model returned
    by `model_fn`.
    
    The default predictor used by `Chainer` serializes input data to the 'npy' format:
    https://docs.scipy.org/doc/numpy-1.14.0/neps/npy-format.html

    The Chainer container provides an overridable pre-processing function `input_fn`
    that accepts the serialized input data and deserializes it into a NumPy array.
    `input_fn` is invoked before `predict_fn` and passes its return value to this function
    (as `input_data`)
    
    The Chainer container provides an overridable post-processing function `output_fn`
    that accepts this function's return value and serializes it back into `npy` format, which
    the Chainer predictor can deserialize back into a NumPy array on the client.

    Args:
        input_data: a numpy array containing the data serialized by the Chainer predictor
        model: the return value of `model_fn`
    Returns:
        a NumPy array containing predictions which will be returned to the client


    For more on `input_fn`, `predict_fn` and `output_fn`, please visit the sagemaker-python-sdk repository:
    https://github.com/aws/sagemaker-python-sdk

    For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
    https://github.com/aws/sagemaker-chainer-containers
    """
    trained_model, vocab = model

    words_batch = []
    for sentence in input_data.tolist():
        text = normalize_text(sentence)
        words = split_text(text)
        words_batch.append(words)

    xs = transform_to_array(words_batch, vocab, with_label=False)
    xs = convert_seq(xs, with_label=False)

    with chainer.using_config('train', False), chainer.no_backprop_mode():
        probs = trained_model.predict(xs, softmax=True)
    answers = trained_model.xp.argmax(probs, axis=1)
    scores = probs[trained_model.xp.arange(answers.size), answers].tolist()

    output = []
    for words, answer, score in zip(words_batch, answers, scores):
        output.append([' '.join(words), answer, score])

    return np.array(output)
def run_online(device):
    # predict labels online
    for l in sys.stdin:
        l = l.strip()
        if not l:
            print('# blank line')
            continue
        text = nlp_utils.normalize_text(l)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=device, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            prob = model.predict(xs, softmax=True)[0]
        answer = int(model.xp.argmax(prob))
        score = float(prob[answer])
        print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
def run_online(gpu):
    # predict labels online
    for l in sys.stdin:
        l = l.strip()
        if not l:
            print('# blank line')
            continue
        text = nlp_utils.normalize_text(l)
        words = nlp_utils.split_text(text, char_based=setup['char_based'])
        xs = nlp_utils.transform_to_array([words], vocab, with_label=False)
        xs = nlp_utils.convert_seq(xs, device=gpu, with_label=False)
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            prob = model.predict(xs, softmax=True)[0]
        answer = int(model.xp.argmax(prob))
        score = float(prob[answer])
        print('{}\t{:.4f}\t{}'.format(answer, score, ' '.join(words)))
Exemple #8
0
def main():
    parser = argparse.ArgumentParser(
        description='RNNとかで曲生成したい!')
    parser.add_argument('--batchsize', '-b', type=int, default=256,
                        help='Number of images in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=1,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--interval', '-i', type=int, default=10,
                        help='プログレスバー,表示とかのインターバル')
    parser.add_argument('--vec', '-v', type=int, default=32,
                        help='中間層の次元')
    parser.add_argument('--layer', '-l', type=int, default=2,
                        help='レイヤーの層')
    parser.add_argument('--frequency', '-f', type=int, default=400,
                        help='保存頻度')
    parser.add_argument('--model', '-model', default='LSTM',
                        choices=['Word2Vec'],
                        help='Name of encoder model type.')

    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=2))

    model = getattr(mymodel, args.model)(481, args.vec, args.layer)

    # GPUで動かせるのならば動かす
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # optimizerのセットアップ
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)

    # データセットのセットアップ
    trans = Trans()
    index = trans.getindex()
    train, val = chainer.datasets.split_dataset_random(
        index, int(len(index) * 0.8), seed=0)  # 2割をvalidation用にとっておく
    train = chainer.datasets.TransformDataset(train, trans)
    val = chainer.datasets.TransformDataset(val, trans)

    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    val_iter = chainer.iterators.SerialIterator(val, args.batchsize,
                                                repeat=False, shuffle=False)

    # 学習をどこまで行うかの設定
    stop_trigger = (args.epoch, 'epoch')

    # uodater, trainerのセットアップ
    updater = training.updaters.StandardUpdater(
        train_iter, optimizer, converter=(lambda x, y: tuple(convert_seq(x, y).values())),
        device=args.gpu, loss_func=model.lossfunc)
    trainer = training.Trainer(updater, stop_trigger)

    # testデータでの評価の設定
    evaluator = MyEvaluator(val_iter, model, device=args.gpu, eval_func=model.lossfunc)
    evaluator.trigger = 1, 'epoch'
    # trainer.extend(evaluator)

    # 学習済み部分を学習しないように (backwardはされてるっぽい?)
    if args.model == 6 or args.model == 7:
        model.base.disable_update()

    # snapshot(学習中の重み情報)の保存
    frequency = args.frequency
    trainer.extend(extensions.snapshot(), trigger=(frequency, 'iteration'))

    # trainデータでの評価の表示頻度設定
    logreport = extensions.LogReport(trigger=(args.interval, 'iteration'))
    trainer.extend(logreport)
    model.logreport = logreport

    # 各データでの評価の保存設定
    if extensions.PlotReport.available():
        trainer.extend(
            extensions.PlotReport(
                ['main/loss', 'val/loss'],
                'iteration', trigger=(10, 'iteration'), file_name='loss.png'))
        trainer.extend(
            extensions.PlotReport(
                ['main/acc', 'val/acc'],
                'iteration', trigger=(10, 'iteration'), file_name='accuracy.png'))

    # 各データでの評価の表示(欄に関する)設定
    trainer.extend(extensions.PrintReport(
        ['epoch', 'iteration', 'main/loss_r', 'main/loss', 'val/loss', 'main/acc', 'elapsed_time']))

    # プログレスバー表示の設定
    trainer.extend(extensions.ProgressBar(update_interval=args.interval))

    # 学習済みデータの読み込み設定
    # if args.resume:
    #     chainer.serializers.load_npz(args.resume, model, path='updater/model:main/')  # なぜかpathを外すと読み込めなくなってしまった 原因不明

    # setw(model)

    trainer.run()

    print("save resume")
    chainer.serializers.save_npz("resume.npz", model)
Exemple #9
0
def main():
    parser = argparse.ArgumentParser(
        description='Document Classification Example')
    parser.add_argument('--batchsize', '-b', type=int, default=64,
                        help='Number of documents in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=30,
                        help='Number of training epochs')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--unit', '-u', type=int, default=200,
                        help='Number of units')
    parser.add_argument('--vocab', '-v', type=int, default=50000,
                        help='Vocabulary size')
    parser.add_argument('--layer', '-l', type=int, default=1,
                        help='Number of layers of LSMT')
    parser.add_argument('--dropout', '-d', type=float, default=0.4,
                        help='Dropout rate')
    parser.add_argument('--gradclip', type=float, default=5,
                        help='Gradient clipping threshold')
    parser.add_argument('--train_file', '-train', default='data/train.seg.csv',
                        help='Trainig data file.')
    parser.add_argument('--test_file', '-test', default='data/test.seg.csv',
                        help='Test data file.')
    parser.add_argument('--model', '-m', help='read model parameters from npz file')
    parser.add_argument('--vcb_file',
                        default='/mnt/gold/users/s18153/prjPyCharm/prjNLP_GPU/data/vocab_train_w_NoReplace.vocab_file',
                        help='Vocabulary data file.')
    args = parser.parse_args()

    if os.path.exists(args.vcb_file):  # args.vocab_fileの存在確認(作成済みの場合ロード)
        with open(args.vcb_file, 'rb') as f_vocab_data:
            train_val = pickle.load(f_vocab_data)
    else:
        train_val = data.DocDataset(args.train_file, vocab_size=args.vocab)  # make vocab from training data
        with open(args.vcb_file, 'wb') as f_vocab_save:
            pickle.dump(train_val, f_vocab_save)


    # train_val = data.DocDataset(args.train_file, vocab_size=args.vocab)  # make vocab from training data
    # test = [x[0] for x in data.DocDataset(args.test_file, train_val.get_vocab())]  # [ データ1[文1[], 文2[], ...], データ2[文1[], 文2[], ...], ... ]
    # test_iter = iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False)

    # 文章,ラベルを同時取得
    # test_doc_label = [x for x in data.DocDataset(args.test_file, train_val.get_vocab())]  # [ データ1[文1[], 文2[], ...], データ2[文1[], 文2[], ...], ... ]
    test_doc_label = data.DocDataset(args.test_file, train_val.get_vocab())
    test_doc = [x[0] for x in test_doc_label]
    test_label = [x[1] for x in test_doc_label]
    test_iter = iterators.SerialIterator(test_doc, args.batchsize, repeat=False, shuffle=False)
    test_label_iter = iterators.SerialIterator(test_label, args.batchsize, repeat=False, shuffle=False)
    # test_doc_label_iter = iterators.SerialIterator(test_doc_label, args.batchsize, repeat=False, shuffle=False)

    model = nets.DocClassify(n_vocab=args.vocab+1, n_units=args.unit, n_layers=args.layer, n_out=4, dropout=args.dropout)
    # load npzができなくなる→解消?
    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    if args.model:
        serializers.load_npz(args.model, model, 'updater/model:main/predictor/')

    confusion_mat = np.zeros([4, 4])  # [label, prediction]
    with chainer.using_config('train', False):
        # ChainerにインプリされたEvaluatorでモデル計算を実行できないかと試行したものだが結果は不明
        # test_eval = extensions.Evaluator(test_doc_label_iter, model, converter=convert_seq, device=args.gpu)
        # test_result = test_eval()

        # while True:
        #     result = model(convert_seq(test_iter.next(), device=args.gpu, with_label=False))
        #     test_label_batch = test_label_iter.next()

        for (label_batch, each_testinput_batch) in zip(test_label_iter, test_iter):
            result = model(convert_seq(each_testinput_batch, device=args.gpu, with_label=False))
            predict = np.argmax(result.array, axis=1)

            for (each_label, each_predict) in zip(label_batch, predict):
                confusion_mat[each_label][chainer.cuda.to_cpu(each_predict)] += 1

    print(confusion_mat)

    # dummy_val = 'dummy data'

    time_now = datetime.now().strftime('%Y%m%d%H%M%S')
    save_path = '/mnt/gold/users/s18153/prjPyCharm/prjNLP_GPU/data/vocab_train_w_NoReplace.saved_'
    save_val_str = get_str_of_val_name_on_code(confusion_mat)[0]

    # for (each_val, each_val_str) in zip(save_val, save_val_str):
    #     with open(save_path + each_val_str + time_now, 'wb') as f_save:
    #         pickle.dump(each_val, f_save)
    with open(save_path + save_val_str + '_' + time_now, 'wb') as f_save:
        pickle.dump(confusion_mat, f_save)

    pass  # for breakpoint
Exemple #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--load', required=True)
    args_dir = os.path.join(parser.parse_args().load, 'args.json')
    with open(args_dir) as f:
        args = Bunch(json.load(f))

    print(json.dumps(args.__dict__, indent=2))

    # Load a dataset
    with open(args.vocab_path) as f:
        vocab = json.load(f)

    if args.dataset == 'dbpedia':
        train, test, vocab = text_datasets.get_dbpedia(
            vocab=vocab, char_based=args.char_based)
    elif args.dataset == 'sst':
        train, test, vocab = text_datasets.get_sst(char_based=args.char_based)
    elif args.dataset.startswith('imdb.'):
        train, test, vocab = text_datasets.get_imdb(
            vocab=vocab,
            fine_grained=args.dataset.endswith('.fine'),
            char_based=args.char_based)
    elif args.dataset in [
            'TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa',
            'rt-polarity', 'subj'
    ]:
        train, test, vocab = text_datasets.get_other_text_dataset(
            args.dataset, vocab=vocab, char_based=args.char_based)

    print('# train data: {}'.format(len(train)))
    print('# test  data: {}'.format(len(test)))
    print('# vocab: {}'.format(len(vocab)))
    n_class = len(set([int(d[1]) for d in train]))
    print('# class: {}'.format(n_class))
    # i_to_word = {v: k for k, v in vocab.items()}

    # FIXME
    args.batchsize = 64
    max_beam_size = 5

    # train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    test_iter = chainer.iterators.SerialIterator(test,
                                                 args.batchsize,
                                                 repeat=False,
                                                 shuffle=False)

    if args.dataset == 'snli':
        model = nets.DoubleMaxClassifier(n_layers=args.layer,
                                         n_vocab=len(vocab),
                                         n_units=args.unit,
                                         n_class=n_class,
                                         dropout=args.dropout)
    else:
        model = nets.SingleMaxClassifier(n_layers=args.layer,
                                         n_vocab=len(vocab),
                                         n_units=args.unit,
                                         n_class=n_class,
                                         dropout=args.dropout)
    if args.gpu >= 0:
        # Make a specified GPU current
        chainer.backends.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()  # Copy the model to the GPU
    chainer.serializers.load_npz(args.model_path, model)

    checkpoint = []
    for batch_idx, batch in enumerate(tqdm(test_iter)):
        # if batch_idx > 10:
        #     break

        batch = convert_seq(batch, device=args.gpu)
        xs = batch['xs']
        reduced_xs, removed_indices = get_rawr(model,
                                               xs,
                                               max_beam_size=max_beam_size)

        xp = cupy.get_array_module(*xs)
        n_finals = [len(r) for r in reduced_xs]
        reduced_xs = list(itertools.chain(*reduced_xs))
        removed_indices = list(itertools.chain(*removed_indices))
        reduced_xs = [xp.asarray(x) for x in reduced_xs]
        reduced_xs = convert_seq(reduced_xs, device=args.gpu, with_label=False)
        with chainer.using_config('train', False):
            ss_0 = xp.asnumpy(model.predict(xs, softmax=True))
            ss_1 = xp.asnumpy(model.predict(reduced_xs, softmax=True))
            ys_0 = np.argmax(ss_0, axis=1)
            ys_1 = np.argmax(ss_1, axis=1)

        start = 0
        for example_idx in range(len(xs)):
            oi = xs[example_idx].tolist()  # original input
            op = int(ys_0[example_idx])  # original predictoin
            oos = ss_0[example_idx]  # original output distribution
            label = int(batch['ys'][example_idx])
            checkpoint.append([])
            for i in range(start, start + n_finals[example_idx]):
                ri = reduced_xs[i].tolist()
                rp = int(ys_1[i])
                rs = ss_1[i]
                rr = removed_indices[i]
                entry = {
                    'original_input': oi,
                    'reduced_input': ri,
                    'original_prediction': op,
                    'reduced_prediction': rp,
                    'original_scores': oos,
                    'reduced_scores': rs,
                    'removed_indices': rr,
                    'label': label
                }
                checkpoint[-1].append(entry)
    with open(os.path.join(args.out, 'rawr_dev.pkl'), 'wb') as f:
        pickle.dump(checkpoint, f)