Example #1
0
def main():
    args = get_arguments()

    print("initializing device ... ", end="", file=sys.stderr, flush=True)
    dev = D.Naive() if args.gpu < 0 else D.CUDA(args.gpu)
    Device.set_default(dev)
    print("done.", file=sys.stderr)

    mode = args.mode
    prefix = args.model
    if mode == "train":
        encdec = EncoderDecoder(args.dropout)
        encdec.init(args.src_vocab, args.trg_vocab, args.embed, args.hidden)
        optimizer = O.Adam()
        optimizer.set_weight_decay(1e-6)
        optimizer.set_gradient_clipping(5)
        train(encdec, optimizer, args, 1e10)
    elif mode == "resume":
        print("loading model/optimizer ... ",
              end="",
              file=sys.stderr,
              flush=True)
        encdec = EncoderDecoder(args.dropout)
        encdec.load(prefix + ".model")
        optimizer = O.Adam()
        optimizer.load(prefix + ".optimizer")
        valid_ppl = load_ppl(prefix + ".valid_ppl")
        print("done.", file=sys.stderr)
        train(encdec, optimizer, args, valid_ppl)
    else:
        print("loading model ... ", end="", file=sys.stderr, flush=True)
        encdec = EncoderDecoder(args.dropout)
        encdec.load(prefix + ".model")
        print("done.", file=sys.stderr)
        test(encdec, args)
Example #2
0
def main():
    parser = ArgumentParser()
    parser.add_argument("mode", help="(train|resume|test)")
    parser.add_argument("model_prefix", help="prefix of the model files.")
    args = parser.parse_args()

    mode = args.mode
    prefix = args.model_prefix
    print("mode:", mode, file=sys.stderr)
    print("prefix:", prefix, file=sys.stderr)

    if mode not in ("train", "resume", "test"):
        print("unknown mode:", mode, file=sys.stderr)
        return

    print("initializing device ... ", end="", file=sys.stderr)
    sys.stderr.flush()
    dev = D.CUDA(0)
    Device.set_default(dev)
    print("done.", file=sys.stderr)

    if mode == "train":
        encdec = AttentionalEncoderDecoder()
        encdec.init(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, NUM_EMBED_UNITS,
                    NUM_HIDDEN_UNITS)
        optimizer = O.Adam()
        optimizer.set_weight_decay(1e-6)
        optimizer.set_gradient_clipping(5)
        train(encdec, optimizer, prefix, 1e10)
    elif mode == "resume":
        print("loading model/optimizer ... ", end="", file=sys.stderr)
        sys.stderr.flush()
        encdec = AttentionalEncoderDecoder()
        encdec.load(prefix + ".model")
        optimizer = O.Adam()
        optimizer.load(prefix + ".optimizer")
        valid_ppl = load_ppl(prefix + ".valid_ppl")
        print("done.", file=sys.stderr)
        train(encdec, optimizer, prefix, valid_ppl)
    else:
        print("loading model ... ", end="", file=sys.stderr)
        sys.stderr.flush()
        encdec = AttentionalEncoderDecoder()
        encdec.load(prefix + ".model")
        print("done.", file=sys.stderr)
        test(encdec)
Example #3
0
def main(config):
    mode = config['mode']
    if mode == 'preproc':
        preproc(config)
        return

    print('initializing device ...', end='', file=sys.stderr, flush=True)
    dev = D.Naive() if config['gpu'] < 0 else D.CUDA(config['gpu'])
    Device.set_default(dev)
    print("done.", file=sys.stderr, flush=True)

    prefix = config['model_prefix']
    if mode == 'train':
        model = Transformer(config['n_heads'], config['n_stacks'],
                            config['dropout'], config['generation_limit'])
        model.init(config['vocabulary_size'], config['d_model'],
                   config['d_ff'])
        optimizer = O.Adam(alpha=1, beta2=0.98, eps=1e-9)
        optimizer.set_gradient_clipping(5)
        train(model, optimizer, config, 1e10)
    elif mode == 'resume':
        print('loading model/optimizer ... ',
              end='',
              file=sys.stderr,
              flush=True)
        model = Transformer(config['n_heads'], config['n_stacks'],
                            config['dropout'], config['generation_limit'])
        model.load(prefix + '.model')
        optimizer = O.Adam(alpha=1, beta2=0.98, eps=1e-9)
        optimizer.set_gradient_clipping(5)
        optimizer.load(prefix + '.optimizer')
        with Path(prefix).with_suffix('.valid').open() as f:
            valid_ppl = float(f.read().strip())
        print('done.', file=sys.stderr, flush=True)
        train(model, optimizer, config, valid_ppl)
    elif mode == 'test':
        model = Transformer(config['n_heads'], config['n_stacks'],
                            config['dropout'], config['generation_limit'])
        model.load(prefix + '.model')
        test(model, config)
Example #4
0
def main():
    # Loads vocab.
    vocab = make_vocab("data/ptb.train.txt")
    print("#vocab:", len(vocab))  # maybe 10000
    eos_id = vocab["<s>"]

    # Loads all corpus.
    train_corpus = load_corpus("data/ptb.train.txt", vocab)
    valid_corpus = load_corpus("data/ptb.valid.txt", vocab)
    num_train_sents = len(train_corpus)
    num_valid_sents = len(valid_corpus)
    num_train_labels = count_labels(train_corpus)
    num_valid_labels = count_labels(valid_corpus)
    print("train:", num_train_sents, "sentences,", num_train_labels, "labels")
    print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels")

    # Device and computation graph.
    dev = D.CUDA(0)
    Device.set_default(dev)
    g = Graph()
    Graph.set_default(g)

    # Our LM.
    lm = RNNLM(len(vocab), eos_id)

    # Optimizer.
    optimizer = O.SGD(1)
    #optimizer.set_weight_decay(1e-6)
    optimizer.set_gradient_clipping(5)
    optimizer.add(lm)

    # Sentence IDs.
    train_ids = list(range(num_train_sents))
    valid_ids = list(range(num_valid_sents))

    best_valid_ppl = 1e10

    # Train/valid loop.
    for epoch in range(MAX_EPOCH):
        print("epoch", epoch + 1, "/", MAX_EPOCH, ":")
        # Shuffles train sentence IDs.
        random.shuffle(train_ids)

        # Training.
        train_loss = 0
        for ofs in range(0, num_train_sents, BATCH_SIZE):
            batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)]
            batch = make_batch(train_corpus, batch_ids, eos_id)

            g.clear()

            outputs = lm.forward(batch, True)
            loss = lm.loss(outputs, batch)
            train_loss += loss.to_float() * len(batch_ids)

            optimizer.reset_gradients()
            loss.backward()
            optimizer.update()

            print("%d" % ofs, end="\r")
            sys.stdout.flush()

        train_ppl = math.exp(train_loss / num_train_labels)
        print("  train ppl =", train_ppl)

        # Validation.
        valid_loss = 0
        for ofs in range(0, num_valid_sents, BATCH_SIZE):
            batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)]
            batch = make_batch(valid_corpus, batch_ids, eos_id)

            g.clear()

            outputs = lm.forward(batch, False)
            loss = lm.loss(outputs, batch)
            valid_loss += loss.to_float() * len(batch_ids)
            print("%d" % ofs, end="\r")
            sys.stdout.flush()

        valid_ppl = math.exp(valid_loss / num_valid_labels)
        print("  valid ppl =", valid_ppl)

        if valid_ppl < best_valid_ppl:
            best_valid_ppl = valid_ppl
            print("  BEST")
        else:
            old_lr = optimizer.get_learning_rate_scaling()
            new_lr = 0.5 * old_lr
            optimizer.set_learning_rate_scaling(new_lr)
            print("  learning rate scaled:", old_lr, "->", new_lr)
Example #5
0
def main():
    # Loads data
    train_inputs = load_images("data/train-images-idx3-ubyte", NUM_TRAIN_SAMPLES)
    train_labels = load_labels("data/train-labels-idx1-ubyte", NUM_TRAIN_SAMPLES)
    test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES)
    test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES)

    # Initializes 2 device objects which manage different GPUs.
    dev0 = D.CUDA(0)
    dev1 = D.CUDA(1)

    # Parameters on GPU 0.
    pw1 = Parameter([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS], I.XavierUniform(), dev0)
    pb1 = Parameter([NUM_HIDDEN_UNITS], I.Constant(0), dev0)

    # Parameters on GPU 1.
    pw2 = Parameter([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS], I.XavierUniform(), dev1)
    pb2 = Parameter([NUM_OUTPUT_UNITS], I.Constant(0), dev1)

    trainer = T.SGD(.1)
    trainer.add_parameter(pw1)
    trainer.add_parameter(pb1)
    trainer.add_parameter(pw2)
    trainer.add_parameter(pb2)

    def make_graph(inputs):
        # We first store input values explicitly on GPU 0.
        x = F.input(inputs, device=dev0)
        w1 = F.parameter(pw1)
        b1 = F.parameter(pb1)
        w2 = F.parameter(pw2)
        b2 = F.parameter(pb2)
        # The hidden layer is calculated and implicitly stored on GPU 0.
        h_on_gpu0 = F.relu(w1 @ x + b1)
        # `copy()` transfers the hiddne layer to GPU 1.
        h_on_gpu1 = F.copy(h_on_gpu0, dev1)
        # The output layer is calculated and implicitly stored on GPU 1.
        return w2 @ h_on_gpu1 + b2

    ids = list(range(NUM_TRAIN_SAMPLES))

    g = Graph()
    Graph.set_default(g)

    for epoch in range(MAX_EPOCH):
        random.shuffle(ids)

        # Training loop
        for batch in range(NUM_TRAIN_BATCHES):
            print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES), end="")
            inputs = [train_inputs[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)]
            labels = [train_labels[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)]

            g.clear()

            y = make_graph(inputs)
            loss = F.softmax_cross_entropy(y, labels, 0)
            avg_loss = F.batch.mean(loss)

            trainer.reset_gradients()
            avg_loss.backward()
            trainer.update()

        print()

        match = 0

        # Test loop
        for batch in range(NUM_TEST_BATCHES):
            print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES), end="")
            inputs = [test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE)]

            g.clear()

            y = make_graph(inputs)
            y_val = y.to_list()
            for i in range(BATCH_SIZE):
                maxval = -1e10
                argmax = -1
                for j in range(NUM_OUTPUT_UNITS):
                    v = y_val[j + i * NUM_OUTPUT_UNITS]
                    if (v > maxval):
                        maxval = v
                        argmax = j
                if argmax == test_labels[i + batch * BATCH_SIZE]:
                    match += 1

        accuracy = 100.0 * match / NUM_TEST_SAMPLES
        print("\nepoch %d: accuracy: %.2f%%\n" % (epoch, accuracy))
Example #6
0
def main():
    # Loads data
    train_inputs = load_images("data/train-images-idx3-ubyte",
                               NUM_TRAIN_SAMPLES)
    train_labels = load_labels("data/train-labels-idx1-ubyte",
                               NUM_TRAIN_SAMPLES)
    test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES)
    test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES)

    dev = D.CUDA(0)
    Device.set_default(dev)
    g = Graph()
    Graph.set_default(g)

    # Parameters of CNNs
    # Shape: {kernel_height, kernel_width, in_channels, out_channels}
    pw_cnn1 = Parameter(Shape([KERNEL_SIZE1, KERNEL_SIZE1, 1, NUM_CHANNELS1]),
                        I.XavierUniformConv2D())
    pw_cnn2 = Parameter(
        Shape([KERNEL_SIZE2, KERNEL_SIZE2, NUM_CHANNELS1, NUM_CHANNELS2]),
        I.XavierUniformConv2D())

    # Parameters of FC layers
    pw_fc1 = Parameter(Shape([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS]),
                       I.XavierUniform())
    pw_fc2 = Parameter(Shape([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS]),
                       I.XavierUniform())
    pb_fc1 = Parameter(Shape([NUM_HIDDEN_UNITS]), I.Constant(0))
    pb_fc2 = Parameter(Shape([NUM_OUTPUT_UNITS]), I.Constant(0))

    # Optimizer
    optimizer = O.SGD(.1)
    optimizer.add(pw_cnn1, pw_cnn2, pw_fc1, pw_fc2, pb_fc1, pb_fc2)

    # Helper lambda to construct the predictor network.
    def make_graph(inputs, train):
        # Input and parameters.
        #x = F.input(Shape([IMAGE_HEIGHT, IMAGE_WIDTH], BATCH_SIZE), inputs)
        x = F.input(inputs)
        w_cnn1 = F.parameter(pw_cnn1)
        w_cnn2 = F.parameter(pw_cnn2)
        w_fc1 = F.parameter(pw_fc1)
        w_fc2 = F.parameter(pw_fc2)
        b_fc1 = F.parameter(pb_fc1)
        b_fc2 = F.parameter(pb_fc2)
        # CNNs
        h_cnn1 = F.relu(F.conv2d(x, w_cnn1, PADDING1, PADDING1, 1, 1, 1, 1))
        h_pool1 = F.max_pool2d(h_cnn1, 2, 2, 0, 0, 2, 2)
        h_cnn2 = F.relu(
            F.conv2d(h_pool1, w_cnn2, PADDING2, PADDING2, 1, 1, 1, 1))
        h_pool2 = F.max_pool2d(h_cnn2, 2, 2, 0, 0, 2, 2)
        # FC layers
        x_fc = F.dropout(F.flatten(h_pool2), .5, train)
        h_fc = F.dropout(F.relu(F.matmul(w_fc1, x_fc) + b_fc1), .5, train)
        return F.matmul(w_fc2, h_fc) + b_fc2

    # Batch randomizer
    ids = list(range(NUM_TRAIN_SAMPLES))

    for epoch in range(MAX_EPOCH):
        # Shuffles sample IDs.
        random.shuffle(ids)

        # Training loop
        for batch in range(NUM_TRAIN_BATCHES):
            print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES),
                  end="")
            # Makes a minibatch for training.
            inputs = [
                train_inputs[ids[batch * BATCH_SIZE + i]]
                for i in range(BATCH_SIZE)
            ]
            labels = [
                train_labels[ids[batch * BATCH_SIZE + i]]
                for i in range(BATCH_SIZE)
            ]

            # Constructs the graph.
            g.clear()
            y = make_graph(inputs, True)
            loss = F.softmax_cross_entropy(y, labels, 0)
            avg_loss = F.batch.mean(loss)

            # Dump computation graph at the first time.
            # if epoch == 0 and batch == 0:
            #     print(g.dump("dot"))

            # Implicit forward, backward, and updates parameters.
            optimizer.reset_gradients()
            avg_loss.backward()
            optimizer.update()

        print()

        match = 0

        # Test loop
        for batch in range(NUM_TEST_BATCHES):
            print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES),
                  end="")
            # Makes a test minibatch.
            inputs = [
                test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE)
            ]

            # Constructs the graph.
            g.clear()
            y = make_graph(inputs, False)

            # Gets outputs, argmax, and compares them with the label.
            y_val = y.to_list()
            for i in range(BATCH_SIZE):
                maxval = -1e10
                argmax = -1
                for j in range(NUM_OUTPUT_UNITS):
                    v = y_val[j + i * NUM_OUTPUT_UNITS]
                    if v > maxval:
                        maxval = v
                        argmax = j

                if argmax == test_labels[i + batch * BATCH_SIZE]:
                    match += 1

        accuracy = 100.0 * match / NUM_TEST_SAMPLES
        print("epoch %d: accuracy: %.2f%%" % (epoch, accuracy))

    return 0
Example #7
0
def main():
    # Loads vocab.
    vocab = make_vocab("data/ptb.train.txt")
    print("#vocab:", len(vocab))  # maybe 10000
    eos_id = vocab["<s>"]

    # Loads all corpus.
    train_corpus = load_corpus("data/ptb.train.txt", vocab)
    valid_corpus = load_corpus("data/ptb.valid.txt", vocab)
    num_train_sents = len(train_corpus)
    num_valid_sents = len(valid_corpus)
    num_train_labels = count_labels(train_corpus)
    num_valid_labels = count_labels(valid_corpus)
    print("train:", num_train_sents, "sentences,", num_train_labels, "labels")
    print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels")

    dev = D.CUDA(0)
    Device.set_default(dev)

    # Trainer.
    trainer = T.Adam()
    trainer.set_weight_decay(1e-6)
    trainer.set_gradient_clipping(5)

    # Our LM.
    lm = RNNLM(len(vocab), eos_id, trainer)

    # Sentence IDs.
    train_ids = list(range(num_train_sents))
    valid_ids = list(range(num_valid_sents))

    g = Graph()
    Graph.set_default(g)

    # Train/valid loop.
    for epoch in range(MAX_EPOCH):
        print("epoch", (epoch + 1), "/", MAX_EPOCH, ":")
        # Shuffles train sentence IDs.
        random.shuffle(train_ids)

        # Training.
        train_loss = 0
        for ofs in range(0, num_train_sents, BATCH_SIZE):
            batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)]
            batch = make_batch(train_corpus, batch_ids, eos_id)

            g.clear()

            outputs = lm.forward(batch)
            loss = lm.forward_loss(outputs, batch)
            train_loss += loss.to_float() * len(batch_ids)

            trainer.reset_gradients()
            loss.backward()
            trainer.update()

            print("\r%d" % ofs, end="")
            sys.stdout.flush()

        print()

        train_ppl = math.exp(train_loss / num_train_labels)
        print("  train ppl =", train_ppl)

        # Validation.
        valid_loss = 0
        for ofs in range(0, num_valid_sents, BATCH_SIZE):
            batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)]
            batch = make_batch(valid_corpus, batch_ids, eos_id)

            g.clear()

            outputs = lm.forward(batch)
            loss = lm.forward_loss(outputs, batch)
            valid_loss += loss.to_float() * len(batch_ids)
            print("\r%d" % ofs, end="")
            sys.stdout.flush()

        print()

        valid_ppl = math.exp(valid_loss / num_valid_labels)
        print("  valid ppl =", valid_ppl)