Ejemplo n.º 1
0
def main(loc,
         width=64,
         depth=2,
         batch_size=128,
         dropout=0.5,
         dropout_decay=1e-5,
         nb_epoch=20):
    print("Load spaCy")
    nlp = spacy.load('en',
                     parser=False,
                     entity=False,
                     matcher=False,
                     tagger=False)
    print("Construct model")
    Model.ops = CupyOps()
    with Model.define_operators({'>>': chain, '**': clone, '|': concatenate}):
        mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width * 3)
        sent2vec = (get_word_ids >> flatten_add_lengths >> with_getitem(
            0,
            SpacyVectors(nlp, width) >> mwe_encode**depth) >> Pooling(
                mean_pool, max_pool))
        model = (((Arg(0) >> sent2vec) |
                  (Arg(1) >> sent2vec)) >> Maxout(width, width * 4) >> Maxout(
                      width, width)**depth >> Softmax(2, width))

    print("Read and parse quora data")
    rows = read_quora_tsv_data(pathlib.Path(loc))
    train, dev = partition(rows, 0.9)
    train_X, train_y = create_data(model.ops, nlp, train)
    dev_X, dev_y = create_data(model.ops, nlp, dev)
    print("Train")
    with model.begin_training(train_X[:20000],
                              train_y[:20000]) as (trainer, optimizer):
        trainer.batch_size = batch_size
        trainer.nb_epoch = nb_epoch
        trainer.dropout = dropout
        trainer.dropout_decay = dropout_decay

        epoch_times = [timer()]
        epoch_loss = [0.]
        n_train_words = sum(len(d0) + len(d1) for d0, d1 in train_X)
        n_dev_words = sum(len(d0) + len(d1) for d0, d1 in dev_X)

        def track_progress():
            stats = get_stats(model, optimizer.averages, dev_X, dev_y,
                              epoch_loss[-1], epoch_times[-1], n_train_words,
                              n_dev_words)
            stats.append(trainer.dropout)
            stats = tuple(stats)
            print(
                len(epoch_loss),
                "%.3f loss, %.3f (%.3f) acc, %d/%d=%d wps train, %d/%.3f=%d wps run. d.o.=%.3f"
                % stats)
            epoch_times.append(timer())
            epoch_loss.append(0.)

        trainer.each_epoch.append(track_progress)
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            backprop(yh - y, optimizer)
def main(dataset='quora', width=64, depth=2, min_batch_size=1,
        max_batch_size=128, dropout=0.0, dropout_decay=0.0, pooling="mean+max",
        nb_epoch=20, pieces=3, use_gpu=False, out_loc=None, quiet=False):
    cfg = dict(locals())
    if out_loc:
        out_loc = Path(out_loc)
        if not out_loc.parent.exists():
            raise IOError("Can't open output location: %s" % out_loc)
    print(cfg)
    if pooling == 'mean+max':
        pool_layer = Pooling(mean_pool, max_pool)
    elif pooling == "mean":
        pool_layer = mean_pool
    elif pooling == "max":
        pool_layer = max_pool
    else:
        raise ValueError("Unrecognised pooling", pooling)


    print("Load spaCy")
    nlp = get_spacy('en')

    #if use_gpu:
    #    Model.ops = CupyOps()

    print("Construct model")
    # Bind operators for the scope of the block:
    # * chain (>>): Compose models in a 'feed forward' style,
    # i.e. chain(f, g)(x) -> g(f(x))
    # * clone (**): Create n copies of a model, and chain them, i.e.
    # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights.
    # * concatenate (|): Merge the outputs of two models into a single vector,
    # i.e. (f|g)(x) -> hstack(f(x), g(x))
    with Model.define_operators({'>>': chain, '**': clone, '|': concatenate,
                                 '+': add}):
        mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width*3, pieces=pieces)

        embed = StaticVectors('en', width)# + Embed(width, width*2, 5000)
        # Comments indicate the output type and shape at each step of the pipeline.
        # * B: Number of sentences in the batch
        # * T: Total number of words in the batch
        # (i.e. sum(len(sent) for sent in batch))
        # * W: Width of the network (input hyper-parameter)
        # * ids: ID for each word (integers).
        # * lengths: Number of words in each sentence in the batch (integers)
        # * floats: Standard dense vector.
        # (Dimensions annotated in curly braces.)
        sent2vec = ( # List[spacy.token.Doc]{B}
            flatten_add_lengths  # : (ids{T}, lengths{B})
            >> with_getitem(0,      # : word_ids{T}
                 embed
                 >> mwe_encode ** depth
            ) # : (floats{T, W}, lengths{B})
            >> pool_layer
            >> Maxout(width, pieces=pieces)
            >> Maxout(width, pieces=pieces)
        )
        model = (
            ((Arg(0) >> sent2vec) | (Arg(1) >> sent2vec))
            >> Maxout(width, pieces=pieces)
            >> Maxout(width, pieces=pieces)
            >> Softmax(2)
        )

    print("Read and parse data: %s" % dataset)
    if dataset == 'quora':
        train, dev = datasets.quora_questions()
    elif dataset == 'snli':
        train, dev = datasets.snli()
    elif dataset == 'stackxc':
        train, dev = datasets.stack_exchange()
    elif dataset in ('quora+snli', 'snli+quora'):
        train, dev = datasets.quora_questions()
        train2, dev2 = datasets.snli()
        train.extend(train2)
        dev.extend(dev2)
    else:
        raise ValueError("Unknown dataset: %s" % dataset)
    get_ids = get_word_ids(Model.ops)
    train_X, train_y = preprocess(model.ops, nlp, train, get_ids)
    dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids)

    print("Initialize with data (LSUV)")
    print(dev_y.shape)
    with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer):
        # Pass a callback to print progress. Give it all the local scope,
        # because why not?
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        print("Accuracy before training", model.evaluate(dev_X, dev_y))
        print("Train")
        global epoch_train_acc
        for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet):
            # Slightly useful trick: Decay the dropout as training proceeds.
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            assert yh.shape == y.shape, (yh.shape, y.shape)
            # No auto-diff: Just get a callback and pass the data through.
            # Hardly a hardship, and it means we don't have to create/maintain
            # a computational graph. We just use closures.

            assert (yh >= 0.).all()
            train_acc = (yh.argmax(axis=1) == y.argmax(axis=1)).sum()
            epoch_train_acc += train_acc

            backprop(yh-y, optimizer)

            # Slightly useful trick: start with low batch size, accelerate.
            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
        if out_loc:
            out_loc = Path(out_loc)
            print('Saving to', out_loc)
            with out_loc.open('wb') as file_:
                pickle.dump(model, file_, -1)
Ejemplo n.º 3
0
def main(loc=None,
         width=128,
         depth=2,
         max_batch_size=128,
         dropout=0.5,
         dropout_decay=1e-5,
         nb_epoch=30,
         use_gpu=False):
    cfg = dict(locals())

    print("Load spaCy")
    nlp = spacy.load('en',
                     parser=False,
                     entity=False,
                     matcher=False,
                     tagger=False)

    if use_gpu:
        Model.ops = CupyOps()

    print("Construct model")
    # Bind operators for the scope of the block:
    # * chain (>>): Compose models in a 'feed forward' style,
    # i.e. chain(f, g)(x) -> g(f(x))
    # * clone (**): Create n copies of a model, and chain them, i.e.
    # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights.
    # * concatenate (|): Merge the outputs of two models into a single vector,
    # i.e. (f|g)(x) -> hstack(f(x), g(x))
    with Model.define_operators({'>>': chain, '**': clone, '|': concatenate}):
        # Important trick: text isn't like images, and the best way to use
        # convolution is different. Don't use pooling-over-time. Instead,
        # use the window to compute one vector per word, and do this N deep.
        # In the first layer, we adjust each word vector based on the two
        # surrounding words --- this gives us essentially trigram vectors.
        # In the next layer, we have a trigram of trigrams --- so we're
        # conditioning on information from a five word slice. The third layer
        # gives us 7 words. This is like the BiLSTM insight: we're not trying
        # to learn a vector for the whole sentence in this step. We're just
        # trying to learn better, position-sensitive word features. This simple
        # convolution step is much more efficient than BiLSTM, and can be
        # computed in parallel for every token in the batch.
        mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width * 3)
        # Comments indicate the output type and shape at each step of the pipeline.
        # * B: Number of sentences in the batch
        # * T: Total number of words in the batch
        # (i.e. sum(len(sent) for sent in batch))
        # * W: Width of the network (input hyper-parameter)
        # * ids: ID for each word (integers).
        # * lengths: Number of words in each sentence in the batch (integers)
        # * floats: Standard dense vector.
        # (Dimensions annotated in curly braces.)
        sent2vec = (  # List[spacy.token.Doc]{B}
            #get_word_ids            # : List[ids]{B}
            flatten_add_lengths  # : (ids{T}, lengths{B})
            >> with_getitem(
                0,  # : word_ids{T}
                # This class integrates a linear projection layer, and loads
                # static embeddings (by default, GloVe common crawl).
                SpacyVectors(nlp, width)  # : floats{T, W}
                >> mwe_encode**depth  # : floats{T, W}
            )  # : (floats{T, W}, lengths{B})
            # Useful trick: Why choose between max pool and mean pool?
            # We may as well have both representations.
            >> Pooling(mean_pool, max_pool)  # : floats{B, 2*W}
        )
        model = ((
            (Arg(0) >> sent2vec) | (Arg(1) >> sent2vec))  # : floats{B, 4*W}
                 >> Maxout(width, width * 4)  # : floats{B, W}
                 >> Maxout(width, width)**depth  # : floats{B, W}
                 >> Softmax(3, width)  # : floats{B, 2}
                 )

    print("Read and parse SNLI data")
    train, dev = datasets.snli(loc)
    train_X, train_y = preprocess(model.ops, nlp, train)
    dev_X, dev_y = preprocess(model.ops, nlp, dev)
    assert len(dev_y.shape) == 2
    print("Initialize with data (LSUV)")
    with model.begin_training(train_X[:10000], train_y[:10000],
                              **cfg) as (trainer, optimizer):
        # Pass a callback to print progress. Give it all the local scope,
        # because why not?
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = 1
        batch_size = 1.
        print("Accuracy before training", model.evaluate(dev_X, dev_y))
        print("Train")
        global epoch_train_acc
        for X, y in trainer.iterate(train_X, train_y):
            # Slightly useful trick: Decay the dropout as training proceeds.
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            # No auto-diff: Just get a callback and pass the data through.
            # Hardly a hardship, and it means we don't have to create/maintain
            # a computational graph. We just use closures.
            backprop(yh - y, optimizer)

            epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum()

            # Slightly useful trick: start with low batch size, accelerate.
            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001