Beispiel #1
0
def build_model(nr_class, width, **kwargs):
    with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}):
        model = (FeatureExtracter([ORTH]) >> flatten_add_lengths >>
                 with_getitem(0, uniqued(HashEmbed(width, 10000, column=0))) >>
                 Pooling(mean_pool) >> Softmax(nr_class))
    model.lsuv = False
    return model
Beispiel #2
0
def main(use_gpu=False, nb_epoch=100):
    if use_gpu:
        Model.ops = CupyOps()
        Model.Ops = CupyOps
    train, test = datasets.imdb(limit=2000)
    print("Load data")
    train_X, train_y = zip(*train)
    test_X, test_y = zip(*test)
    train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2))
    test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2))
    
    nlp = spacy.load('en_vectors_web_lg')
    nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True)

    preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
    train_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X))]
    test_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X))]

    dev_X = train_X[-1000:]
    dev_y = train_y[-1000:]
    train_X = train_X[:-1000]
    train_y = train_y[:-1000]
    print("Parse data")
    n_sent = sum([len(list(sents)) for sents in train_X])
    print("%d sentences" % n_sent)

    model = build_model(2, width=128, conv_depth=2, depth=2,
                        train_X=train_X, train_y=train_y)
    with model.begin_training(train_X[:100], train_y[:100]) as (trainer, optimizer):
        epoch_loss = [0.]
        def report_progress():
            with model.use_params(optimizer.averages):
                print(epoch_loss[-1], epoch_var[-1], model.evaluate(dev_X, dev_y), trainer.dropout)
            epoch_loss.append(0.)
            epoch_var.append(0.)
        trainer.each_epoch.append(report_progress)
        batch_sizes = compounding(64, 64, 1.01)
        trainer.dropout = 0.3
        trainer.batch_size = int(next(batch_sizes))
        trainer.dropout_decay = 0.0
        trainer.nb_epoch = nb_epoch
        #optimizer.alpha = 0.1
        #optimizer.max_grad_norm = 10.0
        #optimizer.b1 = 0.0
        #optimizer.b2 = 0.0
        epoch_var = [0.]
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            losses = ((yh-y)**2.).sum(axis=1) / y.shape[0]
            epoch_var[-1] += losses.var()
            loss = losses.mean()
            backprop((yh-y)/yh.shape[0], optimizer)
            epoch_loss[-1] += loss
            trainer.batch_size = int(next(batch_sizes))
        with model.use_params(optimizer.averages):
            print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y))
Beispiel #3
0
def build_text_classifier(nr_class, width=64, **cfg):
    nr_vector = cfg.get('nr_vector', 5000)
    pretrained_dims = cfg.get('pretrained_dims', 0)
    with Model.define_operators({
            '>>': chain,
            '+': add,
            '|': concatenate,
            '**': clone
    }):
        if cfg.get('low_data') and pretrained_dims:
            model = (SpacyVectors >> flatten_add_lengths >> with_getitem(
                0, Affine(width, pretrained_dims)) >>
                     ParametricAttention(width) >> Pooling(sum_pool) >>
                     Residual(ReLu(width, width))**2 >> zero_init(
                         Affine(nr_class, width, drop_factor=0.0)) >> logistic)
            return model

        lower = HashEmbed(width, nr_vector, column=1)
        prefix = HashEmbed(width // 2, nr_vector, column=2)
        suffix = HashEmbed(width // 2, nr_vector, column=3)
        shape = HashEmbed(width // 2, nr_vector, column=4)

        trained_vectors = (FeatureExtracter(
            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten(
                uniqued((lower | prefix | suffix | shape) >> LN(
                    Maxout(width, width + (width // 2) * 3)),
                        column=0)))

        if pretrained_dims:
            static_vectors = (
                SpacyVectors >> with_flatten(Affine(width, pretrained_dims)))
            # TODO Make concatenate support lists
            vectors = concatenate_lists(trained_vectors, static_vectors)
            vectors_width = width * 2
        else:
            vectors = trained_vectors
            vectors_width = width
            static_vectors = None
        cnn_model = (
            vectors >> with_flatten(
                LN(Maxout(width, vectors_width)) >> Residual(
                    (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**2,
                pad=2) >> flatten_add_lengths >> ParametricAttention(width) >>
            Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >>
            zero_init(Affine(nr_class, width, drop_factor=0.0)))

        linear_model = (
            _preprocess_doc >> LinearModel(nr_class, drop_factor=0.))

        model = ((linear_model | cnn_model) >> zero_init(
            Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic)
    model.nO = nr_class
    model.lsuv = False
    return model
Beispiel #4
0
def Tok2Vec(width, embed_size, **kwargs):
    pretrained_vectors = kwargs.get('pretrained_vectors', None)
    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({
            '>>': chain,
            '|': concatenate,
            '**': clone,
            '+': add,
            '*': reapply
    }):
        norm = HashEmbed(width,
                         embed_size,
                         column=cols.index(NORM),
                         name='embed_norm')
        prefix = HashEmbed(width,
                           embed_size // 2,
                           column=cols.index(PREFIX),
                           name='embed_prefix')
        suffix = HashEmbed(width,
                           embed_size // 2,
                           column=cols.index(SUFFIX),
                           name='embed_suffix')
        shape = HashEmbed(width,
                          embed_size // 2,
                          column=cols.index(SHAPE),
                          name='embed_shape')
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors,
                                  width,
                                  column=cols.index(ID))

            embed = uniqued((glove | norm | prefix | suffix | shape) >> LN(
                Maxout(width, width * 5, pieces=3)),
                            column=cols.index(ORTH))
        else:
            embed = uniqued((norm | prefix | suffix | shape) >> LN(
                Maxout(width, width * 4, pieces=3)),
                            column=cols.index(ORTH))

        convolution = Residual(
            ExtractWindow(
                nW=1) >> LN(Maxout(width, width *
                                   3, pieces=cnn_maxout_pieces)))

        tok2vec = (FeatureExtracter(cols) >> with_flatten(
            embed >> convolution**4, pad=4))
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Beispiel #5
0
def build_model(nr_class, width, depth, conv_depth, **kwargs):
    with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}):
        embed = ((HashEmbed(width, 5000, column=1)
                  | HashEmbed(width // 2, 750, column=2)
                  | HashEmbed(width // 2, 750, column=3)
                  | HashEmbed(width // 2, 750, column=4)) >> Maxout(width))

        sent2vec = (
            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE]) >>
            flatten_add_lengths >> with_getitem(
                0,
                uniqued(embed, column=0) >>
                Residual(ExtractWindow(nW=1) >> SELU(width))**conv_depth) >>
            ParametricAttention(width) >> Pooling(sum_pool) >> Residual(
                SELU(width))**depth)

        model = (
            foreach_sentence(sent2vec, drop_factor=2.0) >> flatten_add_lengths
            >> ParametricAttention(width, hard=False) >> Pooling(sum_pool) >>
            Residual(SELU(width))**depth >> Softmax(nr_class))
    model.lsuv = False
    return model