Beispiel #1
0
 def _build_network(self, orig_width, hidden_with):
     with Model.define_operators({">>": chain}):
         # very simple encoder-decoder model
         self.encoder = Affine(hidden_with, orig_width)
         self.model = self.encoder >> zero_init(
             Affine(orig_width, hidden_with, drop_factor=0.0))
     self.sgd = create_default_optimizer(self.model.ops)
Beispiel #2
0
def build_model(nr_class, width, depth, conv_depth, **kwargs):
    with Model.define_operators({"|": concatenate, ">>": chain, "**": clone}):
        embed = (HashEmbed(width, 5000, column=1)
                 | StaticVectors("spacy_pretrained_vectors", width, column=5)
                 | HashEmbed(width // 2, 750, column=2)
                 | HashEmbed(width // 2, 750, column=3)
                 | HashEmbed(width // 2, 750, column=4)) >> LN(Maxout(width))

        sent2vec = (flatten_add_lengths >> with_getitem(
            0,
            embed >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width)))**
            conv_depth,
        ) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(
            LN(Maxout(width)))**depth)

        model = (
            foreach(sent2vec, drop_factor=2.0) >> flatten_add_lengths
            # This block would allow the model to learn some cross-sentence
            # features. It's not useful on this problem. It might make more
            # sense to use a BiLSTM here, following Liang et al (2016).
            # >> with_getitem(0,
            #    Residual(ExtractWindow(nW=1) >> LN(Maxout(width))) ** conv_depth
            # )
            >> ParametricAttention(width, hard=False) >> Pooling(sum_pool) >>
            Residual(LN(Maxout(width)))**depth >> Softmax(nr_class))
    model.lsuv = False
    return model
Beispiel #3
0
def create_pretraining_model(nlp, tok2vec, objective="basic"):
    """Define a network for the pretraining."""
    output_size = nlp.vocab.vectors.data.shape[1]
    # This is annoying, but the parser etc have the flatten step after
    # the tok2vec. To load the weights in cleanly, we need to match
    # the shape of the models' components exactly. So what we cann
    # "tok2vec" has to be the same set of processes as what the components do.
    with Model.define_operators({">>": chain, "|": concatenate}):

        l2r_model = (
            tok2vec.l2r
            >> flatten
            >> LN(Maxout(output_size, tok2vec.l2r.nO, pieces=3))
            >> zero_init(Affine(output_size, drop_factor=0.0))
        )
        r2l_model = (
            tok2vec.r2l
            >> flatten
            >> LN(Maxout(output_size, tok2vec.r2l.nO, pieces=3))
            >> zero_init(Affine(output_size, drop_factor=0.0))
        )

        model = tok2vec.embed >> (l2r_model | r2l_model)

    model.tok2vec = tok2vec
    model.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
    tok2vec.begin_training([nlp.make_doc("Give it a doc to infer shapes")])
    tokvecs = tok2vec([nlp.make_doc('hello there'), nlp.make_doc(u'and hello')])
    print(tokvecs.shape)
    return model
Beispiel #4
0
def build_model(nr_class, width, **kwargs):
    with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}):
        model = (FeatureExtracter([ORTH]) >> flatten_add_lengths >>
                 with_getitem(0, uniqued(HashEmbed(width, 10000, column=0))) >>
                 Pooling(mean_pool) >> Softmax(nr_class))
    model.lsuv = False
    return model
Beispiel #5
0
def main(width=100,
         depth=4,
         vector_length=64,
         min_batch_size=1,
         max_batch_size=32,
         learn_rate=0.001,
         momentum=0.9,
         dropout=0.5,
         dropout_decay=1e-4,
         nb_epoch=20,
         L2=1e-6):
    cfg = dict(locals())
    print(cfg)
    if cupy is not None:
        print("Using GPU")
        Model.ops = CupyOps()
    train_data, check_data, nr_tag = ancora_pos_tags()

    extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX])
    Model.lsuv = True
    with Model.define_operators({
            '**': clone,
            '>>': chain,
            '+': add,
            '|': concatenate
    }):
        lower_case = HashEmbed(width, 100, column=0)
        shape = HashEmbed(width // 2, 200, column=1)
        prefix = HashEmbed(width // 2, 100, column=2)
        suffix = HashEmbed(width // 2, 100, column=3)

        model = (with_flatten(
            (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3)
            >> Residual(ExtractWindow(nW=1) >> Maxout(width, pieces=3))**depth
            >> Softmax(nr_tag),
            pad=depth))

    train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag)
    dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag)

    n_train = float(sum(len(x) for x in train_X))
    global epoch_train_acc
    with model.begin_training(train_X[:5000], train_y[:5000],
                              **cfg) as (trainer, optimizer):
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)

            gradient = [yh[i] - y[i] for i in range(len(yh))]

            backprop(gradient, optimizer)

            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
    with model.use_params(trainer.optimizer.averages):
        print(model.evaluate(dev_X, model.ops.flatten(dev_y)))
        with open('/tmp/model.pickle', 'wb') as file_:
            pickle.dump(model, file_)
Beispiel #6
0
def build_morphologizer_model(class_nums, **cfg):
    embed_size = util.env_opt("embed_size", 7000)
    if "token_vector_width" in cfg:
        token_vector_width = cfg["token_vector_width"]
    else:
        token_vector_width = util.env_opt("token_vector_width", 128)
    pretrained_vectors = cfg.get("pretrained_vectors")
    char_embed = cfg.get("char_embed", True)
    with Model.define_operators({">>": chain, "+": add, "**": clone}):
        if "tok2vec" in cfg:
            tok2vec = cfg["tok2vec"]
        else:
            tok2vec = Tok2Vec(
                token_vector_width,
                embed_size,
                char_embed=char_embed,
                pretrained_vectors=pretrained_vectors,
            )
        softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
        softmax.out_sizes = class_nums
        model = tok2vec >> softmax
    model.nI = None
    model.tok2vec = tok2vec
    model.softmax = softmax
    return model
Beispiel #7
0
def main(depth=2, width=512, nb_epoch=30):
    prefer_gpu()
    # Configuration here isn't especially good. But, for demo..
    with Model.define_operators({"**": clone, ">>": chain}):
        model = ReLu(width) >> ReLu(width) >> Softmax()

    train_data, dev_data, _ = datasets.mnist()
    train_X, train_y = model.ops.unzip(train_data)
    dev_X, dev_y = model.ops.unzip(dev_data)

    dev_y = to_categorical(dev_y)
    with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer):
        epoch_loss = [0.0]

        def report_progress():
            with model.use_params(optimizer.averages):
                print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout)
            epoch_loss.append(0.0)

        trainer.each_epoch.append(report_progress)
        trainer.nb_epoch = nb_epoch
        trainer.dropout = 0.3
        trainer.batch_size = 128
        trainer.dropout_decay = 0.0
        train_X = model.ops.asarray(train_X, dtype="float32")
        y_onehot = to_categorical(train_y)
        for X, y in trainer.iterate(train_X, y_onehot):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            loss = ((yh - y) ** 2.0).sum() / y.shape[0]
            backprop(yh - y, optimizer)
            epoch_loss[-1] += loss
        with model.use_params(optimizer.averages):
            print("Avg dev.: %.3f" % model.evaluate(dev_X, dev_y))
            with open("out.pickle", "wb") as file_:
                pickle.dump(model, file_, -1)
Beispiel #8
0
def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
    if "entity_width" not in cfg:
        raise ValueError(Errors.E144.format(param="entity_width"))

    conv_depth = cfg.get("conv_depth", 2)
    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
    pretrained_vectors = cfg.get("pretrained_vectors", None)
    context_width = cfg.get("entity_width")

    with Model.define_operators({">>": chain, "**": clone}):
        # context encoder
        tok2vec = Tok2Vec(
            width=hidden_width,
            embed_size=embed_width,
            pretrained_vectors=pretrained_vectors,
            cnn_maxout_pieces=cnn_maxout_pieces,
            subword_features=True,
            conv_depth=conv_depth,
            bilstm_depth=0,
        )

        model = (
            tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> Residual(
                zero_init(Maxout(hidden_width, hidden_width))) >> zero_init(
                    Affine(context_width, hidden_width, drop_factor=0.0)))

        model.tok2vec = tok2vec
        model.nO = context_width
    return model
Beispiel #9
0
def Tok2Vec(width, embed_size, **kwargs):
    pretrained_vectors = kwargs.get("pretrained_vectors", None)
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators(
        {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
    ):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
        if subword_features:
            prefix = HashEmbed(
                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
            )
            suffix = HashEmbed(
                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
            )
            shape = HashEmbed(
                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
            )
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))

            if subword_features:
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape)
                    >> LN(Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
            else:
                embed = uniqued(
                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
                    column=cols.index(ORTH),
                )
        elif subword_features:
            embed = uniqued(
                (norm | prefix | suffix | shape)
                >> LN(Maxout(width, width * 4, pieces=3)),
                column=cols.index(ORTH),
            )
        else:
            embed = norm

        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
        )
        tok2vec = FeatureExtracter(cols) >> with_flatten(
            embed >> convolution ** conv_depth, pad=conv_depth
        )
        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Beispiel #10
0
def main(
    width=100,
    depth=4,
    vector_length=64,
    min_batch_size=1,
    max_batch_size=32,
    learn_rate=0.001,
    momentum=0.9,
    dropout=0.5,
    dropout_decay=1e-4,
    nb_epoch=20,
    L2=1e-6,
):
    cfg = dict(locals())
    print(cfg)
    prefer_gpu()
    train_data, check_data, nr_tag = ancora_pos_tags()

    extracter = FeatureExtracter("es", attrs=[LOWER, SHAPE, PREFIX, SUFFIX])
    Model.lsuv = True
    with Model.define_operators({"**": clone, ">>": chain, "+": add, "|": concatenate}):
        lower_case = HashEmbed(width, 100, column=0)
        shape = HashEmbed(width // 2, 200, column=1)
        prefix = HashEmbed(width // 2, 100, column=2)
        suffix = HashEmbed(width // 2, 100, column=3)

        model = with_flatten(
            (lower_case | shape | prefix | suffix)
            >> Maxout(width, pieces=3)
            >> Residual(ExtractWindow(nW=1) >> Maxout(width, pieces=3)) ** depth
            >> Softmax(nr_tag),
            pad=depth,
        )

    train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag)
    dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag)

    n_train = float(sum(len(x) for x in train_X))
    global epoch_train_acc
    with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (
        trainer,
        optimizer,
    ):
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)

            gradient = [yh[i] - y[i] for i in range(len(yh))]

            backprop(gradient, optimizer)

            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
    with model.use_params(trainer.optimizer.averages):
        print(model.evaluate(dev_X, model.ops.flatten(dev_y)))
        with open("/tmp/model.pickle", "wb") as file_:
            pickle.dump(model, file_)
Beispiel #11
0
def main(width=128,
         depth=1,
         vector_length=128,
         min_batch_size=16,
         max_batch_size=16,
         learn_rate=0.001,
         momentum=0.9,
         dropout=0.5,
         dropout_decay=1e-4,
         nb_epoch=20,
         L2=1e-6):
    using_gpu = prefer_gpu()
    if using_gpu:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
    cfg = dict(locals())
    print(cfg)
    train_data, check_data, nr_tag = ancora_pos_tags()
    train_data = list(train_data)
    check_data = list(check_data)

    extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX])
    with Model.define_operators({
            '**': clone,
            '>>': chain,
            '+': add,
            '|': concatenate
    }):
        lower_case = HashEmbed(width, 100, column=0)
        shape = HashEmbed(width // 2, 200, column=1)
        prefix = HashEmbed(width // 2, 100, column=2)
        suffix = HashEmbed(width // 2, 100, column=3)

        model = (with_flatten(
            (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3))
                 >> PyTorchBiLSTM(width, width, depth) >> with_flatten(
                     Softmax(nr_tag)))

    train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag)
    dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag)

    n_train = float(sum(len(x) for x in train_X))
    global epoch_train_acc
    with model.begin_training(train_X[:10], train_y[:10],
                              **cfg) as (trainer, optimizer):
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)

            gradient = [yh[i] - y[i] for i in range(len(yh))]

            backprop(gradient, optimizer)

            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
    print(model.evaluate(dev_X, model.ops.flatten(dev_y)))
    with open('/tmp/model.pickle', 'wb') as file_:
        pickle.dump(model, file_)
Beispiel #12
0
def Tok2Vec(width, embed_size, **kwargs):
    pretrained_vectors = kwargs.get("pretrained_vectors", None)
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators(
        {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
    ):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
        if subword_features:
            prefix = HashEmbed(
                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
            )
            suffix = HashEmbed(
                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
            )
            shape = HashEmbed(
                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
            )
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))

            if subword_features:
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape)
                    >> LN(Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
            else:
                embed = uniqued(
                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
                    column=cols.index(ORTH),
                )
        elif subword_features:
            embed = uniqued(
                (norm | prefix | suffix | shape)
                >> LN(Maxout(width, width * 4, pieces=3)),
                column=cols.index(ORTH),
            )
        else:
            embed = norm

        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
        )
        tok2vec = FeatureExtracter(cols) >> with_flatten(
            embed >> convolution ** conv_depth, pad=conv_depth
        )
        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Beispiel #13
0
def MultiHashEmbed(config):
    # For backwards compatibility with models before the architecture registry,
    # we have to be careful to get exactly the same model structure. One subtle
    # trick is that when we define concatenation with the operator, the operator
    # is actually binary associative. So when we write (a | b | c), we're actually
    # getting concatenate(concatenate(a, b), c). That's why the implementation
    # is a bit ugly here.
    cols = config["columns"]
    width = config["width"]
    rows = config["rows"]

    norm = HashEmbed(width,
                     rows,
                     column=cols.index("NORM"),
                     name="embed_norm",
                     seed=1)
    if config["use_subwords"]:
        prefix = HashEmbed(width,
                           rows // 2,
                           column=cols.index("PREFIX"),
                           name="embed_prefix",
                           seed=2)
        suffix = HashEmbed(width,
                           rows // 2,
                           column=cols.index("SUFFIX"),
                           name="embed_suffix",
                           seed=3)
        shape = HashEmbed(width,
                          rows // 2,
                          column=cols.index("SHAPE"),
                          name="embed_shape",
                          seed=4)
    if config.get("@pretrained_vectors"):
        glove = make_layer(config["@pretrained_vectors"])
    mix = make_layer(config["@mix"])

    with Model.define_operators({">>": chain, "|": concatenate}):
        if config["use_subwords"] and config["@pretrained_vectors"]:
            mix._layers[0].nI = width * 5
            layer = uniqued(
                (glove | norm | prefix | suffix | shape) >> mix,
                column=cols.index("ORTH"),
            )
        elif config["use_subwords"]:
            mix._layers[0].nI = width * 4
            layer = uniqued((norm | prefix | suffix | shape) >> mix,
                            column=cols.index("ORTH"))
        elif config["@pretrained_vectors"]:
            mix._layers[0].nI = width * 2
            layer = uniqued(
                (glove | norm) >> mix,
                column=cols.index("ORTH"),
            )
        else:
            layer = norm
    layer.cfg = config
    return layer
Beispiel #14
0
def build_textcat_model(tok2vec, nr_class, width):
    from thinc.v2v import Model, Softmax
    from thinc.api import flatten_add_lengths, chain
    from thinc.t2v import Pooling, mean_pool

    with Model.define_operators({">>": chain}):
        model = (tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >>
                 Softmax(nr_class, width))
    model.tok2vec = chain(tok2vec, flatten)
    return model
Beispiel #15
0
def build_model(nr_class, width, **kwargs):
    with Model.define_operators({"|": concatenate, ">>": chain, "**": clone}):
        model = (
            FeatureExtracter([ORTH])
            >> flatten_add_lengths
            >> with_getitem(0, uniqued(HashEmbed(width, 10000, column=0)))
            >> Pooling(mean_pool)
            >> Softmax(nr_class)
        )
    model.lsuv = False
    return model
Beispiel #16
0
def build_bow_text_classifier(
    nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg
):
    with Model.define_operators({">>": chain}):
        model = with_cpu(
            Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
        )
        if not no_output_layer:
            model = model >> (cpu_softmax if exclusive_classes else logistic)
    model.nO = nr_class
    return model
Beispiel #17
0
def build_text_classifier(nr_class, width=64, **cfg):
    nr_vector = cfg.get('nr_vector', 5000)
    pretrained_dims = cfg.get('pretrained_dims', 0)
    with Model.define_operators({
            '>>': chain,
            '+': add,
            '|': concatenate,
            '**': clone
    }):
        if cfg.get('low_data') and pretrained_dims:
            model = (SpacyVectors >> flatten_add_lengths >> with_getitem(
                0, Affine(width, pretrained_dims)) >>
                     ParametricAttention(width) >> Pooling(sum_pool) >>
                     Residual(ReLu(width, width))**2 >> zero_init(
                         Affine(nr_class, width, drop_factor=0.0)) >> logistic)
            return model

        lower = HashEmbed(width, nr_vector, column=1)
        prefix = HashEmbed(width // 2, nr_vector, column=2)
        suffix = HashEmbed(width // 2, nr_vector, column=3)
        shape = HashEmbed(width // 2, nr_vector, column=4)

        trained_vectors = (FeatureExtracter(
            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten(
                uniqued((lower | prefix | suffix | shape) >> LN(
                    Maxout(width, width + (width // 2) * 3)),
                        column=0)))

        if pretrained_dims:
            static_vectors = (
                SpacyVectors >> with_flatten(Affine(width, pretrained_dims)))
            # TODO Make concatenate support lists
            vectors = concatenate_lists(trained_vectors, static_vectors)
            vectors_width = width * 2
        else:
            vectors = trained_vectors
            vectors_width = width
            static_vectors = None
        cnn_model = (
            vectors >> with_flatten(
                LN(Maxout(width, vectors_width)) >> Residual(
                    (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**2,
                pad=2) >> flatten_add_lengths >> ParametricAttention(width) >>
            Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >>
            zero_init(Affine(nr_class, width, drop_factor=0.0)))

        linear_model = (
            _preprocess_doc >> LinearModel(nr_class, drop_factor=0.))

        model = ((linear_model | cnn_model) >> zero_init(
            Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic)
    model.nO = nr_class
    model.lsuv = False
    return model
Beispiel #18
0
def build_bow_text_classifier(
    nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg
):
    with Model.define_operators({">>": chain}):
        model = with_cpu(
            Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
        )
        if not no_output_layer:
            model = model >> (cpu_softmax if exclusive_classes else logistic)
    model.nO = nr_class
    return model
Beispiel #19
0
def build_textcat_model(tok2vec, nr_class, width):
    from thinc.v2v import Model, Softmax, Maxout
    from thinc.api import flatten_add_lengths, chain
    from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
    from thinc.misc import Residual, LayerNorm
    from spacy._ml import logistic, zero_init

    with Model.define_operators({">>": chain}):
        model = (tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >>
                 Softmax(nr_class, width))
    model.tok2vec = tok2vec
    return model
Beispiel #20
0
def Tok2Vec(width, embed_size, **kwargs):
    pretrained_vectors = kwargs.get('pretrained_vectors', None)
    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({
            '>>': chain,
            '|': concatenate,
            '**': clone,
            '+': add,
            '*': reapply
    }):
        norm = HashEmbed(width,
                         embed_size,
                         column=cols.index(NORM),
                         name='embed_norm')
        prefix = HashEmbed(width,
                           embed_size // 2,
                           column=cols.index(PREFIX),
                           name='embed_prefix')
        suffix = HashEmbed(width,
                           embed_size // 2,
                           column=cols.index(SUFFIX),
                           name='embed_suffix')
        shape = HashEmbed(width,
                          embed_size // 2,
                          column=cols.index(SHAPE),
                          name='embed_shape')
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors,
                                  width,
                                  column=cols.index(ID))

            embed = uniqued((glove | norm | prefix | suffix | shape) >> LN(
                Maxout(width, width * 5, pieces=3)),
                            column=cols.index(ORTH))
        else:
            embed = uniqued((norm | prefix | suffix | shape) >> LN(
                Maxout(width, width * 4, pieces=3)),
                            column=cols.index(ORTH))

        convolution = Residual(
            ExtractWindow(
                nW=1) >> LN(Maxout(width, width *
                                   3, pieces=cnn_maxout_pieces)))

        tok2vec = (FeatureExtracter(cols) >> with_flatten(
            embed >> convolution**4, pad=4))
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Beispiel #21
0
def build_textcat_model(tok2vec, nr_class, width):
    from thinc.v2v import Model, Softmax, Maxout
    from thinc.api import flatten_add_lengths, chain
    from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
    from thinc.misc import Residual, LayerNorm
    from spacy._ml import logistic, zero_init

    with Model.define_operators({">>": chain}):
        model = (
            tok2vec
            >> flatten_add_lengths
            >> Pooling(mean_pool)
            >> Softmax(nr_class, width)
        )
    model.tok2vec = tok2vec
    return model
Beispiel #22
0
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
    """
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
    is applied instead, so that outputs are in the range [0, 1].
    """
    with Model.define_operators({">>": chain}):
        if exclusive_classes:
            output_layer = Softmax(nr_class, tok2vec.nO)
        else:
            output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
        model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
    model.tok2vec = chain(tok2vec, flatten)
    model.nO = nr_class
    return model
Beispiel #23
0
def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
    """
    Build a simple CNN text classifier, given a token-to-vector model as inputs.
    If exclusive_classes=True, a softmax non-linearity is applied, so that the
    outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
    is applied instead, so that outputs are in the range [0, 1].
    """
    with Model.define_operators({">>": chain}):
        if exclusive_classes:
            output_layer = Softmax(nr_class, tok2vec.nO)
        else:
            output_layer = (
                zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
            )
        model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
    model.tok2vec = chain(tok2vec, flatten)
    model.nO = nr_class
    return model
Beispiel #24
0
def build_model(nr_class, width, depth, conv_depth, **kwargs):
    with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}):
        embed = ((HashEmbed(width, 5000, column=1)
                  | StaticVectors('spacy_pretrained_vectors', width, column=5)
                  | HashEmbed(width // 2, 750, column=2)
                  | HashEmbed(width // 2, 750, column=3)
                  | HashEmbed(width // 2, 750, column=4)) >> LN(Maxout(width)))

        sent2vec = (flatten_add_lengths >> with_getitem(
            0, embed >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width)))**
            conv_depth) >> ParametricAttention(width) >> Pooling(sum_pool) >>
                    Residual(LN(Maxout(width)))**depth)

        model = (foreach(sent2vec, drop_factor=2.0) >> flatten_add_lengths >>
                 ParametricAttention(width, hard=False) >> Pooling(sum_pool) >>
                 Residual(LN(Maxout(width)))**depth >> Softmax(nr_class))
    model.lsuv = False
    return model
Beispiel #25
0
def build_tagger_model(nr_class, **cfg):
    embed_size = util.env_opt('embed_size', 7000)
    if 'token_vector_width' in cfg:
        token_vector_width = cfg['token_vector_width']
    else:
        token_vector_width = util.env_opt('token_vector_width', 128)
    pretrained_dims = cfg.get('pretrained_dims', 0)
    with Model.define_operators({'>>': chain, '+': add}):
        if 'tok2vec' in cfg:
            tok2vec = cfg['tok2vec']
        else:
            tok2vec = Tok2Vec(token_vector_width,
                              embed_size,
                              pretrained_dims=pretrained_dims)
        softmax = with_flatten(Softmax(nr_class, token_vector_width))
        model = (tok2vec >> softmax)
    model.nI = None
    model.tok2vec = tok2vec
    model.softmax = softmax
    return model
Beispiel #26
0
def Tok2Vec(width, embed_size, **kwargs):
    pretrained_vectors = kwargs.get('pretrained_vectors', None)
    cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({'>>': chain, '|': concatenate, '**': clone,
                                 '+': add, '*': reapply}):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM),
                         name='embed_norm')
        prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX),
                           name='embed_prefix')
        suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX),
                           name='embed_suffix')
        shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE),
                          name='embed_shape')
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))

            embed = uniqued(
                (glove | norm | prefix | suffix | shape)
                >> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH))
        else:
            embed = uniqued(
                (norm | prefix | suffix | shape)
                >> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH))

        convolution = Residual(
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces))
        )

        tok2vec = (
            FeatureExtracter(cols)
            >> with_flatten(
                embed
                >> convolution ** 4, pad=4
            )
        )
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Beispiel #27
0
def build_tagger_model(nr_class, **cfg):
    embed_size = util.env_opt('embed_size', 7000)
    if 'token_vector_width' in cfg:
        token_vector_width = cfg['token_vector_width']
    else:
        token_vector_width = util.env_opt('token_vector_width', 128)
    pretrained_vectors = cfg.get('pretrained_vectors')
    with Model.define_operators({'>>': chain, '+': add}):
        if 'tok2vec' in cfg:
            tok2vec = cfg['tok2vec']
        else:
            tok2vec = Tok2Vec(token_vector_width, embed_size,
                              pretrained_vectors=pretrained_vectors)
        softmax = with_flatten(Softmax(nr_class, token_vector_width))
        model = (
            tok2vec
            >> softmax
        )
    model.nI = None
    model.tok2vec = tok2vec
    model.softmax = softmax
    return model
Beispiel #28
0
def main(depth=2, width=512, nb_epoch=30):
    if CupyOps.xp != None:
        Model.ops = CupyOps()
        Model.Ops = CupyOps
    # Configuration here isn't especially good. But, for demo..
    with Model.define_operators({'**': clone, '>>': chain}):
        model = ReLu(width) >> ReLu(width) >> Softmax()

    train_data, dev_data, _ = datasets.mnist()
    train_X, train_y = model.ops.unzip(train_data)
    dev_X, dev_y = model.ops.unzip(dev_data)

    dev_y = to_categorical(dev_y)
    with model.begin_training(train_X, train_y,
                              L2=1e-6) as (trainer, optimizer):
        epoch_loss = [0.]

        def report_progress():
            with model.use_params(optimizer.averages):
                print(epoch_loss[-1], model.evaluate(dev_X, dev_y),
                      trainer.dropout)
            epoch_loss.append(0.)

        trainer.each_epoch.append(report_progress)
        trainer.nb_epoch = nb_epoch
        trainer.dropout = 0.3
        trainer.batch_size = 128
        trainer.dropout_decay = 0.0
        train_X = model.ops.asarray(train_X, dtype='float32')
        y_onehot = to_categorical(train_y)
        for X, y in trainer.iterate(train_X, y_onehot):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            loss = ((yh - y)**2.).sum() / y.shape[0]
            backprop(yh - y, optimizer)
            epoch_loss[-1] += loss
        with model.use_params(optimizer.averages):
            print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y))
            with open('out.pickle', 'wb') as file_:
                pickle.dump(model, file_, -1)
Beispiel #29
0
def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
    # TODO proper error
    if "entity_width" not in cfg:
        raise ValueError("entity_width not found")
    if "context_width" not in cfg:
        raise ValueError("context_width not found")

    conv_depth = cfg.get("conv_depth", 2)
    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
    pretrained_vectors = cfg.get(
        "pretrained_vectors")  # self.nlp.vocab.vectors.name
    context_width = cfg.get("context_width")
    entity_width = cfg.get("entity_width")

    with Model.define_operators({">>": chain, "**": clone}):
        model = (
            Affine(entity_width, entity_width + context_width + 1 + ner_types)
            >> Affine(1, entity_width, drop_factor=0.0) >> logistic)

        # context encoder
        tok2vec = (Tok2Vec(
            width=hidden_width,
            embed_size=embed_width,
            pretrained_vectors=pretrained_vectors,
            cnn_maxout_pieces=cnn_maxout_pieces,
            subword_features=True,
            conv_depth=conv_depth,
            bilstm_depth=0,
        ) >> flatten_add_lengths >> Pooling(mean_pool) >> Residual(
            zero_init(Maxout(hidden_width, hidden_width))) >> zero_init(
                Affine(context_width, hidden_width)))

        model.tok2vec = tok2vec

    model.tok2vec = tok2vec
    model.tok2vec.nO = context_width
    model.nO = 1
    return model
Beispiel #30
0
def build_model(nr_class, width, depth, conv_depth, **kwargs):
    with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}):
        embed = (
            (HashEmbed(width, 5000, column=1)
            | StaticVectors('spacy_pretrained_vectors', width, column=5)
            | HashEmbed(width//2, 750, column=2)
            | HashEmbed(width//2, 750, column=3)
            | HashEmbed(width//2, 750, column=4))
            >> LN(Maxout(width))
        )

        sent2vec = (
            flatten_add_lengths
            >> with_getitem(0,
                embed
                >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width))) ** conv_depth
            )
            >> ParametricAttention(width)
            >> Pooling(sum_pool)
            >> Residual(LN(Maxout(width))) ** depth
        )

        model = (
            foreach(sent2vec, drop_factor=2.0)
            >> flatten_add_lengths
            # This block would allow the model to learn some cross-sentence
            # features. It's not useful on this problem. It might make more
            # sense to use a BiLSTM here, following Liang et al (2016).
            #>> with_getitem(0,
            #    Residual(ExtractWindow(nW=1) >> LN(Maxout(width))) ** conv_depth
            #)
            >> ParametricAttention(width, hard=False)
            >> Pooling(sum_pool)
            >> Residual(LN(Maxout(width))) ** depth
            >> Softmax(nr_class)
        )
    model.lsuv = False
    return model
Beispiel #31
0
def build_model(nr_class, width, depth, conv_depth, vectors_name, **kwargs):
    with Model.define_operators({"|": concatenate, ">>": chain, "**": clone}):
        embed = (HashEmbed(width, 5000, column=1)
                 | StaticVectors(vectors_name, width, column=5)
                 | HashEmbed(width // 2, 750, column=2)
                 | HashEmbed(width // 2, 750, column=3)
                 | HashEmbed(width // 2, 750, column=4)) >> LN(Maxout(width))

        sent2vec = (with_flatten(embed) >> Residual(
            prepare_self_attention(Affine(width * 3, width), nM=width, nH=4) >>
            MultiHeadedAttention() >> with_flatten(
                Maxout(width, width, pieces=3))) >> flatten_add_lengths >>
                    ParametricAttention(width, hard=False) >>
                    Pooling(mean_pool) >> Residual(LN(Maxout(width))))

        model = (foreach(sent2vec, drop_factor=2.0) >> Residual(
            prepare_self_attention(Affine(width * 3, width), nM=width, nH=4) >>
            MultiHeadedAttention() >> with_flatten(LN(Affine(width, width))))
                 >> flatten_add_lengths >> ParametricAttention(
                     width, hard=False) >> Pooling(mean_pool) >> Residual(
                         LN(Maxout(width)))**2 >> Softmax(nr_class))
    model.lsuv = False
    return model
Beispiel #32
0
def build_tagger_model(nr_class, **cfg):
    embed_size = util.env_opt("embed_size", 2000)
    if "token_vector_width" in cfg:
        token_vector_width = cfg["token_vector_width"]
    else:
        token_vector_width = util.env_opt("token_vector_width", 96)
    pretrained_vectors = cfg.get("pretrained_vectors")
    subword_features = cfg.get("subword_features", True)
    with Model.define_operators({">>": chain, "+": add}):
        if "tok2vec" in cfg:
            tok2vec = cfg["tok2vec"]
        else:
            tok2vec = Tok2Vec(
                token_vector_width,
                embed_size,
                subword_features=subword_features,
                pretrained_vectors=pretrained_vectors,
            )
        softmax = with_flatten(Softmax(nr_class, token_vector_width))
        model = tok2vec >> softmax
    model.nI = None
    model.tok2vec = tok2vec
    model.softmax = softmax
    return model
Beispiel #33
0
def build_tagger_model(nr_class, **cfg):
    embed_size = util.env_opt("embed_size", 2000)
    if "token_vector_width" in cfg:
        token_vector_width = cfg["token_vector_width"]
    else:
        token_vector_width = util.env_opt("token_vector_width", 96)
    pretrained_vectors = cfg.get("pretrained_vectors")
    subword_features = cfg.get("subword_features", True)
    with Model.define_operators({">>": chain, "+": add}):
        if "tok2vec" in cfg:
            tok2vec = cfg["tok2vec"]
        else:
            tok2vec = Tok2Vec(
                token_vector_width,
                embed_size,
                subword_features=subword_features,
                pretrained_vectors=pretrained_vectors,
            )
        softmax = with_flatten(Softmax(nr_class, token_vector_width))
        model = tok2vec >> softmax
    model.nI = None
    model.tok2vec = tok2vec
    model.softmax = softmax
    return model
Beispiel #34
0
def main(
    width=128,
    depth=1,
    vector_length=128,
    min_batch_size=16,
    max_batch_size=16,
    learn_rate=0.001,
    momentum=0.9,
    dropout=0.5,
    dropout_decay=1e-4,
    nb_epoch=20,
    L2=1e-6,
):
    using_gpu = prefer_gpu()
    if using_gpu:
        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    cfg = dict(locals())
    print(cfg)
    train_data, check_data, nr_tag = ancora_pos_tags()
    train_data = list(train_data)
    check_data = list(check_data)

    extracter = FeatureExtracter("es", attrs=[LOWER, SHAPE, PREFIX, SUFFIX])
    with Model.define_operators({"**": clone, ">>": chain, "+": add, "|": concatenate}):
        lower_case = HashEmbed(width, 100, column=0)
        shape = HashEmbed(width // 2, 200, column=1)
        prefix = HashEmbed(width // 2, 100, column=2)
        suffix = HashEmbed(width // 2, 100, column=3)

        model = (
            with_flatten(
                (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3)
            )
            >> PyTorchBiLSTM(width, width, depth)
            >> with_flatten(Softmax(nr_tag))
        )

    train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag)
    dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag)

    n_train = float(sum(len(x) for x in train_X))
    global epoch_train_acc
    with model.begin_training(train_X[:10], train_y[:10], **cfg) as (
        trainer,
        optimizer,
    ):
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)

            gradient = [yh[i] - y[i] for i in range(len(yh))]

            backprop(gradient, optimizer)

            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
    print(model.evaluate(dev_X, model.ops.flatten(dev_y)))
    with open("/tmp/model.pickle", "wb") as file_:
        pickle.dump(model, file_)
def main(
    dataset="quora",
    width=200,
    depth=2,
    min_batch_size=1,
    max_batch_size=512,
    dropout=0.2,
    dropout_decay=0.0,
    pooling="mean+max",
    nb_epoch=5,
    pieces=3,
    L2=0.0,
    use_gpu=False,
    out_loc=None,
    quiet=False,
    job_id=None,
    ws_api_url=None,
    rest_api_url=None,
):
    cfg = dict(locals())

    if out_loc:
        out_loc = Path(out_loc)
        if not out_loc.parent.exists():
            raise IOError("Can't open output location: %s" % out_loc)
    print(cfg)
    if pooling == "mean+max":
        pool_layer = Pooling(mean_pool, max_pool)
    elif pooling == "mean":
        pool_layer = mean_pool
    elif pooling == "max":
        pool_layer = max_pool
    else:
        raise ValueError("Unrecognised pooling", pooling)

    print("Load spaCy")
    nlp = get_spacy("en")

    if use_gpu:
        Model.ops = CupyOps()

    print("Construct model")
    # Bind operators for the scope of the block:
    # * chain (>>): Compose models in a 'feed forward' style,
    # i.e. chain(f, g)(x) -> g(f(x))
    # * clone (**): Create n copies of a model, and chain them, i.e.
    # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights.
    # * concatenate (|): Merge the outputs of two models into a single vector,
    # i.e. (f|g)(x) -> hstack(f(x), g(x))
    Model.lsuv = True
    # Model.ops = CupyOps()
    with Model.define_operators({">>": chain, "**": clone, "|": concatenate, "+": add}):
        mwe_encode = ExtractWindow(nW=1) >> LN(
            Maxout(width, drop_factor=0.0, pieces=pieces)
        )

        sent2vec = (
            flatten_add_lengths
            >> with_getitem(
                0,
                (HashEmbed(width, 3000) | StaticVectors("en", width))
                >> LN(Maxout(width, width * 2))
                >> Residual(mwe_encode) ** depth,
            )  # : word_ids{T}
            >> Pooling(mean_pool, max_pool)
            >> Residual(LN(Maxout(width * 2, pieces=pieces), nO=width * 2)) ** 2
            >> logistic
        )
        model = Siamese(sent2vec, CauchySimilarity(width * 2))

    print("Read and parse data: %s" % dataset)
    if dataset == "quora":
        train, dev = datasets.quora_questions()
    elif dataset == "snli":
        train, dev = datasets.snli()
    elif dataset == "stackxc":
        train, dev = datasets.stack_exchange()
    elif dataset in ("quora+snli", "snli+quora"):
        train, dev = datasets.quora_questions()
        train2, dev2 = datasets.snli()
        train.extend(train2)
        dev.extend(dev2)
    else:
        raise ValueError("Unknown dataset: %s" % dataset)
    get_ids = get_word_ids(Model.ops)
    train_X, train_y = preprocess(model.ops, nlp, train, get_ids)
    dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids)

    with model.begin_training(train_X[:10000], train_y[:10000], **cfg) as (
        trainer,
        optimizer,
    ):
        # Pass a callback to print progress. Give it all the local scope,
        # because why not?
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        print("Accuracy before training", model.evaluate_logloss(dev_X, dev_y))
        print("Train")
        global epoch_train_acc
        n_iter = 0

        for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet):
            # Slightly useful trick: Decay the dropout as training proceeds.
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            assert yh.shape == y.shape, (yh.shape, y.shape)

            assert (yh >= 0.0).all(), yh
            train_acc = ((yh >= 0.5) == (y >= 0.5)).sum()
            loss = model.ops.xp.abs(yh - y).mean()
            epoch_train_acc += train_acc
            backprop(yh - y, optimizer)
            n_iter += 1

            # Slightly useful trick: start with low batch size, accelerate.
            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
        if out_loc:
            out_loc = Path(out_loc)
            print("Saving to", out_loc)
            with out_loc.open("wb") as file_:
                pickle.dump(model, file_, -1)
Beispiel #36
0
def main(
    width=100,
    depth=4,
    vector_length=64,
    min_batch_size=1,
    max_batch_size=32,
    learn_rate=0.001,
    momentum=0.9,
    dropout=0.5,
    dropout_decay=1e-4,
    nb_epoch=20,
    L2=1e-6,
):
    cfg = dict(locals())
    print(cfg)
    prefer_gpu()
    train_data, check_data, nr_tag = ancora_pos_tags()

    extracter = FeatureExtracter("es", attrs=[LOWER, SHAPE, PREFIX, SUFFIX])
    Model.lsuv = True
    with Model.define_operators({
            "**": clone,
            ">>": chain,
            "+": add,
            "|": concatenate
    }):
        lower_case = HashEmbed(width, 100, column=0)
        shape = HashEmbed(width // 2, 200, column=1)
        prefix = HashEmbed(width // 2, 100, column=2)
        suffix = HashEmbed(width // 2, 100, column=3)

        model = (with_flatten(
            (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3),
            pad=depth) >> with_pad_and_mask(
                MultiHeadedAttention(nM=width, nH=4)) >> with_flatten(
                    Softmax(nr_tag)))

    train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag)
    dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag)

    n_train = float(sum(len(x) for x in train_X))
    global epoch_train_acc
    with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (
            trainer,
            optimizer,
    ):
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        for X, y in trainer.iterate(train_X, train_y):
            yh, backprop = model.begin_update(X, drop=trainer.dropout)

            gradient = [yh[i] - y[i] for i in range(len(yh))]

            backprop(gradient, optimizer)

            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
    with model.use_params(trainer.optimizer.averages):
        print(model.evaluate(dev_X, model.ops.flatten(dev_y)))
        with open("/tmp/model.pickle", "wb") as file_:
            pickle.dump(model, file_)
Beispiel #37
0
def build_text_classifier(nr_class, width=64, **cfg):
    nr_vector = cfg.get('nr_vector', 5000)
    pretrained_dims = cfg.get('pretrained_dims', 0)
    with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
                                 '**': clone}):
        if cfg.get('low_data') and pretrained_dims:
            model = (
                SpacyVectors
                >> flatten_add_lengths
                >> with_getitem(0, Affine(width, pretrained_dims))
                >> ParametricAttention(width)
                >> Pooling(sum_pool)
                >> Residual(ReLu(width, width)) ** 2
                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
                >> logistic
            )
            return model

        lower = HashEmbed(width, nr_vector, column=1)
        prefix = HashEmbed(width//2, nr_vector, column=2)
        suffix = HashEmbed(width//2, nr_vector, column=3)
        shape = HashEmbed(width//2, nr_vector, column=4)

        trained_vectors = (
            FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID])
            >> with_flatten(
                uniqued(
                    (lower | prefix | suffix | shape)
                    >> LN(Maxout(width, width+(width//2)*3)),
                    column=0
                )
            )
        )

        if pretrained_dims:
            static_vectors = (
                SpacyVectors
                >> with_flatten(Affine(width, pretrained_dims))
            )
            # TODO Make concatenate support lists
            vectors = concatenate_lists(trained_vectors, static_vectors)
            vectors_width = width*2
        else:
            vectors = trained_vectors
            vectors_width = width
            static_vectors = None
        cnn_model = (
            vectors
            >> with_flatten(
                LN(Maxout(width, vectors_width))
                >> Residual(
                    (ExtractWindow(nW=1) >> LN(Maxout(width, width*3)))
                ) ** 2, pad=2
            )
            >> flatten_add_lengths
            >> ParametricAttention(width)
            >> Pooling(sum_pool)
            >> Residual(zero_init(Maxout(width, width)))
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
        )

        linear_model = (
            _preprocess_doc
            >> LinearModel(nr_class)
        )
        #model = linear_model >> logistic

        model = (
            (linear_model | cnn_model)
            >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0))
            >> logistic
        )
    model.nO = nr_class
    model.lsuv = False
    return model
Beispiel #38
0
def build_text_classifier(nr_class, width=64, **cfg):
    depth = cfg.get("depth", 2)
    nr_vector = cfg.get("nr_vector", 5000)
    pretrained_dims = cfg.get("pretrained_dims", 0)
    with Model.define_operators({
            ">>": chain,
            "+": add,
            "|": concatenate,
            "**": clone
    }):
        if cfg.get("low_data") and pretrained_dims:
            model = (SpacyVectors >> flatten_add_lengths >> with_getitem(
                0, Affine(width, pretrained_dims)) >>
                     ParametricAttention(width) >> Pooling(sum_pool) >>
                     Residual(ReLu(width, width))**2 >> zero_init(
                         Affine(nr_class, width, drop_factor=0.0)) >> logistic)
            return model

        lower = HashEmbed(width, nr_vector, column=1)
        prefix = HashEmbed(width // 2, nr_vector, column=2)
        suffix = HashEmbed(width // 2, nr_vector, column=3)
        shape = HashEmbed(width // 2, nr_vector, column=4)

        trained_vectors = FeatureExtracter(
            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten(
                uniqued(
                    (lower | prefix | suffix | shape) >> LN(
                        Maxout(width, width + (width // 2) * 3)),
                    column=0,
                ))

        if pretrained_dims:
            static_vectors = SpacyVectors >> with_flatten(
                Affine(width, pretrained_dims))
            # TODO Make concatenate support lists
            vectors = concatenate_lists(trained_vectors, static_vectors)
            vectors_width = width * 2
        else:
            vectors = trained_vectors
            vectors_width = width
            static_vectors = None
        tok2vec = vectors >> with_flatten(
            LN(Maxout(width, vectors_width)) >> Residual(
                (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**depth,
            pad=depth,
        )
        cnn_model = (
            tok2vec >> flatten_add_lengths >> ParametricAttention(width) >>
            Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >>
            zero_init(Affine(nr_class, width, drop_factor=0.0)))

        linear_model = build_bow_text_classifier(nr_class,
                                                 ngram_size=cfg.get(
                                                     "ngram_size", 1),
                                                 exclusive_classes=False)
        if cfg.get("exclusive_classes"):
            output_layer = Softmax(nr_class, nr_class * 2)
        else:
            output_layer = (zero_init(
                Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic)
        model = (linear_model | cnn_model) >> output_layer
        model.tok2vec = chain(tok2vec, flatten)
    model.nO = nr_class
    model.lsuv = False
    return model
def main(dataset='quora',
         width=50,
         depth=2,
         min_batch_size=1,
         max_batch_size=512,
         dropout=0.2,
         dropout_decay=0.0,
         pooling="mean+max",
         nb_epoch=5,
         pieces=3,
         L2=0.0,
         use_gpu=False,
         out_loc=None,
         quiet=False,
         job_id=None,
         ws_api_url=None,
         rest_api_url=None):
    global CTX
    if job_id is not None:
        CTX = neptune.Context()
        width = CTX.params.width
        L2 = CTX.params.L2
        nb_epoch = CTX.params.nb_epoch
        depth = CTX.params.depth
        max_batch_size = CTX.params.max_batch_size
    cfg = dict(locals())

    if out_loc:
        out_loc = Path(out_loc)
        if not out_loc.parent.exists():
            raise IOError("Can't open output location: %s" % out_loc)
    print(cfg)
    if pooling == 'mean+max':
        pool_layer = Pooling(mean_pool, max_pool)
    elif pooling == "mean":
        pool_layer = mean_pool
    elif pooling == "max":
        pool_layer = max_pool
    else:
        raise ValueError("Unrecognised pooling", pooling)

    print("Load spaCy")
    nlp = get_spacy('en')

    if use_gpu:
        Model.ops = CupyOps()

    print("Construct model")
    # Bind operators for the scope of the block:
    # * chain (>>): Compose models in a 'feed forward' style,
    # i.e. chain(f, g)(x) -> g(f(x))
    # * clone (**): Create n copies of a model, and chain them, i.e.
    # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights.
    # * concatenate (|): Merge the outputs of two models into a single vector,
    # i.e. (f|g)(x) -> hstack(f(x), g(x))
    Model.lsuv = True
    #Model.ops = CupyOps()
    with Model.define_operators({
            '>>': chain,
            '**': clone,
            '|': concatenate,
            '+': add
    }):
        mwe_encode = ExtractWindow(nW=1) >> BN(
            Maxout(width, drop_factor=0.0, pieces=pieces))

        sent2vec = (  # List[spacy.token.Doc]{B}
            flatten_add_lengths  # : (ids{T}, lengths{B})
            >> with_getitem(
                0,
                #(StaticVectors('en', width)
                HashEmbed(width, 3000)
                #+ HashEmbed(width, 3000))
                #>> Residual(mwe_encode ** 2)
            )  # : word_ids{T}
            >> Pooling(mean_pool, max_pool)
            #>> Residual(BN(Maxout(width*2, pieces=pieces), nO=width*2)**2)
            >> Maxout(width * 2, pieces=pieces, drop_factor=0.0) >> logistic)
        model = Siamese(sent2vec, CauchySimilarity(width * 2))

    print("Read and parse data: %s" % dataset)
    if dataset == 'quora':
        train, dev = datasets.quora_questions()
    elif dataset == 'snli':
        train, dev = datasets.snli()
    elif dataset == 'stackxc':
        train, dev = datasets.stack_exchange()
    elif dataset in ('quora+snli', 'snli+quora'):
        train, dev = datasets.quora_questions()
        train2, dev2 = datasets.snli()
        train.extend(train2)
        dev.extend(dev2)
    else:
        raise ValueError("Unknown dataset: %s" % dataset)
    get_ids = get_word_ids(Model.ops)
    train_X, train_y = preprocess(model.ops, nlp, train, get_ids)
    dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids)

    with model.begin_training(train_X[:10000], train_y[:10000],
                              **cfg) as (trainer, optimizer):
        # Pass a callback to print progress. Give it all the local scope,
        # because why not?
        trainer.each_epoch.append(track_progress(**locals()))
        trainer.batch_size = min_batch_size
        batch_size = float(min_batch_size)
        print("Accuracy before training", model.evaluate_logloss(dev_X, dev_y))
        print("Train")
        global epoch_train_acc
        n_iter = 0

        for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet):
            # Slightly useful trick: Decay the dropout as training proceeds.
            yh, backprop = model.begin_update(X, drop=trainer.dropout)
            assert yh.shape == y.shape, (yh.shape, y.shape)

            assert (yh >= 0.).all(), yh
            train_acc = ((yh >= 0.5) == (y >= 0.5)).sum()
            loss = model.ops.xp.abs(yh - y).mean()
            epoch_train_acc += train_acc
            backprop(yh - y, optimizer)
            n_iter += 1

            # Slightly useful trick: start with low batch size, accelerate.
            trainer.batch_size = min(int(batch_size), max_batch_size)
            batch_size *= 1.001
        if out_loc:
            out_loc = Path(out_loc)
            print('Saving to', out_loc)
            with out_loc.open('wb') as file_:
                pickle.dump(model, file_, -1)
Beispiel #40
0
def main(nH=6,
         dropout=0.1,
         nS=6,
         nB=64,
         nE=20,
         use_gpu=-1,
         lim=1000000,
         nM=300,
         mL=100,
         save=False,
         nTGT=5000,
         save_name="model.pkl"):
    if use_gpu != -1:
        spacy.require_gpu()
        device = 'cuda'
    else:
        device = 'cpu'
    ''' Read dataset '''
    nlp = spacy.load('en_core_web_sm')
    print('English model loaded')
    for control_token in ("<eos>", "<bos>", "<pad>", "<cls>", "<mask>"):
        nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}])

    train, dev, test = get_iwslt()
    print('Dataset loaded')

    train, _ = zip(*train)
    dev, _ = zip(*dev)
    test, _ = zip(*test)

    train = train[:lim]
    dev = dev[:lim]
    test = test[:lim]
    ''' Tokenize '''
    train = spacy_tokenize(nlp.tokenizer, train, mL=mL)
    dev = spacy_tokenize(nlp.tokenizer, dev, mL=mL)
    test = spacy_tokenize(nlp.tokenizer, test, mL=mL)
    print('Tokenization finished')
    ''' Set rank based on all the docs '''
    all_docs = train + dev + test
    set_rank(nlp.vocab, all_docs, nTGT=nTGT)

    train = set_numeric_ids(nlp.vocab, train)
    dev = set_numeric_ids(nlp.vocab, dev)
    test = set_numeric_ids(nlp.vocab, test)
    print('Numeric ids set')

    word2indx, indx2word = get_dicts(nlp.vocab)
    print('Vocab dictionaries grabbed')

    with Model.define_operators({">>": chain}):
        embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
        extractor = FeatureExtracter(attrs=embed_cols)
        position_encode = PositionEncode(mL, nM)
        model = (FeatureExtracter(attrs=embed_cols) >> with_flatten(
            FancyEmbed(nM, nTGT, cols=embed_cols)) >> Residual(position_encode)
                 >> create_model_input() >> Encoder(
                     nM=nM, nS=nS, nH=nH, device=device) >> with_reshape(
                         Softmax(nO=nTGT, nI=nM)))
        ''' Progress tracking '''
        losses = [0.]
        train_accuracies = [0.]
        train_totals = [0.]
        dev_accuracies = [0.]
        dev_loss = [0.]

        def track_progress():
            correct = 0.
            total = 0.
            ''' Get dev stats '''
            for X0 in minibatch(dev, size=nB):
                X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL)
                Xh = model(X1)
                L, C, t = get_loss(Xh, X0, X1, loss_mask)
                correct += C
                total += t
                dev_loss[-1] += (L**2).sum()
            dev_accuracies[-1] = correct / total
            print(len(losses), losses[-1],
                  train_accuracies[-1] / train_totals[-1], dev_loss[-1],
                  dev_accuracies[-1])
            dev_loss.append(0.)
            losses.append(0.)
            train_accuracies.append(0.)
            dev_accuracies.append(0.)
            train_totals.append(0.)
            if save:
                model.to_disk('.models/' + save_name)

        ''' Model training '''
        with model.begin_training(batch_size=nB,
                                  nb_epoch=nE) as (trainer, optimizer):
            trainer.dropout = dropout
            trainer.dropout_decay = 1e-4
            optimizer.alpha = 0.001
            optimizer.L2 = 1e-6
            optimizer.max_grad_norm = 1.0
            trainer.each_epoch.append(track_progress)
            optimizer.alpha = 0.001
            optimizer.L2 = 1e-6
            optimizer.max_grad_norm = 1.0
            for X0, _ in trainer.iterate(train, train):
                X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL)
                Xh, backprop = model.begin_update(X1, drop=dropout)
                dXh, C, total = get_loss(Xh, X0, X1, loss_mask)
                backprop(dXh, sgd=optimizer)
                losses[-1] += (dXh**2).sum()
                train_accuracies[-1] += C
                train_totals[-1] += total
def main(nH=6, dropout=0.1, nS=6, nB=15, nE=20, use_gpu=-1, lim=2000):
    if use_gpu != -1:
        # TODO: Make specific to different devices, e.g. 1 vs 0
        spacy.require_gpu()
    train, dev, test = get_iwslt()
    train_X, train_Y = zip(*train)
    dev_X, dev_Y = zip(*dev)
    test_X, test_Y = zip(*test)
    ''' Read dataset '''
    nlp_en = spacy.load('en_core_web_sm')
    nlp_de = spacy.load('de_core_news_sm')
    print('Models loaded')
    for control_token in ("<eos>", "<bos>", "<pad>"):
        nlp_en.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
        nlp_de.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
    train_X, train_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                      train_X[-lim:], train_Y[-lim:],
                                      MAX_LENGTH)
    dev_X, dev_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                  dev_X[-lim:], dev_Y[-lim:], MAX_LENGTH)
    test_X, test_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                    test_X[-lim:], test_Y[-lim:], MAX_LENGTH)
    train_X = set_numeric_ids(nlp_en.vocab, train_X, vocab_size=VOCAB_SIZE)
    train_Y = set_numeric_ids(nlp_de.vocab, train_Y, vocab_size=VOCAB_SIZE)
    nTGT = VOCAB_SIZE

    with Model.define_operators({">>": chain}):
        embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
        extractor = FeatureExtracter(attrs=embed_cols)
        position_encode = PositionEncode(MAX_LENGTH, MODEL_SIZE)
        model = (apply_layers(extractor, extractor) >> apply_layers(
            with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)),
            with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)),
        ) >> apply_layers(Residual(position_encode), Residual(position_encode))
                 >> create_batch() >> EncoderDecoder(nS=nS, nH=nH, nTGT=nTGT))

    losses = [0.]
    train_accuracies = [0.]
    train_totals = [0.]
    dev_accuracies = [0.]
    dev_loss = [0.]

    def track_progress():
        correct = 0.
        total = 0.
        for batch in minibatch(zip(dev_X, dev_Y), size=1024):
            X, Y = zip(*batch)
            Yh, Y_mask = model((X, Y))
            L, C = get_loss(model.ops, Yh, Y, Y_mask)
            correct += C
            dev_loss[-1] += (L**2).sum()
            total += len(Y)
        dev_accuracies[-1] = correct / total
        n_train = train_totals[-1]
        print(len(losses), losses[-1], train_accuracies[-1] / n_train,
              dev_loss[-1], dev_accuracies[-1])
        dev_loss.append(0.)
        losses.append(0.)
        train_accuracies.append(0.)
        dev_accuracies.append(0.)
        train_totals.append(0.)

    with model.begin_training(batch_size=nB,
                              nb_epoch=nE) as (trainer, optimizer):
        trainer.dropout = dropout
        trainer.dropout_decay = 1e-4
        trainer.each_epoch.append(track_progress)
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        for X, Y in trainer.iterate(train_X, train_Y):
            (Yh, X_mask), backprop = model.begin_update((X, Y), drop=dropout)
            dYh, C = get_loss(model.ops, Yh, Y, X_mask)
            backprop(dYh, sgd=optimizer)
            losses[-1] += (dYh**2).sum()
            train_accuracies[-1] += C
            train_totals[-1] += sum(len(y) for y in Y)
def main(nH=6,
         dropout=0.0,
         nS=6,
         nB=32,
         nE=20,
         use_gpu=-1,
         lim=2000,
         nM=300,
         mL=20,
         nTGT=3500,
         save=False,
         load=False,
         save_name="model.pkl",
         load_name="model.pkl"):
    if use_gpu != -1:
        # TODO: Make specific to different devices, e.g. 1 vs 0
        spacy.require_gpu()
        device = 'cuda'
    else:
        device = 'cpu'
    train, dev, test = get_iwslt()
    train_X, train_Y = zip(*train)
    dev_X, dev_Y = zip(*dev)
    test_X, test_Y = zip(*test)
    ''' Read dataset '''
    nlp_en = spacy.load('en_core_web_sm')
    nlp_de = spacy.load('de_core_news_sm')
    print('Models loaded')
    for control_token in ("<eos>", "<bos>", "<pad>"):
        nlp_en.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
        nlp_de.tokenizer.add_special_case(control_token, [{
            ORTH: control_token
        }])
    train_lim = min(lim, len(train_X))
    dev_lim = min(lim, len(dev_X))
    test_lim = min(lim, len(test_X))
    train_X, train_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                      train_X[:train_lim], train_Y[:train_lim],
                                      mL)
    dev_X, dev_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                  dev_X[:dev_lim], dev_Y[:dev_lim], mL)
    test_X, test_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer,
                                    test_X[:test_lim], test_Y[:test_lim], mL)
    all_X_docs = train_X + dev_X + test_X
    all_y_docs = train_Y + dev_Y + test_Y
    set_rank(nlp_en.vocab, all_X_docs, nTGT=nTGT)
    set_rank(nlp_de.vocab, all_y_docs, nTGT=nTGT)
    train_X = set_numeric_ids(nlp_en.vocab, train_X)
    dev_X = set_numeric_ids(nlp_en.vocab, dev_X)
    test_X = set_numeric_ids(nlp_en.vocab, test_X)
    train_Y = set_numeric_ids(nlp_de.vocab, train_Y)
    dev_Y = set_numeric_ids(nlp_de.vocab, dev_Y)
    test_Y = set_numeric_ids(nlp_de.vocab, test_Y)

    en_word2indx, en_indx2word = get_dicts(nlp_en.vocab)
    de_word2indx, de_indx2word = get_dicts(nlp_de.vocab)
    nTGT += 1

    if not load:
        with Model.define_operators({">>": chain}):
            embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
            extractor = FeatureExtracter(attrs=embed_cols)
            position_encode = PositionEncode(mL, nM)
            model = (apply_layers(extractor, extractor) >> apply_layers(
                with_flatten(FancyEmbed(nM, 5000, cols=embed_cols)),
                with_flatten(FancyEmbed(nM, 5000, cols=embed_cols)),
            ) >> apply_layers(Residual(position_encode),
                              Residual(position_encode)) >> create_batch() >>
                     EncoderDecoder(
                         nS=nS, nH=nH, nTGT=nTGT, nM=nM, device=device))
    else:
        model = Model.from_disk(load_name)

    losses = [0.]
    train_accuracies = [0.]
    train_totals = [0.]
    dev_accuracies = [0.]
    dev_loss = [0.]

    def track_progress():
        correct = 0.
        total = 0.
        for batch in minibatch(zip(dev_X, dev_Y), size=1024):
            X, Y = zip(*batch)
            Yh, Y_mask = model((X, Y))
            L, C, total = get_loss(model.ops, Yh, Y, Y_mask)
            correct += C
            dev_loss[-1] += (L**2).sum()
        dev_accuracies[-1] = correct / total
        n_train = train_totals[-1]
        print(len(losses), losses[-1], train_accuracies[-1] / n_train,
              dev_loss[-1], dev_accuracies[-1])
        dev_loss.append(0.)
        losses.append(0.)
        train_accuracies.append(0.)
        dev_accuracies.append(0.)
        train_totals.append(0.)

    with model.begin_training(batch_size=nB,
                              nb_epoch=nE) as (trainer, optimizer):
        trainer.dropout = dropout
        trainer.dropout_decay = 1e-4
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        trainer.each_epoch.append(track_progress)
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        for X, Y in trainer.iterate(train_X, train_Y):
            (Yh, X_mask), backprop = model.begin_update((X, Y))
            dYh, C, total = get_loss(model.ops, Yh, Y, X_mask)
            backprop(dYh, sgd=optimizer)
            losses[-1] += (dYh**2).sum()
            train_accuracies[-1] += C
            train_totals[-1] += total
    if save:
        model.to_disk(save_name)
Beispiel #43
0
def build_text_classifier(nr_class, width=64, **cfg):
    depth = cfg.get("depth", 2)
    nr_vector = cfg.get("nr_vector", 5000)
    pretrained_dims = cfg.get("pretrained_dims", 0)
    with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}):
        if cfg.get("low_data") and pretrained_dims:
            model = (
                SpacyVectors
                >> flatten_add_lengths
                >> with_getitem(0, Affine(width, pretrained_dims))
                >> ParametricAttention(width)
                >> Pooling(sum_pool)
                >> Residual(ReLu(width, width)) ** 2
                >> zero_init(Affine(nr_class, width, drop_factor=0.0))
                >> logistic
            )
            return model

        lower = HashEmbed(width, nr_vector, column=1)
        prefix = HashEmbed(width // 2, nr_vector, column=2)
        suffix = HashEmbed(width // 2, nr_vector, column=3)
        shape = HashEmbed(width // 2, nr_vector, column=4)

        trained_vectors = FeatureExtracter(
            [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
        ) >> with_flatten(
            uniqued(
                (lower | prefix | suffix | shape)
                >> LN(Maxout(width, width + (width // 2) * 3)),
                column=0,
            )
        )

        if pretrained_dims:
            static_vectors = SpacyVectors >> with_flatten(
                Affine(width, pretrained_dims)
            )
            # TODO Make concatenate support lists
            vectors = concatenate_lists(trained_vectors, static_vectors)
            vectors_width = width * 2
        else:
            vectors = trained_vectors
            vectors_width = width
            static_vectors = None
        tok2vec = vectors >> with_flatten(
            LN(Maxout(width, vectors_width))
            >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth,
            pad=depth,
        )
        cnn_model = (
            tok2vec
            >> flatten_add_lengths
            >> ParametricAttention(width)
            >> Pooling(sum_pool)
            >> Residual(zero_init(Maxout(width, width)))
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
        )

        linear_model = build_bow_text_classifier(
            nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False
        )
        if cfg.get("exclusive_classes"):
            output_layer = Softmax(nr_class, nr_class * 2)
        else:
            output_layer = (
                zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
            )
        model = (linear_model | cnn_model) >> output_layer
        model.tok2vec = chain(tok2vec, flatten)
    model.nO = nr_class
    model.lsuv = False
    return model
Beispiel #44
0
def Tok2Vec(width, embed_size, **kwargs):
    # Circular imports :(
    from .._ml import CharacterEmbed
    from .._ml import PyTorchBiLSTM

    pretrained_vectors = kwargs.get("pretrained_vectors", None)
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
    char_embed = kwargs.get("char_embed", False)
    if char_embed:
        subword_features = False
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        norm = HashEmbed(width,
                         embed_size,
                         column=cols.index(NORM),
                         name="embed_norm",
                         seed=6)
        if subword_features:
            prefix = HashEmbed(width,
                               embed_size // 2,
                               column=cols.index(PREFIX),
                               name="embed_prefix",
                               seed=7)
            suffix = HashEmbed(width,
                               embed_size // 2,
                               column=cols.index(SUFFIX),
                               name="embed_suffix",
                               seed=8)
            shape = HashEmbed(width,
                              embed_size // 2,
                              column=cols.index(SHAPE),
                              name="embed_shape",
                              seed=9)
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors,
                                  width,
                                  column=cols.index(ID))

            if subword_features:
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape) >> LN(
                        Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
            elif char_embed:
                embed = concatenate_lists(
                    CharacterEmbed(nM=64, nC=8),
                    FeatureExtracter(cols) >> with_flatten(glove),
                )
                reduce_dimensions = LN(
                    Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces))
            else:
                embed = uniqued(
                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
                    column=cols.index(ORTH),
                )
        elif subword_features:
            embed = uniqued(
                (norm | prefix | suffix | shape) >> LN(
                    Maxout(width, width * 4, pieces=3)),
                column=cols.index(ORTH),
            )
        elif char_embed:
            embed = concatenate_lists(
                CharacterEmbed(nM=64, nC=8),
                FeatureExtracter(cols) >> with_flatten(norm),
            )
            reduce_dimensions = LN(
                Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces))
        else:
            embed = norm

        convolution = Residual(
            ExtractWindow(
                nW=1) >> LN(Maxout(width, width *
                                   3, pieces=cnn_maxout_pieces)))
        if char_embed:
            tok2vec = embed >> with_flatten(
                reduce_dimensions >> convolution**conv_depth, pad=conv_depth)
        else:
            tok2vec = FeatureExtracter(cols) >> with_flatten(
                embed >> convolution**conv_depth, pad=conv_depth)

        if bilstm_depth >= 1:
            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
        tok2vec.nO = width
        tok2vec.embed = embed
    return tok2vec
Beispiel #45
0
def main(nH=6,
         dropout=0.0,
         nS=6,
         nB=32,
         nE=20,
         use_gpu=-1,
         lim=2000,
         nM=300,
         mL=100,
         save=False,
         save_name="model.pkl"):
    if use_gpu != -1:
        # TODO: Make specific to different devices, e.g. 1 vs 0
        spacy.require_gpu()
        device = 'cuda'
    else:
        device = 'cpu'
    ''' Read dataset '''
    nlp = spacy.load('en_core_web_sm')
    for control_token in ("<eos>", "<bos>", "<pad>", "<cls>"):
        nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}])
    train, dev = imdb(limit=lim)
    print('Loaded imdb dataset')
    train = train[:lim]
    dev = dev[:lim]
    train_X, train_Y = zip(*train)
    dev_X, dev_Y = zip(*dev)
    train_X = spacy_tokenize(nlp.tokenizer, train_X, mL=mL)
    dev_X = spacy_tokenize(nlp.tokenizer, dev_X, mL=mL)
    print('Tokenized dataset')
    train_X = set_numeric_ids(nlp.vocab, train_X)
    dev_X = set_numeric_ids(nlp.vocab, dev_X)
    print('Numeric ids ready')
    with Model.define_operators({">>": chain}):
        embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX]
        extractor = FeatureExtracter(attrs=embed_cols)
        position_encode = PositionEncode(mL, nM)
        model = (FeatureExtracter(attrs=embed_cols) >> with_flatten(
            FancyEmbed(nM, 5000, cols=embed_cols)) >> Residual(position_encode)
                 >> create_model_input() >> Categorizer(
                     nM=nM, nS=nS, nH=nH, device=device))

    losses = [0.]
    train_accuracies = [0.]
    train_totals = [0.]
    dev_accuracies = [0.]
    dev_loss = [0.]

    def track_progress():
        correct = 0.
        total = 0.
        for batch in minibatch(zip(dev_X, dev_Y), size=1024):
            X, Y = zip(*batch)
            Yh = model(X)
            L, C = get_loss(Yh, Y)
            correct += C
            dev_loss[-1] += (L**2).sum()
            total += len(X)
        dev_accuracies[-1] = correct / total
        n_train = train_totals[-1]
        print(len(losses), losses[-1], train_accuracies[-1] / n_train,
              dev_loss[-1], dev_accuracies[-1])
        dev_loss.append(0.)
        losses.append(0.)
        train_accuracies.append(0.)
        dev_accuracies.append(0.)
        train_totals.append(0.)

    with model.begin_training(batch_size=nB,
                              nb_epoch=nE) as (trainer, optimizer):
        trainer.dropout = dropout
        trainer.dropout_decay = 1e-4
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        trainer.each_epoch.append(track_progress)
        optimizer.alpha = 0.001
        optimizer.L2 = 1e-6
        optimizer.max_grad_norm = 1.0
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
        for X, Y in trainer.iterate(train_X, train_Y):
            Yh, backprop = model.begin_update(X)
            dYh, C = get_loss(Yh, Y)
            backprop(dYh, sgd=optimizer)
            losses[-1] += (dYh**2).sum()
            train_accuracies[-1] += C
            train_totals[-1] += len(Y)
    if save:
        model.to_disk(save_name)