def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg): """ Build a simple CNN text classifier, given a token-to-vector model as inputs. If exclusive_classes=True, a softmax non-linearity is applied, so that the outputs sum to 1. If exclusive_classes=False, a logistic non-linearity is applied instead, so that outputs are in the range [0, 1]. """ with Model.define_operators({">>": chain}): if exclusive_classes: output_layer = Softmax(nr_class, tok2vec.nO) else: output_layer = (zero_init( Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic) model = tok2vec >> flatten_add_lengths >> Pooling( mean_pool) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class return model
def main(depth=2, width=512, nb_epoch=30): if CupyOps.xp != None: Model.ops = CupyOps() Model.Ops = CupyOps # Configuration here isn't especially good. But, for demo.. with Model.define_operators({'**': clone, '>>': chain}): model = ReLu(width) >> ReLu(width) >> Softmax() train_data, dev_data, _ = datasets.mnist() train_X, train_y = model.ops.unzip(train_data) dev_X, dev_y = model.ops.unzip(dev_data) dev_y = to_categorical(dev_y) with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): epoch_loss = [0.] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.) trainer.each_epoch.append(report_progress) trainer.nb_epoch = nb_epoch trainer.dropout = 0.3 trainer.batch_size = 128 trainer.dropout_decay = 0.0 train_X = model.ops.asarray(train_X, dtype='float32') y_onehot = to_categorical(train_y) for X, y in trainer.iterate(train_X, y_onehot): yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y)**2.).sum() / y.shape[0] backprop(yh - y, optimizer) epoch_loss[-1] += loss with model.use_params(optimizer.averages): print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y)) with open('out.pickle', 'wb') as file_: pickle.dump(model, file_, -1)
def build_tagger_model(nr_class, **cfg): embed_size = util.env_opt('embed_size', 7000) if 'token_vector_width' in cfg: token_vector_width = cfg['token_vector_width'] else: token_vector_width = util.env_opt('token_vector_width', 128) pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({'>>': chain, '+': add}): if 'tok2vec' in cfg: tok2vec = cfg['tok2vec'] else: tok2vec = Tok2Vec(token_vector_width, embed_size, pretrained_dims=pretrained_dims) softmax = with_flatten(Softmax(nr_class, token_vector_width)) model = ( tok2vec >> softmax ) model.nI = None model.tok2vec = tok2vec model.softmax = softmax return model
def main(width=32, nr_vector=1000): train_data, check_data, nr_tag = ancora_pos_tags(encode_words=True) model = with_flatten( chain( HashEmbed(width, nr_vector), ReLu(width, width), ReLu(width, width), Softmax(nr_tag, width), )) train_X, train_y = zip(*train_data) dev_X, dev_y = zip(*check_data) train_y = [to_categorical(y, nb_classes=nr_tag) for y in train_y] dev_y = [to_categorical(y, nb_classes=nr_tag) for y in dev_y] with model.begin_training(train_X, train_y) as (trainer, optimizer): trainer.each_epoch.append(lambda: print(model.evaluate(dev_X, dev_y))) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop([yh[i] - y[i] for i in range(len(yh))], optimizer) with model.use_params(optimizer.averages): print(model.evaluate(dev_X, dev_y))
def build_model(nr_class, width, depth, conv_depth, **kwargs): with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}): embed = ( (HashEmbed(width, 5000, column=1) | StaticVectors('spacy_pretrained_vectors', width, column=5) | HashEmbed(width//2, 750, column=2) | HashEmbed(width//2, 750, column=3) | HashEmbed(width//2, 750, column=4)) >> LN(Maxout(width)) ) sent2vec = ( flatten_add_lengths >> with_getitem(0, embed >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width))) ** conv_depth ) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(LN(Maxout(width))) ** depth ) model = ( foreach(sent2vec, drop_factor=2.0) >> flatten_add_lengths # This block would allow the model to learn some cross-sentence # features. It's not useful on this problem. It might make more # sense to use a BiLSTM here, following Liang et al (2016). #>> with_getitem(0, # Residual(ExtractWindow(nW=1) >> LN(Maxout(width))) ** conv_depth #) >> ParametricAttention(width, hard=False) >> Pooling(sum_pool) >> Residual(LN(Maxout(width))) ** depth >> Softmax(nr_class) ) model.lsuv = False return model
def build_model(nr_class, width, depth, conv_depth, vectors_name, **kwargs): with Model.define_operators({"|": concatenate, ">>": chain, "**": clone}): embed = (HashEmbed(width, 5000, column=1) | StaticVectors(vectors_name, width, column=5) | HashEmbed(width // 2, 750, column=2) | HashEmbed(width // 2, 750, column=3) | HashEmbed(width // 2, 750, column=4)) >> LN(Maxout(width)) sent2vec = (with_flatten(embed) >> Residual( prepare_self_attention(Affine(width * 3, width), nM=width, nH=4) >> MultiHeadedAttention() >> with_flatten( Maxout(width, width, pieces=3))) >> flatten_add_lengths >> ParametricAttention(width, hard=False) >> Pooling(mean_pool) >> Residual(LN(Maxout(width)))) model = (foreach(sent2vec, drop_factor=2.0) >> Residual( prepare_self_attention(Affine(width * 3, width), nM=width, nH=4) >> MultiHeadedAttention() >> with_flatten(LN(Affine(width, width)))) >> flatten_add_lengths >> ParametricAttention( width, hard=False) >> Pooling(mean_pool) >> Residual( LN(Maxout(width)))**2 >> Softmax(nr_class)) model.lsuv = False return model
def build_tagger_model(nr_class, **cfg): embed_size = util.env_opt("embed_size", 2000) if "token_vector_width" in cfg: token_vector_width = cfg["token_vector_width"] else: token_vector_width = util.env_opt("token_vector_width", 96) pretrained_vectors = cfg.get("pretrained_vectors") subword_features = cfg.get("subword_features", True) with Model.define_operators({">>": chain, "+": add}): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] else: tok2vec = Tok2Vec( token_vector_width, embed_size, subword_features=subword_features, pretrained_vectors=pretrained_vectors, ) softmax = with_flatten(Softmax(nr_class, token_vector_width)) model = tok2vec >> softmax model.nI = None model.tok2vec = tok2vec model.softmax = softmax return model
def build_text_classifier(nr_class, width=64, **cfg): depth = cfg.get("depth", 2) nr_vector = cfg.get("nr_vector", 5000) pretrained_dims = cfg.get("pretrained_dims", 0) with Model.define_operators({ ">>": chain, "+": add, "|": concatenate, "**": clone }): if cfg.get("low_data") and pretrained_dims: model = (SpacyVectors >> flatten_add_lengths >> with_getitem( 0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width))**2 >> zero_init( Affine(nr_class, width, drop_factor=0.0)) >> logistic) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN( Maxout(width, width + (width // 2) * 3)), column=0, )) if pretrained_dims: static_vectors = SpacyVectors >> with_flatten( Affine(width, pretrained_dims)) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None tok2vec = vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**depth, pad=depth, ) cnn_model = ( tok2vec >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0))) linear_model = build_bow_text_classifier(nr_class, ngram_size=cfg.get( "ngram_size", 1), exclusive_classes=False) if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = (zero_init( Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic) model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False return model
def main(width=100, depth=4, vector_length=64, min_batch_size=4, max_batch_size=32, learn_rate=0.001, momentum=0.9, dropout=0.0, dropout_decay=1e-4, nb_epoch=20, L2=1e-6): cfg = dict(locals()) print(cfg) prefer_gpu() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({ '**': clone, '>>': chain, '+': add, '|': concatenate, '&': concatenate_ragged }): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = (flatten_add_lengths >> with_getitem( 0, (lower_case | shape | prefix | suffix) >> LayerNorm( Maxout(width, pieces=3))) >> concatenate_ragged( SelfAttention(nK=16, nO=16, nI=width, nL=1, nR=1), SelfAttention(nK=16, nO=16, nI=width, nL=1, nR=1), SelfAttention(nK=16, nO=16, nI=width, nL=1, nR=1), SelfAttention(nK=16, nO=16, nI=width, nL=1, nR=1)) >> with_getitem(0, Softmax(nr_tag)) >> unflatten) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open('/tmp/model.pickle', 'wb') as file_: pickle.dump(model, file_)
def main( width=100, depth=4, vector_length=64, min_batch_size=1, max_batch_size=32, learn_rate=0.001, momentum=0.9, dropout=0.5, dropout_decay=1e-4, nb_epoch=20, L2=1e-6, ): cfg = dict(locals()) print(cfg) prefer_gpu() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter("es", attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({ "**": clone, ">>": chain, "+": add, "|": concatenate }): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = with_flatten( (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3) >> Residual(ExtractWindow(nW=1) >> Maxout(width, pieces=3))**depth >> Softmax(nr_tag), pad=depth, ) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as ( trainer, optimizer, ): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open("/tmp/model.pickle", "wb") as file_: pickle.dump(model, file_)
def main( width=128, depth=1, vector_length=128, min_batch_size=16, max_batch_size=16, learn_rate=0.001, momentum=0.9, dropout=0.5, dropout_decay=1e-4, nb_epoch=20, L2=1e-6, ): using_gpu = prefer_gpu() if using_gpu: torch.set_default_tensor_type("torch.cuda.FloatTensor") cfg = dict(locals()) print(cfg) train_data, check_data, nr_tag = ancora_pos_tags() train_data = list(train_data) check_data = list(check_data) extracter = FeatureExtracter("es", attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) with Model.define_operators({ "**": clone, ">>": chain, "+": add, "|": concatenate }): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = (with_flatten( (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3)) >> PyTorchBiLSTM(width, width, depth) >> with_flatten( Softmax(nr_tag))) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:10], train_y[:10], **cfg) as ( trainer, optimizer, ): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open("/tmp/model.pickle", "wb") as file_: pickle.dump(model, file_)