def main(width=128, depth=1, vector_length=128, min_batch_size=16, max_batch_size=16, learn_rate=0.001, momentum=0.9, dropout=0.5, dropout_decay=1e-4, nb_epoch=20, L2=1e-6): using_gpu = prefer_gpu() if using_gpu: torch.set_default_tensor_type('torch.cuda.FloatTensor') cfg = dict(locals()) print(cfg) train_data, check_data, nr_tag = ancora_pos_tags() train_data = list(train_data) check_data = list(check_data) extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) with Model.define_operators({ '**': clone, '>>': chain, '+': add, '|': concatenate }): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = (with_flatten( (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3)) >> PyTorchBiLSTM(width, width, depth) >> with_flatten( Softmax(nr_tag))) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:10], train_y[:10], **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open('/tmp/model.pickle', 'wb') as file_: pickle.dump(model, file_)
def main( width=128, depth=1, vector_length=128, min_batch_size=16, max_batch_size=16, learn_rate=0.001, momentum=0.9, dropout=0.5, dropout_decay=1e-4, nb_epoch=20, L2=1e-6, ): prefer_gpu() cfg = dict(locals()) print(cfg) train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter("es", attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) with Model.define_operators({"**": clone, ">>": chain, "+": add, "|": concatenate}): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = ( with_flatten( (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3) ) >> BiLSTM(width, width) ** depth >> with_flatten(Softmax(nr_tag)) ) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:10], train_y[:10], **cfg) as ( trainer, optimizer, ): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open("/tmp/model.pickle", "wb") as file_: pickle.dump(model, file_)
def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({ '>>': chain, '+': add, '|': concatenate, '**': clone }): if cfg.get('low_data') and pretrained_dims: model = (SpacyVectors >> flatten_add_lengths >> with_getitem( 0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width))**2 >> zero_init( Affine(nr_class, width, drop_factor=0.0)) >> logistic) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = (FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued((lower | prefix | suffix | shape) >> LN( Maxout(width, width + (width // 2) * 3)), column=0))) if pretrained_dims: static_vectors = ( SpacyVectors >> with_flatten(Affine(width, pretrained_dims))) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None cnn_model = ( vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**2, pad=2) >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0))) linear_model = ( _preprocess_doc >> LinearModel(nr_class, drop_factor=0.)) model = ((linear_model | cnn_model) >> zero_init( Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic) model.nO = nr_class model.lsuv = False return model
def main(width=100, depth=4, vector_length=64, min_batch_size=1, max_batch_size=32, learn_rate=0.001, momentum=0.9, dropout=0.5, dropout_decay=1e-4, nb_epoch=20, L2=1e-6): cfg = dict(locals()) print(cfg) if cupy is not None: print("Using GPU") Model.ops = CupyOps() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({ '**': clone, '>>': chain, '+': add, '|': concatenate }): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = (with_flatten( (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3) >> Residual(ExtractWindow(nW=1) >> Maxout(width, pieces=3))**depth >> Softmax(nr_tag), pad=depth)) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open('/tmp/model.pickle', 'wb') as file_: pickle.dump(model, file_)
def build_morphologizer_model(class_nums, **cfg): embed_size = util.env_opt("embed_size", 7000) if "token_vector_width" in cfg: token_vector_width = cfg["token_vector_width"] else: token_vector_width = util.env_opt("token_vector_width", 128) pretrained_vectors = cfg.get("pretrained_vectors") char_embed = cfg.get("char_embed", True) with Model.define_operators({">>": chain, "+": add, "**": clone}): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] else: tok2vec = Tok2Vec( token_vector_width, embed_size, char_embed=char_embed, pretrained_vectors=pretrained_vectors, ) softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width)) softmax.out_sizes = class_nums model = tok2vec >> softmax model.nI = None model.tok2vec = tok2vec model.softmax = softmax return model
def Tok2Vec(width, embed_size, **kwargs): pretrained_vectors = kwargs.get("pretrained_vectors", None) cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) subword_features = kwargs.get("subword_features", True) conv_depth = kwargs.get("conv_depth", 4) bilstm_depth = kwargs.get("bilstm_depth", 0) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators( {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply} ): norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm") if subword_features: prefix = HashEmbed( width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix" ) suffix = HashEmbed( width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix" ) shape = HashEmbed( width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape" ) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) if subword_features: embed = uniqued( (glove | norm | prefix | suffix | shape) >> LN(Maxout(width, width * 5, pieces=3)), column=cols.index(ORTH), ) else: embed = uniqued( (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)), column=cols.index(ORTH), ) elif subword_features: embed = uniqued( (norm | prefix | suffix | shape) >> LN(Maxout(width, width * 4, pieces=3)), column=cols.index(ORTH), ) else: embed = norm convolution = Residual( ExtractWindow(nW=1) >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) ) tok2vec = FeatureExtracter(cols) >> with_flatten( embed >> convolution ** conv_depth, pad=conv_depth ) if bilstm_depth >= 1: tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 tok2vec.nO = width tok2vec.embed = embed return tok2vec
def test_pickle_with_flatten(affine): Xs = [affine.ops.allocate((2, 3)), affine.ops.allocate((4, 3))] model = with_flatten(affine) pickled = pickle.dumps(model) loaded = pickle.loads(pickled) Ys = loaded(Xs) assert len(Ys) == 2 assert Ys[0].shape == (Xs[0].shape[0], affine.nO) assert Ys[1].shape == (Xs[1].shape[0], affine.nO)
def Tok2Vec(config): doc2feats = make_layer(config["@doc2feats"]) embed = make_layer(config["@embed"]) encode = make_layer(config["@encode"]) field_size = getattr(encode, "receptive_field", 0) tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=field_size)) tok2vec.cfg = config tok2vec.nO = encode.nO tok2vec.embed = embed tok2vec.encode = encode return tok2vec
def Tok2Vec(config): doc2feats = make_layer(config["@doc2feats"]) embed = make_layer(config["@embed"]) encode = make_layer(config["@encode"]) depth = config["@encode"]["config"]["depth"] tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=depth)) tok2vec.cfg = config tok2vec.nO = encode.nO tok2vec.embed = embed tok2vec.encode = encode return tok2vec
def Tok2Vec(width, embed_size, **kwargs): pretrained_vectors = kwargs.get('pretrained_vectors', None) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({ '>>': chain, '|': concatenate, '**': clone, '+': add, '*': reapply }): norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') prefix = HashEmbed(width, embed_size // 2, column=cols.index(PREFIX), name='embed_prefix') suffix = HashEmbed(width, embed_size // 2, column=cols.index(SUFFIX), name='embed_suffix') shape = HashEmbed(width, embed_size // 2, column=cols.index(SHAPE), name='embed_shape') if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) embed = uniqued((glove | norm | prefix | suffix | shape) >> LN( Maxout(width, width * 5, pieces=3)), column=cols.index(ORTH)) else: embed = uniqued((norm | prefix | suffix | shape) >> LN( Maxout(width, width * 4, pieces=3)), column=cols.index(ORTH)) convolution = Residual( ExtractWindow( nW=1) >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))) tok2vec = (FeatureExtracter(cols) >> with_flatten( embed >> convolution**4, pad=4)) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 tok2vec.nO = width tok2vec.embed = embed return tok2vec
def build_model(nr_class, width, depth, conv_depth, vectors_name, **kwargs): with Model.define_operators({"|": concatenate, ">>": chain, "**": clone}): embed = (HashEmbed(width, 5000, column=1) | StaticVectors(vectors_name, width, column=5) | HashEmbed(width // 2, 750, column=2) | HashEmbed(width // 2, 750, column=3) | HashEmbed(width // 2, 750, column=4)) >> LN(Maxout(width)) sent2vec = (with_flatten(embed) >> Residual( prepare_self_attention(Affine(width * 3, width), nM=width, nH=4) >> MultiHeadedAttention() >> with_flatten( Maxout(width, width, pieces=3))) >> flatten_add_lengths >> ParametricAttention(width, hard=False) >> Pooling(mean_pool) >> Residual(LN(Maxout(width)))) model = (foreach(sent2vec, drop_factor=2.0) >> Residual( prepare_self_attention(Affine(width * 3, width), nM=width, nH=4) >> MultiHeadedAttention() >> with_flatten(LN(Affine(width, width)))) >> flatten_add_lengths >> ParametricAttention( width, hard=False) >> Pooling(mean_pool) >> Residual( LN(Maxout(width)))**2 >> Softmax(nr_class)) model.lsuv = False return model
def baseline_mwe(nO, nP, depth): from thinc.neural._classes.model import Model from thinc.neural._classes.resnet import Residual from thinc.neural._classes.convolution import ExtractWindow from thinc.neural._classes.layernorm import LayerNorm from thinc.api import chain, clone, with_flatten maxout = Maxout(nO, nO*3, pieces=nP) normalize = LayerNorm(maxout) with Model.define_operators({'>>': chain, '**': clone}): model = Residual(ExtractWindow(nW=1) >> normalize) model = with_flatten(chain(*([model]*depth))) model.maxout = maxout model.normalize = normalize return model
def main(width=100, depth=4, vector_length=64, min_batch_size=1, max_batch_size=32, learn_rate=0.001, momentum=0.9, dropout=0.5, dropout_decay=1e-4, nb_epoch=20, L2=1e-6): cfg = dict(locals()) print(cfg) if cupy is not None: print("Using GPU") Model.ops = CupyOps() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({'**': clone, '>>': chain, '+': add, '|': concatenate}): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width//2, 200, column=1) prefix = HashEmbed(width//2, 100, column=2) suffix = HashEmbed(width//2, 100, column=3) model = ( with_flatten( (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3) >> Residual(ExtractWindow(nW=1) >> Maxout(width, pieces=3)) ** depth >> Softmax(nr_tag), pad=depth)) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i]-y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open('/tmp/model.pickle', 'wb') as file_: pickle.dump(model, file_)
def my_tok_to_vec(width, embed_size, pretrained_vectors, **kwargs): # Circular imports :( from spacy._ml import PyTorchBiLSTM cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) conv_depth = kwargs.get("conv_depth", 4) bilstm_depth = kwargs.get("bilstm_depth", 0) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] storage = [] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): # norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm") # prefix = HashEmbed( # width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix" # ) # suffix = HashEmbed( # width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix" # ) shape = HashEmbed( width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape" ) glove = Vectors(storage, pretrained_vectors, width, column=cols.index(NORM), ) vec_width = glove.nV embed = uniqued( (glove | shape) >> LN(Maxout(width, width + vec_width, pieces=3)), column=cols.index(ORTH), ) convolution = Residual( ExtractWindow(nW=1) >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) ) tok2vec = SaveDoc(storage) >> FeatureExtracter(cols) >> with_flatten( embed >> convolution ** conv_depth, pad=conv_depth ) if bilstm_depth >= 1: tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 tok2vec.nO = width tok2vec.embed = embed return tok2vec
def main(width=32, nr_vector=1000): train_data, check_data, nr_tag = ancora_pos_tags(encode_words=True) model = with_flatten( chain(HashEmbed(width, 1000), ReLu(width, width), ReLu(width, width), Softmax(nr_tag, width))) train_X, train_y = zip(*train_data) dev_X, dev_y = zip(*check_data) train_y = [to_categorical(y, nb_classes=nr_tag) for y in train_y] dev_y = [to_categorical(y, nb_classes=nr_tag) for y in dev_y] with model.begin_training(train_X, train_y) as (trainer, optimizer): trainer.each_epoch.append(lambda: print(model.evaluate(dev_X, dev_y))) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop([yh[i] - y[i] for i in range(len(yh))], optimizer) with model.use_params(optimizer.averages): print(model.evaluate(dev_X, dev_y))
def build_model(n_tags, n_words, word_width, tag_width, hidden_width): with Model.define_operators({'|': concatenate, '>>': chain}): words_model = ( with_flatten( Embed(word_width, word_width, n_words), pad=0 ) >> BiLSTM(word_width, word_width) >> flatten_add_lengths >> getitem(0) >> Affine(hidden_width, word_width * 2) >> pad_and_reshape ) tags_model = ( Embed(tag_width, tag_width, n_tags) >> Affine(hidden_width, tag_width) ) state_model = Affine(hidden_width, hidden_width) output_model = Softmax(n_tags, hidden_width) words_model.nO = hidden_width state_model.nO = hidden_width output_model.nO = n_tags def fwd_step(features, drop=0.): word_feats, prev_tags, prev_state = features tag_feats, bp_tags = tags_model.begin_update(prev_tags, drop=drop) state_feats, bp_state = state_model.begin_update(prev_state, drop=drop) preact = word_feats + tag_feats + state_feats nonlin = preact > 0 state = preact * nonlin scores, bp_scores = output_model.begin_update(state, drop=drop) def bwd_step(d_scores, d_next_state, sgd=None): d_state = d_next_state + bp_scores(d_scores, sgd=sgd) d_state *= nonlin bp_tags(d_state, sgd=sgd) d_prev_state = bp_state(d_state, sgd=sgd) return d_state, d_prev_state (state, scores), bwd_step return words_model, fwd_step
def build_tagger_model(nr_class, **cfg): embed_size = util.env_opt('embed_size', 7000) if 'token_vector_width' in cfg: token_vector_width = cfg['token_vector_width'] else: token_vector_width = util.env_opt('token_vector_width', 128) pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({'>>': chain, '+': add}): if 'tok2vec' in cfg: tok2vec = cfg['tok2vec'] else: tok2vec = Tok2Vec(token_vector_width, embed_size, pretrained_dims=pretrained_dims) softmax = with_flatten(Softmax(nr_class, token_vector_width)) model = (tok2vec >> softmax) model.nI = None model.tok2vec = tok2vec model.softmax = softmax return model
def Tok2Vec(width, embed_size, **kwargs): pretrained_vectors = kwargs.get('pretrained_vectors', None) cnn_maxout_pieces = kwargs.get('cnn_maxout_pieces', 2) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({'>>': chain, '|': concatenate, '**': clone, '+': add, '*': reapply}): norm = HashEmbed(width, embed_size, column=cols.index(NORM), name='embed_norm') prefix = HashEmbed(width, embed_size//2, column=cols.index(PREFIX), name='embed_prefix') suffix = HashEmbed(width, embed_size//2, column=cols.index(SUFFIX), name='embed_suffix') shape = HashEmbed(width, embed_size//2, column=cols.index(SHAPE), name='embed_shape') if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) embed = uniqued( (glove | norm | prefix | suffix | shape) >> LN(Maxout(width, width*5, pieces=3)), column=cols.index(ORTH)) else: embed = uniqued( (norm | prefix | suffix | shape) >> LN(Maxout(width, width*4, pieces=3)), column=cols.index(ORTH)) convolution = Residual( ExtractWindow(nW=1) >> LN(Maxout(width, width*3, pieces=cnn_maxout_pieces)) ) tok2vec = ( FeatureExtracter(cols) >> with_flatten( embed >> convolution ** 4, pad=4 ) ) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 tok2vec.nO = width tok2vec.embed = embed return tok2vec
def main(width=32, nr_vector=1000): train_data, check_data, nr_tag = ancora_pos_tags(encode_words=True) model = with_flatten( chain( HashEmbed(width, nr_vector), ReLu(width, width), ReLu(width, width), Softmax(nr_tag, width))) train_X, train_y = zip(*train_data) dev_X, dev_y = zip(*check_data) train_y = [to_categorical(y, nb_classes=nr_tag) for y in train_y] dev_y = [to_categorical(y, nb_classes=nr_tag) for y in dev_y] with model.begin_training(train_X, train_y) as (trainer, optimizer): trainer.each_epoch.append( lambda: print(model.evaluate(dev_X, dev_y))) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop([yh[i]-y[i] for i in range(len(yh))], optimizer) with model.use_params(optimizer.averages): print(model.evaluate(dev_X, dev_y))
def build_tagger_model(nr_class, **cfg): embed_size = util.env_opt('embed_size', 7000) if 'token_vector_width' in cfg: token_vector_width = cfg['token_vector_width'] else: token_vector_width = util.env_opt('token_vector_width', 128) pretrained_vectors = cfg.get('pretrained_vectors') with Model.define_operators({'>>': chain, '+': add}): if 'tok2vec' in cfg: tok2vec = cfg['tok2vec'] else: tok2vec = Tok2Vec(token_vector_width, embed_size, pretrained_vectors=pretrained_vectors) softmax = with_flatten(Softmax(nr_class, token_vector_width)) model = ( tok2vec >> softmax ) model.nI = None model.tok2vec = tok2vec model.softmax = softmax return model
def build_tagger_model(nr_class, **cfg): embed_size = util.env_opt("embed_size", 2000) if "token_vector_width" in cfg: token_vector_width = cfg["token_vector_width"] else: token_vector_width = util.env_opt("token_vector_width", 96) pretrained_vectors = cfg.get("pretrained_vectors") subword_features = cfg.get("subword_features", True) with Model.define_operators({">>": chain, "+": add}): if "tok2vec" in cfg: tok2vec = cfg["tok2vec"] else: tok2vec = Tok2Vec( token_vector_width, embed_size, subword_features=subword_features, pretrained_vectors=pretrained_vectors, ) softmax = with_flatten(Softmax(nr_class, token_vector_width)) model = tok2vec >> softmax model.nI = None model.tok2vec = tok2vec model.softmax = softmax return model
def main(nH=6, dropout=0.1, nS=6, nB=64, nE=20, use_gpu=-1, lim=1000000, nM=300, mL=100, save=False, nTGT=5000, save_name="model.pkl"): if use_gpu != -1: spacy.require_gpu() device = 'cuda' else: device = 'cpu' ''' Read dataset ''' nlp = spacy.load('en_core_web_sm') print('English model loaded') for control_token in ("<eos>", "<bos>", "<pad>", "<cls>", "<mask>"): nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}]) train, dev, test = get_iwslt() print('Dataset loaded') train, _ = zip(*train) dev, _ = zip(*dev) test, _ = zip(*test) train = train[:lim] dev = dev[:lim] test = test[:lim] ''' Tokenize ''' train = spacy_tokenize(nlp.tokenizer, train, mL=mL) dev = spacy_tokenize(nlp.tokenizer, dev, mL=mL) test = spacy_tokenize(nlp.tokenizer, test, mL=mL) print('Tokenization finished') ''' Set rank based on all the docs ''' all_docs = train + dev + test set_rank(nlp.vocab, all_docs, nTGT=nTGT) train = set_numeric_ids(nlp.vocab, train) dev = set_numeric_ids(nlp.vocab, dev) test = set_numeric_ids(nlp.vocab, test) print('Numeric ids set') word2indx, indx2word = get_dicts(nlp.vocab) print('Vocab dictionaries grabbed') with Model.define_operators({">>": chain}): embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX] extractor = FeatureExtracter(attrs=embed_cols) position_encode = PositionEncode(mL, nM) model = (FeatureExtracter(attrs=embed_cols) >> with_flatten( FancyEmbed(nM, nTGT, cols=embed_cols)) >> Residual(position_encode) >> create_model_input() >> Encoder( nM=nM, nS=nS, nH=nH, device=device) >> with_reshape( Softmax(nO=nTGT, nI=nM))) ''' Progress tracking ''' losses = [0.] train_accuracies = [0.] train_totals = [0.] dev_accuracies = [0.] dev_loss = [0.] def track_progress(): correct = 0. total = 0. ''' Get dev stats ''' for X0 in minibatch(dev, size=nB): X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL) Xh = model(X1) L, C, t = get_loss(Xh, X0, X1, loss_mask) correct += C total += t dev_loss[-1] += (L**2).sum() dev_accuracies[-1] = correct / total print(len(losses), losses[-1], train_accuracies[-1] / train_totals[-1], dev_loss[-1], dev_accuracies[-1]) dev_loss.append(0.) losses.append(0.) train_accuracies.append(0.) dev_accuracies.append(0.) train_totals.append(0.) if save: model.to_disk('.models/' + save_name) ''' Model training ''' with model.begin_training(batch_size=nB, nb_epoch=nE) as (trainer, optimizer): trainer.dropout = dropout trainer.dropout_decay = 1e-4 optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 trainer.each_epoch.append(track_progress) optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 for X0, _ in trainer.iterate(train, train): X1, loss_mask = random_mask(X0, nlp, indx2word, nlp.vocab, mL) Xh, backprop = model.begin_update(X1, drop=dropout) dXh, C, total = get_loss(Xh, X0, X1, loss_mask) backprop(dXh, sgd=optimizer) losses[-1] += (dXh**2).sum() train_accuracies[-1] += C train_totals[-1] += total
def main( width=100, depth=4, vector_length=64, min_batch_size=1, max_batch_size=32, learn_rate=0.001, momentum=0.9, dropout=0.5, dropout_decay=1e-4, nb_epoch=20, L2=1e-6, ): cfg = dict(locals()) print(cfg) prefer_gpu() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter("es", attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({ "**": clone, ">>": chain, "+": add, "|": concatenate }): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = (with_flatten( (lower_case | shape | prefix | suffix) >> Maxout(width, pieces=3), pad=depth) >> with_pad_and_mask( MultiHeadedAttention(nM=width, nH=4)) >> with_flatten( Softmax(nr_tag))) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as ( trainer, optimizer, ): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open("/tmp/model.pickle", "wb") as file_: pickle.dump(model, file_)
def main(dataset='quora', width=64, depth=1, min_batch_size=128, max_batch_size=128, dropout=0.0, dropout_decay=0.0, pooling="mean+max", nb_epoch=20, pieces=2, use_gpu=False, out_loc=None, quiet=False): cfg = dict(locals()) if out_loc: out_loc = Path(out_loc) if not out_loc.parent.exists(): raise IOError("Can't open output location: %s" % out_loc) print(cfg) if pooling == 'mean+max': pool_layer = Pooling(mean_pool, max_pool) elif pooling == "mean": pool_layer = mean_pool elif pooling == "max": pool_layer = max_pool else: raise ValueError("Unrecognised pooling", pooling) print("Load spaCy") nlp = get_spacy('en') if use_gpu: Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) with Model.define_operators({ '>>': chain, '**': clone, '|': concatenate, '+': add }): mwe_encode = ExtractWindow(nW=1) >> Maxout( width, width * 3, pieces=pieces) embed = StaticVectors('en', width) #+ Embed(width, width, 5000) sent2mat = (get_word_ids(Model.ops) >> with_flatten(embed >> mwe_encode**depth)) model = Siamese(sent2mat, WordMoversSimilarity(Model.ops)) print("Read and parse data: %s" % dataset) if dataset == 'quora': train, dev = datasets.quora_questions() elif dataset == 'snli': train, dev = datasets.snli() else: raise ValueError("Unknown dataset: %s" % dataset) train_X, train_y = preprocess(model.ops, nlp, train) dev_X, dev_y = preprocess(model.ops, nlp, dev) print("Initialize with data (LSUV)") with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) print("Accuracy before training", model.evaluate(dev_X, dev_y)) print("Train") global epoch_train_acc for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) # No auto-diff: Just get a callback and pass the data through. # Hardly a hardship, and it means we don't have to create/maintain # a computational graph. We just use closures. train_acc = ((yh >= 0.5) == (y >= 0.5)).sum() epoch_train_acc += train_acc backprop(yh - y, optimizer) # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 if out_loc: out_loc = Path(out_loc) print('Saving to', out_loc) with out_loc.open('wb') as file_: pickle.dump(model, file_, -1)
def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}): if cfg.get('low_data') and pretrained_dims: model = ( SpacyVectors >> flatten_add_lengths >> with_getitem(0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width)) ** 2 >> zero_init(Affine(nr_class, width, drop_factor=0.0)) >> logistic ) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width//2, nr_vector, column=2) suffix = HashEmbed(width//2, nr_vector, column=3) shape = HashEmbed(width//2, nr_vector, column=4) trained_vectors = ( FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN(Maxout(width, width+(width//2)*3)), column=0 ) ) ) if pretrained_dims: static_vectors = ( SpacyVectors >> with_flatten(Affine(width, pretrained_dims)) ) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width*2 else: vectors = trained_vectors vectors_width = width static_vectors = None cnn_model = ( vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) ) ** 2, pad=2 ) >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0)) ) linear_model = ( _preprocess_doc >> LinearModel(nr_class) ) #model = linear_model >> logistic model = ( (linear_model | cnn_model) >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) >> logistic ) model.nO = nr_class model.lsuv = False return model
def create_embed_relu_relu_softmax(depth, width, vector_length): with Model.define_operators({">>": chain}): model = with_flatten( Embed(width, vector_length) >> ExtractWindow( nW=1) >> ReLu(width) >> ReLu(width) >> Softmax(20)) return model
def main(nH=6, dropout=0.1, nS=6, nB=15, nE=20, use_gpu=-1, lim=2000): if use_gpu != -1: # TODO: Make specific to different devices, e.g. 1 vs 0 spacy.require_gpu() train, dev, test = get_iwslt() train_X, train_Y = zip(*train) dev_X, dev_Y = zip(*dev) test_X, test_Y = zip(*test) ''' Read dataset ''' nlp_en = spacy.load('en_core_web_sm') nlp_de = spacy.load('de_core_news_sm') print('Models loaded') for control_token in ("<eos>", "<bos>", "<pad>"): nlp_en.tokenizer.add_special_case(control_token, [{ ORTH: control_token }]) nlp_de.tokenizer.add_special_case(control_token, [{ ORTH: control_token }]) train_X, train_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, train_X[-lim:], train_Y[-lim:], MAX_LENGTH) dev_X, dev_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, dev_X[-lim:], dev_Y[-lim:], MAX_LENGTH) test_X, test_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, test_X[-lim:], test_Y[-lim:], MAX_LENGTH) train_X = set_numeric_ids(nlp_en.vocab, train_X, vocab_size=VOCAB_SIZE) train_Y = set_numeric_ids(nlp_de.vocab, train_Y, vocab_size=VOCAB_SIZE) nTGT = VOCAB_SIZE with Model.define_operators({">>": chain}): embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX] extractor = FeatureExtracter(attrs=embed_cols) position_encode = PositionEncode(MAX_LENGTH, MODEL_SIZE) model = (apply_layers(extractor, extractor) >> apply_layers( with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)), with_flatten(FancyEmbed(MODEL_SIZE, 5000, cols=embed_cols)), ) >> apply_layers(Residual(position_encode), Residual(position_encode)) >> create_batch() >> EncoderDecoder(nS=nS, nH=nH, nTGT=nTGT)) losses = [0.] train_accuracies = [0.] train_totals = [0.] dev_accuracies = [0.] dev_loss = [0.] def track_progress(): correct = 0. total = 0. for batch in minibatch(zip(dev_X, dev_Y), size=1024): X, Y = zip(*batch) Yh, Y_mask = model((X, Y)) L, C = get_loss(model.ops, Yh, Y, Y_mask) correct += C dev_loss[-1] += (L**2).sum() total += len(Y) dev_accuracies[-1] = correct / total n_train = train_totals[-1] print(len(losses), losses[-1], train_accuracies[-1] / n_train, dev_loss[-1], dev_accuracies[-1]) dev_loss.append(0.) losses.append(0.) train_accuracies.append(0.) dev_accuracies.append(0.) train_totals.append(0.) with model.begin_training(batch_size=nB, nb_epoch=nE) as (trainer, optimizer): trainer.dropout = dropout trainer.dropout_decay = 1e-4 trainer.each_epoch.append(track_progress) optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 for X, Y in trainer.iterate(train_X, train_Y): (Yh, X_mask), backprop = model.begin_update((X, Y), drop=dropout) dYh, C = get_loss(model.ops, Yh, Y, X_mask) backprop(dYh, sgd=optimizer) losses[-1] += (dYh**2).sum() train_accuracies[-1] += C train_totals[-1] += sum(len(y) for y in Y)
def build_text_classifier(nr_class, width=64, **cfg): depth = cfg.get("depth", 2) nr_vector = cfg.get("nr_vector", 5000) pretrained_dims = cfg.get("pretrained_dims", 0) with Model.define_operators({ ">>": chain, "+": add, "|": concatenate, "**": clone }): if cfg.get("low_data") and pretrained_dims: model = (SpacyVectors >> flatten_add_lengths >> with_getitem( 0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width))**2 >> zero_init( Affine(nr_class, width, drop_factor=0.0)) >> logistic) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN( Maxout(width, width + (width // 2) * 3)), column=0, )) if pretrained_dims: static_vectors = SpacyVectors >> with_flatten( Affine(width, pretrained_dims)) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None tok2vec = vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**depth, pad=depth, ) cnn_model = ( tok2vec >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0))) linear_model = build_bow_text_classifier(nr_class, ngram_size=cfg.get( "ngram_size", 1), exclusive_classes=False) if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = (zero_init( Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic) model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False return model
def main(nH=6, dropout=0.0, nS=6, nB=32, nE=20, use_gpu=-1, lim=2000, nM=300, mL=20, nTGT=3500, save=False, load=False, save_name="model.pkl", load_name="model.pkl"): if use_gpu != -1: # TODO: Make specific to different devices, e.g. 1 vs 0 spacy.require_gpu() device = 'cuda' else: device = 'cpu' train, dev, test = get_iwslt() train_X, train_Y = zip(*train) dev_X, dev_Y = zip(*dev) test_X, test_Y = zip(*test) ''' Read dataset ''' nlp_en = spacy.load('en_core_web_sm') nlp_de = spacy.load('de_core_news_sm') print('Models loaded') for control_token in ("<eos>", "<bos>", "<pad>"): nlp_en.tokenizer.add_special_case(control_token, [{ ORTH: control_token }]) nlp_de.tokenizer.add_special_case(control_token, [{ ORTH: control_token }]) train_lim = min(lim, len(train_X)) dev_lim = min(lim, len(dev_X)) test_lim = min(lim, len(test_X)) train_X, train_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, train_X[:train_lim], train_Y[:train_lim], mL) dev_X, dev_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, dev_X[:dev_lim], dev_Y[:dev_lim], mL) test_X, test_Y = spacy_tokenize(nlp_en.tokenizer, nlp_de.tokenizer, test_X[:test_lim], test_Y[:test_lim], mL) all_X_docs = train_X + dev_X + test_X all_y_docs = train_Y + dev_Y + test_Y set_rank(nlp_en.vocab, all_X_docs, nTGT=nTGT) set_rank(nlp_de.vocab, all_y_docs, nTGT=nTGT) train_X = set_numeric_ids(nlp_en.vocab, train_X) dev_X = set_numeric_ids(nlp_en.vocab, dev_X) test_X = set_numeric_ids(nlp_en.vocab, test_X) train_Y = set_numeric_ids(nlp_de.vocab, train_Y) dev_Y = set_numeric_ids(nlp_de.vocab, dev_Y) test_Y = set_numeric_ids(nlp_de.vocab, test_Y) en_word2indx, en_indx2word = get_dicts(nlp_en.vocab) de_word2indx, de_indx2word = get_dicts(nlp_de.vocab) nTGT += 1 if not load: with Model.define_operators({">>": chain}): embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX] extractor = FeatureExtracter(attrs=embed_cols) position_encode = PositionEncode(mL, nM) model = (apply_layers(extractor, extractor) >> apply_layers( with_flatten(FancyEmbed(nM, 5000, cols=embed_cols)), with_flatten(FancyEmbed(nM, 5000, cols=embed_cols)), ) >> apply_layers(Residual(position_encode), Residual(position_encode)) >> create_batch() >> EncoderDecoder( nS=nS, nH=nH, nTGT=nTGT, nM=nM, device=device)) else: model = Model.from_disk(load_name) losses = [0.] train_accuracies = [0.] train_totals = [0.] dev_accuracies = [0.] dev_loss = [0.] def track_progress(): correct = 0. total = 0. for batch in minibatch(zip(dev_X, dev_Y), size=1024): X, Y = zip(*batch) Yh, Y_mask = model((X, Y)) L, C, total = get_loss(model.ops, Yh, Y, Y_mask) correct += C dev_loss[-1] += (L**2).sum() dev_accuracies[-1] = correct / total n_train = train_totals[-1] print(len(losses), losses[-1], train_accuracies[-1] / n_train, dev_loss[-1], dev_accuracies[-1]) dev_loss.append(0.) losses.append(0.) train_accuracies.append(0.) dev_accuracies.append(0.) train_totals.append(0.) with model.begin_training(batch_size=nB, nb_epoch=nE) as (trainer, optimizer): trainer.dropout = dropout trainer.dropout_decay = 1e-4 optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 trainer.each_epoch.append(track_progress) optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 for X, Y in trainer.iterate(train_X, train_Y): (Yh, X_mask), backprop = model.begin_update((X, Y)) dYh, C, total = get_loss(model.ops, Yh, Y, X_mask) backprop(dYh, sgd=optimizer) losses[-1] += (dYh**2).sum() train_accuracies[-1] += C train_totals[-1] += total if save: model.to_disk(save_name)
def build_text_classifier(nr_class, width=64, **cfg): depth = cfg.get("depth", 2) nr_vector = cfg.get("nr_vector", 5000) pretrained_dims = cfg.get("pretrained_dims", 0) with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}): if cfg.get("low_data") and pretrained_dims: model = ( SpacyVectors >> flatten_add_lengths >> with_getitem(0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width)) ** 2 >> zero_init(Affine(nr_class, width, drop_factor=0.0)) >> logistic ) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] ) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN(Maxout(width, width + (width // 2) * 3)), column=0, ) ) if pretrained_dims: static_vectors = SpacyVectors >> with_flatten( Affine(width, pretrained_dims) ) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None tok2vec = vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth, pad=depth, ) cnn_model = ( tok2vec >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0)) ) linear_model = build_bow_text_classifier( nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False ) if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = ( zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic ) model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False return model
def Tok2Vec(width, embed_size, **kwargs): # Circular imports :( from .._ml import CharacterEmbed from .._ml import PyTorchBiLSTM pretrained_vectors = kwargs.get("pretrained_vectors", None) cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) subword_features = kwargs.get("subword_features", True) char_embed = kwargs.get("char_embed", False) if char_embed: subword_features = False conv_depth = kwargs.get("conv_depth", 4) bilstm_depth = kwargs.get("bilstm_depth", 0) cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm", seed=6) if subword_features: prefix = HashEmbed(width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix", seed=7) suffix = HashEmbed(width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix", seed=8) shape = HashEmbed(width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape", seed=9) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID)) if subword_features: embed = uniqued( (glove | norm | prefix | suffix | shape) >> LN( Maxout(width, width * 5, pieces=3)), column=cols.index(ORTH), ) elif char_embed: embed = concatenate_lists( CharacterEmbed(nM=64, nC=8), FeatureExtracter(cols) >> with_flatten(glove), ) reduce_dimensions = LN( Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)) else: embed = uniqued( (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)), column=cols.index(ORTH), ) elif subword_features: embed = uniqued( (norm | prefix | suffix | shape) >> LN( Maxout(width, width * 4, pieces=3)), column=cols.index(ORTH), ) elif char_embed: embed = concatenate_lists( CharacterEmbed(nM=64, nC=8), FeatureExtracter(cols) >> with_flatten(norm), ) reduce_dimensions = LN( Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)) else: embed = norm convolution = Residual( ExtractWindow( nW=1) >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))) if char_embed: tok2vec = embed >> with_flatten( reduce_dimensions >> convolution**conv_depth, pad=conv_depth) else: tok2vec = FeatureExtracter(cols) >> with_flatten( embed >> convolution**conv_depth, pad=conv_depth) if bilstm_depth >= 1: tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) # Work around thinc API limitations :(. TODO: Revise in Thinc 7 tok2vec.nO = width tok2vec.embed = embed return tok2vec
def main(nH=6, dropout=0.0, nS=6, nB=32, nE=20, use_gpu=-1, lim=2000, nM=300, mL=100, save=False, save_name="model.pkl"): if use_gpu != -1: # TODO: Make specific to different devices, e.g. 1 vs 0 spacy.require_gpu() device = 'cuda' else: device = 'cpu' ''' Read dataset ''' nlp = spacy.load('en_core_web_sm') for control_token in ("<eos>", "<bos>", "<pad>", "<cls>"): nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}]) train, dev = imdb(limit=lim) print('Loaded imdb dataset') train = train[:lim] dev = dev[:lim] train_X, train_Y = zip(*train) dev_X, dev_Y = zip(*dev) train_X = spacy_tokenize(nlp.tokenizer, train_X, mL=mL) dev_X = spacy_tokenize(nlp.tokenizer, dev_X, mL=mL) print('Tokenized dataset') train_X = set_numeric_ids(nlp.vocab, train_X) dev_X = set_numeric_ids(nlp.vocab, dev_X) print('Numeric ids ready') with Model.define_operators({">>": chain}): embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX] extractor = FeatureExtracter(attrs=embed_cols) position_encode = PositionEncode(mL, nM) model = (FeatureExtracter(attrs=embed_cols) >> with_flatten( FancyEmbed(nM, 5000, cols=embed_cols)) >> Residual(position_encode) >> create_model_input() >> Categorizer( nM=nM, nS=nS, nH=nH, device=device)) losses = [0.] train_accuracies = [0.] train_totals = [0.] dev_accuracies = [0.] dev_loss = [0.] def track_progress(): correct = 0. total = 0. for batch in minibatch(zip(dev_X, dev_Y), size=1024): X, Y = zip(*batch) Yh = model(X) L, C = get_loss(Yh, Y) correct += C dev_loss[-1] += (L**2).sum() total += len(X) dev_accuracies[-1] = correct / total n_train = train_totals[-1] print(len(losses), losses[-1], train_accuracies[-1] / n_train, dev_loss[-1], dev_accuracies[-1]) dev_loss.append(0.) losses.append(0.) train_accuracies.append(0.) dev_accuracies.append(0.) train_totals.append(0.) with model.begin_training(batch_size=nB, nb_epoch=nE) as (trainer, optimizer): trainer.dropout = dropout trainer.dropout_decay = 1e-4 optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 trainer.each_epoch.append(track_progress) optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] for X, Y in trainer.iterate(train_X, train_Y): Yh, backprop = model.begin_update(X) dYh, C = get_loss(Yh, Y) backprop(dYh, sgd=optimizer) losses[-1] += (dYh**2).sum() train_accuracies[-1] += C train_totals[-1] += len(Y) if save: model.to_disk(save_name)