def build_model(nr_class, width, **kwargs): with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}): model = (FeatureExtracter([ORTH]) >> flatten_add_lengths >> with_getitem(0, uniqued(HashEmbed(width, 10000, column=0))) >> Pooling(mean_pool) >> Softmax(nr_class)) model.lsuv = False return model
def build_spancat_model( tok2vec: Model[List[Doc], List[Floats2d]], reducer: Model[Ragged, Floats2d], scorer: Model[Floats2d, Floats2d], ) -> Model[Tuple[List[Doc], Ragged], Floats2d]: """Build a span categorizer model, given a token-to-vector model, a reducer model to map the sequence of vectors for each span down to a single vector, and a scorer model to map the vectors to probabilities. tok2vec (Model[List[Doc], List[Floats2d]]): The tok2vec model. reducer (Model[Ragged, Floats2d]): The reducer model. scorer (Model[Floats2d, Floats2d]): The scorer model. """ model = chain( cast( Model[Tuple[List[Doc], Ragged], Tuple[Ragged, Ragged]], with_getitem( 0, chain(tok2vec, cast(Model[List[Floats2d], Ragged], list2ragged()))), ), extract_spans(), reducer, scorer, ) model.set_ref("tok2vec", tok2vec) model.set_ref("reducer", reducer) model.set_ref("scorer", scorer) return model
def build_model(nr_class, width, depth, conv_depth, **kwargs): with Model.define_operators({"|": concatenate, ">>": chain, "**": clone}): embed = (HashEmbed(width, 5000, column=1) | StaticVectors("spacy_pretrained_vectors", width, column=5) | HashEmbed(width // 2, 750, column=2) | HashEmbed(width // 2, 750, column=3) | HashEmbed(width // 2, 750, column=4)) >> LN(Maxout(width)) sent2vec = (flatten_add_lengths >> with_getitem( 0, embed >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width)))** conv_depth, ) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual( LN(Maxout(width)))**depth) model = ( foreach(sent2vec, drop_factor=2.0) >> flatten_add_lengths # This block would allow the model to learn some cross-sentence # features. It's not useful on this problem. It might make more # sense to use a BiLSTM here, following Liang et al (2016). # >> with_getitem(0, # Residual(ExtractWindow(nW=1) >> LN(Maxout(width))) ** conv_depth # ) >> ParametricAttention(width, hard=False) >> Pooling(sum_pool) >> Residual(LN(Maxout(width)))**depth >> Softmax(nr_class)) model.lsuv = False return model
def softmax_tanh_class_vector(nr_class, *, exclusive_classes=True, **cfg): """Select features from the class-vectors from the last hidden state, mean-pool them, and softmax to produce one vector per document. The gradients of the class vectors are incremented in the backward pass, to allow fine-tuning. """ width = cfg["token_vector_width"] return chain(get_pytt_class_tokens, flatten_add_lengths, with_getitem(0, chain(Affine(width, width), tanh)), Pooling(mean_pool), Softmax(2, width))
def build_model(nr_class, width, **kwargs): with Model.define_operators({"|": concatenate, ">>": chain, "**": clone}): model = ( FeatureExtracter([ORTH]) >> flatten_add_lengths >> with_getitem(0, uniqued(HashEmbed(width, 10000, column=0))) >> Pooling(mean_pool) >> Softmax(nr_class) ) model.lsuv = False return model
def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({ '>>': chain, '+': add, '|': concatenate, '**': clone }): if cfg.get('low_data') and pretrained_dims: model = (SpacyVectors >> flatten_add_lengths >> with_getitem( 0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width))**2 >> zero_init( Affine(nr_class, width, drop_factor=0.0)) >> logistic) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = (FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued((lower | prefix | suffix | shape) >> LN( Maxout(width, width + (width // 2) * 3)), column=0))) if pretrained_dims: static_vectors = ( SpacyVectors >> with_flatten(Affine(width, pretrained_dims))) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None cnn_model = ( vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**2, pad=2) >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0))) linear_model = ( _preprocess_doc >> LinearModel(nr_class, drop_factor=0.)) model = ((linear_model | cnn_model) >> zero_init( Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic) model.nO = nr_class model.lsuv = False return model
def softmax_pooler_output(nr_class, *, exclusive_classes=True, **cfg): """Select features from the pooler output, (if necessary) mean-pool them to produce one vector per item, and then softmax them. The gradients of the class vectors are incremented in the backward pass, to allow fine-tuning. """ return chain( get_pooler_output, flatten_add_lengths, with_getitem(0, Softmax(nr_class, cfg["token_vector_width"])), Pooling(mean_pool), )
def fine_tune_pooler_output(nr_class, *, exclusive_classes=True, **cfg): """Select features from the class-vectors from the last hidden state, softmax them, and then mean-pool them to produce one feature per vector. The gradients of the class vectors are incremented in the backward pass, to allow fine-tuning. """ return chain( get_pytt_pooler_output, flatten_add_lengths, with_getitem(0, Softmax(nr_class, cfg["token_vector_width"])), Pooling(mean_pool), )
def test_with_getitem(): data = ( numpy.asarray([[1, 2, 3, 4]], dtype="f"), numpy.asarray([[5, 6, 7, 8]], dtype="f"), ) model = with_getitem(1, Linear()) model.initialize(data, data) Y, backprop = model.begin_update(data) assert len(Y) == len(data) assert numpy.array_equal(Y[0], data[0]) # the other item stayed the same assert not numpy.array_equal(Y[1], data[1]) dX = backprop(Y) assert numpy.array_equal(dX[0], data[0]) assert not numpy.array_equal(dX[1], data[1])
def build_model(nr_class, width, depth, conv_depth, **kwargs): with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}): embed = ((HashEmbed(width, 5000, column=1) | StaticVectors('spacy_pretrained_vectors', width, column=5) | HashEmbed(width // 2, 750, column=2) | HashEmbed(width // 2, 750, column=3) | HashEmbed(width // 2, 750, column=4)) >> LN(Maxout(width))) sent2vec = (flatten_add_lengths >> with_getitem( 0, embed >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width)))** conv_depth) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(LN(Maxout(width)))**depth) model = (foreach(sent2vec, drop_factor=2.0) >> flatten_add_lengths >> ParametricAttention(width, hard=False) >> Pooling(sum_pool) >> Residual(LN(Maxout(width)))**depth >> Softmax(nr_class)) model.lsuv = False return model
def build_model(nr_class, width, depth, conv_depth, **kwargs): with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}): embed = ((HashEmbed(width, 5000, column=1) | HashEmbed(width // 2, 750, column=2) | HashEmbed(width // 2, 750, column=3) | HashEmbed(width // 2, 750, column=4)) >> Maxout(width)) sent2vec = ( FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE]) >> flatten_add_lengths >> with_getitem( 0, uniqued(embed, column=0) >> Residual(ExtractWindow(nW=1) >> SELU(width))**conv_depth) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual( SELU(width))**depth) model = ( foreach_sentence(sent2vec, drop_factor=2.0) >> flatten_add_lengths >> ParametricAttention(width, hard=False) >> Pooling(sum_pool) >> Residual(SELU(width))**depth >> Softmax(nr_class)) model.lsuv = False return model
def build_model(nr_class, width, depth, conv_depth, **kwargs): with Model.define_operators({'|': concatenate, '>>': chain, '**': clone}): embed = ( (HashEmbed(width, 5000, column=1) | StaticVectors('spacy_pretrained_vectors', width, column=5) | HashEmbed(width//2, 750, column=2) | HashEmbed(width//2, 750, column=3) | HashEmbed(width//2, 750, column=4)) >> LN(Maxout(width)) ) sent2vec = ( flatten_add_lengths >> with_getitem(0, embed >> Residual(ExtractWindow(nW=1) >> LN(Maxout(width))) ** conv_depth ) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(LN(Maxout(width))) ** depth ) model = ( foreach(sent2vec, drop_factor=2.0) >> flatten_add_lengths # This block would allow the model to learn some cross-sentence # features. It's not useful on this problem. It might make more # sense to use a BiLSTM here, following Liang et al (2016). #>> with_getitem(0, # Residual(ExtractWindow(nW=1) >> LN(Maxout(width))) ** conv_depth #) >> ParametricAttention(width, hard=False) >> Pooling(sum_pool) >> Residual(LN(Maxout(width))) ** depth >> Softmax(nr_class) ) model.lsuv = False return model
def build_text_classifier(nr_class, width=64, **cfg): depth = cfg.get("depth", 2) nr_vector = cfg.get("nr_vector", 5000) pretrained_dims = cfg.get("pretrained_dims", 0) with Model.define_operators({ ">>": chain, "+": add, "|": concatenate, "**": clone }): if cfg.get("low_data") and pretrained_dims: model = (SpacyVectors >> flatten_add_lengths >> with_getitem( 0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width))**2 >> zero_init( Affine(nr_class, width, drop_factor=0.0)) >> logistic) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN( Maxout(width, width + (width // 2) * 3)), column=0, )) if pretrained_dims: static_vectors = SpacyVectors >> with_flatten( Affine(width, pretrained_dims)) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None tok2vec = vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width * 3))))**depth, pad=depth, ) cnn_model = ( tok2vec >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0))) linear_model = build_bow_text_classifier(nr_class, ngram_size=cfg.get( "ngram_size", 1), exclusive_classes=False) if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = (zero_init( Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic) model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False return model
def main(width=100, depth=4, vector_length=64, min_batch_size=4, max_batch_size=32, learn_rate=0.001, momentum=0.9, dropout=0.0, dropout_decay=1e-4, nb_epoch=20, L2=1e-6): cfg = dict(locals()) print(cfg) prefer_gpu() train_data, check_data, nr_tag = ancora_pos_tags() extracter = FeatureExtracter('es', attrs=[LOWER, SHAPE, PREFIX, SUFFIX]) Model.lsuv = True with Model.define_operators({ '**': clone, '>>': chain, '+': add, '|': concatenate, '&': concatenate_ragged }): lower_case = HashEmbed(width, 100, column=0) shape = HashEmbed(width // 2, 200, column=1) prefix = HashEmbed(width // 2, 100, column=2) suffix = HashEmbed(width // 2, 100, column=3) model = (flatten_add_lengths >> with_getitem( 0, (lower_case | shape | prefix | suffix) >> LayerNorm( Maxout(width, pieces=3))) >> concatenate_ragged( SelfAttention(nK=16, nO=16, nI=width, nL=1, nR=1), SelfAttention(nK=16, nO=16, nI=width, nL=1, nR=1), SelfAttention(nK=16, nO=16, nI=width, nL=1, nR=1), SelfAttention(nK=16, nO=16, nI=width, nL=1, nR=1)) >> with_getitem(0, Softmax(nr_tag)) >> unflatten) train_X, train_y = preprocess(model.ops, extracter, train_data, nr_tag) dev_X, dev_y = preprocess(model.ops, extracter, check_data, nr_tag) n_train = float(sum(len(x) for x in train_X)) global epoch_train_acc with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) gradient = [yh[i] - y[i] for i in range(len(yh))] backprop(gradient, optimizer) trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 with model.use_params(trainer.optimizer.averages): print(model.evaluate(dev_X, model.ops.flatten(dev_y))) with open('/tmp/model.pickle', 'wb') as file_: pickle.dump(model, file_)
def main(dataset='quora', width=50, depth=2, min_batch_size=1, max_batch_size=512, dropout=0.2, dropout_decay=0.0, pooling="mean+max", nb_epoch=5, pieces=3, L2=0.0, use_gpu=False, out_loc=None, quiet=False, job_id=None, ws_api_url=None, rest_api_url=None): global CTX if job_id is not None: CTX = neptune.Context() width = CTX.params.width L2 = CTX.params.L2 nb_epoch = CTX.params.nb_epoch depth = CTX.params.depth max_batch_size = CTX.params.max_batch_size cfg = dict(locals()) if out_loc: out_loc = Path(out_loc) if not out_loc.parent.exists(): raise IOError("Can't open output location: %s" % out_loc) print(cfg) if pooling == 'mean+max': pool_layer = Pooling(mean_pool, max_pool) elif pooling == "mean": pool_layer = mean_pool elif pooling == "max": pool_layer = max_pool else: raise ValueError("Unrecognised pooling", pooling) print("Load spaCy") nlp = get_spacy('en') if use_gpu: Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) Model.lsuv = True #Model.ops = CupyOps() with Model.define_operators({ '>>': chain, '**': clone, '|': concatenate, '+': add }): mwe_encode = ExtractWindow(nW=1) >> BN( Maxout(width, drop_factor=0.0, pieces=pieces)) sent2vec = ( # List[spacy.token.Doc]{B} flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem( 0, #(StaticVectors('en', width) HashEmbed(width, 3000) #+ HashEmbed(width, 3000)) #>> Residual(mwe_encode ** 2) ) # : word_ids{T} >> Pooling(mean_pool, max_pool) #>> Residual(BN(Maxout(width*2, pieces=pieces), nO=width*2)**2) >> Maxout(width * 2, pieces=pieces, drop_factor=0.0) >> logistic) model = Siamese(sent2vec, CauchySimilarity(width * 2)) print("Read and parse data: %s" % dataset) if dataset == 'quora': train, dev = datasets.quora_questions() elif dataset == 'snli': train, dev = datasets.snli() elif dataset == 'stackxc': train, dev = datasets.stack_exchange() elif dataset in ('quora+snli', 'snli+quora'): train, dev = datasets.quora_questions() train2, dev2 = datasets.snli() train.extend(train2) dev.extend(dev2) else: raise ValueError("Unknown dataset: %s" % dataset) get_ids = get_word_ids(Model.ops) train_X, train_y = preprocess(model.ops, nlp, train, get_ids) dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids) with model.begin_training(train_X[:10000], train_y[:10000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) print("Accuracy before training", model.evaluate_logloss(dev_X, dev_y)) print("Train") global epoch_train_acc n_iter = 0 for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) assert yh.shape == y.shape, (yh.shape, y.shape) assert (yh >= 0.).all(), yh train_acc = ((yh >= 0.5) == (y >= 0.5)).sum() loss = model.ops.xp.abs(yh - y).mean() epoch_train_acc += train_acc backprop(yh - y, optimizer) n_iter += 1 # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 if out_loc: out_loc = Path(out_loc) print('Saving to', out_loc) with out_loc.open('wb') as file_: pickle.dump(model, file_, -1)
def main(dataset='quora', width=128, depth=2, min_batch_size=128, max_batch_size=128, dropout=0.2, dropout_decay=0.0, pooling="mean+max", nb_epoch=20, pieces=3, use_gpu=False, out_loc=None, quiet=False): cfg = dict(locals()) if out_loc: out_loc = Path(out_loc) if not out_loc.parent.exists(): raise IOError("Can't open output location: %s" % out_loc) print(cfg) if pooling == 'mean+max': pool_layer = Pooling(mean_pool, max_pool) elif pooling == "mean": pool_layer = mean_pool elif pooling == "max": pool_layer = max_pool else: raise ValueError("Unrecognised pooling", pooling) print("Load spaCy") nlp = get_spacy('en') if use_gpu: Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) with Model.define_operators({ '>>': chain, '**': clone, '|': concatenate, '*': multiply }): # Important trick: text isn't like images, and the best way to use # convolution is different. Don't use pooling-over-time. Instead, # use the window to compute one vector per word, and do this N deep. # In the first layer, we adjust each word vector based on the two # surrounding words --- this gives us essentially trigram vectors. # In the next layer, we have a trigram of trigrams --- so we're # conditioning on information from a five word slice. The third layer # gives us 7 words. This is like the BiLSTM insight: we're not trying # to learn a vector for the whole sentence in this step. We're just # trying to learn better, position-sensitive word features. This simple # convolution step is much more efficient than BiLSTM, and can be # computed in parallel for every token in the batch. mwe_encode = ExtractWindow(nW=1) >> Maxout( width, width * 3, pieces=pieces) # Comments indicate the output type and shape at each step of the pipeline. # * B: Number of sentences in the batch # * T: Total number of words in the batch # (i.e. sum(len(sent) for sent in batch)) # * W: Width of the network (input hyper-parameter) # * ids: ID for each word (integers). # * lengths: Number of words in each sentence in the batch (integers) # * floats: Standard dense vector. # (Dimensions annotated in curly braces.) sent2vec = ( # List[spacy.token.Doc]{B} get_word_ids >> flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem( 0, # : word_ids{T} (TokenWeights(Model.ops, nlp) * SpacyVectors('en', width) >> mwe_encode**depth)) # : (floats{T, W}, lengths{B}) # Useful trick: Why choose between max pool and mean pool? # We may as well have both representations. >> pool_layer # : floats{B, 2*W} ) model = ( diff(sent2vec) # : floats{B, 8*W} >> Maxout(width, pieces=pieces) # : floats{B, W} >> Softmax() # : floats{B, 2} ) print("Read and parse data: %s" % dataset) if dataset == 'quora': train, dev = datasets.quora_questions() elif dataset == 'snli': train, dev = datasets.snli() else: raise ValueError("Unknown dataset: %s" % dataset) train_X, train_y = preprocess(model.ops, nlp, train) dev_X, dev_y = preprocess(model.ops, nlp, dev) assert len(dev_y.shape) == 2 print("Initialize with data (LSUV)") with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) print("Accuracy before training", model.evaluate(dev_X, dev_y)) print("Train") global epoch_train_acc for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) # No auto-diff: Just get a callback and pass the data through. # Hardly a hardship, and it means we don't have to create/maintain # a computational graph. We just use closures. backprop(yh - y, optimizer) epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 if out_loc: out_loc = Path(out_loc) print('Saving to', out_loc) with out_loc.open('wb') as file_: pickle.dump(model, file_, -1)
def main( dataset="quora", width=64, depth=2, min_batch_size=1, max_batch_size=128, dropout=0.0, dropout_decay=0.0, pooling="mean+max", nb_epoch=20, pieces=3, use_gpu=False, out_loc=None, quiet=False, ): cfg = dict(locals()) if out_loc: out_loc = Path(out_loc) if not out_loc.parent.exists(): raise IOError("Can't open output location: %s" % out_loc) print(cfg) if pooling == "mean+max": pool_layer = Pooling(mean_pool, max_pool) elif pooling == "mean": pool_layer = mean_pool elif pooling == "max": pool_layer = max_pool else: raise ValueError("Unrecognised pooling", pooling) print("Load spaCy") nlp = get_spacy("en") # if use_gpu: # Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) with Model.define_operators({">>": chain, "**": clone, "|": concatenate, "+": add}): mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width * 3, pieces=pieces) embed = StaticVectors("en", width) # + Embed(width, width*2, 5000) # Comments indicate the output type and shape at each step of the pipeline. # * B: Number of sentences in the batch # * T: Total number of words in the batch # (i.e. sum(len(sent) for sent in batch)) # * W: Width of the network (input hyper-parameter) # * ids: ID for each word (integers). # * lengths: Number of words in each sentence in the batch (integers) # * floats: Standard dense vector. # (Dimensions annotated in curly braces.) sent2vec = ( # List[spacy.token.Doc]{B} flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem( 0, embed >> mwe_encode ** depth # : word_ids{T} ) # : (floats{T, W}, lengths{B}) >> pool_layer >> Maxout(width, pieces=pieces) >> Maxout(width, pieces=pieces) ) model = ( ((Arg(0) >> sent2vec) | (Arg(1) >> sent2vec)) >> Maxout(width, pieces=pieces) >> Maxout(width, pieces=pieces) >> Softmax(2) ) print("Read and parse data: %s" % dataset) if dataset == "quora": train, dev = datasets.quora_questions() elif dataset == "snli": train, dev = datasets.snli() elif dataset == "stackxc": train, dev = datasets.stack_exchange() elif dataset in ("quora+snli", "snli+quora"): train, dev = datasets.quora_questions() train2, dev2 = datasets.snli() train.extend(train2) dev.extend(dev2) else: raise ValueError("Unknown dataset: %s" % dataset) get_ids = get_word_ids(Model.ops) train_X, train_y = preprocess(model.ops, nlp, train, get_ids) dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids) print("Initialize with data (LSUV)") print(dev_y.shape) with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as ( trainer, optimizer, ): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) print("Accuracy before training", model.evaluate(dev_X, dev_y)) print("Train") global epoch_train_acc for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) assert yh.shape == y.shape, (yh.shape, y.shape) # No auto-diff: Just get a callback and pass the data through. # Hardly a hardship, and it means we don't have to create/maintain # a computational graph. We just use closures. assert (yh >= 0.0).all() train_acc = (yh.argmax(axis=1) == y.argmax(axis=1)).sum() epoch_train_acc += train_acc backprop(yh - y, optimizer) # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 if out_loc: out_loc = Path(out_loc) print("Saving to", out_loc) with out_loc.open("wb") as file_: pickle.dump(model, file_, -1)
def main(dataset='quora', width=64, depth=2, min_batch_size=1, max_batch_size=128, dropout=0.0, dropout_decay=0.0, pooling="mean+max", nb_epoch=20, pieces=3, use_gpu=False, out_loc=None, quiet=False): cfg = dict(locals()) if out_loc: out_loc = Path(out_loc) if not out_loc.parent.exists(): raise IOError("Can't open output location: %s" % out_loc) print(cfg) if pooling == 'mean+max': pool_layer = Pooling(mean_pool, max_pool) elif pooling == "mean": pool_layer = mean_pool elif pooling == "max": pool_layer = max_pool else: raise ValueError("Unrecognised pooling", pooling) print("Load spaCy") nlp = get_spacy('en') #if use_gpu: # Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) with Model.define_operators({'>>': chain, '**': clone, '|': concatenate, '+': add}): mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width*3, pieces=pieces) embed = StaticVectors('en', width)# + Embed(width, width*2, 5000) # Comments indicate the output type and shape at each step of the pipeline. # * B: Number of sentences in the batch # * T: Total number of words in the batch # (i.e. sum(len(sent) for sent in batch)) # * W: Width of the network (input hyper-parameter) # * ids: ID for each word (integers). # * lengths: Number of words in each sentence in the batch (integers) # * floats: Standard dense vector. # (Dimensions annotated in curly braces.) sent2vec = ( # List[spacy.token.Doc]{B} flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem(0, # : word_ids{T} embed >> mwe_encode ** depth ) # : (floats{T, W}, lengths{B}) >> pool_layer >> Maxout(width, pieces=pieces) >> Maxout(width, pieces=pieces) ) model = ( ((Arg(0) >> sent2vec) | (Arg(1) >> sent2vec)) >> Maxout(width, pieces=pieces) >> Maxout(width, pieces=pieces) >> Softmax(2) ) print("Read and parse data: %s" % dataset) if dataset == 'quora': train, dev = datasets.quora_questions() elif dataset == 'snli': train, dev = datasets.snli() elif dataset == 'stackxc': train, dev = datasets.stack_exchange() elif dataset in ('quora+snli', 'snli+quora'): train, dev = datasets.quora_questions() train2, dev2 = datasets.snli() train.extend(train2) dev.extend(dev2) else: raise ValueError("Unknown dataset: %s" % dataset) get_ids = get_word_ids(Model.ops) train_X, train_y = preprocess(model.ops, nlp, train, get_ids) dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids) print("Initialize with data (LSUV)") print(dev_y.shape) with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) print("Accuracy before training", model.evaluate(dev_X, dev_y)) print("Train") global epoch_train_acc for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) assert yh.shape == y.shape, (yh.shape, y.shape) # No auto-diff: Just get a callback and pass the data through. # Hardly a hardship, and it means we don't have to create/maintain # a computational graph. We just use closures. assert (yh >= 0.).all() train_acc = (yh.argmax(axis=1) == y.argmax(axis=1)).sum() epoch_train_acc += train_acc backprop(yh-y, optimizer) # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 if out_loc: out_loc = Path(out_loc) print('Saving to', out_loc) with out_loc.open('wb') as file_: pickle.dump(model, file_, -1)
def main(loc=None, width=128, depth=2, max_batch_size=128, dropout=0.5, dropout_decay=1e-5, nb_epoch=30, use_gpu=False): cfg = dict(locals()) print("Load spaCy") nlp = spacy.load('en', parser=False, entity=False, matcher=False, tagger=False) if use_gpu: Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) with Model.define_operators({'>>': chain, '**': clone, '|': concatenate}): # Important trick: text isn't like images, and the best way to use # convolution is different. Don't use pooling-over-time. Instead, # use the window to compute one vector per word, and do this N deep. # In the first layer, we adjust each word vector based on the two # surrounding words --- this gives us essentially trigram vectors. # In the next layer, we have a trigram of trigrams --- so we're # conditioning on information from a five word slice. The third layer # gives us 7 words. This is like the BiLSTM insight: we're not trying # to learn a vector for the whole sentence in this step. We're just # trying to learn better, position-sensitive word features. This simple # convolution step is much more efficient than BiLSTM, and can be # computed in parallel for every token in the batch. mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width * 3) # Comments indicate the output type and shape at each step of the pipeline. # * B: Number of sentences in the batch # * T: Total number of words in the batch # (i.e. sum(len(sent) for sent in batch)) # * W: Width of the network (input hyper-parameter) # * ids: ID for each word (integers). # * lengths: Number of words in each sentence in the batch (integers) # * floats: Standard dense vector. # (Dimensions annotated in curly braces.) sent2vec = ( # List[spacy.token.Doc]{B} #get_word_ids # : List[ids]{B} flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem( 0, # : word_ids{T} # This class integrates a linear projection layer, and loads # static embeddings (by default, GloVe common crawl). SpacyVectors(nlp, width) # : floats{T, W} >> mwe_encode**depth # : floats{T, W} ) # : (floats{T, W}, lengths{B}) # Useful trick: Why choose between max pool and mean pool? # We may as well have both representations. >> Pooling(mean_pool, max_pool) # : floats{B, 2*W} ) model = (( (Arg(0) >> sent2vec) | (Arg(1) >> sent2vec)) # : floats{B, 4*W} >> Maxout(width, width * 4) # : floats{B, W} >> Maxout(width, width)**depth # : floats{B, W} >> Softmax(3, width) # : floats{B, 2} ) print("Read and parse SNLI data") train, dev = datasets.snli(loc) train_X, train_y = preprocess(model.ops, nlp, train) dev_X, dev_y = preprocess(model.ops, nlp, dev) assert len(dev_y.shape) == 2 print("Initialize with data (LSUV)") with model.begin_training(train_X[:10000], train_y[:10000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = 1 batch_size = 1. print("Accuracy before training", model.evaluate(dev_X, dev_y)) print("Train") global epoch_train_acc for X, y in trainer.iterate(train_X, train_y): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) # No auto-diff: Just get a callback and pass the data through. # Hardly a hardship, and it means we don't have to create/maintain # a computational graph. We just use closures. backprop(yh - y, optimizer) epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001
def build_text_classifier(nr_class, width=64, **cfg): nr_vector = cfg.get('nr_vector', 5000) pretrained_dims = cfg.get('pretrained_dims', 0) with Model.define_operators({'>>': chain, '+': add, '|': concatenate, '**': clone}): if cfg.get('low_data') and pretrained_dims: model = ( SpacyVectors >> flatten_add_lengths >> with_getitem(0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width)) ** 2 >> zero_init(Affine(nr_class, width, drop_factor=0.0)) >> logistic ) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width//2, nr_vector, column=2) suffix = HashEmbed(width//2, nr_vector, column=3) shape = HashEmbed(width//2, nr_vector, column=4) trained_vectors = ( FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN(Maxout(width, width+(width//2)*3)), column=0 ) ) ) if pretrained_dims: static_vectors = ( SpacyVectors >> with_flatten(Affine(width, pretrained_dims)) ) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width*2 else: vectors = trained_vectors vectors_width = width static_vectors = None cnn_model = ( vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual( (ExtractWindow(nW=1) >> LN(Maxout(width, width*3))) ) ** 2, pad=2 ) >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0)) ) linear_model = ( _preprocess_doc >> LinearModel(nr_class) ) #model = linear_model >> logistic model = ( (linear_model | cnn_model) >> zero_init(Affine(nr_class, nr_class*2, drop_factor=0.0)) >> logistic ) model.nO = nr_class model.lsuv = False return model
def main( dataset="quora", width=200, depth=2, min_batch_size=1, max_batch_size=512, dropout=0.2, dropout_decay=0.0, pooling="mean+max", nb_epoch=5, pieces=3, L2=0.0, use_gpu=False, out_loc=None, quiet=False, job_id=None, ws_api_url=None, rest_api_url=None, ): cfg = dict(locals()) if out_loc: out_loc = Path(out_loc) if not out_loc.parent.exists(): raise IOError("Can't open output location: %s" % out_loc) print(cfg) if pooling == "mean+max": pool_layer = Pooling(mean_pool, max_pool) elif pooling == "mean": pool_layer = mean_pool elif pooling == "max": pool_layer = max_pool else: raise ValueError("Unrecognised pooling", pooling) print("Load spaCy") nlp = get_spacy("en") if use_gpu: Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) Model.lsuv = True # Model.ops = CupyOps() with Model.define_operators({">>": chain, "**": clone, "|": concatenate, "+": add}): mwe_encode = ExtractWindow(nW=1) >> LN( Maxout(width, drop_factor=0.0, pieces=pieces) ) sent2vec = ( flatten_add_lengths >> with_getitem( 0, (HashEmbed(width, 3000) | StaticVectors("en", width)) >> LN(Maxout(width, width * 2)) >> Residual(mwe_encode) ** depth, ) # : word_ids{T} >> Pooling(mean_pool, max_pool) >> Residual(LN(Maxout(width * 2, pieces=pieces), nO=width * 2)) ** 2 >> logistic ) model = Siamese(sent2vec, CauchySimilarity(width * 2)) print("Read and parse data: %s" % dataset) if dataset == "quora": train, dev = datasets.quora_questions() elif dataset == "snli": train, dev = datasets.snli() elif dataset == "stackxc": train, dev = datasets.stack_exchange() elif dataset in ("quora+snli", "snli+quora"): train, dev = datasets.quora_questions() train2, dev2 = datasets.snli() train.extend(train2) dev.extend(dev2) else: raise ValueError("Unknown dataset: %s" % dataset) get_ids = get_word_ids(Model.ops) train_X, train_y = preprocess(model.ops, nlp, train, get_ids) dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids) with model.begin_training(train_X[:10000], train_y[:10000], **cfg) as ( trainer, optimizer, ): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) print("Accuracy before training", model.evaluate_logloss(dev_X, dev_y)) print("Train") global epoch_train_acc n_iter = 0 for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) assert yh.shape == y.shape, (yh.shape, y.shape) assert (yh >= 0.0).all(), yh train_acc = ((yh >= 0.5) == (y >= 0.5)).sum() loss = model.ops.xp.abs(yh - y).mean() epoch_train_acc += train_acc backprop(yh - y, optimizer) n_iter += 1 # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 if out_loc: out_loc = Path(out_loc) print("Saving to", out_loc) with out_loc.open("wb") as file_: pickle.dump(model, file_, -1)
def build_text_classifier(nr_class, width=64, **cfg): depth = cfg.get("depth", 2) nr_vector = cfg.get("nr_vector", 5000) pretrained_dims = cfg.get("pretrained_dims", 0) with Model.define_operators({">>": chain, "+": add, "|": concatenate, "**": clone}): if cfg.get("low_data") and pretrained_dims: model = ( SpacyVectors >> flatten_add_lengths >> with_getitem(0, Affine(width, pretrained_dims)) >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(ReLu(width, width)) ** 2 >> zero_init(Affine(nr_class, width, drop_factor=0.0)) >> logistic ) return model lower = HashEmbed(width, nr_vector, column=1) prefix = HashEmbed(width // 2, nr_vector, column=2) suffix = HashEmbed(width // 2, nr_vector, column=3) shape = HashEmbed(width // 2, nr_vector, column=4) trained_vectors = FeatureExtracter( [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] ) >> with_flatten( uniqued( (lower | prefix | suffix | shape) >> LN(Maxout(width, width + (width // 2) * 3)), column=0, ) ) if pretrained_dims: static_vectors = SpacyVectors >> with_flatten( Affine(width, pretrained_dims) ) # TODO Make concatenate support lists vectors = concatenate_lists(trained_vectors, static_vectors) vectors_width = width * 2 else: vectors = trained_vectors vectors_width = width static_vectors = None tok2vec = vectors >> with_flatten( LN(Maxout(width, vectors_width)) >> Residual((ExtractWindow(nW=1) >> LN(Maxout(width, width * 3)))) ** depth, pad=depth, ) cnn_model = ( tok2vec >> flatten_add_lengths >> ParametricAttention(width) >> Pooling(sum_pool) >> Residual(zero_init(Maxout(width, width))) >> zero_init(Affine(nr_class, width, drop_factor=0.0)) ) linear_model = build_bow_text_classifier( nr_class, ngram_size=cfg.get("ngram_size", 1), exclusive_classes=False ) if cfg.get("exclusive_classes"): output_layer = Softmax(nr_class, nr_class * 2) else: output_layer = ( zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic ) model = (linear_model | cnn_model) >> output_layer model.tok2vec = chain(tok2vec, flatten) model.nO = nr_class model.lsuv = False return model