def main(loc, width=64, depth=2, batch_size=128, dropout=0.5, dropout_decay=1e-5, nb_epoch=20): print("Load spaCy") nlp = spacy.load('en', parser=False, entity=False, matcher=False, tagger=False) print("Construct model") Model.ops = CupyOps() with Model.define_operators({'>>': chain, '**': clone, '|': concatenate}): mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width * 3) sent2vec = (get_word_ids >> flatten_add_lengths >> with_getitem( 0, SpacyVectors(nlp, width) >> mwe_encode**depth) >> Pooling( mean_pool, max_pool)) model = (((Arg(0) >> sent2vec) | (Arg(1) >> sent2vec)) >> Maxout(width, width * 4) >> Maxout( width, width)**depth >> Softmax(2, width)) print("Read and parse quora data") rows = read_quora_tsv_data(pathlib.Path(loc)) train, dev = partition(rows, 0.9) train_X, train_y = create_data(model.ops, nlp, train) dev_X, dev_y = create_data(model.ops, nlp, dev) print("Train") with model.begin_training(train_X[:20000], train_y[:20000]) as (trainer, optimizer): trainer.batch_size = batch_size trainer.nb_epoch = nb_epoch trainer.dropout = dropout trainer.dropout_decay = dropout_decay epoch_times = [timer()] epoch_loss = [0.] n_train_words = sum(len(d0) + len(d1) for d0, d1 in train_X) n_dev_words = sum(len(d0) + len(d1) for d0, d1 in dev_X) def track_progress(): stats = get_stats(model, optimizer.averages, dev_X, dev_y, epoch_loss[-1], epoch_times[-1], n_train_words, n_dev_words) stats.append(trainer.dropout) stats = tuple(stats) print( len(epoch_loss), "%.3f loss, %.3f (%.3f) acc, %d/%d=%d wps train, %d/%.3f=%d wps run. d.o.=%.3f" % stats) epoch_times.append(timer()) epoch_loss.append(0.) trainer.each_epoch.append(track_progress) for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) backprop(yh - y, optimizer)
def main(dataset='quora', width=64, depth=2, min_batch_size=1, max_batch_size=128, dropout=0.0, dropout_decay=0.0, pooling="mean+max", nb_epoch=20, pieces=3, use_gpu=False, out_loc=None, quiet=False): cfg = dict(locals()) if out_loc: out_loc = Path(out_loc) if not out_loc.parent.exists(): raise IOError("Can't open output location: %s" % out_loc) print(cfg) if pooling == 'mean+max': pool_layer = Pooling(mean_pool, max_pool) elif pooling == "mean": pool_layer = mean_pool elif pooling == "max": pool_layer = max_pool else: raise ValueError("Unrecognised pooling", pooling) print("Load spaCy") nlp = get_spacy('en') #if use_gpu: # Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) with Model.define_operators({'>>': chain, '**': clone, '|': concatenate, '+': add}): mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width*3, pieces=pieces) embed = StaticVectors('en', width)# + Embed(width, width*2, 5000) # Comments indicate the output type and shape at each step of the pipeline. # * B: Number of sentences in the batch # * T: Total number of words in the batch # (i.e. sum(len(sent) for sent in batch)) # * W: Width of the network (input hyper-parameter) # * ids: ID for each word (integers). # * lengths: Number of words in each sentence in the batch (integers) # * floats: Standard dense vector. # (Dimensions annotated in curly braces.) sent2vec = ( # List[spacy.token.Doc]{B} flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem(0, # : word_ids{T} embed >> mwe_encode ** depth ) # : (floats{T, W}, lengths{B}) >> pool_layer >> Maxout(width, pieces=pieces) >> Maxout(width, pieces=pieces) ) model = ( ((Arg(0) >> sent2vec) | (Arg(1) >> sent2vec)) >> Maxout(width, pieces=pieces) >> Maxout(width, pieces=pieces) >> Softmax(2) ) print("Read and parse data: %s" % dataset) if dataset == 'quora': train, dev = datasets.quora_questions() elif dataset == 'snli': train, dev = datasets.snli() elif dataset == 'stackxc': train, dev = datasets.stack_exchange() elif dataset in ('quora+snli', 'snli+quora'): train, dev = datasets.quora_questions() train2, dev2 = datasets.snli() train.extend(train2) dev.extend(dev2) else: raise ValueError("Unknown dataset: %s" % dataset) get_ids = get_word_ids(Model.ops) train_X, train_y = preprocess(model.ops, nlp, train, get_ids) dev_X, dev_y = preprocess(model.ops, nlp, dev, get_ids) print("Initialize with data (LSUV)") print(dev_y.shape) with model.begin_training(train_X[:5000], train_y[:5000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = min_batch_size batch_size = float(min_batch_size) print("Accuracy before training", model.evaluate(dev_X, dev_y)) print("Train") global epoch_train_acc for X, y in trainer.iterate(train_X, train_y, progress_bar=not quiet): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) assert yh.shape == y.shape, (yh.shape, y.shape) # No auto-diff: Just get a callback and pass the data through. # Hardly a hardship, and it means we don't have to create/maintain # a computational graph. We just use closures. assert (yh >= 0.).all() train_acc = (yh.argmax(axis=1) == y.argmax(axis=1)).sum() epoch_train_acc += train_acc backprop(yh-y, optimizer) # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001 if out_loc: out_loc = Path(out_loc) print('Saving to', out_loc) with out_loc.open('wb') as file_: pickle.dump(model, file_, -1)
def main(loc=None, width=128, depth=2, max_batch_size=128, dropout=0.5, dropout_decay=1e-5, nb_epoch=30, use_gpu=False): cfg = dict(locals()) print("Load spaCy") nlp = spacy.load('en', parser=False, entity=False, matcher=False, tagger=False) if use_gpu: Model.ops = CupyOps() print("Construct model") # Bind operators for the scope of the block: # * chain (>>): Compose models in a 'feed forward' style, # i.e. chain(f, g)(x) -> g(f(x)) # * clone (**): Create n copies of a model, and chain them, i.e. # (f ** 3)(x) -> f''(f'(f(x))), where f, f' and f'' have distinct weights. # * concatenate (|): Merge the outputs of two models into a single vector, # i.e. (f|g)(x) -> hstack(f(x), g(x)) with Model.define_operators({'>>': chain, '**': clone, '|': concatenate}): # Important trick: text isn't like images, and the best way to use # convolution is different. Don't use pooling-over-time. Instead, # use the window to compute one vector per word, and do this N deep. # In the first layer, we adjust each word vector based on the two # surrounding words --- this gives us essentially trigram vectors. # In the next layer, we have a trigram of trigrams --- so we're # conditioning on information from a five word slice. The third layer # gives us 7 words. This is like the BiLSTM insight: we're not trying # to learn a vector for the whole sentence in this step. We're just # trying to learn better, position-sensitive word features. This simple # convolution step is much more efficient than BiLSTM, and can be # computed in parallel for every token in the batch. mwe_encode = ExtractWindow(nW=1) >> Maxout(width, width * 3) # Comments indicate the output type and shape at each step of the pipeline. # * B: Number of sentences in the batch # * T: Total number of words in the batch # (i.e. sum(len(sent) for sent in batch)) # * W: Width of the network (input hyper-parameter) # * ids: ID for each word (integers). # * lengths: Number of words in each sentence in the batch (integers) # * floats: Standard dense vector. # (Dimensions annotated in curly braces.) sent2vec = ( # List[spacy.token.Doc]{B} #get_word_ids # : List[ids]{B} flatten_add_lengths # : (ids{T}, lengths{B}) >> with_getitem( 0, # : word_ids{T} # This class integrates a linear projection layer, and loads # static embeddings (by default, GloVe common crawl). SpacyVectors(nlp, width) # : floats{T, W} >> mwe_encode**depth # : floats{T, W} ) # : (floats{T, W}, lengths{B}) # Useful trick: Why choose between max pool and mean pool? # We may as well have both representations. >> Pooling(mean_pool, max_pool) # : floats{B, 2*W} ) model = (( (Arg(0) >> sent2vec) | (Arg(1) >> sent2vec)) # : floats{B, 4*W} >> Maxout(width, width * 4) # : floats{B, W} >> Maxout(width, width)**depth # : floats{B, W} >> Softmax(3, width) # : floats{B, 2} ) print("Read and parse SNLI data") train, dev = datasets.snli(loc) train_X, train_y = preprocess(model.ops, nlp, train) dev_X, dev_y = preprocess(model.ops, nlp, dev) assert len(dev_y.shape) == 2 print("Initialize with data (LSUV)") with model.begin_training(train_X[:10000], train_y[:10000], **cfg) as (trainer, optimizer): # Pass a callback to print progress. Give it all the local scope, # because why not? trainer.each_epoch.append(track_progress(**locals())) trainer.batch_size = 1 batch_size = 1. print("Accuracy before training", model.evaluate(dev_X, dev_y)) print("Train") global epoch_train_acc for X, y in trainer.iterate(train_X, train_y): # Slightly useful trick: Decay the dropout as training proceeds. yh, backprop = model.begin_update(X, drop=trainer.dropout) # No auto-diff: Just get a callback and pass the data through. # Hardly a hardship, and it means we don't have to create/maintain # a computational graph. We just use closures. backprop(yh - y, optimizer) epoch_train_acc += (yh.argmax(axis=1) == y.argmax(axis=1)).sum() # Slightly useful trick: start with low batch size, accelerate. trainer.batch_size = min(int(batch_size), max_batch_size) batch_size *= 1.001