def main(): train, dev = datasets.imdb() train_X, train_y = zip(*train) dev_X, dev_y = zip(*dev) model = LinearModel(2) train_y = to_categorical(train_y, nb_classes=2) dev_y = to_categorical(dev_y, nb_classes=2) nlp = spacy.load("en") train_X = [ model.ops.asarray([tok.orth for tok in doc], dtype="uint64") for doc in nlp.pipe(train_X) ] dev_X = [ model.ops.asarray([tok.orth for tok in doc], dtype="uint64") for doc in nlp.pipe(dev_X) ] dev_X = preprocess(model.ops, dev_X) with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): trainer.dropout = 0.0 trainer.batch_size = 512 trainer.nb_epoch = 3 trainer.each_epoch.append(lambda: print(model.evaluate(dev_X, dev_y))) for X, y in trainer.iterate(train_X, train_y): keys_vals_lens = preprocess(model.ops, X) scores, backprop = model.begin_update(keys_vals_lens, drop=trainer.dropout) backprop(scores - y, optimizer) with model.use_params(optimizer.averages): print(model.evaluate(dev_X, dev_y))
def main(): train, dev = datasets.imdb() train_X, train_y = zip(*train) dev_X, dev_y = zip(*dev) model = LinearModel(2) train_y = to_categorical(train_y, nb_classes=2) dev_y = to_categorical(dev_y, nb_classes=2) nlp = spacy.load("en") train_X = [ model.ops.asarray([tok.orth for tok in doc], dtype="uint64") for doc in nlp.pipe(train_X) ] dev_X = [ model.ops.asarray([tok.orth for tok in doc], dtype="uint64") for doc in nlp.pipe(dev_X) ] dev_X = preprocess(model.ops, dev_X) with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): trainer.dropout = 0.0 trainer.batch_size = 512 trainer.nb_epoch = 3 trainer.each_epoch.append(lambda: print(model.evaluate(dev_X, dev_y))) for X, y in trainer.iterate(train_X, train_y): keys_vals_lens = preprocess(model.ops, X) scores, backprop = model.begin_update(keys_vals_lens, drop=trainer.dropout) backprop(scores - y, optimizer) with model.use_params(optimizer.averages): print(model.evaluate(dev_X, dev_y))
def train(self, epochs=3, batch_size=128, val_split=0.2): im, db = dink.imdb() train = im + db train_raw, train_y = zip(*train) train_prep = [ ' '.join(simple_preprocess(review)) for review in train_raw ] tokenizer = Tokenizer(num_words=self._NVOCAB) tokenizer.fit_on_texts(train_prep) train_idx = tokenizer.texts_to_sequences(train_prep) train_seq = pad_sequences(train_idx, maxlen=self._MAXLEN) model = Sequential() model.add( Embedding(self._NVOCAB + 1, self._EDIM, input_length=self._MAXLEN, mask_zero=True)) model.add(LSTM(100, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) model.fit(train_seq, list(train_y), epochs=3, batch_size=128, validation_split=0.2) self.model = model self.tokenizer = tokenizer
def main(use_gpu=False, nb_epoch=100): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb(limit=2000) print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = Model.ops.asarray(to_categorical(train_y, nb_classes=2)) test_y = Model.ops.asarray(to_categorical(test_y, nb_classes=2)) nlp = spacy.load('en_vectors_web_lg') nlp.add_pipe(nlp.create_pipe('sentencizer'), first=True) preprocessor = FeatureExtracter([ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]) train_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(train_X))] test_X = [preprocessor(list(doc.sents)) for doc in tqdm.tqdm(nlp.pipe(test_X))] dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") n_sent = sum([len(list(sents)) for sents in train_X]) print("%d sentences" % n_sent) model = build_model(2, width=128, conv_depth=2, depth=2, train_X=train_X, train_y=train_y) with model.begin_training(train_X[:100], train_y[:100]) as (trainer, optimizer): epoch_loss = [0.] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], epoch_var[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.) epoch_var.append(0.) trainer.each_epoch.append(report_progress) batch_sizes = compounding(64, 64, 1.01) trainer.dropout = 0.3 trainer.batch_size = int(next(batch_sizes)) trainer.dropout_decay = 0.0 trainer.nb_epoch = nb_epoch #optimizer.alpha = 0.1 #optimizer.max_grad_norm = 10.0 #optimizer.b1 = 0.0 #optimizer.b2 = 0.0 epoch_var = [0.] for X, y in trainer.iterate(train_X, train_y): yh, backprop = model.begin_update(X, drop=trainer.dropout) losses = ((yh-y)**2.).sum(axis=1) / y.shape[0] epoch_var[-1] += losses.var() loss = losses.mean() backprop((yh-y)/yh.shape[0], optimizer) epoch_loss[-1] += loss trainer.batch_size = int(next(batch_sizes)) with model.use_params(optimizer.averages): print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y))
def main(use_gpu=False, nb_epoch=50): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb() print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) nlp = Language() dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") train_X = [nlp.make_doc(x) for x in train_X] dev_X = [nlp.make_doc(x) for x in dev_X] model = build_model(2, 1) print("Begin training") with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): epoch_loss = [0.] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.) trainer.each_epoch.append(report_progress) trainer.nb_epoch = nb_epoch trainer.dropout = 0.0 trainer.batch_size = 128 trainer.dropout_decay = 0.0 for X, y in trainer.iterate(train_X[:1000], train_y[:1000]): yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y)**2.).sum() / y.shape[0] backprop((yh - y) / y.shape[0], optimizer) epoch_loss[-1] += loss with model.use_params(optimizer.averages): print('Avg dev.: %.3f' % model.evaluate(dev_X, dev_y)) with open('out.pickle', 'wb') as file_: pickle.dump(model, file_, -1)
def main(use_gpu=False, nb_epoch=50): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb() print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) nlp = Language() dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] print("Parse data") train_X = [nlp.make_doc(x) for x in train_X] dev_X = [nlp.make_doc(x) for x in dev_X] model = build_model(2, 1) print("Begin training") with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): epoch_loss = [0.0] def report_progress(): with model.use_params(optimizer.averages): print(epoch_loss[-1], model.evaluate(dev_X, dev_y), trainer.dropout) epoch_loss.append(0.0) trainer.each_epoch.append(report_progress) trainer.nb_epoch = nb_epoch trainer.dropout = 0.0 trainer.batch_size = 128 trainer.dropout_decay = 0.0 for X, y in trainer.iterate(train_X[:1000], train_y[:1000]): yh, backprop = model.begin_update(X, drop=trainer.dropout) loss = ((yh - y) ** 2.0).sum() / y.shape[0] backprop((yh - y) / y.shape[0], optimizer) epoch_loss[-1] += loss with model.use_params(optimizer.averages): print("Avg dev.: %.3f" % model.evaluate(dev_X, dev_y)) with open("out.pickle", "wb") as file_: pickle.dump(model, file_, -1)
def main(use_gpu=False): if use_gpu: Model.ops = CupyOps() Model.Ops = CupyOps train, test = datasets.imdb() print("Load data") train_X, train_y = zip(*train) test_X, test_y = zip(*test) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) nlp = spacy.load('en') nlp.vocab.lex_attr_getters[PREFIX] = lambda string: string[:3] for word in nlp.vocab: word.prefix_ = word.orth_[:3] dev_X = train_X[-1000:] dev_y = train_y[-1000:] train_X = train_X[:-1000] train_y = train_y[:-1000] #train_X = train_X[:1000] #train_y = train_y[:1000] print("Parse data") train_X = list(nlp.pipe(train_X)) dev_X = list(nlp.pipe(dev_X)) n_sent = sum([len(list(doc.sents)) for doc in train_X]) print("%d sentences" % n_sent) hpsearch = BestFirstFinder(nonlin=[SELU], width=[64], depth=[2], conv_depth=[2], batch_size=[128], learn_rate=[0.001], L2=[1e-6], beta1=[0.9], beta2=[0.999], dropout=[0.2]) for hp in hpsearch.configs: for _ in range(3): model = build_model(2, train_X=train_X, train_y=train_y, **hp) with model.begin_training(train_X[:100], train_y[:100]) as (_, sgd): pass _, (model_data, train_acc, dev_acc) = train_epoch(model, sgd, hp, train_X, train_y, dev_X, dev_y, device_id=-1 if not use_gpu else 0) print('0', dev_acc * 100, train_acc * 100, hp) hpsearch.enqueue(model_data, train_acc, dev_acc) hpsearch.temperature = 0.0 print("Train") total = 0 temperature = 0.0 while True: for model, sgd, hp in hpsearch: _, (new_model, train_acc, dev_acc) = train_epoch(model, sgd, hp, train_X, train_y, dev_X, dev_y, device_id=-1 if not use_gpu else 0, temperature=hpsearch.temperature) hp = new_model[-1] print( '%d,%d,%d:\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%.3f\t%d\t%d\t%.3f\t%.3f\t%.3f' % (total, hp['epochs'], hp['parent'], hpsearch.best_acc * 100, dev_acc * 100, train_acc * 100, int( hp['batch_size']), hp['dropout'], hp['learn_rate'], hp['width'], hp['depth'], hpsearch.temperature, hpsearch.queue[0][0], hpsearch.queue[-1][0])) total += 1 hpsearch.enqueue(new_model, train_acc, dev_acc)
def main(nH=6, dropout=0.0, nS=6, nB=32, nE=20, use_gpu=-1, lim=2000, nM=300, mL=100, save=False, save_name="model.pkl"): if use_gpu != -1: # TODO: Make specific to different devices, e.g. 1 vs 0 spacy.require_gpu() device = 'cuda' else: device = 'cpu' ''' Read dataset ''' nlp = spacy.load('en_core_web_sm') for control_token in ("<eos>", "<bos>", "<pad>", "<cls>"): nlp.tokenizer.add_special_case(control_token, [{ORTH: control_token}]) train, dev = imdb(limit=lim) print('Loaded imdb dataset') train = train[:lim] dev = dev[:lim] train_X, train_Y = zip(*train) dev_X, dev_Y = zip(*dev) train_X = spacy_tokenize(nlp.tokenizer, train_X, mL=mL) dev_X = spacy_tokenize(nlp.tokenizer, dev_X, mL=mL) print('Tokenized dataset') train_X = set_numeric_ids(nlp.vocab, train_X) dev_X = set_numeric_ids(nlp.vocab, dev_X) print('Numeric ids ready') with Model.define_operators({">>": chain}): embed_cols = [ORTH, SHAPE, PREFIX, SUFFIX] extractor = FeatureExtracter(attrs=embed_cols) position_encode = PositionEncode(mL, nM) model = (FeatureExtracter(attrs=embed_cols) >> with_flatten( FancyEmbed(nM, 5000, cols=embed_cols)) >> Residual(position_encode) >> create_model_input() >> Categorizer( nM=nM, nS=nS, nH=nH, device=device)) losses = [0.] train_accuracies = [0.] train_totals = [0.] dev_accuracies = [0.] dev_loss = [0.] def track_progress(): correct = 0. total = 0. for batch in minibatch(zip(dev_X, dev_Y), size=1024): X, Y = zip(*batch) Yh = model(X) L, C = get_loss(Yh, Y) correct += C dev_loss[-1] += (L**2).sum() total += len(X) dev_accuracies[-1] = correct / total n_train = train_totals[-1] print(len(losses), losses[-1], train_accuracies[-1] / n_train, dev_loss[-1], dev_accuracies[-1]) dev_loss.append(0.) losses.append(0.) train_accuracies.append(0.) dev_accuracies.append(0.) train_totals.append(0.) with model.begin_training(batch_size=nB, nb_epoch=nE) as (trainer, optimizer): trainer.dropout = dropout trainer.dropout_decay = 1e-4 optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 trainer.each_epoch.append(track_progress) optimizer.alpha = 0.001 optimizer.L2 = 1e-6 optimizer.max_grad_norm = 1.0 other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] for X, Y in trainer.iterate(train_X, train_Y): Yh, backprop = model.begin_update(X) dYh, C = get_loss(Yh, Y) backprop(dYh, sgd=optimizer) losses[-1] += (dYh**2).sum() train_accuracies[-1] += C train_totals[-1] += len(Y) if save: model.to_disk(save_name)