def batch_train_increment(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=1, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None): """ Batch train a new text classification model from annotations. Prodigy will export the best result to the output directory, and include a JSONL file of the training and evaluation examples. You can either supply a dataset ID containing the evaluation data, or choose to split off a percentage of examples for evaluation. """ #log("RECIPE: Starting recipe textcat.batch-train", locals()) if(gpu_id): spacy.util.use_gpu(gpu_id) if(n_iter ==1): print("one pass mode") print("batch_size",batch_size) print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) else: print("build your customized model") nlp = spacy.load('en_core_web_lg') pt_model = FastText(vocab_size=684831, emb_dim = 300) pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data)) model = PyTorchWrapper(pt_model) #textcat = TextCategorizer(nlp.vocab,model) textcat = Loss_TextCategorizer(nlp.vocab,model) nlp.add_pipe(textcat) examples = DB.get_dataset(dataset) labels = {eg['label'] for eg in examples} labels = list(sorted(labels)) print(labels) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) if shuffle: print("it's shuffling") random.shuffle(examples) else: print("it's not shuffling") if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) else: examples, evals, eval_split = split_evals(examples, eval_split) print_("Using {}% of examples ({}) for evaluation" .format(round(eval_split * 100), len(evals))) if shuffle: random.shuffle(examples) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) # best_acc = {'accuracy': 0} # best_model = None if long_text: examples = list(split_sentences(nlp, examples, min_length=False)) batch_idx = 0 start_time = datetime.now() for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): batch = list(batch) for i in range(n_iter): loss = model.update(batch, revise=False, drop=dropout) if len(evals) > 0: #print("optimizer averages",model.optimizer.averages) with nlp.use_params(model.optimizer.averages): acc = model.evaluate(tqdm.tqdm(evals, leave=False)) #print_(printers.tc_update(i, loss, acc)) end_time = datetime.now() -start_time print('Time:[{0} seconds], Epoch: [{1}/{2}], batch: [{3}/{4}], Loss:{5}, Accuracy:{6}'.format( end_time.seconds,i+1, n_iter, batch_idx+1, len(examples)//batch_size, loss, acc['accuracy'])) batch_idx += 1 return acc
def batch_train(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=10, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False): """ Batch train a new text classification model from annotations. Prodigy will export the best result to the output directory, and include a JSONL file of the training and evaluation examples. You can either supply a dataset ID containing the evaluation data, or choose to split off a percentage of examples for evaluation. """ #log("RECIPE: Starting recipe textcat.batch-train", locals()) print("batch_size",batch_size) print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) else: print("build your customized model") nlp = spacy.load('en_core_web_lg') pt_model = FastText(vocab_size=684831, emb_dim = 300) pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data)) model = PyTorchWrapper(pt_model) textcat = TextCategorizer(nlp.vocab,model) nlp.add_pipe(textcat) #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5) #model = PyTorchWrapper(pt_model) #nlp = spacy.load('/home/ysun/pytorchprodigy/') #textcat = TextCategorizer(nlp.vocab,model) #nlp.add_pipe(textcat) examples = DB.get_dataset(dataset) labels = {eg['label'] for eg in examples} labels = list(sorted(labels)) print(labels) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) #log('RECIPE: Initialised TextClassifier with model {}' # .format(input_model), model.nlp.meta) if shuffle: print("it's shuffling") random.shuffle(examples) else: print("it's not shuffling") if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) else: examples, evals, eval_split = split_evals(examples, eval_split) print_("Using {}% of examples ({}) for evaluation" .format(round(eval_split * 100), len(evals))) if shuffle: random.shuffle(examples) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) best_acc = {'accuracy': 0} best_model = None if long_text: examples = list(split_sentences(nlp, examples, min_length=False)) for i in range(n_iter): loss = 0. random.shuffle(examples) for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): batch = list(batch) loss += model.update(batch, revise=False, drop=dropout) if len(evals) > 0: with nlp.use_params(model.optimizer.averages): acc = model.evaluate(tqdm.tqdm(evals, leave=False)) if acc['accuracy'] > best_acc['accuracy']: best_acc = dict(acc) best_model = nlp.to_bytes() print_(printers.tc_update(i, loss, acc)) if len(evals) > 0: print_(printers.tc_result(best_acc)) if output_model is not None: if best_model is not None: nlp = nlp.from_bytes(best_model) msg = export_model_data(output_model, nlp, examples, evals) print_(msg) return best_acc['accuracy']