def _train_classifier(self, data_class, encoder_name, classifier_name): learn = text_classifier_learner(data_class, AWD_LSTM, drop_mult=0.5, pretrained=False) try: path = f'{classifier_name}' learn.load(path) self.log('Loaded pretrained classifier from {}'.format(str(path))) # return learn except FileNotFoundError: self.log(f'Training classifier `{classifier_name}`') learn.load_encoder(encoder_name) lr = 1e-1/LR_DIV_FACTOR learn.fit_one_cycle(1, lr, moms=(0.8, 0.7)) # TODO remove # learn.save(classifier_name) # return learn learn.freeze_to(-2) lr /= 2 learn.fit_one_cycle(1, slice(lr/(2.6**4), lr), moms=(0.8, 0.7)) learn.freeze_to(-3) lr /= 2 learn.fit_one_cycle(1, slice(lr/(2.6**4), lr), moms=(0.8, 0.7)) learn.unfreeze() lr /= 5 learn.fit_one_cycle(2, slice(lr/(2.6**4), lr), moms=(0.8, 0.7)) self.log(f"Saving classifier `{classifier_name}`") learn.save(classifier_name) return learn
def train(bs): path = Path("./") data_lm = load_data(path, 'data_lm.pkl', bs=bs) print("data_lm loaded") data_clas = (TextList.from_csv(path, 'classifier.csv', cols=["summary", "description"], vocab=data_lm.vocab) .split_from_df(col=3) .label_from_df(cols=0) .databunch(bs=bs)) print("data_clas loaded") data_clas.show_batch() # not sure how it will work data_clas.save('data_clas.pkl') learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.6) learn.load_encoder('fine_tuned_enc') lr_estimate = 1.0e-2 learn.fit_one_cycle(1, lr_estimate, moms=(0.8,0.7)) learn.save('first') losses_fig = learn.recorder.plot_losses(return_fig=True) losses_fig.savefig("losses_001.jpg", dpi=600)
def _fit_class(self, df_train, df_val, data_lm): n_data = min(len(df_train), len(df_val)) # Classifier model data data_class = TextClasDataBunch.from_df( path="", train_df=df_train, valid_df=df_val, vocab=data_lm.train_ds.vocab, bs=self.batch_size if self.batch_size < n_data else n_data // 2, ) # train the learner object class_learner = text_classifier_learner( data_class, self.arch, drop_mult=self.dropout_lm ) class_learner.load_encoder(self.path_lm.name) class_learner.fit_one_cycle(1, self.lr_class) class_learner.export(self.path_class)
def _create_classifier_learner(self, data_lm, df_train, df_val, tokenizer, encoder_name): data_class = ftext.TextClasDataBunch.from_df( '', vocab=data_lm.train_ds.vocab, bs=32, train_df=df_train, valid_df=df_val, text_cols=0, label_cols=1, tokenizer=tokenizer, label_delim=' ') config = ftext.awd_lstm_clas_config.copy() config['n_hid'] = 1150 learn = ftext.text_classifier_learner(data_class, ftext.AWD_LSTM, config=config, drop_mult=0.5) learn.load_encoder(encoder_name) return learn
import ujson as json import spacy import fastai from pathlib import Path from fastai.text import load_data, text_classifier_learner, AWD_LSTM from typing import Tuple bs=48 path=Path('/app/model/sentiment') data_clas = load_data(path, 'data_clas.pkl', bs=bs) learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5) learn.load('third') learn.model.eval() nlp = spacy.load('en_core_web_md') def message_to_sentences(message:str, minimum_characters:int=5) -> str: text = message sentences = [] for line in text.split('\n'): sentences += line.split('.') return [s for s in sentences if len(s) > minimum_characters] def predict_sentiment(sentence:str) -> Tuple[str, float]: categorical, class_id, scores = learn.predict(sentence) score = round(scores[class_id].item(), 4) return "negative" if class_id == 0 else "positive", score def model(message): sentence = json.loads(message) doc = nlp(sentence) entities = nlp(sentence).ents
SAMPLES_PER_CLASS = 12500 print('loading data') texts = [] target = [] for class_index, classname in enumerate(CLASS_NAMES): for n, line in enumerate(open(DATA_FOLDER+classname+'.txt')): texts.append(preprocess_string(line,False)) target.append(class_index) if n > SAMPLES_PER_CLASS: break df = DataFrame({'label':target,'text':texts}) df_train, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12) data_lm = TextLMDataBunch.from_df(train_df = df_train, valid_df = df_val, path = "") data_clas = TextClasDataBunch.from_df(path = "", train_df = df_train, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32) learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7) learn.fit_one_cycle(1, 1e-2) learn.save_encoder('ft_enc') learn = text_classifier_learner(data_clas, drop_mult=0.7) learn.load_encoder('ft_enc') learn.fit_one_cycle(1, 1e-2)
lm_learner.recorder.plot(suggestion=True) lm_learner.fit_one_cycle(1, lm_learner.recorder.min_grad_lr) lm_learner.save_encoder(model) text_clas = TextClasDataBunch.from_df( train_df=train_df, valid_df=valid_df, vocab=text_lm.train_ds.vocab, path="", ) clf = text_classifier_learner( text_clas, arch=AWD_LSTM, drop_mult=0.2, ) clf.load_encoder(model) clf.lr_find() clf.recorder.plot(suggestion=True) clf.fit_one_cycle(1, clf.recorder.min_grad_lr) print(lm_learner.predict("green bell")) print(text_clas.train_ds.y.c2i) print(clf.predict("wrap avocado beef"))
def new_train_clas(data_dir, lang='en', cuda_id=0, pretrain_name='wt103', model_dir='models', qrnn=False, fine_tune=True, max_vocab=30000, bs=20, bptt=70, name='imdb-clas', dataset='imdb', ds_pct=1.0): """ :param data_dir: The path to the `data` directory :param lang: the language unicode :param cuda_id: The id of the GPU. Uses GPU 0 by default or no GPU when run on CPU. :param pretrain_name: name of the pretrained model :param model_dir: The path to the directory where the pretrained model is saved :param qrrn: Use a QRNN. Requires installing cupy. :param fine_tune: Fine-tune the pretrained language model :param max_vocab: The maximum size of the vocabulary. :param bs: The batch size. :param bptt: The back-propagation-through-time sequence length. :param name: The name used for both the model and the vocabulary. :param dataset: The dataset used for evaluation. Currently only IMDb and XNLI are implemented. Assumes dataset is located in `data` folder and that name of folder is the same as dataset name. """ results = {} if not torch.cuda.is_available(): print('CUDA not available. Setting device=-1.') cuda_id = -1 torch.cuda.set_device(cuda_id) print(f'Dataset: {dataset}. Language: {lang}.') assert dataset in DATASETS, f'Error: {dataset} processing is not implemented.' assert (dataset == 'imdb' and lang == 'en') or not dataset == 'imdb',\ 'Error: IMDb is only available in English.' data_dir = Path(data_dir) assert data_dir.name == 'data',\ f'Error: Name of data directory should be data, not {data_dir.name}.' dataset_dir = data_dir / dataset model_dir = Path(model_dir) if qrnn: print('Using QRNNs...') model_name = 'qrnn' if qrnn else 'lstm' lm_name = f'{model_name}_{pretrain_name}' pretrained_fname = (lm_name, f'itos_{pretrain_name}') ensure_paths_exists(data_dir, dataset_dir, model_dir, model_dir / f"{pretrained_fname[0]}.pth", model_dir / f"{pretrained_fname[1]}.pkl") tmp_dir = dataset_dir / 'tmp' tmp_dir.mkdir(exist_ok=True) vocab_file = tmp_dir / f'vocab_{lang}.pkl' if not (tmp_dir / f'{TRN}_{lang}_ids.npy').exists(): print('Reading the data...') toks, lbls = read_clas_data(dataset_dir, dataset, lang) # create the vocabulary counter = Counter(word for example in toks[TRN] for word in example) itos = [word for word, count in counter.most_common(n=max_vocab)] itos.insert(0, PAD) itos.insert(0, UNK) vocab = Vocab(itos) stoi = vocab.stoi with open(vocab_file, 'wb') as f: pickle.dump(vocab, f) ids = {} for split in [TRN, VAL, TST]: ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s]) for s in toks[split]]) np.save(tmp_dir / f'{split}_{lang}_ids.npy', ids[split]) np.save(tmp_dir / f'{split}_{lang}_lbl.npy', lbls[split]) else: print('Loading the pickled data...') ids, lbls = {}, {} for split in [TRN, VAL, TST]: ids[split] = np.load(tmp_dir / f'{split}_{lang}_ids.npy') lbls[split] = np.load(tmp_dir / f'{split}_{lang}_lbl.npy') with open(vocab_file, 'rb') as f: vocab = pickle.load(f) print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. ' f'Test size: {len(ids[TST])}.') if ds_pct < 1.0: print(f"Makeing the dataset smaller {ds_pct}") for split in [TRN, VAL, TST]: ids[split] = ids[split][:int(len(ids[split]) * ds_pct)] data_lm = TextLMDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL], bs=bs, bptt=bptt) # TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls? data_clas = TextClasDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL], train_lbls=lbls[TRN], valid_lbls=lbls[VAL], bs=bs) if qrnn: emb_sz, nh, nl = 400, 1550, 3 else: emb_sz, nh, nl = 400, 1150, 3 learn = language_model_learner(data_lm, bptt=bptt, emb_sz=emb_sz, nh=nh, nl=nl, qrnn=qrnn, pad_token=PAD_TOKEN_ID, pretrained_fnames=pretrained_fname, path=model_dir.parent, model_dir=model_dir.name) lm_enc_finetuned = f"{lm_name}_{dataset}_enc" if fine_tune and not (model_dir / f"lm_enc_finetuned.pth").exists(): print('Fine-tuning the language model...') learn.unfreeze() learn.fit(2, slice(1e-4, 1e-2)) # save encoder learn.save_encoder(lm_enc_finetuned) print("Starting classifier training") learn = text_classifier_learner(data_clas, bptt=bptt, pad_token=PAD_TOKEN_ID, path=model_dir.parent, model_dir=model_dir.name, qrnn=qrnn, emb_sz=emb_sz, nh=nh, nl=nl) learn.load_encoder(lm_enc_finetuned) learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7), wd=1e-7) learn.freeze_to(-2) learn.fit_one_cycle(1, slice(1e-2 / (2.6**4), 1e-2), moms=(0.8, 0.7), wd=1e-7) learn.freeze_to(-3) learn.fit_one_cycle(1, slice(5e-3 / (2.6**4), 5e-3), moms=(0.8, 0.7), wd=1e-7) learn.unfreeze() learn.fit_one_cycle(2, slice(1e-3 / (2.6**4), 1e-3), moms=(0.8, 0.7), wd=1e-7) results['accuracy'] = learn.validate()[1] print(f"Saving models at {learn.path / learn.model_dir}") learn.save(f'{model_name}_{name}') return results