def train(bs): path = Path("./") data_lm = load_data(path, 'data_lm.pkl', bs=bs) print("data_lm loaded") data_clas = (TextList.from_csv(path, 'classifier.csv', cols=["summary", "description"], vocab=data_lm.vocab) .split_from_df(col=3) .label_from_df(cols=0) .databunch(bs=bs)) print("data_clas loaded") data_clas.show_batch() # not sure how it will work data_clas.save('data_clas.pkl') learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.6) learn.load_encoder('fine_tuned_enc') lr_estimate = 1.0e-2 learn.fit_one_cycle(1, lr_estimate, moms=(0.8,0.7)) learn.save('first') losses_fig = learn.recorder.plot_losses(return_fig=True) losses_fig.savefig("losses_001.jpg", dpi=600)
def finetune_lm(data_dir, model_dir, dest_dir=None, cyc_len=25, lr=4e-3, lr_factor=1 / 2.6): """ Finetunes the provided language model on the given data, uses discriminative fine-tuning and slanted triangular learning rates. Attributes: data_dir (str): A directory from which to take input data model_dir (str): A directory where the pretrained model is located dest_dir (str): A directory where to store the finetuned language model. Defaults to `model_dir` / name of the `data_dir` last folder. cyc_len (int): Number of epochs for one cycle learning rate scheduler. For more details refer to https://docs.fast.ai/callbacks.one_cycle.html#The-1cycle-policy. lr (float): Learning rate at the last layer. lr_factor (float): Learning rate at layer n is learning rate at layer (n+1) times `lr_factor`. """ data_dir, model_dir = Path(data_dir), Path(model_dir) dest_dir = (Path(dest_dir) if dest_dir else model_dir / data_dir.name) dest_dir.mkdir(parents=True, exist_ok=True) data_lm = load_data(data_dir, "data_finetune_lm.pkl") with open(model_dir / "model_hparams.json", "r") as model_hparams_file: model_hparams = json.load(model_hparams_file) learner = lm_learner(data_lm, AWD_LSTM, model_dir, pretrained=True, config=model_hparams) learner.path = dest_dir # Calculate learning rates for each layer. num_layers = len(learner.layer_groups) lrs = [lr * lr_factor**i for i in range(num_layers)][::-1] learner.unfreeze() learner.fit_one_cycle(cyc_len=cyc_len, max_lr=lrs, div_factor=32, pct_start=0.1) # Save everything. learner.save_encoder("lm_finetuned_encoder") torch.save(learner.model.state_dict(), dest_dir / "lm_finetuned_wgts.pth") with open(dest_dir / "lm_finetuned_itos.pkl", "wb") as itos_file: pickle.dump(learner.data.vocab.itos, itos_file) with open(dest_dir / "model_hparams.json", "w") as model_hparams_file: json.dump(model_hparams, model_hparams_file, indent=2)
def pre_execution_hook(self, mode=ExecutionModeKeys.TEST): self.data_lm_name = "data_lm.pkl" self.data_class_name = "data_class_name" self.fwd_enc_name = "fwd_enc" self.bwd_enc_name = "bwd_enc" self.fwd_class_name = 'fwd_clas' self.bwd_class_name = 'bwd_clas' # to make sure the outputs are also logged fastprogress.fastprogress.WRITER_FN = self._get_master_bar_write_fn() data_lm_path = os.path.join(self.experiment_dir, self.data_lm_name) if not os.path.exists(os.path.dirname(data_lm_path)): os.makedirs(os.path.dirname(data_lm_path)) if not os.path.exists(data_lm_path): data_lm = TextLMDataBunch.from_df(path=self.experiment_dir, train_df=self.dataloader.get_train_input(), valid_df=self.dataloader.get_test_input(), text_cols='utterance', bs=BATCH_SIZE) data_lm.save(self.data_lm_name) self.data_lm = load_data(self.experiment_dir, self.data_lm_name, bs=BATCH_SIZE, bptt=BPTT) self.data_bwd = load_data(self.experiment_dir, self.data_lm_name, bs=BATCH_SIZE, bptt=BPTT, backwards=True) data_class_path = os.path.join(self.experiment_dir, self.data_class_name) if not os.path.exists(data_class_path): data_class = TextDataBunch.from_df(path=self.experiment_dir, train_df=self.dataloader.get_train_input(), valid_df=self.dataloader.get_test_input(), text_cols='utterance', label_cols='functions', vocab=data_lm.train_ds.vocab, bs=BATCH_SIZE) data_class.save(self.data_class_name) self.data_class = load_data(self.experiment_dir, self.data_class_name, bs=BATCH_SIZE) self.data_class_bwd = load_data(self.experiment_dir, self.data_class_name, bs=BATCH_SIZE, backwards=True)
def train_lm(data_dir, model_dir, epochs=12, lr=3e-4, use_pretrained_lm=False, hparam_updates=dict()): """ Trains a new language model using the provided dataset. Attributes: data_dir (str): A directory with processed training and validation data. model_dir (str): A directory where to store the trained model. epochs (int): Number of epochs for model training. lr (float): Learning rate. use_pretrained_lm (bool): If `use_pretrained_lm` is set, a trained language model is first loaded from `model_dir`and then it is trained with the provided dataset. hparam_updates (dict): A dictionary with updates of model hyper-parameters. By default, a configuration from fastai's awd lstm model is used. For more details see https://github.com/fastai/fastai/blob/master/fastai/text/models/awd_lstm.py. """ data_lm = load_data(data_dir, "data_lm.pkl") model_hparams = awd_lstm_lm_config model_hparams.update(hparam_updates) learner = lm_learner(data_lm, AWD_LSTM, model_dir, pretrained=use_pretrained_lm, config=model_hparams) learner.fit(epochs, lr) loss, acc = learner.validate(learner.data.train_dl) print("Training - Loss: {}, Perplexity: {}, Accuracy: {}".format( loss, exp(loss), acc)) loss, acc = learner.validate() print("Validation - Loss: {}, Perplexity: {}, Accuracy: {}".format( loss, exp(loss), acc)) model_dir = Path(model_dir) model_dir.mkdir(parents=True, exist_ok=True) torch.save(learner.model.state_dict(), model_dir / "lm_wgts.pth") with open(model_dir / "lm_itos.pkl", "wb") as itos_file: pickle.dump(learner.data.vocab.itos, itos_file) with open(model_dir / "model_hparams.json", "w") as model_hparams_file: json.dump(model_hparams, model_hparams_file, indent=2)
import ujson as json import spacy import fastai from pathlib import Path from fastai.text import load_data, text_classifier_learner, AWD_LSTM from typing import Tuple bs=48 path=Path('/app/model/sentiment') data_clas = load_data(path, 'data_clas.pkl', bs=bs) learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5) learn.load('third') learn.model.eval() nlp = spacy.load('en_core_web_md') def message_to_sentences(message:str, minimum_characters:int=5) -> str: text = message sentences = [] for line in text.split('\n'): sentences += line.split('.') return [s for s in sentences if len(s) > minimum_characters] def predict_sentiment(sentence:str) -> Tuple[str, float]: categorical, class_id, scores = learn.predict(sentence) score = round(scores[class_id].item(), 4) return "negative" if class_id == 0 else "positive", score def model(message): sentence = json.loads(message) doc = nlp(sentence) entities = nlp(sentence).ents
def __init__(self, data_path: str = 'lang_model', emb_sz: int = 800, qrnn: bool = False, bidir: bool = False, n_layers: int = 4, n_hid: int = 2500, bs: int = 104, bptt: int = 67, lr: float = 0.0013, wd: float = .012, one_cycle: bool = True, cycle_len: int = 1) -> None: """ Instantiate AWD_LSTM Language Model with hyper-parameters. data_path: str path where databunch is loaded from emb_sz: int size of word embeddings qrnn: bool whether or not to use qrnn (requires CudNN) bidir: bool if RNN should be bi-directional n_layers: int number of layers in lang model n_hid: int number of hidden units in model lr: float learning rate bptt: int back-propigation-through-time; max sequence length through which gradients will be accumulated. bs: int batch size The hyper-parameters are stored in a fastai dict called `fastai.text.models.awd_lstm_lm_config`: {'emb_sz': 400, 'n_hid': 1150, 'n_layers': 3, 'pad_token': 1, 'qrnn': False, 'bidir': False, 'output_p': 0.1, 'hidden_p': 0.15, 'input_p': 0.25, 'embed_p': 0.02,'weight_p': 0.2, 'tie_weights': True, 'out_bias': True} """ self.lr, self.wd, self.one_cycle, self.cycle_len = lr, wd, one_cycle, cycle_len awd_lstm_lm_config.update( dict(emb_sz=emb_sz, qrnn=qrnn, bidir=bidir, n_layers=n_layers, n_hid=n_hid)) #log params wb_handle = wandb.init(config=awd_lstm_lm_config) wandb.config.update({ 'data_path': str(data_path), 'bs': bs, 'bptt': bptt, 'lr': lr }) self.csv_name = 'history_' + wb_handle.name wandb.config.update({'csvlog_save_path': self.csv_name}) # instantiate databunch self.data_lm = load_data(data_path, bs=bs, bptt=bptt) # instantiate language model self.learn = language_model_learner(data=self.data_lm, arch=AWD_LSTM, pretrained=False, model_dir=Path('models_' + wb_handle.name), config=awd_lstm_lm_config) self.full_model_path = str(self.learn.path / self.learn.model_dir) wandb.config.update({'model_save_path': self.full_model_path}) # prepare callbacks escb = EarlyStoppingCallback(learn=self.learn, patience=2) smcb = SaveModelCallback(learn=self.learn, name='best_' + wb_handle.name) rpcb = ReduceLROnPlateauCallback(learn=self.learn, patience=1) csvcb = CSVLogger(learn=self.learn, filename=self.csv_name) wb = wandbCallback(self.learn) self.callbacks = [escb, smcb, rpcb, csvcb, wb] self.fit()
parser.add_argument('--load_model') args = parser.parse_args() return args if __name__=='__main__': args = get_args() data_file = args.data_lm batch_size = args.batch_size num_cycle = args.num_cycle max_lr = args.max_lr weight_file = args.load_model print("Loading data...") data_lm = FT.load_data('./', data_file) learner = FT.language_model_learner(data_lm, FT.AWD_LSTM) if weight_file is not None: if not os.path.isdir(weight_file): raise Exception("Invalid weight path") else: print("Loading weight...") learner = learner.load(weight_file) print("Start training") learner.fit_one_cycle(num_cycle, max_lr) learner.save("model") learner.save_encoder("encoder")
def train_clas(data_dir, model_dir, dest_dir=None, cyc_len=1, lr=0.01, lr_factor=1/2.6, pretrained=1): """ Trains a classifier on the given classification dataset, starting with the given language model. Attributes: data_dir (str): The folder where the dataset is located. model_dir (str): The folder where the (finetuned) language model is located. dest_dir (str): The folder where to store the trained classifier. Defaults to `model_dir` / name of the last folder of `data_dir`. cyc_len (int): Determines the number of epochs dedicated to finetuning each layer. That is, firstly the last layer group is unfrozen and trained for `cyc_len` epochs, then the last but one group is unfrozen and trained for `cyc_len` epochs, ... In the last iteration, all layer groups are unfrozen and trained for `cyc_len` epochs. Cyclic learning rate scheduling is used. The total number of epochs is thus `cyc_len` * number of layer groups. lr (float): Learning rate at the last layer. lr_factor (float): Learning rate of layer n is learning rate at layer (n+1) times `lr_factor`. pretrained (int): If 0, starts from with untrained language model. If 1, loads a finetuned language model from `model_dir`. If 2, loads an already trained classifier from `model_dir`. [2 is CURRENTLY BROKEN, seems like load_pretrained does not work with classifiers...?] """ data_dir, model_dir = Path(data_dir), Path(model_dir) dest_dir = (Path(dest_dir) if dest_dir else data_dir.name) dest_dir.mkdir(parents=True, exist_ok=True) data_lm = load_data(data_dir, "data_clas.pkl") # Load config, but remove entries that do not affect the classifier. hparams_fname = ("model_hparams.json" if pretrained != 2 else "clas_hparams.json") with open(model_dir / hparams_fname, "r") as model_hparams_file: model_hparams = json.load(model_hparams_file) for key in ["tie_weights", "out_bias"]: model_hparams.pop(key, None) fmacro = FBeta(average="macro", beta=1) fmacro.name = "f1_macro" fweighted = FBeta(average="weighted", beta=1) fweighted.name = "f1_weighted" metrics = [accuracy, fmacro, fweighted] learner = clas_learner( data_lm, AWD_LSTM, model_dir, pretrained=pretrained, config=model_hparams, metrics=metrics) learner.path = dest_dir # Calculate learning rates for each layer. num_layers = len(learner.layer_groups) lrs = [lr * lr_factor**i for i in range(num_layers)][::-1] # Gradual unfreezing, discriminative fine-tuning, and slanted # triangular learning rates. for i in range(num_layers)[::-1]: learner.freeze_to(i) learner.fit_one_cycle(cyc_len=cyc_len, max_lr=lrs, div_factor=32, pct_start=0.1) # Save everything. learner.save_encoder("clas_encoder") torch.save(learner.model.state_dict(), dest_dir / "clas_wgts.pth") with open(dest_dir / "clas_itos.pkl", "wb") as itos_file: pickle.dump(learner.data.vocab.itos, itos_file) with open(dest_dir / "clas_hparams.json", "w") as model_hparams_file: json.dump(model_hparams, model_hparams_file, indent=2)