Example #1
0
def train(bs):
    path = Path("./")
    data_lm = load_data(path, 'data_lm.pkl', bs=bs)
    print("data_lm loaded")

    data_clas = (TextList.from_csv(path, 'classifier.csv', cols=["summary", "description"], vocab=data_lm.vocab)
                 .split_from_df(col=3)
                 .label_from_df(cols=0)
                 .databunch(bs=bs))

    print("data_clas loaded")
    data_clas.show_batch()  # not sure how it will work

    data_clas.save('data_clas.pkl')

    learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.6)
    learn.load_encoder('fine_tuned_enc')

    lr_estimate = 1.0e-2

    learn.fit_one_cycle(1, lr_estimate, moms=(0.8,0.7))

    learn.save('first')
    losses_fig = learn.recorder.plot_losses(return_fig=True)
    losses_fig.savefig("losses_001.jpg", dpi=600)
Example #2
0
def finetune_lm(data_dir,
                model_dir,
                dest_dir=None,
                cyc_len=25,
                lr=4e-3,
                lr_factor=1 / 2.6):
    """
    Finetunes the provided language model on the given data, uses
    discriminative fine-tuning and slanted triangular learning rates.

    Attributes:
        data_dir (str): A directory from which to take input data
        model_dir (str): A directory where the pretrained model is located
        dest_dir (str): A directory where to store the finetuned language model. Defaults
            to `model_dir` / name of the `data_dir` last folder.
        cyc_len (int): Number of epochs for one cycle learning rate scheduler. For more details
            refer to https://docs.fast.ai/callbacks.one_cycle.html#The-1cycle-policy.
        lr (float): Learning rate at the last layer.
        lr_factor (float): Learning rate at layer n is learning rate at layer (n+1) times
            `lr_factor`.
    """
    data_dir, model_dir = Path(data_dir), Path(model_dir)
    dest_dir = (Path(dest_dir) if dest_dir else model_dir / data_dir.name)
    dest_dir.mkdir(parents=True, exist_ok=True)

    data_lm = load_data(data_dir, "data_finetune_lm.pkl")

    with open(model_dir / "model_hparams.json", "r") as model_hparams_file:
        model_hparams = json.load(model_hparams_file)
    learner = lm_learner(data_lm,
                         AWD_LSTM,
                         model_dir,
                         pretrained=True,
                         config=model_hparams)
    learner.path = dest_dir

    # Calculate learning rates for each layer.
    num_layers = len(learner.layer_groups)
    lrs = [lr * lr_factor**i for i in range(num_layers)][::-1]

    learner.unfreeze()
    learner.fit_one_cycle(cyc_len=cyc_len,
                          max_lr=lrs,
                          div_factor=32,
                          pct_start=0.1)

    # Save everything.
    learner.save_encoder("lm_finetuned_encoder")
    torch.save(learner.model.state_dict(), dest_dir / "lm_finetuned_wgts.pth")
    with open(dest_dir / "lm_finetuned_itos.pkl", "wb") as itos_file:
        pickle.dump(learner.data.vocab.itos, itos_file)
    with open(dest_dir / "model_hparams.json", "w") as model_hparams_file:
        json.dump(model_hparams, model_hparams_file, indent=2)
Example #3
0
    def pre_execution_hook(self, mode=ExecutionModeKeys.TEST):
        self.data_lm_name = "data_lm.pkl"
        self.data_class_name = "data_class_name"
        self.fwd_enc_name = "fwd_enc"
        self.bwd_enc_name = "bwd_enc"
        self.fwd_class_name = 'fwd_clas'
        self.bwd_class_name = 'bwd_clas'

        # to make sure the outputs are also logged
        fastprogress.fastprogress.WRITER_FN = self._get_master_bar_write_fn()

        data_lm_path = os.path.join(self.experiment_dir, self.data_lm_name)
        if not os.path.exists(os.path.dirname(data_lm_path)):
            os.makedirs(os.path.dirname(data_lm_path))
        if not os.path.exists(data_lm_path):
            data_lm = TextLMDataBunch.from_df(path=self.experiment_dir,
                                              train_df=self.dataloader.get_train_input(),
                                              valid_df=self.dataloader.get_test_input(),
                                              text_cols='utterance',
                                              bs=BATCH_SIZE)
            data_lm.save(self.data_lm_name)
        self.data_lm = load_data(self.experiment_dir, self.data_lm_name, bs=BATCH_SIZE, bptt=BPTT)
        self.data_bwd = load_data(self.experiment_dir, self.data_lm_name, bs=BATCH_SIZE,
                                  bptt=BPTT, backwards=True)

        data_class_path = os.path.join(self.experiment_dir, self.data_class_name)
        if not os.path.exists(data_class_path):
            data_class = TextDataBunch.from_df(path=self.experiment_dir,
                                               train_df=self.dataloader.get_train_input(),
                                               valid_df=self.dataloader.get_test_input(),
                                               text_cols='utterance',
                                               label_cols='functions',
                                               vocab=data_lm.train_ds.vocab,
                                               bs=BATCH_SIZE)
            data_class.save(self.data_class_name)
        self.data_class = load_data(self.experiment_dir, self.data_class_name, bs=BATCH_SIZE)
        self.data_class_bwd = load_data(self.experiment_dir, self.data_class_name, bs=BATCH_SIZE, backwards=True)
Example #4
0
def train_lm(data_dir,
             model_dir,
             epochs=12,
             lr=3e-4,
             use_pretrained_lm=False,
             hparam_updates=dict()):
    """
    Trains a new language model using the provided dataset.

    Attributes:
        data_dir (str): A directory with processed training and validation data.
        model_dir (str): A directory where to store the trained model.
        epochs (int): Number of epochs for model training.
        lr (float): Learning rate.
        use_pretrained_lm (bool): If `use_pretrained_lm` is set, a trained language model is first
            loaded from `model_dir`and then it is trained with the provided dataset.
        hparam_updates (dict): A dictionary with updates of model hyper-parameters. By default,
            a configuration from fastai's awd lstm model is used. For more details see
            https://github.com/fastai/fastai/blob/master/fastai/text/models/awd_lstm.py.
    """
    data_lm = load_data(data_dir, "data_lm.pkl")

    model_hparams = awd_lstm_lm_config
    model_hparams.update(hparam_updates)
    learner = lm_learner(data_lm,
                         AWD_LSTM,
                         model_dir,
                         pretrained=use_pretrained_lm,
                         config=model_hparams)
    learner.fit(epochs, lr)

    loss, acc = learner.validate(learner.data.train_dl)
    print("Training - Loss: {}, Perplexity: {}, Accuracy: {}".format(
        loss, exp(loss), acc))
    loss, acc = learner.validate()
    print("Validation - Loss: {}, Perplexity: {}, Accuracy: {}".format(
        loss, exp(loss), acc))

    model_dir = Path(model_dir)
    model_dir.mkdir(parents=True, exist_ok=True)
    torch.save(learner.model.state_dict(), model_dir / "lm_wgts.pth")
    with open(model_dir / "lm_itos.pkl", "wb") as itos_file:
        pickle.dump(learner.data.vocab.itos, itos_file)
    with open(model_dir / "model_hparams.json", "w") as model_hparams_file:
        json.dump(model_hparams, model_hparams_file, indent=2)
Example #5
0
import ujson as json
import spacy
import fastai
from pathlib import Path
from fastai.text import load_data, text_classifier_learner, AWD_LSTM
from typing import Tuple

bs=48
path=Path('/app/model/sentiment')
data_clas = load_data(path, 'data_clas.pkl', bs=bs)
learn = text_classifier_learner(data_clas, AWD_LSTM, drop_mult=0.5)
learn.load('third')
learn.model.eval()
nlp = spacy.load('en_core_web_md')

def message_to_sentences(message:str, minimum_characters:int=5) -> str:
    text = message
    sentences = []
    for line in text.split('\n'):
        sentences += line.split('.')
    return [s for s in sentences if len(s) > minimum_characters]

def predict_sentiment(sentence:str) -> Tuple[str, float]:
    categorical, class_id, scores = learn.predict(sentence)
    score = round(scores[class_id].item(), 4)
    return "negative" if class_id == 0 else "positive", score

def model(message):
    sentence = json.loads(message)
    doc = nlp(sentence)
    entities = nlp(sentence).ents
    def __init__(self,
                 data_path: str = 'lang_model',
                 emb_sz: int = 800,
                 qrnn: bool = False,
                 bidir: bool = False,
                 n_layers: int = 4,
                 n_hid: int = 2500,
                 bs: int = 104,
                 bptt: int = 67,
                 lr: float = 0.0013,
                 wd: float = .012,
                 one_cycle: bool = True,
                 cycle_len: int = 1) -> None:
        """ Instantiate AWD_LSTM Language Model with hyper-parameters.
        
        data_path: str
            path where databunch is loaded from
        emb_sz: int
            size of word embeddings
        qrnn: bool
            whether or not to use qrnn (requires CudNN)
        bidir: bool
            if RNN should be bi-directional
        n_layers: int
            number of layers in lang model
        n_hid: int
            number of hidden units in model
        lr: float
            learning rate
        bptt: int
            back-propigation-through-time; max sequence length through which gradients will be accumulated.
        bs: int
            batch size
        
        The hyper-parameters are stored in a fastai dict called `fastai.text.models.awd_lstm_lm_config`:
           {'emb_sz': 400, 'n_hid': 1150, 'n_layers': 3, 'pad_token': 1, 'qrnn': False, 'bidir': False, 'output_p': 0.1,
            'hidden_p': 0.15, 'input_p': 0.25, 'embed_p': 0.02,'weight_p': 0.2, 'tie_weights': True, 'out_bias': True}
        """
        self.lr, self.wd, self.one_cycle, self.cycle_len = lr, wd, one_cycle, cycle_len
        awd_lstm_lm_config.update(
            dict(emb_sz=emb_sz,
                 qrnn=qrnn,
                 bidir=bidir,
                 n_layers=n_layers,
                 n_hid=n_hid))
        #log params
        wb_handle = wandb.init(config=awd_lstm_lm_config)
        wandb.config.update({
            'data_path': str(data_path),
            'bs': bs,
            'bptt': bptt,
            'lr': lr
        })
        self.csv_name = 'history_' + wb_handle.name
        wandb.config.update({'csvlog_save_path': self.csv_name})

        # instantiate databunch
        self.data_lm = load_data(data_path, bs=bs, bptt=bptt)

        # instantiate language model
        self.learn = language_model_learner(data=self.data_lm,
                                            arch=AWD_LSTM,
                                            pretrained=False,
                                            model_dir=Path('models_' +
                                                           wb_handle.name),
                                            config=awd_lstm_lm_config)
        self.full_model_path = str(self.learn.path / self.learn.model_dir)
        wandb.config.update({'model_save_path': self.full_model_path})

        # prepare callbacks
        escb = EarlyStoppingCallback(learn=self.learn, patience=2)
        smcb = SaveModelCallback(learn=self.learn,
                                 name='best_' + wb_handle.name)
        rpcb = ReduceLROnPlateauCallback(learn=self.learn, patience=1)
        csvcb = CSVLogger(learn=self.learn, filename=self.csv_name)
        wb = wandbCallback(self.learn)
        self.callbacks = [escb, smcb, rpcb, csvcb, wb]

        self.fit()
Example #7
0
    parser.add_argument('--load_model')
    
    args = parser.parse_args()
    return args


if __name__=='__main__':
    args = get_args()
    data_file = args.data_lm
    batch_size = args.batch_size
    num_cycle = args.num_cycle
    max_lr = args.max_lr
    weight_file = args.load_model

    print("Loading data...")
    data_lm = FT.load_data('./', data_file)
    learner = FT.language_model_learner(data_lm, FT.AWD_LSTM)
    
    if weight_file is not None:
        if not os.path.isdir(weight_file):
            raise Exception("Invalid weight path")
        else:
            print("Loading weight...")
            learner = learner.load(weight_file)
        
    print("Start training")
    learner.fit_one_cycle(num_cycle, max_lr)
    learner.save("model")
    learner.save_encoder("encoder")

Example #8
0
def train_clas(data_dir, model_dir, dest_dir=None, cyc_len=1, lr=0.01, lr_factor=1/2.6,
               pretrained=1):
    """
    Trains a classifier on the given classification dataset, starting with the given language model.

    Attributes:
        data_dir (str): The folder where the dataset is located.
        model_dir (str): The folder where the (finetuned) language model is located.
        dest_dir (str): The folder where to store the trained classifier. Defaults
            to `model_dir` / name of the last folder of `data_dir`.
        cyc_len (int): Determines the number of epochs dedicated to finetuning each
            layer. That is, firstly the last layer group is unfrozen and trained for
            `cyc_len` epochs, then the last but one group is unfrozen and
            trained for `cyc_len` epochs, ... In the last iteration, all layer groups are
            unfrozen and trained for `cyc_len` epochs. Cyclic learning rate
            scheduling is used. The total number of epochs is thus
            `cyc_len` * number of layer groups.
        lr (float): Learning rate at the last layer.
        lr_factor (float): Learning rate of layer n is learning rate at layer (n+1) times
            `lr_factor`.
        pretrained (int): If 0, starts from with untrained language model.
            If 1, loads a finetuned language model from `model_dir`.
            If 2, loads an already trained classifier from `model_dir`.
            [2 is CURRENTLY BROKEN, seems like load_pretrained does not
            work with classifiers...?]
    """
    data_dir, model_dir = Path(data_dir), Path(model_dir)
    dest_dir = (Path(dest_dir) if dest_dir else data_dir.name)
    dest_dir.mkdir(parents=True, exist_ok=True)

    data_lm = load_data(data_dir, "data_clas.pkl")

    # Load config, but remove entries that do not affect the classifier.
    hparams_fname = ("model_hparams.json" if pretrained != 2 else "clas_hparams.json")
    with open(model_dir / hparams_fname, "r") as model_hparams_file:
        model_hparams = json.load(model_hparams_file)
    for key in ["tie_weights", "out_bias"]:
        model_hparams.pop(key, None)

    fmacro = FBeta(average="macro", beta=1)
    fmacro.name = "f1_macro"
    fweighted = FBeta(average="weighted", beta=1)
    fweighted.name = "f1_weighted"
    metrics = [accuracy, fmacro, fweighted]

    learner = clas_learner(
        data_lm, AWD_LSTM, model_dir, pretrained=pretrained, config=model_hparams, metrics=metrics)
    learner.path = dest_dir

    # Calculate learning rates for each layer.
    num_layers = len(learner.layer_groups)
    lrs = [lr * lr_factor**i for i in range(num_layers)][::-1]

    # Gradual unfreezing, discriminative fine-tuning, and slanted
    # triangular learning rates.
    for i in range(num_layers)[::-1]:
        learner.freeze_to(i)
        learner.fit_one_cycle(cyc_len=cyc_len, max_lr=lrs, div_factor=32, pct_start=0.1)

    # Save everything.
    learner.save_encoder("clas_encoder")
    torch.save(learner.model.state_dict(), dest_dir / "clas_wgts.pth")
    with open(dest_dir / "clas_itos.pkl", "wb") as itos_file:
        pickle.dump(learner.data.vocab.itos, itos_file)
    with open(dest_dir / "clas_hparams.json", "w") as model_hparams_file:
        json.dump(model_hparams, model_hparams_file, indent=2)