def initialize_learner( data: TextLMDataBunch, pretrained_encoder: str, pretrained_itos: str, local_rank: int, label: str, kind: str, gpus: int, ) -> LanguageLearner: data.path = Path(".") click.echo("Training language model...") learn = language_model_learner( data, TransformerXL, pretrained_fnames=[ "./../" + pretrained_encoder.replace(".pth", ""), "./../" + pretrained_itos.replace(".pkl", ""), ], drop_mult=0.1, ) tboard_path = Path("logs/" + label) node_name = "gpu-" + str(local_rank) + "-" + kind learn.callback_fns.append( partial(LearnerTensorboardWriter, base_dir=tboard_path, gpus=gpus, name=node_name)) if gpus > 1: learn.to_distributed(local_rank) return learn
def from_model(model_path: Path, model_name: str) -> LanguageLearner: """Loads a trained language model for inference.""" print("Loading model for inference....") data = DataBunch.load_empty(model_path, "data/empty_data") learn = language_model_learner(data, TransformerXL, pretrained=False) learn.load(model_name) learn.freeze() learn.model.eval() return learn
def _fit_lm(self, df_train, df_val): # Language model data data_lm = TextLMDataBunch.from_df(train_df=df_train, valid_df=df_val, path="") lm_learner = language_model_learner( data_lm, self.arch, drop_mult=self.dropout_class ) # train the learner object lm_learner.fit_one_cycle(1, self.lr_class) # TODO: can we return lm_leaner and load via memory so we don't have to save it? lm_learner.save_encoder(self.path_lm.name) return data_lm
def _create_language_model_learner(self, data_lm, pretrained_model): # Hack for this specific model - lm_5_ep_lr2-3_5_stlr, as it was trained on older version of fast ai. # For new models you shouldn't override the config config = ftext.awd_lstm_lm_config.copy() config['n_hid'] = 1150 learn_lm = ftext.language_model_learner( data_lm, ftext.AWD_LSTM, config=config, pretrained_fnames=pretrained_model, drop_mult=0.3) return learn_lm
def __init__(self, path): texts = pd.read_csv(path + '/jokes_extended_vk_anekdot_preproc.csv', index_col=0) texts.dropna(inplace=True) data = TextList.from_df(texts, processor=[TokenizeProcessor(tokenizer=Tokenizer(lang="xx")), NumericalizeProcessor(min_freq=2, max_vocab=60000)])\ .split_by_rand_pct(.1)\ .label_for_lm()\ .databunch(bs=64) self.learn = language_model_learner(data=data, arch=AWD_LSTM, pretrained=None) self.learn.load_pretrained(path + '/ulmfit/bestmodel_tune.pth', path + '/ulmfit/bestmodel_tune_itos.pkl')
def train( training_config: LMTrainingConfig = LMTrainingConfig(), device_options: DeviceOptions = DeviceOptions(), tune: bool = False, comet: bool = True, save_every_epoch: bool = False, allow_unks: bool = False, return_fastai_learner: bool = False, output_path: Optional[str] = None, rewrite_output: bool = False ) -> Union[TrainedModel, Tuple[TrainedModel, Learner]]: experiment_run = ExperimentRun.with_config(training_config, device_options=device_options, comet=comet, output_path=output_path, rewrite_output=rewrite_output) experiment_run.log_experiment_input() if isinstance(training_config.corpus, Corpus): prep_corpus: api.PreprocessedCorpus = training_config.prep_function.apply_to_corpus( training_config.corpus, calc_vocab=True, output_path=PATH_TO_PREP_DATASETS) elif isinstance(training_config.corpus, PreprocessedCorpus): prep_corpus = training_config.corpus else: raise AssertionError( f'Unknown corpus type: {type(training_config.corpus)}') vocab = create_vocab_for_lm(prep_corpus) experiment_run.log_vocab(vocab) device = device_options.get_device_id() config = create_custom_config(training_config) arch_class = training_config.arch.get_module() dropout_multiplier = training_config.arch.drop.multiplier training = training_config.training if training_config.training.sub_epochs: data_bunch: DataBunch = EmptyDataBunch( vocab=vocab, path=prep_corpus.path_to_prep_dataset, device=device) else: data_bunch = create_databunch(prep_corpus.path_to_prep_dataset, get_all_files( prep_corpus.path_to_prep_dataset, None), vocab, bs=training_config.bs, bptt=training_config.bptt, device=device, verbose=True, allow_unks=allow_unks) learner = language_model_learner( data_bunch, arch_class, opt_func=training.optimizer.get_callable(), drop_mult=dropout_multiplier, config=config, pretrained=False, metrics=[accuracy, mrr, Perplexity()], clip=training.gradient_clip, alpha=training.activation_regularization.alpha, beta=training.activation_regularization.beta, path=os.path.dirname(experiment_run.path_to_trained_model), model_dir=os.path.basename(experiment_run.path_to_trained_model)) learner.loss_func = binary_cross_entropy_flat() if training_config.training.sub_epochs: files_per_epoch = training_config.training.sub_epochs.n_files learner.callbacks.append( EpochFileLoader(learner, prep_corpus, vocab, bs=training_config.bs, bptt=training_config.bptt, device=device, n_files_per_epoch=files_per_epoch, allow_unks=allow_unks)) add_callbacks(experiment_run, learner, vocab, tune, save_every_epoch=save_every_epoch) load_base_model_if_needed(learner, training_config) logger.info( f"Starting training... Model will be saved to {experiment_run.perm_path_to_model} " f"(Saving config and vocab to {experiment_run.path_to_trained_model} before getting the first trained model)" ) training_config.training.schedule.fit(learner, training.weight_decay) if experiment_run.comet_experiment: report_experiment_terminated_mormally(experiment_run.comet_experiment) model = load_from_path(experiment_run.path_to_trained_model, force_use_cpu=True, device=device_options.non_default_device_to_use) return model if not return_fastai_learner else (model, learner)
SAMPLES_PER_CLASS = 12500 print('loading data') texts = [] target = [] for class_index, classname in enumerate(CLASS_NAMES): for n, line in enumerate(open(DATA_FOLDER+classname+'.txt')): texts.append(preprocess_string(line,False)) target.append(class_index) if n > SAMPLES_PER_CLASS: break df = DataFrame({'label':target,'text':texts}) df_train, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12) data_lm = TextLMDataBunch.from_df(train_df = df_train, valid_df = df_val, path = "") data_clas = TextClasDataBunch.from_df(path = "", train_df = df_train, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32) learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7) learn.fit_one_cycle(1, 1e-2) learn.save_encoder('ft_enc') learn = text_classifier_learner(data_clas, drop_mult=0.7) learn.load_encoder('ft_enc') learn.fit_one_cycle(1, 1e-2)
train_df, valid_df = train_test_split( df, stratify=df["cuisine"], test_size=0.2, random_state=1024, ) text_lm = TextLMDataBunch.from_df( train_df=train_df, valid_df=valid_df, path="", ) lm_learner = language_model_learner( text_lm, arch=AWD_LSTM, drop_mult=0.2, ) lm_learner.lr_find() lm_learner.recorder.plot(suggestion=True) lm_learner.fit_one_cycle(1, lm_learner.recorder.min_grad_lr) lm_learner.save_encoder(model) text_clas = TextClasDataBunch.from_df( train_df=train_df, valid_df=valid_df, vocab=text_lm.train_ds.vocab, path="",
def new_train_clas(data_dir, lang='en', cuda_id=0, pretrain_name='wt103', model_dir='models', qrnn=False, fine_tune=True, max_vocab=30000, bs=20, bptt=70, name='imdb-clas', dataset='imdb', ds_pct=1.0): """ :param data_dir: The path to the `data` directory :param lang: the language unicode :param cuda_id: The id of the GPU. Uses GPU 0 by default or no GPU when run on CPU. :param pretrain_name: name of the pretrained model :param model_dir: The path to the directory where the pretrained model is saved :param qrrn: Use a QRNN. Requires installing cupy. :param fine_tune: Fine-tune the pretrained language model :param max_vocab: The maximum size of the vocabulary. :param bs: The batch size. :param bptt: The back-propagation-through-time sequence length. :param name: The name used for both the model and the vocabulary. :param dataset: The dataset used for evaluation. Currently only IMDb and XNLI are implemented. Assumes dataset is located in `data` folder and that name of folder is the same as dataset name. """ results = {} if not torch.cuda.is_available(): print('CUDA not available. Setting device=-1.') cuda_id = -1 torch.cuda.set_device(cuda_id) print(f'Dataset: {dataset}. Language: {lang}.') assert dataset in DATASETS, f'Error: {dataset} processing is not implemented.' assert (dataset == 'imdb' and lang == 'en') or not dataset == 'imdb',\ 'Error: IMDb is only available in English.' data_dir = Path(data_dir) assert data_dir.name == 'data',\ f'Error: Name of data directory should be data, not {data_dir.name}.' dataset_dir = data_dir / dataset model_dir = Path(model_dir) if qrnn: print('Using QRNNs...') model_name = 'qrnn' if qrnn else 'lstm' lm_name = f'{model_name}_{pretrain_name}' pretrained_fname = (lm_name, f'itos_{pretrain_name}') ensure_paths_exists(data_dir, dataset_dir, model_dir, model_dir / f"{pretrained_fname[0]}.pth", model_dir / f"{pretrained_fname[1]}.pkl") tmp_dir = dataset_dir / 'tmp' tmp_dir.mkdir(exist_ok=True) vocab_file = tmp_dir / f'vocab_{lang}.pkl' if not (tmp_dir / f'{TRN}_{lang}_ids.npy').exists(): print('Reading the data...') toks, lbls = read_clas_data(dataset_dir, dataset, lang) # create the vocabulary counter = Counter(word for example in toks[TRN] for word in example) itos = [word for word, count in counter.most_common(n=max_vocab)] itos.insert(0, PAD) itos.insert(0, UNK) vocab = Vocab(itos) stoi = vocab.stoi with open(vocab_file, 'wb') as f: pickle.dump(vocab, f) ids = {} for split in [TRN, VAL, TST]: ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s]) for s in toks[split]]) np.save(tmp_dir / f'{split}_{lang}_ids.npy', ids[split]) np.save(tmp_dir / f'{split}_{lang}_lbl.npy', lbls[split]) else: print('Loading the pickled data...') ids, lbls = {}, {} for split in [TRN, VAL, TST]: ids[split] = np.load(tmp_dir / f'{split}_{lang}_ids.npy') lbls[split] = np.load(tmp_dir / f'{split}_{lang}_lbl.npy') with open(vocab_file, 'rb') as f: vocab = pickle.load(f) print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. ' f'Test size: {len(ids[TST])}.') if ds_pct < 1.0: print(f"Makeing the dataset smaller {ds_pct}") for split in [TRN, VAL, TST]: ids[split] = ids[split][:int(len(ids[split]) * ds_pct)] data_lm = TextLMDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL], bs=bs, bptt=bptt) # TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls? data_clas = TextClasDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL], train_lbls=lbls[TRN], valid_lbls=lbls[VAL], bs=bs) if qrnn: emb_sz, nh, nl = 400, 1550, 3 else: emb_sz, nh, nl = 400, 1150, 3 learn = language_model_learner(data_lm, bptt=bptt, emb_sz=emb_sz, nh=nh, nl=nl, qrnn=qrnn, pad_token=PAD_TOKEN_ID, pretrained_fnames=pretrained_fname, path=model_dir.parent, model_dir=model_dir.name) lm_enc_finetuned = f"{lm_name}_{dataset}_enc" if fine_tune and not (model_dir / f"lm_enc_finetuned.pth").exists(): print('Fine-tuning the language model...') learn.unfreeze() learn.fit(2, slice(1e-4, 1e-2)) # save encoder learn.save_encoder(lm_enc_finetuned) print("Starting classifier training") learn = text_classifier_learner(data_clas, bptt=bptt, pad_token=PAD_TOKEN_ID, path=model_dir.parent, model_dir=model_dir.name, qrnn=qrnn, emb_sz=emb_sz, nh=nh, nl=nl) learn.load_encoder(lm_enc_finetuned) learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7), wd=1e-7) learn.freeze_to(-2) learn.fit_one_cycle(1, slice(1e-2 / (2.6**4), 1e-2), moms=(0.8, 0.7), wd=1e-7) learn.freeze_to(-3) learn.fit_one_cycle(1, slice(5e-3 / (2.6**4), 5e-3), moms=(0.8, 0.7), wd=1e-7) learn.unfreeze() learn.fit_one_cycle(2, slice(1e-3 / (2.6**4), 1e-3), moms=(0.8, 0.7), wd=1e-7) results['accuracy'] = learn.validate()[1] print(f"Saving models at {learn.path / learn.model_dir}") learn.save(f'{model_name}_{name}') return results
args = parser.parse_args() return args if __name__=='__main__': args = get_args() data_file = args.data_lm batch_size = args.batch_size num_cycle = args.num_cycle max_lr = args.max_lr weight_file = args.load_model print("Loading data...") data_lm = FT.load_data('./', data_file) learner = FT.language_model_learner(data_lm, FT.AWD_LSTM) if weight_file is not None: if not os.path.isdir(weight_file): raise Exception("Invalid weight path") else: print("Loading weight...") learner = learner.load(weight_file) print("Start training") learner.fit_one_cycle(num_cycle, max_lr) learner.save("model") learner.save_encoder("encoder")