Beispiel #1
0
def initialize_learner(
    data: TextLMDataBunch,
    pretrained_encoder: str,
    pretrained_itos: str,
    local_rank: int,
    label: str,
    kind: str,
    gpus: int,
) -> LanguageLearner:
    data.path = Path(".")
    click.echo("Training language model...")
    learn = language_model_learner(
        data,
        TransformerXL,
        pretrained_fnames=[
            "./../" + pretrained_encoder.replace(".pth", ""),
            "./../" + pretrained_itos.replace(".pkl", ""),
        ],
        drop_mult=0.1,
    )

    tboard_path = Path("logs/" + label)
    node_name = "gpu-" + str(local_rank) + "-" + kind
    learn.callback_fns.append(
        partial(LearnerTensorboardWriter,
                base_dir=tboard_path,
                gpus=gpus,
                name=node_name))

    if gpus > 1:
        learn.to_distributed(local_rank)
    return learn
Beispiel #2
0
def from_model(model_path: Path, model_name: str) -> LanguageLearner:
    """Loads a trained language model for inference."""
    print("Loading model for inference....")
    data = DataBunch.load_empty(model_path, "data/empty_data")
    learn = language_model_learner(data, TransformerXL, pretrained=False)
    learn.load(model_name)
    learn.freeze()
    learn.model.eval()
    return learn
Beispiel #3
0
 def _fit_lm(self, df_train, df_val):
     # Language model data
     data_lm = TextLMDataBunch.from_df(train_df=df_train, valid_df=df_val, path="")
     lm_learner = language_model_learner(
         data_lm, self.arch, drop_mult=self.dropout_class
     )
     # train the learner object
     lm_learner.fit_one_cycle(1, self.lr_class)
     # TODO: can we return lm_leaner and load via memory so we don't have to save it?
     lm_learner.save_encoder(self.path_lm.name)
     return data_lm
Beispiel #4
0
    def _create_language_model_learner(self, data_lm, pretrained_model):
        # Hack for this specific model - lm_5_ep_lr2-3_5_stlr, as it was trained on older version of fast ai.
        # For new models you shouldn't override the config
        config = ftext.awd_lstm_lm_config.copy()
        config['n_hid'] = 1150

        learn_lm = ftext.language_model_learner(
            data_lm,
            ftext.AWD_LSTM,
            config=config,
            pretrained_fnames=pretrained_model,
            drop_mult=0.3)
        return learn_lm
    def __init__(self, path):
        texts = pd.read_csv(path + '/jokes_extended_vk_anekdot_preproc.csv',
                            index_col=0)
        texts.dropna(inplace=True)
        data = TextList.from_df(texts,
                        processor=[TokenizeProcessor(tokenizer=Tokenizer(lang="xx")),
                                     NumericalizeProcessor(min_freq=2, max_vocab=60000)])\
                                    .split_by_rand_pct(.1)\
                                    .label_for_lm()\
                                    .databunch(bs=64)

        self.learn = language_model_learner(data=data,
                                            arch=AWD_LSTM,
                                            pretrained=None)
        self.learn.load_pretrained(path + '/ulmfit/bestmodel_tune.pth',
                                   path + '/ulmfit/bestmodel_tune_itos.pkl')
Beispiel #6
0
def train(
    training_config: LMTrainingConfig = LMTrainingConfig(),
    device_options: DeviceOptions = DeviceOptions(),
    tune: bool = False,
    comet: bool = True,
    save_every_epoch: bool = False,
    allow_unks: bool = False,
    return_fastai_learner: bool = False,
    output_path: Optional[str] = None,
    rewrite_output: bool = False
) -> Union[TrainedModel, Tuple[TrainedModel, Learner]]:
    experiment_run = ExperimentRun.with_config(training_config,
                                               device_options=device_options,
                                               comet=comet,
                                               output_path=output_path,
                                               rewrite_output=rewrite_output)
    experiment_run.log_experiment_input()

    if isinstance(training_config.corpus, Corpus):
        prep_corpus: api.PreprocessedCorpus = training_config.prep_function.apply_to_corpus(
            training_config.corpus,
            calc_vocab=True,
            output_path=PATH_TO_PREP_DATASETS)
    elif isinstance(training_config.corpus, PreprocessedCorpus):
        prep_corpus = training_config.corpus
    else:
        raise AssertionError(
            f'Unknown corpus type: {type(training_config.corpus)}')

    vocab = create_vocab_for_lm(prep_corpus)
    experiment_run.log_vocab(vocab)

    device = device_options.get_device_id()

    config = create_custom_config(training_config)
    arch_class = training_config.arch.get_module()
    dropout_multiplier = training_config.arch.drop.multiplier
    training = training_config.training

    if training_config.training.sub_epochs:
        data_bunch: DataBunch = EmptyDataBunch(
            vocab=vocab, path=prep_corpus.path_to_prep_dataset, device=device)
    else:
        data_bunch = create_databunch(prep_corpus.path_to_prep_dataset,
                                      get_all_files(
                                          prep_corpus.path_to_prep_dataset,
                                          None),
                                      vocab,
                                      bs=training_config.bs,
                                      bptt=training_config.bptt,
                                      device=device,
                                      verbose=True,
                                      allow_unks=allow_unks)

    learner = language_model_learner(
        data_bunch,
        arch_class,
        opt_func=training.optimizer.get_callable(),
        drop_mult=dropout_multiplier,
        config=config,
        pretrained=False,
        metrics=[accuracy, mrr, Perplexity()],
        clip=training.gradient_clip,
        alpha=training.activation_regularization.alpha,
        beta=training.activation_regularization.beta,
        path=os.path.dirname(experiment_run.path_to_trained_model),
        model_dir=os.path.basename(experiment_run.path_to_trained_model))
    learner.loss_func = binary_cross_entropy_flat()

    if training_config.training.sub_epochs:
        files_per_epoch = training_config.training.sub_epochs.n_files
        learner.callbacks.append(
            EpochFileLoader(learner,
                            prep_corpus,
                            vocab,
                            bs=training_config.bs,
                            bptt=training_config.bptt,
                            device=device,
                            n_files_per_epoch=files_per_epoch,
                            allow_unks=allow_unks))

    add_callbacks(experiment_run,
                  learner,
                  vocab,
                  tune,
                  save_every_epoch=save_every_epoch)

    load_base_model_if_needed(learner, training_config)

    logger.info(
        f"Starting training... Model will be saved to {experiment_run.perm_path_to_model} "
        f"(Saving config and vocab to {experiment_run.path_to_trained_model} before getting the first trained model)"
    )
    training_config.training.schedule.fit(learner, training.weight_decay)
    if experiment_run.comet_experiment:
        report_experiment_terminated_mormally(experiment_run.comet_experiment)

    model = load_from_path(experiment_run.path_to_trained_model,
                           force_use_cpu=True,
                           device=device_options.non_default_device_to_use)
    return model if not return_fastai_learner else (model, learner)
Beispiel #7
0
SAMPLES_PER_CLASS = 12500

print('loading data')
texts = []
target = []

for class_index, classname in enumerate(CLASS_NAMES):

    for n, line in enumerate(open(DATA_FOLDER+classname+'.txt')):

        texts.append(preprocess_string(line,False))
        target.append(class_index)

        if n > SAMPLES_PER_CLASS:
            break

df = DataFrame({'label':target,'text':texts})
df_train, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

data_lm = TextLMDataBunch.from_df(train_df = df_train, valid_df = df_val, path = "")
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_train, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7)
learn.fit_one_cycle(1, 1e-2)
learn.save_encoder('ft_enc')

learn = text_classifier_learner(data_clas, drop_mult=0.7)
learn.load_encoder('ft_enc')
learn.fit_one_cycle(1, 1e-2)

    train_df, valid_df = train_test_split(
        df,
        stratify=df["cuisine"],
        test_size=0.2,
        random_state=1024,
    )

    text_lm = TextLMDataBunch.from_df(
        train_df=train_df,
        valid_df=valid_df,
        path="",
    )
    lm_learner = language_model_learner(
        text_lm,
        arch=AWD_LSTM,
        drop_mult=0.2,
    )

    lm_learner.lr_find()
    lm_learner.recorder.plot(suggestion=True)

    lm_learner.fit_one_cycle(1, lm_learner.recorder.min_grad_lr)

    lm_learner.save_encoder(model)

    text_clas = TextClasDataBunch.from_df(
        train_df=train_df,
        valid_df=valid_df,
        vocab=text_lm.train_ds.vocab,
        path="",
def new_train_clas(data_dir,
                   lang='en',
                   cuda_id=0,
                   pretrain_name='wt103',
                   model_dir='models',
                   qrnn=False,
                   fine_tune=True,
                   max_vocab=30000,
                   bs=20,
                   bptt=70,
                   name='imdb-clas',
                   dataset='imdb',
                   ds_pct=1.0):
    """
    :param data_dir: The path to the `data` directory
    :param lang: the language unicode
    :param cuda_id: The id of the GPU. Uses GPU 0 by default or no GPU when
                    run on CPU.
    :param pretrain_name: name of the pretrained model
    :param model_dir: The path to the directory where the pretrained model is saved
    :param qrrn: Use a QRNN. Requires installing cupy.
    :param fine_tune: Fine-tune the pretrained language model
    :param max_vocab: The maximum size of the vocabulary.
    :param bs: The batch size.
    :param bptt: The back-propagation-through-time sequence length.
    :param name: The name used for both the model and the vocabulary.
    :param dataset: The dataset used for evaluation. Currently only IMDb and
                    XNLI are implemented. Assumes dataset is located in `data`
                    folder and that name of folder is the same as dataset name.
    """
    results = {}
    if not torch.cuda.is_available():
        print('CUDA not available. Setting device=-1.')
        cuda_id = -1
    torch.cuda.set_device(cuda_id)

    print(f'Dataset: {dataset}. Language: {lang}.')
    assert dataset in DATASETS, f'Error: {dataset} processing is not implemented.'
    assert (dataset == 'imdb' and lang == 'en') or not dataset == 'imdb',\
        'Error: IMDb is only available in English.'

    data_dir = Path(data_dir)
    assert data_dir.name == 'data',\
        f'Error: Name of data directory should be data, not {data_dir.name}.'
    dataset_dir = data_dir / dataset
    model_dir = Path(model_dir)

    if qrnn:
        print('Using QRNNs...')
    model_name = 'qrnn' if qrnn else 'lstm'
    lm_name = f'{model_name}_{pretrain_name}'
    pretrained_fname = (lm_name, f'itos_{pretrain_name}')

    ensure_paths_exists(data_dir, dataset_dir, model_dir,
                        model_dir / f"{pretrained_fname[0]}.pth",
                        model_dir / f"{pretrained_fname[1]}.pkl")

    tmp_dir = dataset_dir / 'tmp'
    tmp_dir.mkdir(exist_ok=True)
    vocab_file = tmp_dir / f'vocab_{lang}.pkl'

    if not (tmp_dir / f'{TRN}_{lang}_ids.npy').exists():
        print('Reading the data...')
        toks, lbls = read_clas_data(dataset_dir, dataset, lang)

        # create the vocabulary
        counter = Counter(word for example in toks[TRN] for word in example)
        itos = [word for word, count in counter.most_common(n=max_vocab)]
        itos.insert(0, PAD)
        itos.insert(0, UNK)
        vocab = Vocab(itos)
        stoi = vocab.stoi
        with open(vocab_file, 'wb') as f:
            pickle.dump(vocab, f)

        ids = {}
        for split in [TRN, VAL, TST]:
            ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s])
                                   for s in toks[split]])
            np.save(tmp_dir / f'{split}_{lang}_ids.npy', ids[split])
            np.save(tmp_dir / f'{split}_{lang}_lbl.npy', lbls[split])
    else:
        print('Loading the pickled data...')
        ids, lbls = {}, {}
        for split in [TRN, VAL, TST]:
            ids[split] = np.load(tmp_dir / f'{split}_{lang}_ids.npy')
            lbls[split] = np.load(tmp_dir / f'{split}_{lang}_lbl.npy')
        with open(vocab_file, 'rb') as f:
            vocab = pickle.load(f)

    print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. '
          f'Test size: {len(ids[TST])}.')

    if ds_pct < 1.0:
        print(f"Makeing the dataset smaller {ds_pct}")
        for split in [TRN, VAL, TST]:
            ids[split] = ids[split][:int(len(ids[split]) * ds_pct)]

    data_lm = TextLMDataBunch.from_ids(path=tmp_dir,
                                       vocab=vocab,
                                       train_ids=ids[TRN],
                                       valid_ids=ids[VAL],
                                       bs=bs,
                                       bptt=bptt)

    # TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls?
    data_clas = TextClasDataBunch.from_ids(path=tmp_dir,
                                           vocab=vocab,
                                           train_ids=ids[TRN],
                                           valid_ids=ids[VAL],
                                           train_lbls=lbls[TRN],
                                           valid_lbls=lbls[VAL],
                                           bs=bs)

    if qrnn:
        emb_sz, nh, nl = 400, 1550, 3
    else:
        emb_sz, nh, nl = 400, 1150, 3
    learn = language_model_learner(data_lm,
                                   bptt=bptt,
                                   emb_sz=emb_sz,
                                   nh=nh,
                                   nl=nl,
                                   qrnn=qrnn,
                                   pad_token=PAD_TOKEN_ID,
                                   pretrained_fnames=pretrained_fname,
                                   path=model_dir.parent,
                                   model_dir=model_dir.name)
    lm_enc_finetuned = f"{lm_name}_{dataset}_enc"
    if fine_tune and not (model_dir / f"lm_enc_finetuned.pth").exists():
        print('Fine-tuning the language model...')
        learn.unfreeze()
        learn.fit(2, slice(1e-4, 1e-2))

        # save encoder
        learn.save_encoder(lm_enc_finetuned)

    print("Starting classifier training")
    learn = text_classifier_learner(data_clas,
                                    bptt=bptt,
                                    pad_token=PAD_TOKEN_ID,
                                    path=model_dir.parent,
                                    model_dir=model_dir.name,
                                    qrnn=qrnn,
                                    emb_sz=emb_sz,
                                    nh=nh,
                                    nl=nl)

    learn.load_encoder(lm_enc_finetuned)

    learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7), wd=1e-7)

    learn.freeze_to(-2)
    learn.fit_one_cycle(1,
                        slice(1e-2 / (2.6**4), 1e-2),
                        moms=(0.8, 0.7),
                        wd=1e-7)

    learn.freeze_to(-3)
    learn.fit_one_cycle(1,
                        slice(5e-3 / (2.6**4), 5e-3),
                        moms=(0.8, 0.7),
                        wd=1e-7)

    learn.unfreeze()
    learn.fit_one_cycle(2,
                        slice(1e-3 / (2.6**4), 1e-3),
                        moms=(0.8, 0.7),
                        wd=1e-7)
    results['accuracy'] = learn.validate()[1]
    print(f"Saving models at {learn.path / learn.model_dir}")
    learn.save(f'{model_name}_{name}')
    return results
Beispiel #10
0
    
    args = parser.parse_args()
    return args


if __name__=='__main__':
    args = get_args()
    data_file = args.data_lm
    batch_size = args.batch_size
    num_cycle = args.num_cycle
    max_lr = args.max_lr
    weight_file = args.load_model

    print("Loading data...")
    data_lm = FT.load_data('./', data_file)
    learner = FT.language_model_learner(data_lm, FT.AWD_LSTM)
    
    if weight_file is not None:
        if not os.path.isdir(weight_file):
            raise Exception("Invalid weight path")
        else:
            print("Loading weight...")
            learner = learner.load(weight_file)
        
    print("Start training")
    learner.fit_one_cycle(num_cycle, max_lr)
    learner.save("model")
    learner.save_encoder("encoder")