Ejemplo n.º 1
0
    def load_wiki_data(self, bs=70):
        trn_path = self.dataset_path / f'{self.lang}.wiki.train.tokens'
        val_path = self.dataset_path / f'{self.lang}.wiki.valid.tokens'
        tst_path = self.dataset_path / f'{self.lang}.wiki.test.tokens'
        for path_ in [trn_path, val_path, tst_path]:
            assert path_.exists(), f'Error: {path_} does not exist.'

        args = self.tokenzier_to_fastai_args(
            trn_data_loading_func=self.load_train_text, add_moses=False)
        try:
            data_lm = TextLMDataBunch.load(self.cache_dir,
                                           '.',
                                           lm_type=self.lm_type,
                                           bs=bs)
            print("Tokenized data loaded")
        except FileNotFoundError:
            print("Running tokenization")
            data_lm = TextLMDataBunch.from_df(
                path=self.cache_dir,
                train_df=read_wiki_articles(trn_path),
                valid_df=read_wiki_articles(val_path),
                classes=None,
                lm_type=self.lm_type,
                max_vocab=self.max_vocab,
                bs=bs,
                text_cols='texts',
                **args)
            data_lm.save('.')

        itos, stoi, trn_path = data_lm.vocab.itos, data_lm.vocab.stoi, data_lm.path
        print('Size of vocabulary:', len(itos))
        print('First 20 words in vocab:', data_lm.vocab.itos[:20])
        return data_lm
Ejemplo n.º 2
0
def save(
    data: TextLMDataBunch,
    learn: LanguageLearner,
    label: str,
    suffix: str,
    accuracy: int,
):
    f = open("models/" + label + "_accuracy.metric", "w")
    f.write(str(accuracy))
    f.close()
    click.echo("Saving...")
    learn.save("model_" + label + "_" + suffix)
    learn.save_encoder("encoder_" + label + "_" + suffix)
    click.echo("Exporting...")
    data.export("models/" + label + "_empty_data")
    learn.export("models/learner_" + label + "_" + suffix + ".pkl")
Ejemplo n.º 3
0
def initialize_learner(
    data: TextLMDataBunch,
    pretrained_encoder: str,
    pretrained_itos: str,
    local_rank: int,
    label: str,
    kind: str,
    gpus: int,
) -> LanguageLearner:
    data.path = Path(".")
    click.echo("Training language model...")
    learn = language_model_learner(
        data,
        TransformerXL,
        pretrained_fnames=[
            "./../" + pretrained_encoder.replace(".pth", ""),
            "./../" + pretrained_itos.replace(".pkl", ""),
        ],
        drop_mult=0.1,
    )

    tboard_path = Path("logs/" + label)
    node_name = "gpu-" + str(local_rank) + "-" + kind
    learn.callback_fns.append(
        partial(LearnerTensorboardWriter,
                base_dir=tboard_path,
                gpus=gpus,
                name=node_name))

    if gpus > 1:
        learn.to_distributed(local_rank)
    return learn
Ejemplo n.º 4
0
 def _fit_lm(self, df_train, df_val):
     # Language model data
     data_lm = TextLMDataBunch.from_df(train_df=df_train, valid_df=df_val, path="")
     lm_learner = language_model_learner(
         data_lm, self.arch, drop_mult=self.dropout_class
     )
     # train the learner object
     lm_learner.fit_one_cycle(1, self.lr_class)
     # TODO: can we return lm_leaner and load via memory so we don't have to save it?
     lm_learner.save_encoder(self.path_lm.name)
     return data_lm
Ejemplo n.º 5
0
def get_datasets(dataset, dataset_dir, bptt, bs, lang, max_vocab, ds_pct, lm_type):
    tmp_dir = dataset_dir / 'tmp'
    tmp_dir.mkdir(exist_ok=True)
    vocab_file = tmp_dir / f'vocab_{lang}.pkl'
    if not (tmp_dir / f'{TRN}_{lang}_ids.npy').exists():
        print('Reading the data...')
        toks, lbls = read_clas_data(dataset_dir, dataset, lang)
        # create the vocabulary
        counter = Counter(word for example in toks[TRN]+toks[TST]+toks[VAL] for word in example)
        itos = [word for word, count in counter.most_common(n=max_vocab)]
        itos.insert(0, PAD)
        itos.insert(0, UNK)
        vocab = Vocab(itos)
        stoi = vocab.stoi
        with open(vocab_file, 'wb') as f:
            pickle.dump(vocab, f)

        ids = {}
        for split in [TRN, VAL, TST]:
            ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s])
                                   for s in toks[split]])
            np.save(tmp_dir / f'{split}_{lang}_ids.npy', ids[split])
            np.save(tmp_dir / f'{split}_{lang}_lbl.npy', lbls[split])
    else:
        print('Loading the pickled data...')
        ids, lbls = {}, {}
        for split in [TRN, VAL, TST]:
            ids[split] = np.load(tmp_dir / f'{split}_{lang}_ids.npy')
            lbls[split] = np.load(tmp_dir / f'{split}_{lang}_lbl.npy')
        with open(vocab_file, 'rb') as f:
            vocab = pickle.load(f)
    print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. '
          f'Test size: {len(ids[TST])}.')
    if ds_pct < 1.0:
        print(f"Making the dataset smaller {ds_pct}")
    for split in [TRN, VAL, TST]:
        ids[split] = np.array([np.array(e, dtype=np.int) for e in ids[split]])
        #print([lbl for lbl in lbls[split] if not int(lbl) in [0,1,2]])          # debug by ak
        #print(f'First 10 lbls[split] labels: {lbls[split][:11]}') 
        if split == TRN: print("processing TRN labels ... ")
        lbls[split] = np.array([np.array(e, dtype=np.int) for e in lbls[split]])
        if split == TRN: print("Info: Passed the train labels lbls[split] to np.array sucessfully .....")
    data_lm = TextLMDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=np.concatenate([ids[TRN],ids[TST]]),
                                       valid_ids=ids[VAL], bs=bs, bptt=bptt, lm_type=lm_type)
    #  TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls?
    data_clas = TextClasDataBunch.from_ids(
        path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL],
        train_lbls=lbls[TRN], valid_lbls=lbls[VAL], bs=bs, classes={l:l for l in lbls[TRN]})

    print(f"Sizes of train_ds {len(data_clas.train_ds)}, valid_ds {len(data_clas.valid_ds)}")
    return data_clas, data_lm
Ejemplo n.º 6
0
def prepare_clas_dataset(input_path,
                         output_dir=None,
                         valid_split=0.2,
                         tokenizer_lang="xx",
                         min_freq=2,
                         seed=42):
    """
    Reads a CSV file with texts and labels, splits it into training and validation sets,
    tokenizes texts and saves datasets for fine-tuning and for classification.

    Attributes:
        input_path (str): Path to CSV file with texts in the first and labels in second column.
        output_dir (str): Folder where to store the processed dataset.
        valid_split (float): A fraction of data used for validation.
        tokenizer_lang (str): Language setting for tokenizer.
        min_freq (int): Minimal number of occurrences of a word to be conidered for adding to
            vocabulary.
        seed (int): Random seed that determines the training-validation split.
    """
    input_path = Path(input_path)
    output_dir = Path(output_dir or input_path.parent)
    output_dir.mkdir(parents=True, exist_ok=True)

    train_df, valid_df = csv_to_train_valid_df(input_path, valid_split, seed)

    data_finetune_lm = TextLMDataBunch.from_df(
        output_dir,
        train_df,
        valid_df,
        tokenizer=Tokenizer(lang=tokenizer_lang),
        text_cols=0,
        min_freq=min_freq)
    data_clas = TextClasDataBunch.from_df(
        output_dir,
        train_df,
        valid_df,
        tokenizer=Tokenizer(lang=tokenizer_lang),
        text_cols=0,
        label_cols=1,
        vocab=data_finetune_lm.train_ds.vocab,
        bs=32,
        min_freq=min_freq)

    data_finetune_lm.save("data_finetune_lm.pkl")
    data_clas.save("data_clas.pkl")
Ejemplo n.º 7
0
def main():
    train_ds = TextDataset.from_folder(IMDB_PATH, name='train', shuffle=True)
    valid_ds = TextDataset.from_folder(IMDB_PATH, name='test')
    lm_data = [train_ds, valid_ds]
    lm_bunch = TextLMDataBunch.create(lm_data, path=LM_PATH)

    learner = RNNLearner.language_model(lm_bunch)

    n = sum(len(ds) for ds in lm_data)
    num_epochs, phases = create_phases(3, n)

    callbacks = [
        EarlyStopping(learner, patience=2),
        SaveModel(learner),
        GeneralScheduler(learner, phases)
    ]

    learner.fit(num_epochs, )
Ejemplo n.º 8
0
def evaluate_lm(data_path,
                model_dir,
                tokenizer_lang="xx",
                evaluate_custom_perplexity=False):
    """
    Evaluate metrics of a trained language model using any dataset of texts from CSV file.

    Attributes:
        data_path (str): Path to CSV file with texts in the first column.
        model_dir (str): Directory with a trained language model.
        tokenizer_lang (str): Language setting for tokenizer.
        evaluate_custom_perplexity (bool): The perplexity estimated as e^(avg. loss),
            but the average loss changes slightly with batch size. To get perplexity computed in
            slower but controlled fashion, set `evaluate_custom_perplexity` to True. Discrepancy
            between perplexity and custom perplexity is empirically approximately 1%.
    """
    model_dir = Path(model_dir)
    with open(model_dir / "lm_itos.pkl", "rb") as f:
        itos = pickle.load(f)

    data_df = pd.read_csv(data_path, header=None)
    data = TextLMDataBunch.from_df("",
                                   data_df,
                                   data_df,
                                   text_cols=0,
                                   tokenizer=Tokenizer(lang=tokenizer_lang),
                                   vocab=Vocab(itos))

    with open(model_dir / "model_hparams.json", "r") as model_hparams_file:
        model_hparams = json.load(model_hparams_file)
    learner = lm_learner(data,
                         AWD_LSTM,
                         model_dir,
                         pretrained=True,
                         config=model_hparams)

    loss, acc = learner.validate()
    print("Loss: {}, Perplexity: {}, Accuracy: {}".format(
        loss, exp(loss), acc))
    if evaluate_custom_perplexity:
        print(
            "Custom perplexity: {}, Fraction OOV: {}, OOV perplexity contribution: {}"
            .format(*evaluate_perplexity(learner, data.valid_ds.x)))
Ejemplo n.º 9
0
def prepare_lm_dataset(input_path,
                       output_dir=None,
                       valid_split=0.2,
                       tokenizer_lang="xx",
                       min_freq=2,
                       seed=42):
    """
    Reads CSV file with texts for training language model, splits it into training and validation
    sets, tokenizes and saves the dataset.

    Attributes:
        input_path (str): Path to CSV file where there are texts in the first column.
        output_dir (str): Folder where to store the processed dataset.
        valid_split (float): A fraction of data used for validation.
        tokenizer_lang (str): Language setting for tokenizer.
        min_freq (int): Minimal number of occurrences of a word to be conidered for adding to
            vocabulary.
        seed (int): Random seed that determines the training-validation split.
    """
    input_path = Path(input_path)
    output_dir = Path(output_dir or input_path.parent)
    output_dir.mkdir(parents=True, exist_ok=True)

    train_df, valid_df = csv_to_train_valid_df(input_path, valid_split, seed)

    data_lm = TextLMDataBunch.from_df(output_dir,
                                      train_df,
                                      valid_df,
                                      text_cols=0,
                                      tokenizer=Tokenizer(lang=tokenizer_lang),
                                      min_freq=min_freq)
    data_lm.save("data_lm.pkl")

    with open(output_dir / "data_lm_tokenized_train.txt", "w") as f:
        f.write("\n".join(map(str, list(data_lm.train_ds.x))))
    with open(output_dir / "data_lm_tokenized_valid.txt", "w") as f:
        f.write("\n".join(map(str, list(data_lm.valid_ds.x))))
Ejemplo n.º 10
0
    def pre_execution_hook(self, mode=ExecutionModeKeys.TEST):
        self.data_lm_name = "data_lm.pkl"
        self.data_class_name = "data_class_name"
        self.fwd_enc_name = "fwd_enc"
        self.bwd_enc_name = "bwd_enc"
        self.fwd_class_name = 'fwd_clas'
        self.bwd_class_name = 'bwd_clas'

        # to make sure the outputs are also logged
        fastprogress.fastprogress.WRITER_FN = self._get_master_bar_write_fn()

        data_lm_path = os.path.join(self.experiment_dir, self.data_lm_name)
        if not os.path.exists(os.path.dirname(data_lm_path)):
            os.makedirs(os.path.dirname(data_lm_path))
        if not os.path.exists(data_lm_path):
            data_lm = TextLMDataBunch.from_df(path=self.experiment_dir,
                                              train_df=self.dataloader.get_train_input(),
                                              valid_df=self.dataloader.get_test_input(),
                                              text_cols='utterance',
                                              bs=BATCH_SIZE)
            data_lm.save(self.data_lm_name)
        self.data_lm = load_data(self.experiment_dir, self.data_lm_name, bs=BATCH_SIZE, bptt=BPTT)
        self.data_bwd = load_data(self.experiment_dir, self.data_lm_name, bs=BATCH_SIZE,
                                  bptt=BPTT, backwards=True)

        data_class_path = os.path.join(self.experiment_dir, self.data_class_name)
        if not os.path.exists(data_class_path):
            data_class = TextDataBunch.from_df(path=self.experiment_dir,
                                               train_df=self.dataloader.get_train_input(),
                                               valid_df=self.dataloader.get_test_input(),
                                               text_cols='utterance',
                                               label_cols='functions',
                                               vocab=data_lm.train_ds.vocab,
                                               bs=BATCH_SIZE)
            data_class.save(self.data_class_name)
        self.data_class = load_data(self.experiment_dir, self.data_class_name, bs=BATCH_SIZE)
        self.data_class_bwd = load_data(self.experiment_dir, self.data_class_name, bs=BATCH_SIZE, backwards=True)
Ejemplo n.º 11
0
    def load_cls_data_imdb(self,
                           bs,
                           force=False,
                           use_test_for_validation=False):
        trn_df = pd.read_csv(self.dataset_path / 'train.csv', header=None)
        tst_df = pd.read_csv(self.dataset_path / 'test.csv', header=None)
        unsp_df = pd.read_csv(self.dataset_path / 'unsup.csv', header=None)

        lm_trn_df = pd.concat([unsp_df, trn_df, tst_df])
        val_len = max(int(len(lm_trn_df) * 0.1), 2)
        lm_trn_df = lm_trn_df[val_len:]
        lm_val_df = lm_trn_df[:val_len]

        if use_test_for_validation:
            val_df = tst_df
            cls_cache = 'notst'
        else:
            val_len = max(int(len(trn_df) * 0.1), 2)
            trn_len = len(trn_df) - val_len
            trn_df, val_df = trn_df[:trn_len], trn_df[trn_len:]
            cls_cache = '.'

        if self.tokenizer is Tokenizers.SUBWORD:
            args = get_sentencepiece(self.dataset_path,
                                     self.dataset_path / 'train.csv',
                                     self.name,
                                     vocab_size=self.max_vocab,
                                     pre_rules=[],
                                     post_rules=[])
            if self.tokenizer is Tokenizers.SUBWORD:
                args = get_sentencepiece(self.dataset_path,
                                         self.dataset_path / 'train.csv',
                                         self.name,
                                         vocab_size=self.max_vocab,
                                         pre_rules=[],
                                         post_rules=[])
        elif self.tokenizer is Tokenizers.MOSES:
            args = dict(tokenizer=Tokenizer(tok_func=MosesTokenizerFunc,
                                            lang='en',
                                            pre_rules=[],
                                            post_rules=[]))
        elif self.tokenizer is Tokenizers.MOSES_FA:
            args = dict(
                tokenizer=Tokenizer(tok_func=MosesTokenizerFunc,
                                    lang='en'))  # use default pre/post rules
        elif self.tokenizer is Tokenizers.FASTAI:
            args = dict()
        else:
            raise ValueError(
                f"self.tokenizer has wrong value {self.tokenizer}, Allowed values are taken from {Tokenizers}"
            )

        try:
            if force: raise FileNotFoundError("Forcing reloading of caches")
            data_lm = TextLMDataBunch.load(self.cache_dir,
                                           'lm',
                                           lm_type=self.lm_type,
                                           bs=bs)
            print(
                f"Tokenized data loaded, lm.trn {len(data_lm.train_ds)}, lm.val {len(data_lm.valid_ds)}"
            )
        except FileNotFoundError:
            print(f"Running tokenization...")
            data_lm = TextLMDataBunch.from_df(path=self.cache_dir,
                                              train_df=lm_trn_df,
                                              valid_df=lm_val_df,
                                              max_vocab=self.max_vocab,
                                              bs=bs,
                                              lm_type=self.lm_type,
                                              **args)
            print(
                f"Saving tokenized: cls.trn {len(data_lm.train_ds)}, cls.val {len(data_lm.valid_ds)}"
            )
            data_lm.save('lm')

        try:
            if force: raise FileNotFoundError("Forcing reloading of caches")
            data_cls = TextClasDataBunch.load(self.cache_dir, cls_cache, bs=bs)
            print(
                f"Tokenized data loaded, cls.trn {len(data_cls.train_ds)}, cls.val {len(data_cls.valid_ds)}"
            )
        except FileNotFoundError:
            args[
                'vocab'] = data_lm.vocab  # make sure we use the same vocab for classifcation
            print(f"Running tokenization...")
            data_cls = TextClasDataBunch.from_df(path=self.cache_dir,
                                                 train_df=trn_df,
                                                 valid_df=val_df,
                                                 test_df=tst_df,
                                                 max_vocab=self.max_vocab,
                                                 bs=bs,
                                                 **args)
            print(
                f"Saving tokenized: cls.trn {len(data_cls.train_ds)}, cls.val {len(data_cls.valid_ds)}"
            )
            data_cls.save(cls_cache)
        print('Size of vocabulary:', len(data_lm.vocab.itos))
        print('First 20 words in vocab:', data_lm.vocab.itos[:20])
        return data_cls, data_lm
    def load_cls_data(self,
                      bs,
                      force=False,
                      use_test_for_validation=False,
                      **kwargs):
        args = self.tokenzier_to_fastai_args(
            trn_data_loading_func=lambda: trn_df[1], add_moses=True)
        src_path = self.dataset_path
        csv_name = self.csv_name
        tgt_paths = [Path(tgt_path) for tgt_path in self.target_paths]
        mixed_csv = pd.read_csv(src_path / csv_name, header=None)
        for tgt_path in tgt_paths:
            mixed_csv = pd.concat(
                [mixed_csv,
                 pd.read_csv(tgt_path / csv_name, header=None)])

        xcvs_name = ('x_' + csv_name)
        mixed_csv.to_csv(src_path / xcvs_name, header=None, index=False)

        try:
            if force: raise FileNotFoundError("Forcing reloading of caches")
            data_lm = TextLMDataBunch.load(src_path,
                                           'xlm',
                                           lm_type=self.lm_type,
                                           bs=bs)
            print(
                f"Tokenized data loaded, xlm.trn {len(data_lm.train_ds)}, xlm.val {len(data_lm.valid_ds)}"
            )
        except FileNotFoundError:
            print(f"Running tokenization...")
            data_lm = TextLMDataBunch.from_csv(path=src_path,
                                               csv_name=xcvs_name,
                                               bs=bs,
                                               lm_type=self.lm_type,
                                               **kwargs,
                                               **args)
            print(
                f"Saving tokenized: cls.trn {len(data_lm.train_ds)}, cls.val {len(data_lm.valid_ds)}"
            )
            data_lm.save('xlm')

        try:
            if force: raise FileNotFoundError("Forcing reloading of caches")
            data_cls = TextClasDataBunch.load(src_path, 'cls', bs=bs)
            print(
                f"Tokenized data loaded, cls.trn {len(data_cls.train_ds)}, cls.val {len(data_cls.valid_ds)}"
            )
        except FileNotFoundError:
            args[
                'vocab'] = data_lm.vocab  # make sure we use the same vocab for classifcation
            print(f"Running tokenization...")
            data_cls = TextClasDataBunch.from_csv(path=src_path,
                                                  csv_name=csv_name,
                                                  bs=bs,
                                                  **kwargs,
                                                  **args)

            print(
                f"Saving tokenized: cls.trn {len(data_cls.train_ds)}, cls.val {len(data_cls.valid_ds)}"
            )
            data_cls.save('cls')

        print('Size of vocabulary:', len(data_lm.vocab.itos))
        print('First 20 words in vocab:', data_lm.vocab.itos[:20])
        return data_cls, data_lm
Ejemplo n.º 13
0
def main(input_file, output_dir, max_size=0):
    crate_dir(output_dir)
    count = 0
    blank_limit = 1000
    blank_count = 0
    cpus = multiprocessing.cpu_count() - 1
    p = multiprocessing.Pool(cpus)

    with open(input_file, 'r', encoding='utf-8') as input_text, open(
            os.path.join(output_dir,
                         'train.txt'), 'w', encoding='utf-8') as output_train:
        count = 0
        for text in p.imap(preprocessing, input_text):
            text_size = len(text.split(' '))

            count += 1
            print(count)

            if text_size < 2:
                continue

            if max_size:
                if text_size > max_size:
                    text = ' '.join(text.split(' ')[:max_size])

            blank_count = 0
            output_train.write(text + '\n')
        # pickle.dump(words_dict, open(os.path.join(output_dir, 'data.dict'), 'wb'))

    print('\033[1;34m', 'Creating the Data Buntch', '\033[0;0m')
    phrases = []

    with open(os.path.join(output_dir, 'train.txt'), 'r',
              encoding='utf-8') as txt:
        phrases = [line.replace('\n', '').split(' ') for line in txt]

    freq = collections.Counter([w for s in phrases for w in s])

    max_vocab = 30000
    min_freq = 5

    # getting rid of the rare words
    itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
    itos.insert(0, '_pad_')
    itos.insert(0, '_unk_')  # itos is the list of all the strings in the vocab

    stoi = collections.defaultdict(lambda: 0,
                                   {v: k
                                    for k, v in enumerate(itos)})
    max_data = int(len(phrases) * 0.9)
    # creating a index representation for our train and validation dataset

    trn_lm = np.array([[stoi[o] for o in p] for p in phrases])
    data_lm = TextLMDataBunch.from_ids(output_dir,
                                       transform.Vocab(itos),
                                       train_ids=trn_lm[:max_data],
                                       valid_ids=trn_lm[max_data:])

    #np.save(output_dir, trn_lm)
    #pickle.dump(itos, open(os.path.join(output_dir, 'itos.pkl'), 'wb'))
    #pickle.dump(dict(stoi), open(os.path.join(output_dir, 'stoi.pkl'), 'wb'))
    data_lm.save('data_save.pkl')
Ejemplo n.º 14
0
SAMPLES_PER_CLASS = 12500

print('loading data')
texts = []
target = []

for class_index, classname in enumerate(CLASS_NAMES):

    for n, line in enumerate(open(DATA_FOLDER+classname+'.txt')):

        texts.append(preprocess_string(line,False))
        target.append(class_index)

        if n > SAMPLES_PER_CLASS:
            break

df = DataFrame({'label':target,'text':texts})
df_train, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

data_lm = TextLMDataBunch.from_df(train_df = df_train, valid_df = df_val, path = "")
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_train, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7)
learn.fit_one_cycle(1, 1e-2)
learn.save_encoder('ft_enc')

learn = text_classifier_learner(data_clas, drop_mult=0.7)
learn.load_encoder('ft_enc')
learn.fit_one_cycle(1, 1e-2)

    ingredients = [" ".join(item["ingredients"]) for item in train_json]
    cuisine = [item["cuisine"] for item in train_json]

    df = pd.DataFrame({"cuisine": cuisine, "ingredients": ingredients})

    train_df, valid_df = train_test_split(
        df,
        stratify=df["cuisine"],
        test_size=0.2,
        random_state=1024,
    )

    text_lm = TextLMDataBunch.from_df(
        train_df=train_df,
        valid_df=valid_df,
        path="",
    )
    lm_learner = language_model_learner(
        text_lm,
        arch=AWD_LSTM,
        drop_mult=0.2,
    )

    lm_learner.lr_find()
    lm_learner.recorder.plot(suggestion=True)

    lm_learner.fit_one_cycle(1, lm_learner.recorder.min_grad_lr)

    lm_learner.save_encoder(model)
Ejemplo n.º 16
0
    def load_cls_data_old_for_xnli(self, bs):
        tmp_dir = self.cache_dir
        tmp_dir.mkdir(exist_ok=True)
        vocab_file = tmp_dir / f'vocab_{self.lang}.pkl'
        if not (tmp_dir / f'{TRN}_{self.lang}_ids.npy').exists():
            print('Reading the data...')
            toks, lbls = read_clas_data(self.dataset_dir,
                                        self.dataset_dir.name, self.lang)
            # create the vocabulary
            counter = Counter(word
                              for example in toks[TRN] + toks[TST] + toks[VAL]
                              for word in example)
            itos = [
                word for word, count in counter.most_common(n=self.max_vocab)
            ]
            itos.insert(0, PAD)
            itos.insert(0, UNK)
            vocab = Vocab(itos)
            stoi = vocab.stoi
            with open(vocab_file, 'wb') as f:
                pickle.dump(vocab, f)
            ids = {}
            for split in [TRN, VAL, TST]:
                ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s])
                                       for s in toks[split]])
                np.save(tmp_dir / f'{split}_{self.lang}_ids.npy', ids[split])
                np.save(tmp_dir / f'{split}_{self.lang}_lbl.npy', lbls[split])
        else:
            print('Loading the pickled data...')
            ids, lbls = {}, {}
            for split in [TRN, VAL, TST]:
                ids[split] = np.load(tmp_dir / f'{split}_{self.lang}_ids.npy')
                lbls[split] = np.load(tmp_dir / f'{split}_{self.lang}_lbl.npy')
            with open(vocab_file, 'rb') as f:
                vocab = pickle.load(f)
        print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. '
              f'Test size: {len(ids[TST])}.')
        for split in [TRN, VAL, TST]:
            ids[split] = np.array(
                [np.array(e, dtype=np.int) for e in ids[split]])
            lbls[split] = np.array(
                [np.array(e, dtype=np.int) for e in lbls[split]])
        data_lm = TextLMDataBunch.from_ids(path=tmp_dir,
                                           vocab=vocab,
                                           train_ids=np.concatenate(
                                               [ids[TRN], ids[TST]]),
                                           valid_ids=ids[VAL],
                                           bs=bs,
                                           bptt=self.bptt,
                                           lm_type=self.lm_type)
        #  TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls?
        data_clas = TextClasDataBunch.from_ids(
            path=tmp_dir,
            vocab=vocab,
            train_ids=ids[TRN],
            valid_ids=ids[VAL],
            train_lbls=lbls[TRN],
            valid_lbls=lbls[VAL],
            bs=bs,
            classes={l: l
                     for l in lbls[TRN]})

        print(
            f"Sizes of train_ds {len(data_clas.train_ds)}, valid_ds {len(data_clas.valid_ds)}"
        )
        return data_clas, data_lm
Ejemplo n.º 17
0
    def df_to_emb(self, dataframe: pd.DataFrame, bs=100) -> np.ndarray:
        """
        Retrieve document embeddings for a dataframe with the columns `title` and `body`.
        Uses batching for effiecient computation, which is useful when you have many documents
        to retrieve embeddings for. 

        Paramaters
        ----------
        dataframe: pandas.DataFrame
            Dataframe with columns `title` and `body`, which reprsent the Title and Body of a
            GitHub Issue. 
        bs: int
            batch size for doing inference.  Set this variable according to your available GPU memory.
            The default is set to 200, which was stable on a Nvida-Tesla V-100.

        Returns
        -------
        numpy.ndarray
            An array with of shape (number of dataframe rows, 2400)
            This numpy array represents the latent features of the GitHub issues.

        Example
        -------
        >>> import pandas as pd
        >>> wrapper = InferenceWrapper(model_path='/path/to/model',
                                   model_file_name='model.pkl')
        # load 200 sample GitHub issues
        >>> testdf = pd.read_csv(f'https://bit.ly/2GDY5NY').head(200)
        >>> embeddings = wrapper.df_to_emb(testdf)

        >>> embeddings.shape
        (200, 2400)
        """
        new_df = self.process_df(dataframe)
        # to get the benefit of batching similar length sequences together, have a minimum of 20 batches
        bs = min(bs, (len(new_df) // 20) + 1)

        # use the machinery of the data block to numericalize text in parallel
        data_lm = lmdb.from_df(
            path=self.path,
            train_df=new_df.head(),  # train_df gets sample data only
            valid_df=new_df,
            text_cols='text',
            tokenizer=self.model_tokenizer,
            vocab=self.vocab)

        # extract numericalized arrays and convert to pytorch
        docs = data_lm.valid_dl.x.items
        lengths = []
        numericalized_docs = []
        for arr in docs:
            numericalized_docs.append(
                tensor(arr).cuda())  # convert to torch.Tensor
            lengths.append(arr.shape[0])

        # sort the data by sequence length and assemble batches
        length_arr = np.array(lengths)
        len_mask = length_arr.argsort()
        len_mask_reversed = len_mask.argsort()
        batched_features = list(
            chunked([numericalized_docs[i] for i in len_mask], bs))
        batched_lengths = list(chunked(length_arr[len_mask], bs))

        # perform model inference
        hidden_states_batched = []
        pooled_states = []
        for i, b in tqdm(enumerate(batched_features), desc="Model inference:"):
            # pad the batch to the same length
            bp = pad_sequence(b, batch_first=True, padding_value=self.pad_idx)
            # perform inference
            hidden_states = self._forward_pass(bp)
            empty_cache()
            # fetch the summary of the hidden states as the embedding
            pooled_states.append(
                self.batch_seq_pool(hidden_states, batched_lengths[i]))

        # restore the original order of the data by unsorting
        pooled_states = cat(pooled_states)[len_mask_reversed, :]
        assert pooled_states.shape[0] == length_arr.shape[0] == len(dataframe)

        return pooled_states
Ejemplo n.º 18
0
def new_train_clas(data_dir,
                   lang='en',
                   cuda_id=0,
                   pretrain_name='wt103',
                   model_dir='models',
                   qrnn=False,
                   fine_tune=True,
                   max_vocab=30000,
                   bs=20,
                   bptt=70,
                   name='imdb-clas',
                   dataset='imdb',
                   ds_pct=1.0):
    """
    :param data_dir: The path to the `data` directory
    :param lang: the language unicode
    :param cuda_id: The id of the GPU. Uses GPU 0 by default or no GPU when
                    run on CPU.
    :param pretrain_name: name of the pretrained model
    :param model_dir: The path to the directory where the pretrained model is saved
    :param qrrn: Use a QRNN. Requires installing cupy.
    :param fine_tune: Fine-tune the pretrained language model
    :param max_vocab: The maximum size of the vocabulary.
    :param bs: The batch size.
    :param bptt: The back-propagation-through-time sequence length.
    :param name: The name used for both the model and the vocabulary.
    :param dataset: The dataset used for evaluation. Currently only IMDb and
                    XNLI are implemented. Assumes dataset is located in `data`
                    folder and that name of folder is the same as dataset name.
    """
    results = {}
    if not torch.cuda.is_available():
        print('CUDA not available. Setting device=-1.')
        cuda_id = -1
    torch.cuda.set_device(cuda_id)

    print(f'Dataset: {dataset}. Language: {lang}.')
    assert dataset in DATASETS, f'Error: {dataset} processing is not implemented.'
    assert (dataset == 'imdb' and lang == 'en') or not dataset == 'imdb',\
        'Error: IMDb is only available in English.'

    data_dir = Path(data_dir)
    assert data_dir.name == 'data',\
        f'Error: Name of data directory should be data, not {data_dir.name}.'
    dataset_dir = data_dir / dataset
    model_dir = Path(model_dir)

    if qrnn:
        print('Using QRNNs...')
    model_name = 'qrnn' if qrnn else 'lstm'
    lm_name = f'{model_name}_{pretrain_name}'
    pretrained_fname = (lm_name, f'itos_{pretrain_name}')

    ensure_paths_exists(data_dir, dataset_dir, model_dir,
                        model_dir / f"{pretrained_fname[0]}.pth",
                        model_dir / f"{pretrained_fname[1]}.pkl")

    tmp_dir = dataset_dir / 'tmp'
    tmp_dir.mkdir(exist_ok=True)
    vocab_file = tmp_dir / f'vocab_{lang}.pkl'

    if not (tmp_dir / f'{TRN}_{lang}_ids.npy').exists():
        print('Reading the data...')
        toks, lbls = read_clas_data(dataset_dir, dataset, lang)

        # create the vocabulary
        counter = Counter(word for example in toks[TRN] for word in example)
        itos = [word for word, count in counter.most_common(n=max_vocab)]
        itos.insert(0, PAD)
        itos.insert(0, UNK)
        vocab = Vocab(itos)
        stoi = vocab.stoi
        with open(vocab_file, 'wb') as f:
            pickle.dump(vocab, f)

        ids = {}
        for split in [TRN, VAL, TST]:
            ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s])
                                   for s in toks[split]])
            np.save(tmp_dir / f'{split}_{lang}_ids.npy', ids[split])
            np.save(tmp_dir / f'{split}_{lang}_lbl.npy', lbls[split])
    else:
        print('Loading the pickled data...')
        ids, lbls = {}, {}
        for split in [TRN, VAL, TST]:
            ids[split] = np.load(tmp_dir / f'{split}_{lang}_ids.npy')
            lbls[split] = np.load(tmp_dir / f'{split}_{lang}_lbl.npy')
        with open(vocab_file, 'rb') as f:
            vocab = pickle.load(f)

    print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. '
          f'Test size: {len(ids[TST])}.')

    if ds_pct < 1.0:
        print(f"Makeing the dataset smaller {ds_pct}")
        for split in [TRN, VAL, TST]:
            ids[split] = ids[split][:int(len(ids[split]) * ds_pct)]

    data_lm = TextLMDataBunch.from_ids(path=tmp_dir,
                                       vocab=vocab,
                                       train_ids=ids[TRN],
                                       valid_ids=ids[VAL],
                                       bs=bs,
                                       bptt=bptt)

    # TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls?
    data_clas = TextClasDataBunch.from_ids(path=tmp_dir,
                                           vocab=vocab,
                                           train_ids=ids[TRN],
                                           valid_ids=ids[VAL],
                                           train_lbls=lbls[TRN],
                                           valid_lbls=lbls[VAL],
                                           bs=bs)

    if qrnn:
        emb_sz, nh, nl = 400, 1550, 3
    else:
        emb_sz, nh, nl = 400, 1150, 3
    learn = language_model_learner(data_lm,
                                   bptt=bptt,
                                   emb_sz=emb_sz,
                                   nh=nh,
                                   nl=nl,
                                   qrnn=qrnn,
                                   pad_token=PAD_TOKEN_ID,
                                   pretrained_fnames=pretrained_fname,
                                   path=model_dir.parent,
                                   model_dir=model_dir.name)
    lm_enc_finetuned = f"{lm_name}_{dataset}_enc"
    if fine_tune and not (model_dir / f"lm_enc_finetuned.pth").exists():
        print('Fine-tuning the language model...')
        learn.unfreeze()
        learn.fit(2, slice(1e-4, 1e-2))

        # save encoder
        learn.save_encoder(lm_enc_finetuned)

    print("Starting classifier training")
    learn = text_classifier_learner(data_clas,
                                    bptt=bptt,
                                    pad_token=PAD_TOKEN_ID,
                                    path=model_dir.parent,
                                    model_dir=model_dir.name,
                                    qrnn=qrnn,
                                    emb_sz=emb_sz,
                                    nh=nh,
                                    nl=nl)

    learn.load_encoder(lm_enc_finetuned)

    learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7), wd=1e-7)

    learn.freeze_to(-2)
    learn.fit_one_cycle(1,
                        slice(1e-2 / (2.6**4), 1e-2),
                        moms=(0.8, 0.7),
                        wd=1e-7)

    learn.freeze_to(-3)
    learn.fit_one_cycle(1,
                        slice(5e-3 / (2.6**4), 5e-3),
                        moms=(0.8, 0.7),
                        wd=1e-7)

    learn.unfreeze()
    learn.fit_one_cycle(2,
                        slice(1e-3 / (2.6**4), 1e-3),
                        moms=(0.8, 0.7),
                        wd=1e-7)
    results['accuracy'] = learn.validate()[1]
    print(f"Saving models at {learn.path / learn.model_dir}")
    learn.save(f'{model_name}_{name}')
    return results