def load_wiki_data(self, bs=70): trn_path = self.dataset_path / f'{self.lang}.wiki.train.tokens' val_path = self.dataset_path / f'{self.lang}.wiki.valid.tokens' tst_path = self.dataset_path / f'{self.lang}.wiki.test.tokens' for path_ in [trn_path, val_path, tst_path]: assert path_.exists(), f'Error: {path_} does not exist.' args = self.tokenzier_to_fastai_args( trn_data_loading_func=self.load_train_text, add_moses=False) try: data_lm = TextLMDataBunch.load(self.cache_dir, '.', lm_type=self.lm_type, bs=bs) print("Tokenized data loaded") except FileNotFoundError: print("Running tokenization") data_lm = TextLMDataBunch.from_df( path=self.cache_dir, train_df=read_wiki_articles(trn_path), valid_df=read_wiki_articles(val_path), classes=None, lm_type=self.lm_type, max_vocab=self.max_vocab, bs=bs, text_cols='texts', **args) data_lm.save('.') itos, stoi, trn_path = data_lm.vocab.itos, data_lm.vocab.stoi, data_lm.path print('Size of vocabulary:', len(itos)) print('First 20 words in vocab:', data_lm.vocab.itos[:20]) return data_lm
def save( data: TextLMDataBunch, learn: LanguageLearner, label: str, suffix: str, accuracy: int, ): f = open("models/" + label + "_accuracy.metric", "w") f.write(str(accuracy)) f.close() click.echo("Saving...") learn.save("model_" + label + "_" + suffix) learn.save_encoder("encoder_" + label + "_" + suffix) click.echo("Exporting...") data.export("models/" + label + "_empty_data") learn.export("models/learner_" + label + "_" + suffix + ".pkl")
def initialize_learner( data: TextLMDataBunch, pretrained_encoder: str, pretrained_itos: str, local_rank: int, label: str, kind: str, gpus: int, ) -> LanguageLearner: data.path = Path(".") click.echo("Training language model...") learn = language_model_learner( data, TransformerXL, pretrained_fnames=[ "./../" + pretrained_encoder.replace(".pth", ""), "./../" + pretrained_itos.replace(".pkl", ""), ], drop_mult=0.1, ) tboard_path = Path("logs/" + label) node_name = "gpu-" + str(local_rank) + "-" + kind learn.callback_fns.append( partial(LearnerTensorboardWriter, base_dir=tboard_path, gpus=gpus, name=node_name)) if gpus > 1: learn.to_distributed(local_rank) return learn
def _fit_lm(self, df_train, df_val): # Language model data data_lm = TextLMDataBunch.from_df(train_df=df_train, valid_df=df_val, path="") lm_learner = language_model_learner( data_lm, self.arch, drop_mult=self.dropout_class ) # train the learner object lm_learner.fit_one_cycle(1, self.lr_class) # TODO: can we return lm_leaner and load via memory so we don't have to save it? lm_learner.save_encoder(self.path_lm.name) return data_lm
def get_datasets(dataset, dataset_dir, bptt, bs, lang, max_vocab, ds_pct, lm_type): tmp_dir = dataset_dir / 'tmp' tmp_dir.mkdir(exist_ok=True) vocab_file = tmp_dir / f'vocab_{lang}.pkl' if not (tmp_dir / f'{TRN}_{lang}_ids.npy').exists(): print('Reading the data...') toks, lbls = read_clas_data(dataset_dir, dataset, lang) # create the vocabulary counter = Counter(word for example in toks[TRN]+toks[TST]+toks[VAL] for word in example) itos = [word for word, count in counter.most_common(n=max_vocab)] itos.insert(0, PAD) itos.insert(0, UNK) vocab = Vocab(itos) stoi = vocab.stoi with open(vocab_file, 'wb') as f: pickle.dump(vocab, f) ids = {} for split in [TRN, VAL, TST]: ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s]) for s in toks[split]]) np.save(tmp_dir / f'{split}_{lang}_ids.npy', ids[split]) np.save(tmp_dir / f'{split}_{lang}_lbl.npy', lbls[split]) else: print('Loading the pickled data...') ids, lbls = {}, {} for split in [TRN, VAL, TST]: ids[split] = np.load(tmp_dir / f'{split}_{lang}_ids.npy') lbls[split] = np.load(tmp_dir / f'{split}_{lang}_lbl.npy') with open(vocab_file, 'rb') as f: vocab = pickle.load(f) print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. ' f'Test size: {len(ids[TST])}.') if ds_pct < 1.0: print(f"Making the dataset smaller {ds_pct}") for split in [TRN, VAL, TST]: ids[split] = np.array([np.array(e, dtype=np.int) for e in ids[split]]) #print([lbl for lbl in lbls[split] if not int(lbl) in [0,1,2]]) # debug by ak #print(f'First 10 lbls[split] labels: {lbls[split][:11]}') if split == TRN: print("processing TRN labels ... ") lbls[split] = np.array([np.array(e, dtype=np.int) for e in lbls[split]]) if split == TRN: print("Info: Passed the train labels lbls[split] to np.array sucessfully .....") data_lm = TextLMDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=np.concatenate([ids[TRN],ids[TST]]), valid_ids=ids[VAL], bs=bs, bptt=bptt, lm_type=lm_type) # TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls? data_clas = TextClasDataBunch.from_ids( path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL], train_lbls=lbls[TRN], valid_lbls=lbls[VAL], bs=bs, classes={l:l for l in lbls[TRN]}) print(f"Sizes of train_ds {len(data_clas.train_ds)}, valid_ds {len(data_clas.valid_ds)}") return data_clas, data_lm
def prepare_clas_dataset(input_path, output_dir=None, valid_split=0.2, tokenizer_lang="xx", min_freq=2, seed=42): """ Reads a CSV file with texts and labels, splits it into training and validation sets, tokenizes texts and saves datasets for fine-tuning and for classification. Attributes: input_path (str): Path to CSV file with texts in the first and labels in second column. output_dir (str): Folder where to store the processed dataset. valid_split (float): A fraction of data used for validation. tokenizer_lang (str): Language setting for tokenizer. min_freq (int): Minimal number of occurrences of a word to be conidered for adding to vocabulary. seed (int): Random seed that determines the training-validation split. """ input_path = Path(input_path) output_dir = Path(output_dir or input_path.parent) output_dir.mkdir(parents=True, exist_ok=True) train_df, valid_df = csv_to_train_valid_df(input_path, valid_split, seed) data_finetune_lm = TextLMDataBunch.from_df( output_dir, train_df, valid_df, tokenizer=Tokenizer(lang=tokenizer_lang), text_cols=0, min_freq=min_freq) data_clas = TextClasDataBunch.from_df( output_dir, train_df, valid_df, tokenizer=Tokenizer(lang=tokenizer_lang), text_cols=0, label_cols=1, vocab=data_finetune_lm.train_ds.vocab, bs=32, min_freq=min_freq) data_finetune_lm.save("data_finetune_lm.pkl") data_clas.save("data_clas.pkl")
def main(): train_ds = TextDataset.from_folder(IMDB_PATH, name='train', shuffle=True) valid_ds = TextDataset.from_folder(IMDB_PATH, name='test') lm_data = [train_ds, valid_ds] lm_bunch = TextLMDataBunch.create(lm_data, path=LM_PATH) learner = RNNLearner.language_model(lm_bunch) n = sum(len(ds) for ds in lm_data) num_epochs, phases = create_phases(3, n) callbacks = [ EarlyStopping(learner, patience=2), SaveModel(learner), GeneralScheduler(learner, phases) ] learner.fit(num_epochs, )
def evaluate_lm(data_path, model_dir, tokenizer_lang="xx", evaluate_custom_perplexity=False): """ Evaluate metrics of a trained language model using any dataset of texts from CSV file. Attributes: data_path (str): Path to CSV file with texts in the first column. model_dir (str): Directory with a trained language model. tokenizer_lang (str): Language setting for tokenizer. evaluate_custom_perplexity (bool): The perplexity estimated as e^(avg. loss), but the average loss changes slightly with batch size. To get perplexity computed in slower but controlled fashion, set `evaluate_custom_perplexity` to True. Discrepancy between perplexity and custom perplexity is empirically approximately 1%. """ model_dir = Path(model_dir) with open(model_dir / "lm_itos.pkl", "rb") as f: itos = pickle.load(f) data_df = pd.read_csv(data_path, header=None) data = TextLMDataBunch.from_df("", data_df, data_df, text_cols=0, tokenizer=Tokenizer(lang=tokenizer_lang), vocab=Vocab(itos)) with open(model_dir / "model_hparams.json", "r") as model_hparams_file: model_hparams = json.load(model_hparams_file) learner = lm_learner(data, AWD_LSTM, model_dir, pretrained=True, config=model_hparams) loss, acc = learner.validate() print("Loss: {}, Perplexity: {}, Accuracy: {}".format( loss, exp(loss), acc)) if evaluate_custom_perplexity: print( "Custom perplexity: {}, Fraction OOV: {}, OOV perplexity contribution: {}" .format(*evaluate_perplexity(learner, data.valid_ds.x)))
def prepare_lm_dataset(input_path, output_dir=None, valid_split=0.2, tokenizer_lang="xx", min_freq=2, seed=42): """ Reads CSV file with texts for training language model, splits it into training and validation sets, tokenizes and saves the dataset. Attributes: input_path (str): Path to CSV file where there are texts in the first column. output_dir (str): Folder where to store the processed dataset. valid_split (float): A fraction of data used for validation. tokenizer_lang (str): Language setting for tokenizer. min_freq (int): Minimal number of occurrences of a word to be conidered for adding to vocabulary. seed (int): Random seed that determines the training-validation split. """ input_path = Path(input_path) output_dir = Path(output_dir or input_path.parent) output_dir.mkdir(parents=True, exist_ok=True) train_df, valid_df = csv_to_train_valid_df(input_path, valid_split, seed) data_lm = TextLMDataBunch.from_df(output_dir, train_df, valid_df, text_cols=0, tokenizer=Tokenizer(lang=tokenizer_lang), min_freq=min_freq) data_lm.save("data_lm.pkl") with open(output_dir / "data_lm_tokenized_train.txt", "w") as f: f.write("\n".join(map(str, list(data_lm.train_ds.x)))) with open(output_dir / "data_lm_tokenized_valid.txt", "w") as f: f.write("\n".join(map(str, list(data_lm.valid_ds.x))))
def pre_execution_hook(self, mode=ExecutionModeKeys.TEST): self.data_lm_name = "data_lm.pkl" self.data_class_name = "data_class_name" self.fwd_enc_name = "fwd_enc" self.bwd_enc_name = "bwd_enc" self.fwd_class_name = 'fwd_clas' self.bwd_class_name = 'bwd_clas' # to make sure the outputs are also logged fastprogress.fastprogress.WRITER_FN = self._get_master_bar_write_fn() data_lm_path = os.path.join(self.experiment_dir, self.data_lm_name) if not os.path.exists(os.path.dirname(data_lm_path)): os.makedirs(os.path.dirname(data_lm_path)) if not os.path.exists(data_lm_path): data_lm = TextLMDataBunch.from_df(path=self.experiment_dir, train_df=self.dataloader.get_train_input(), valid_df=self.dataloader.get_test_input(), text_cols='utterance', bs=BATCH_SIZE) data_lm.save(self.data_lm_name) self.data_lm = load_data(self.experiment_dir, self.data_lm_name, bs=BATCH_SIZE, bptt=BPTT) self.data_bwd = load_data(self.experiment_dir, self.data_lm_name, bs=BATCH_SIZE, bptt=BPTT, backwards=True) data_class_path = os.path.join(self.experiment_dir, self.data_class_name) if not os.path.exists(data_class_path): data_class = TextDataBunch.from_df(path=self.experiment_dir, train_df=self.dataloader.get_train_input(), valid_df=self.dataloader.get_test_input(), text_cols='utterance', label_cols='functions', vocab=data_lm.train_ds.vocab, bs=BATCH_SIZE) data_class.save(self.data_class_name) self.data_class = load_data(self.experiment_dir, self.data_class_name, bs=BATCH_SIZE) self.data_class_bwd = load_data(self.experiment_dir, self.data_class_name, bs=BATCH_SIZE, backwards=True)
def load_cls_data_imdb(self, bs, force=False, use_test_for_validation=False): trn_df = pd.read_csv(self.dataset_path / 'train.csv', header=None) tst_df = pd.read_csv(self.dataset_path / 'test.csv', header=None) unsp_df = pd.read_csv(self.dataset_path / 'unsup.csv', header=None) lm_trn_df = pd.concat([unsp_df, trn_df, tst_df]) val_len = max(int(len(lm_trn_df) * 0.1), 2) lm_trn_df = lm_trn_df[val_len:] lm_val_df = lm_trn_df[:val_len] if use_test_for_validation: val_df = tst_df cls_cache = 'notst' else: val_len = max(int(len(trn_df) * 0.1), 2) trn_len = len(trn_df) - val_len trn_df, val_df = trn_df[:trn_len], trn_df[trn_len:] cls_cache = '.' if self.tokenizer is Tokenizers.SUBWORD: args = get_sentencepiece(self.dataset_path, self.dataset_path / 'train.csv', self.name, vocab_size=self.max_vocab, pre_rules=[], post_rules=[]) if self.tokenizer is Tokenizers.SUBWORD: args = get_sentencepiece(self.dataset_path, self.dataset_path / 'train.csv', self.name, vocab_size=self.max_vocab, pre_rules=[], post_rules=[]) elif self.tokenizer is Tokenizers.MOSES: args = dict(tokenizer=Tokenizer(tok_func=MosesTokenizerFunc, lang='en', pre_rules=[], post_rules=[])) elif self.tokenizer is Tokenizers.MOSES_FA: args = dict( tokenizer=Tokenizer(tok_func=MosesTokenizerFunc, lang='en')) # use default pre/post rules elif self.tokenizer is Tokenizers.FASTAI: args = dict() else: raise ValueError( f"self.tokenizer has wrong value {self.tokenizer}, Allowed values are taken from {Tokenizers}" ) try: if force: raise FileNotFoundError("Forcing reloading of caches") data_lm = TextLMDataBunch.load(self.cache_dir, 'lm', lm_type=self.lm_type, bs=bs) print( f"Tokenized data loaded, lm.trn {len(data_lm.train_ds)}, lm.val {len(data_lm.valid_ds)}" ) except FileNotFoundError: print(f"Running tokenization...") data_lm = TextLMDataBunch.from_df(path=self.cache_dir, train_df=lm_trn_df, valid_df=lm_val_df, max_vocab=self.max_vocab, bs=bs, lm_type=self.lm_type, **args) print( f"Saving tokenized: cls.trn {len(data_lm.train_ds)}, cls.val {len(data_lm.valid_ds)}" ) data_lm.save('lm') try: if force: raise FileNotFoundError("Forcing reloading of caches") data_cls = TextClasDataBunch.load(self.cache_dir, cls_cache, bs=bs) print( f"Tokenized data loaded, cls.trn {len(data_cls.train_ds)}, cls.val {len(data_cls.valid_ds)}" ) except FileNotFoundError: args[ 'vocab'] = data_lm.vocab # make sure we use the same vocab for classifcation print(f"Running tokenization...") data_cls = TextClasDataBunch.from_df(path=self.cache_dir, train_df=trn_df, valid_df=val_df, test_df=tst_df, max_vocab=self.max_vocab, bs=bs, **args) print( f"Saving tokenized: cls.trn {len(data_cls.train_ds)}, cls.val {len(data_cls.valid_ds)}" ) data_cls.save(cls_cache) print('Size of vocabulary:', len(data_lm.vocab.itos)) print('First 20 words in vocab:', data_lm.vocab.itos[:20]) return data_cls, data_lm
def load_cls_data(self, bs, force=False, use_test_for_validation=False, **kwargs): args = self.tokenzier_to_fastai_args( trn_data_loading_func=lambda: trn_df[1], add_moses=True) src_path = self.dataset_path csv_name = self.csv_name tgt_paths = [Path(tgt_path) for tgt_path in self.target_paths] mixed_csv = pd.read_csv(src_path / csv_name, header=None) for tgt_path in tgt_paths: mixed_csv = pd.concat( [mixed_csv, pd.read_csv(tgt_path / csv_name, header=None)]) xcvs_name = ('x_' + csv_name) mixed_csv.to_csv(src_path / xcvs_name, header=None, index=False) try: if force: raise FileNotFoundError("Forcing reloading of caches") data_lm = TextLMDataBunch.load(src_path, 'xlm', lm_type=self.lm_type, bs=bs) print( f"Tokenized data loaded, xlm.trn {len(data_lm.train_ds)}, xlm.val {len(data_lm.valid_ds)}" ) except FileNotFoundError: print(f"Running tokenization...") data_lm = TextLMDataBunch.from_csv(path=src_path, csv_name=xcvs_name, bs=bs, lm_type=self.lm_type, **kwargs, **args) print( f"Saving tokenized: cls.trn {len(data_lm.train_ds)}, cls.val {len(data_lm.valid_ds)}" ) data_lm.save('xlm') try: if force: raise FileNotFoundError("Forcing reloading of caches") data_cls = TextClasDataBunch.load(src_path, 'cls', bs=bs) print( f"Tokenized data loaded, cls.trn {len(data_cls.train_ds)}, cls.val {len(data_cls.valid_ds)}" ) except FileNotFoundError: args[ 'vocab'] = data_lm.vocab # make sure we use the same vocab for classifcation print(f"Running tokenization...") data_cls = TextClasDataBunch.from_csv(path=src_path, csv_name=csv_name, bs=bs, **kwargs, **args) print( f"Saving tokenized: cls.trn {len(data_cls.train_ds)}, cls.val {len(data_cls.valid_ds)}" ) data_cls.save('cls') print('Size of vocabulary:', len(data_lm.vocab.itos)) print('First 20 words in vocab:', data_lm.vocab.itos[:20]) return data_cls, data_lm
def main(input_file, output_dir, max_size=0): crate_dir(output_dir) count = 0 blank_limit = 1000 blank_count = 0 cpus = multiprocessing.cpu_count() - 1 p = multiprocessing.Pool(cpus) with open(input_file, 'r', encoding='utf-8') as input_text, open( os.path.join(output_dir, 'train.txt'), 'w', encoding='utf-8') as output_train: count = 0 for text in p.imap(preprocessing, input_text): text_size = len(text.split(' ')) count += 1 print(count) if text_size < 2: continue if max_size: if text_size > max_size: text = ' '.join(text.split(' ')[:max_size]) blank_count = 0 output_train.write(text + '\n') # pickle.dump(words_dict, open(os.path.join(output_dir, 'data.dict'), 'wb')) print('\033[1;34m', 'Creating the Data Buntch', '\033[0;0m') phrases = [] with open(os.path.join(output_dir, 'train.txt'), 'r', encoding='utf-8') as txt: phrases = [line.replace('\n', '').split(' ') for line in txt] freq = collections.Counter([w for s in phrases for w in s]) max_vocab = 30000 min_freq = 5 # getting rid of the rare words itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq] itos.insert(0, '_pad_') itos.insert(0, '_unk_') # itos is the list of all the strings in the vocab stoi = collections.defaultdict(lambda: 0, {v: k for k, v in enumerate(itos)}) max_data = int(len(phrases) * 0.9) # creating a index representation for our train and validation dataset trn_lm = np.array([[stoi[o] for o in p] for p in phrases]) data_lm = TextLMDataBunch.from_ids(output_dir, transform.Vocab(itos), train_ids=trn_lm[:max_data], valid_ids=trn_lm[max_data:]) #np.save(output_dir, trn_lm) #pickle.dump(itos, open(os.path.join(output_dir, 'itos.pkl'), 'wb')) #pickle.dump(dict(stoi), open(os.path.join(output_dir, 'stoi.pkl'), 'wb')) data_lm.save('data_save.pkl')
SAMPLES_PER_CLASS = 12500 print('loading data') texts = [] target = [] for class_index, classname in enumerate(CLASS_NAMES): for n, line in enumerate(open(DATA_FOLDER+classname+'.txt')): texts.append(preprocess_string(line,False)) target.append(class_index) if n > SAMPLES_PER_CLASS: break df = DataFrame({'label':target,'text':texts}) df_train, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12) data_lm = TextLMDataBunch.from_df(train_df = df_train, valid_df = df_val, path = "") data_clas = TextClasDataBunch.from_df(path = "", train_df = df_train, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32) learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7) learn.fit_one_cycle(1, 1e-2) learn.save_encoder('ft_enc') learn = text_classifier_learner(data_clas, drop_mult=0.7) learn.load_encoder('ft_enc') learn.fit_one_cycle(1, 1e-2)
ingredients = [" ".join(item["ingredients"]) for item in train_json] cuisine = [item["cuisine"] for item in train_json] df = pd.DataFrame({"cuisine": cuisine, "ingredients": ingredients}) train_df, valid_df = train_test_split( df, stratify=df["cuisine"], test_size=0.2, random_state=1024, ) text_lm = TextLMDataBunch.from_df( train_df=train_df, valid_df=valid_df, path="", ) lm_learner = language_model_learner( text_lm, arch=AWD_LSTM, drop_mult=0.2, ) lm_learner.lr_find() lm_learner.recorder.plot(suggestion=True) lm_learner.fit_one_cycle(1, lm_learner.recorder.min_grad_lr) lm_learner.save_encoder(model)
def load_cls_data_old_for_xnli(self, bs): tmp_dir = self.cache_dir tmp_dir.mkdir(exist_ok=True) vocab_file = tmp_dir / f'vocab_{self.lang}.pkl' if not (tmp_dir / f'{TRN}_{self.lang}_ids.npy').exists(): print('Reading the data...') toks, lbls = read_clas_data(self.dataset_dir, self.dataset_dir.name, self.lang) # create the vocabulary counter = Counter(word for example in toks[TRN] + toks[TST] + toks[VAL] for word in example) itos = [ word for word, count in counter.most_common(n=self.max_vocab) ] itos.insert(0, PAD) itos.insert(0, UNK) vocab = Vocab(itos) stoi = vocab.stoi with open(vocab_file, 'wb') as f: pickle.dump(vocab, f) ids = {} for split in [TRN, VAL, TST]: ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s]) for s in toks[split]]) np.save(tmp_dir / f'{split}_{self.lang}_ids.npy', ids[split]) np.save(tmp_dir / f'{split}_{self.lang}_lbl.npy', lbls[split]) else: print('Loading the pickled data...') ids, lbls = {}, {} for split in [TRN, VAL, TST]: ids[split] = np.load(tmp_dir / f'{split}_{self.lang}_ids.npy') lbls[split] = np.load(tmp_dir / f'{split}_{self.lang}_lbl.npy') with open(vocab_file, 'rb') as f: vocab = pickle.load(f) print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. ' f'Test size: {len(ids[TST])}.') for split in [TRN, VAL, TST]: ids[split] = np.array( [np.array(e, dtype=np.int) for e in ids[split]]) lbls[split] = np.array( [np.array(e, dtype=np.int) for e in lbls[split]]) data_lm = TextLMDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=np.concatenate( [ids[TRN], ids[TST]]), valid_ids=ids[VAL], bs=bs, bptt=self.bptt, lm_type=self.lm_type) # TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls? data_clas = TextClasDataBunch.from_ids( path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL], train_lbls=lbls[TRN], valid_lbls=lbls[VAL], bs=bs, classes={l: l for l in lbls[TRN]}) print( f"Sizes of train_ds {len(data_clas.train_ds)}, valid_ds {len(data_clas.valid_ds)}" ) return data_clas, data_lm
def df_to_emb(self, dataframe: pd.DataFrame, bs=100) -> np.ndarray: """ Retrieve document embeddings for a dataframe with the columns `title` and `body`. Uses batching for effiecient computation, which is useful when you have many documents to retrieve embeddings for. Paramaters ---------- dataframe: pandas.DataFrame Dataframe with columns `title` and `body`, which reprsent the Title and Body of a GitHub Issue. bs: int batch size for doing inference. Set this variable according to your available GPU memory. The default is set to 200, which was stable on a Nvida-Tesla V-100. Returns ------- numpy.ndarray An array with of shape (number of dataframe rows, 2400) This numpy array represents the latent features of the GitHub issues. Example ------- >>> import pandas as pd >>> wrapper = InferenceWrapper(model_path='/path/to/model', model_file_name='model.pkl') # load 200 sample GitHub issues >>> testdf = pd.read_csv(f'https://bit.ly/2GDY5NY').head(200) >>> embeddings = wrapper.df_to_emb(testdf) >>> embeddings.shape (200, 2400) """ new_df = self.process_df(dataframe) # to get the benefit of batching similar length sequences together, have a minimum of 20 batches bs = min(bs, (len(new_df) // 20) + 1) # use the machinery of the data block to numericalize text in parallel data_lm = lmdb.from_df( path=self.path, train_df=new_df.head(), # train_df gets sample data only valid_df=new_df, text_cols='text', tokenizer=self.model_tokenizer, vocab=self.vocab) # extract numericalized arrays and convert to pytorch docs = data_lm.valid_dl.x.items lengths = [] numericalized_docs = [] for arr in docs: numericalized_docs.append( tensor(arr).cuda()) # convert to torch.Tensor lengths.append(arr.shape[0]) # sort the data by sequence length and assemble batches length_arr = np.array(lengths) len_mask = length_arr.argsort() len_mask_reversed = len_mask.argsort() batched_features = list( chunked([numericalized_docs[i] for i in len_mask], bs)) batched_lengths = list(chunked(length_arr[len_mask], bs)) # perform model inference hidden_states_batched = [] pooled_states = [] for i, b in tqdm(enumerate(batched_features), desc="Model inference:"): # pad the batch to the same length bp = pad_sequence(b, batch_first=True, padding_value=self.pad_idx) # perform inference hidden_states = self._forward_pass(bp) empty_cache() # fetch the summary of the hidden states as the embedding pooled_states.append( self.batch_seq_pool(hidden_states, batched_lengths[i])) # restore the original order of the data by unsorting pooled_states = cat(pooled_states)[len_mask_reversed, :] assert pooled_states.shape[0] == length_arr.shape[0] == len(dataframe) return pooled_states
def new_train_clas(data_dir, lang='en', cuda_id=0, pretrain_name='wt103', model_dir='models', qrnn=False, fine_tune=True, max_vocab=30000, bs=20, bptt=70, name='imdb-clas', dataset='imdb', ds_pct=1.0): """ :param data_dir: The path to the `data` directory :param lang: the language unicode :param cuda_id: The id of the GPU. Uses GPU 0 by default or no GPU when run on CPU. :param pretrain_name: name of the pretrained model :param model_dir: The path to the directory where the pretrained model is saved :param qrrn: Use a QRNN. Requires installing cupy. :param fine_tune: Fine-tune the pretrained language model :param max_vocab: The maximum size of the vocabulary. :param bs: The batch size. :param bptt: The back-propagation-through-time sequence length. :param name: The name used for both the model and the vocabulary. :param dataset: The dataset used for evaluation. Currently only IMDb and XNLI are implemented. Assumes dataset is located in `data` folder and that name of folder is the same as dataset name. """ results = {} if not torch.cuda.is_available(): print('CUDA not available. Setting device=-1.') cuda_id = -1 torch.cuda.set_device(cuda_id) print(f'Dataset: {dataset}. Language: {lang}.') assert dataset in DATASETS, f'Error: {dataset} processing is not implemented.' assert (dataset == 'imdb' and lang == 'en') or not dataset == 'imdb',\ 'Error: IMDb is only available in English.' data_dir = Path(data_dir) assert data_dir.name == 'data',\ f'Error: Name of data directory should be data, not {data_dir.name}.' dataset_dir = data_dir / dataset model_dir = Path(model_dir) if qrnn: print('Using QRNNs...') model_name = 'qrnn' if qrnn else 'lstm' lm_name = f'{model_name}_{pretrain_name}' pretrained_fname = (lm_name, f'itos_{pretrain_name}') ensure_paths_exists(data_dir, dataset_dir, model_dir, model_dir / f"{pretrained_fname[0]}.pth", model_dir / f"{pretrained_fname[1]}.pkl") tmp_dir = dataset_dir / 'tmp' tmp_dir.mkdir(exist_ok=True) vocab_file = tmp_dir / f'vocab_{lang}.pkl' if not (tmp_dir / f'{TRN}_{lang}_ids.npy').exists(): print('Reading the data...') toks, lbls = read_clas_data(dataset_dir, dataset, lang) # create the vocabulary counter = Counter(word for example in toks[TRN] for word in example) itos = [word for word, count in counter.most_common(n=max_vocab)] itos.insert(0, PAD) itos.insert(0, UNK) vocab = Vocab(itos) stoi = vocab.stoi with open(vocab_file, 'wb') as f: pickle.dump(vocab, f) ids = {} for split in [TRN, VAL, TST]: ids[split] = np.array([([stoi.get(w, stoi[UNK]) for w in s]) for s in toks[split]]) np.save(tmp_dir / f'{split}_{lang}_ids.npy', ids[split]) np.save(tmp_dir / f'{split}_{lang}_lbl.npy', lbls[split]) else: print('Loading the pickled data...') ids, lbls = {}, {} for split in [TRN, VAL, TST]: ids[split] = np.load(tmp_dir / f'{split}_{lang}_ids.npy') lbls[split] = np.load(tmp_dir / f'{split}_{lang}_lbl.npy') with open(vocab_file, 'rb') as f: vocab = pickle.load(f) print(f'Train size: {len(ids[TRN])}. Valid size: {len(ids[VAL])}. ' f'Test size: {len(ids[TST])}.') if ds_pct < 1.0: print(f"Makeing the dataset smaller {ds_pct}") for split in [TRN, VAL, TST]: ids[split] = ids[split][:int(len(ids[split]) * ds_pct)] data_lm = TextLMDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL], bs=bs, bptt=bptt) # TODO TextClasDataBunch allows tst_ids as input, but not tst_lbls? data_clas = TextClasDataBunch.from_ids(path=tmp_dir, vocab=vocab, train_ids=ids[TRN], valid_ids=ids[VAL], train_lbls=lbls[TRN], valid_lbls=lbls[VAL], bs=bs) if qrnn: emb_sz, nh, nl = 400, 1550, 3 else: emb_sz, nh, nl = 400, 1150, 3 learn = language_model_learner(data_lm, bptt=bptt, emb_sz=emb_sz, nh=nh, nl=nl, qrnn=qrnn, pad_token=PAD_TOKEN_ID, pretrained_fnames=pretrained_fname, path=model_dir.parent, model_dir=model_dir.name) lm_enc_finetuned = f"{lm_name}_{dataset}_enc" if fine_tune and not (model_dir / f"lm_enc_finetuned.pth").exists(): print('Fine-tuning the language model...') learn.unfreeze() learn.fit(2, slice(1e-4, 1e-2)) # save encoder learn.save_encoder(lm_enc_finetuned) print("Starting classifier training") learn = text_classifier_learner(data_clas, bptt=bptt, pad_token=PAD_TOKEN_ID, path=model_dir.parent, model_dir=model_dir.name, qrnn=qrnn, emb_sz=emb_sz, nh=nh, nl=nl) learn.load_encoder(lm_enc_finetuned) learn.fit_one_cycle(1, 2e-2, moms=(0.8, 0.7), wd=1e-7) learn.freeze_to(-2) learn.fit_one_cycle(1, slice(1e-2 / (2.6**4), 1e-2), moms=(0.8, 0.7), wd=1e-7) learn.freeze_to(-3) learn.fit_one_cycle(1, slice(5e-3 / (2.6**4), 5e-3), moms=(0.8, 0.7), wd=1e-7) learn.unfreeze() learn.fit_one_cycle(2, slice(1e-3 / (2.6**4), 1e-3), moms=(0.8, 0.7), wd=1e-7) results['accuracy'] = learn.validate()[1] print(f"Saving models at {learn.path / learn.model_dir}") learn.save(f'{model_name}_{name}') return results