(0, 0, offset, 0)) # add 2 row of zeros for __unk__ and __pad__ tokens return tensor, wdict if __name__ == "__main__": opt = get_args() opt.test_batch_size = opt.test_batch_size if opt.test_batch_size else opt.batch_size os.makedirs(opt.model_folder, exist_ok=True) os.makedirs(opt.data_folder, exist_ok=True) logger = lib.get_logger(logdir=opt.model_folder, logname="logs.txt") logger.info("parameters: {}".format(vars(opt))) dataset = load_datasets(names=[opt.dataset])[0] dataset_name = dataset.__class__.__name__ n_classes = dataset.n_classes logger.info("dataset: {}, n_classes: {}".format(dataset_name, n_classes)) tr_seq_path = "{}/train_sequences.pkl".format(opt.data_folder) te_seq_path = "{}/test_sequences.pkl".format(opt.data_folder) tr_lab_path = "{}/train_labels.pkl".format(opt.data_folder) te_lab_path = "{}/test_labels.pkl".format(opt.data_folder) wdict_path = "{}/word_dict.pkl".format(opt.data_folder) embedding_path = "{}/word_embeddings".format(opt.data_folder) # check if datasets exist
w: i for i, w in enumerate(self.word_counter, start=2) } self.word_dict["_pad_"] = 0 self.word_dict["_unk_"] = 1 print("Dictionnary has {} words".format(len(self.word_dict))) self.n_transform += 1 assert self.word_dict, "No dictionnary to vectorize text \n-> call method build_dict \n-> or set a word_dict attribute \n first" s = [self.word_dict.get(w, self.word_dict["_unk_"]) for w in l] return s if __name__ == "__main__": import spacy from tqdm import tqdm from src.datasets import load_datasets dataset = load_datasets(names=['db_pedia'])[0] tr_examples = [ txt for txt, lab in tqdm(dataset.load_train_data(), desc="counting train samples") ] nlp = spacy.load("en", disable=["parser", "tagger", "ner"]) prepro = Preprocessing() tokens = [ prepro.transform(sentence) for sentence in tqdm(tr_examples, total=len(tr_examples)) ]