Exemple #1
0
        (0, 0, offset, 0))  # add 2 row of zeros for __unk__ and __pad__ tokens
    return tensor, wdict


if __name__ == "__main__":

    opt = get_args()
    opt.test_batch_size = opt.test_batch_size if opt.test_batch_size else opt.batch_size

    os.makedirs(opt.model_folder, exist_ok=True)
    os.makedirs(opt.data_folder, exist_ok=True)

    logger = lib.get_logger(logdir=opt.model_folder, logname="logs.txt")
    logger.info("parameters: {}".format(vars(opt)))

    dataset = load_datasets(names=[opt.dataset])[0]
    dataset_name = dataset.__class__.__name__
    n_classes = dataset.n_classes
    logger.info("dataset: {}, n_classes: {}".format(dataset_name, n_classes))

    tr_seq_path = "{}/train_sequences.pkl".format(opt.data_folder)
    te_seq_path = "{}/test_sequences.pkl".format(opt.data_folder)

    tr_lab_path = "{}/train_labels.pkl".format(opt.data_folder)
    te_lab_path = "{}/test_labels.pkl".format(opt.data_folder)

    wdict_path = "{}/word_dict.pkl".format(opt.data_folder)

    embedding_path = "{}/word_embeddings".format(opt.data_folder)

    # check if datasets exist
Exemple #2
0
                w: i
                for i, w in enumerate(self.word_counter, start=2)
            }
            self.word_dict["_pad_"] = 0
            self.word_dict["_unk_"] = 1
            print("Dictionnary has {} words".format(len(self.word_dict)))
        self.n_transform += 1

        assert self.word_dict, "No dictionnary to vectorize text \n-> call method build_dict \n-> or set a word_dict attribute \n first"

        s = [self.word_dict.get(w, self.word_dict["_unk_"]) for w in l]
        return s


if __name__ == "__main__":

    import spacy
    from tqdm import tqdm
    from src.datasets import load_datasets
    dataset = load_datasets(names=['db_pedia'])[0]
    tr_examples = [
        txt for txt, lab in tqdm(dataset.load_train_data(),
                                 desc="counting train samples")
    ]

    nlp = spacy.load("en", disable=["parser", "tagger", "ner"])
    prepro = Preprocessing()
    tokens = [
        prepro.transform(sentence)
        for sentence in tqdm(tr_examples, total=len(tr_examples))
    ]