Ejemplo n.º 1
0
 def _init_token_data(self):
     to_tokenize = {}
     for mode in ["train", "test"]:
         token_data_attr = "_{mode}_tokens".format(mode=mode)
         token_data_file = "_{mode}_tokens.pkl".format(mode=mode)
         token_data_path = join(self._gen_dir, token_data_file)
         if exists(token_data_path):
             token_data = unpickle_file(token_data_path)
             setattr(self, token_data_attr, token_data)
         else:
             data_dict_attr = "_{mode}_dict".format(mode=mode)
             data_dict = getattr(self, data_dict_attr)
             to_tokenize[mode] = data_dict
     if to_tokenize:
         #! Regardless of buckets, all vocab must be tokenized,
         #! otherwise risk experiment failing with empty target
         include = set(self._vocab) | set(
             corpora_vocab(self._train_corpus, self._test_corpus))
         include_tokens_path = join(self._gen_dir, "_incl_tokens.pkl")
         pickle_file(path=include_tokens_path, data=include)
         tokens_dict = tokenize_data(
             include=include,
             case_insensitive=self._embedding.case_insensitive,
             **to_tokenize,
         )
         for mode, token_data in tokens_dict.items():
             token_data_attr = "_{mode}_tokens".format(mode=mode)
             token_data_file = "_{mode}_tokens.pkl".format(mode=mode)
             token_data_path = join(self._gen_dir, token_data_file)
             pickle_file(path=token_data_path, data=token_data)
             setattr(self, token_data_attr, token_data)
Ejemplo n.º 2
0
 def _init_corpus(self, mode):
     corpus_attr = "_{mode}_corpus".format(mode=mode)
     corpus_file = "_{mode}_corpus.pkl".format(mode=mode)
     corpus_path = join(self._gen_dir, corpus_file)
     if exists(corpus_path):
         corpus = unpickle_file(corpus_path)
     else:
         corpora = (lower_corpus(getattr(dataset, corpus_attr))
                    if self._embedding.case_insensitive else getattr(
                        dataset, corpus_attr) for dataset in self.datasets)
         corpus = merge_corpora(*corpora)
         pickle_file(path=corpus_path, data=corpus)
     setattr(self, corpus_attr, corpus)
Ejemplo n.º 3
0
 def _init_data_dict(self, mode):
     data_dict_attr = "_{mode}_dict".format(mode=mode)
     data_dict_file = "_{mode}_dict.pkl".format(mode=mode)
     data_dict_path = join(self._gen_dir, data_dict_file)
     if exists(data_dict_path):
         data_dict = unpickle_file(data_dict_path)
     else:
         data_dicts = (getattr(dataset, data_dict_attr)
                       for dataset in self.datasets)
         data_dict = accumulate_dicts(*data_dicts)
         pickle_file(path=data_dict_path, data=data_dict)
     class_labels = self._class_labels or []
     class_labels = set(class_labels + data_dict["labels"])
     self._class_labels = list(class_labels)
     setattr(self, data_dict_attr, data_dict)
Ejemplo n.º 4
0
def generate_dataset_files(args):
    dataset_name = args.dataset_name or basename(normpath(args.path))
    parsing_fn = get_parsing_fn(args.path, args.parser_name)
    ftrain, ftest = get_raw_file_paths(args.path)
    train_dict, test_dict = get_dataset_dicts(ftrain, ftest, parsing_fn)
    target_path = join(DATASET_DATA_PATH, dataset_name)
    if exists(target_path):
        if not args.force:
            cprnt(warn="Dataset '{}' already exists, use -f to overwrite".
                  format(dataset_name))
            return
        cprnt(info="Overwriting previous '{}' dataset".format(dataset_name))
        rmtree(target_path)
    makedirs(target_path)
    pickle_file(join(target_path, "_train_dict.pkl"), train_dict)
    pickle_file(join(target_path, "_test_dict.pkl"), test_dict)
    return dataset_name
Ejemplo n.º 5
0
 def _init_data_dict(self, mode, redist):
     data_dict_file = "_{mode}_dict.pkl".format(mode=mode)
     data_dict_attr = "_{mode}_dict".format(mode=mode)
     redist = redist.get(mode) if isinstance(redist, dict) else redist
     srcdir_attr = "_{mode}_srcdir".format(mode=mode)
     data_dict_dir = getattr(self, srcdir_attr)
     data_dict_path = join(data_dict_dir, data_dict_file)
     if exists(data_dict_path):
         data_dict = unpickle_file(data_dict_path)
     elif redist:
         source_data_dict_path = join(self.gen_dir, data_dict_file)
         source_data_dict = unpickle_file(source_data_dict_path)
         data_dict = resample_data_dict(source_data_dict, redist)
         pickle_file(path=data_dict_path, data=data_dict)
     class_labels = self._class_labels or []
     class_labels = set(class_labels + data_dict["labels"])
     self._class_labels = list(class_labels)
     setattr(self, data_dict_attr, data_dict)
Ejemplo n.º 6
0
 def _init_corpus(self, mode):
     corpus_pkl_file = "_{mode}_corpus.pkl".format(mode=mode)
     srcdir_attr = "_{mode}_srcdir".format(mode=mode)
     corpus_pkl_dir = getattr(self, srcdir_attr)
     corpus_pkl_path = join(corpus_pkl_dir, corpus_pkl_file)
     if exists(corpus_pkl_path):
         corpus = unpickle_file(corpus_pkl_path)
     else:
         dict_attr = "_{mode}_dict".format(mode=mode)
         data_dict = getattr(self, dict_attr)
         # docs = data_dict["sentences"]
         docs = [
             " ".join([s[:o].strip(), t, s[(o + len(t)):].strip()])
             for s, t, o in zip(
                 data_dict["sentences"],
                 data_dict["targets"],
                 data_dict["offsets"],
             )
         ]
         corpus = generate_corpus(docs, mode)
         pickle_file(data=corpus, path=corpus_pkl_path)
     corpus_attr = "_{mode}_corpus".format(mode=mode)
     setattr(self, corpus_attr, corpus)
Ejemplo n.º 7
0
    def _vocab_coverage(self):
        _ci = self._embedding.case_insensitive
        v_orig = set(self._embedding.vocab)
        v_extd = set(self._vocab)
        v_train = set(
            sum(
                accumulate_dicts(
                    self._train_tokens,
                    accum_fn=(lambda prev, curr: list(set(prev) | set(curr))),
                    default=lambda v=None: set(sum(v, [])) if v else set(),
                ).values(),
                [],
            ))
        v_test = set(
            sum(
                accumulate_dicts(
                    self._test_tokens,
                    accum_fn=(lambda prev, curr: list(set(prev) | set(curr))),
                    default=lambda v=None: set(sum(v, [])) if v else set(),
                ).values(),
                [],
            ))
        v_train_oov_over_t = (
            set(w for w in v_train
                if self._train_corpus.get(w) >= self._oov_train_threshold) -
            v_orig)
        v_tot = v_train | v_test
        v_oov = v_tot - v_orig

        vocab_data_path = join(self._gen_dir, "vocab.pkl")
        if exists(vocab_data_path):
            self._vocab_data = unpickle_file(vocab_data_path)
        else:
            self._vocab_data = {
                "_ts": datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
                "_threshold": self._oov_train_threshold,
                "embedding": {
                    "original": v_orig,
                    "extended": v_extd
                },
                "datasets": {
                    "total": v_tot,
                    "oov": v_oov,
                    "train": {
                        "total": v_train,
                        "oov": {
                            "total": v_train - v_orig,
                            "embedded": v_train_oov_over_t,
                            "bucketed": v_train - v_extd,
                        },
                    },
                    "test": {
                        "total": v_test,
                        "oov": {
                            "total": v_test - v_orig,
                            "bucketed": v_test - v_extd,
                            "exclusive": v_test - (v_extd | v_train),
                        },
                    },
                },
            }
            pickle_file(path=vocab_data_path, data=self._vocab_data)

        n_tot = len(v_tot)
        n_oov = len(v_oov)
        n_train = len(v_train)
        n_test = len(v_test)
        n_train_oov = len(v_train - v_orig)
        n_train_oov_embd = len(v_train_oov_over_t)
        n_train_oov_bktd = len(v_train - v_extd)
        n_test_oov = len(v_test - v_orig)
        n_test_oov_bktd = len(v_test - v_extd)
        n_test_oov_excl = len(v_test - (v_extd | v_train))
        portion = lambda p, tot=None: str(p) + (" ({:.2f}%)".format(
            (p / tot) * 100) if tot else "")
        return {
            "total": {
                "size": n_tot,
                "in_vocab": portion(n_tot - n_oov, tot=n_tot),
                "out_of_vocab": portion(n_oov, tot=n_tot),
            },
            "train": {
                "size": n_train,
                "oov": {
                    "total":
                    portion(n_train_oov, tot=n_train),
                    "embedded":
                    portion(n_train_oov_embd, tot=n_train),
                    **({
                        "bucketed": portion(n_train_oov_bktd, tot=n_train)
                    } if n_train_oov_bktd > 0 else {}),
                },
            },
            "test": {
                "size": n_test,
                "oov": {
                    "total":
                    portion(n_test_oov, tot=n_test),
                    "bucketed":
                    portion(n_test_oov_bktd, tot=n_test),
                    **({
                        "exclusive": portion(n_test_oov_excl, tot=n_test)
                    } if n_train_oov_bktd > 0 else {}),
                },
            },
        }
Ejemplo n.º 8
0
 def _init_tfrecords(self):
     tokens_lists = {}
     for mode in ["train", "test"]:
         tfrecord_folder = "_{mode}".format(mode=mode)
         tfrecord_path = join(self._gen_dir, tfrecord_folder)
         if not exists(tfrecord_path):
             tokens_attr = "_{mode}_tokens".format(mode=mode)
             tokens_dict = getattr(self, tokens_attr)
             tokens_list = tokens_dict.values()
             tokens_lists[mode] = sum(tokens_list, [])
     fetch_results_path = join(self._gen_dir, "_fetch_results.pkl")
     if tokens_lists and not exists(fetch_results_path):
         vocab_file_templ = "_vocab{ext}"
         filtered_vocab_file = vocab_file_templ.format(ext=".filt.txt")
         filtered_vocab_path = join(self._gen_dir, filtered_vocab_file)
         if not exists(filtered_vocab_path):
             filtered_vocab = list(
                 set(self._vocab)
                 & set(corpora_vocab(self._train_corpus, self._test_corpus))
             )
             indices = [self._vocab.index(word) for word in filtered_vocab]
             write_vocab_file(filtered_vocab_path, filtered_vocab, indices)
         #! There has to be at least 1 bucket for any
         #! test-time oov tokens (possibly targets)
         lookup_table = ids_lookup_table(
             filtered_vocab_path,
             self._num_oov_buckets,
             vocab_size=len(self._vocab),
         )
         fetch_dict = fetch_lookup_ops(lookup_table, **tokens_lists)
         fetch_results = run_lookups(fetch_dict, metadata_path=self.gen_dir)
         pickle_file(path=fetch_results_path, data=fetch_results)
     else:
         fetch_results = unpickle_file(fetch_results_path)
     oov_buckets = {}
     for mode, values in fetch_results.items():
         data_dict_attr = "_{mode}_dict".format(mode=mode)
         data_dict = getattr(self, data_dict_attr)
         string_features, int_features = split_list(values, parts=2)
         tfexamples = make_tf_examples(string_features, int_features,
                                       data_dict["labels"])
         tfrecord_folder = "_{mode}".format(mode=mode)
         tfrecord_path = join(self._gen_dir, tfrecord_folder)
         if not exists(tfrecord_path):
             write_tfrecords(tfrecord_path, tfexamples)
         #! There has to be at least 1 bucket for any
         #! test-time oov tokens (possibly targets)
         buckets = [
             BUCKET_TOKEN.format(num=n + 1)
             for n in range(self._num_oov_buckets)
         ]
         oov_buckets[mode] = tokens_by_assigned_id(
             string_features,
             int_features,
             start=len(self._vocab),
             keys=buckets,
         )
     accum_oov_buckets = accumulate_dicts(
         **oov_buckets,
         accum_fn=lambda prev, curr: list(set(prev) | set(curr)),
     )
     self._oov_buckets = {
         buckets[i]: accum_oov_buckets[buckets[i]]
         for i in sorted(
             [buckets.index(key) for key in [*accum_oov_buckets]])
     }