Example #1
0
    def load(self, path, features='BoW', match_avitm=True):
        if path[:2] == '~/':
            path = os.path.join(os.path.expanduser(path[:2]), path[2:])

        ### Specify the file locations
        train_path = path + '/train.npz'
        dev_path = path + '/dev.npz'
        test_path = path + '/test.npz'
        vocab_path = path + '/train.vocab.json'

        ### Load train
        train_csr = load_sparse(train_path)
        train = np.array(train_csr.todense()).astype('float32')

        ### Load dev
        self.dev_counts = load_sparse(dev_path).tocsc() # will be used for NPMI

        ### Load test
        test_csr = load_sparse(test_path)
        test = np.array(test_csr.todense()).astype('float32')

        ### load vocab
        # ENCODING = "ISO-8859-1"
        ENCODING = "utf-8"
        with open(vocab_path, encoding=ENCODING) as f:
             vocab_list = json.load(f)

        # construct maps
        vocab2dim = dict(zip(vocab_list, range(len(vocab_list))))
        dim2vocab = reverse_dict(vocab2dim)

        return [train, None, test, None, None, None], [None, None, None], [vocab2dim, dim2vocab, None, None]
    def load(self, data_path, features='BoW', match_avitm=True):

        ### Specify the file locations
        train_path = data_path + '/train.npz'
        dev_path = data_path + '/dev.npz'
        test_path = data_path + '/test.npz'
        vocab_path = data_path + '/train.vocab.json'

        ### Load train
        train_csr = load_sparse(train_path)
        train_counts = np.array(train_csr.todense()).astype('float32')
        train_bert_logits = np.load(self.logit_path + "/train.npy")
        train = np.concatenate([train_counts, train_bert_logits], axis=1)

        if self.logit_clip is not None:
            # limit the document representations to the top k labels
            doc_tokens = np.sum(train_counts > 0, axis=1)
            vocab_size = train_counts.shape[1]

            for i, (row, total) in enumerate(zip(train_bert_logits,
                                                 doc_tokens)):
                k = self.logit_clip * total  # keep this many logits
                if k < vocab_size:
                    min_logit = np.quantile(row, 1 - k / vocab_size)
                    train_bert_logits[
                        i, train_bert_logits[i] < min_logit] = -np.inf

        #min_logits = np.quantile(train_bert_logits, np.quantile(train_counts.sum(1), 0.9) / 20_000, axis=1)
        #train_bert_logits[train_bert_logits < min_logits.reshape(-1, 1)] = -np.inf

        ### Load dev
        self.dev_counts = load_sparse(
            dev_path).tocsc()  # will be used for NPMI

        ### Load test
        test_csr = load_sparse(test_path)
        test_counts = np.array(test_csr.todense()).astype('float32')
        test_bert_logits = np.ones_like(test_counts)
        test = np.concatenate([test_counts, test_bert_logits], axis=1)

        ### load vocab
        # ENCODING = "ISO-8859-1"
        ENCODING = "utf-8"
        with open(vocab_path, encoding=ENCODING) as f:
            vocab_list = json.load(f)

        # construct maps
        vocab2dim = dict(zip(vocab_list, range(len(vocab_list))))
        dim2vocab = reverse_dict(vocab2dim)

        return [train, None, test, None, None,
                None], [None, None, None], [vocab2dim, dim2vocab, None, None]
Example #3
0
def toks_to_onehot(doc, vocab):
    tokens = [vocab[word] for word in doc]
    return np.bincount(tokens, minlength=len(vocab))


if __name__ == "__main__":
    ## Re-processing
    REMOVE_HEADER = True

    with open("./aligned/train.vocab.json", "r") as infile:
        vocab = json.load(infile)
        vocab_dict = dict(zip(vocab, range(len(vocab))))

    # Read in the ProdLDA 20ng data
    orig_counts_train = load_sparse("./aligned/train.npz").todense()
    orig_counts_test = load_sparse("./aligned/test.npz").todense()

    # Get the original raw text
    raw_train = fetch_20newsgroups(data_home="./intermediate", subset="train")
    raw_test = fetch_20newsgroups(data_home="./intermediate", subset="test")

    # Turn the raw text into count data
    wnl = WordNetLemmatizer()
    raw_tokens_train = [
        process_raw_doc(doc, vocab_dict, wnl, REMOVE_HEADER)
        for doc in tqdm(raw_train.data)
    ]
    raw_tokens_test = [
        process_raw_doc(doc, vocab_dict, wnl, REMOVE_HEADER)
        for doc in tqdm(raw_test.data)
Example #4
0
    indir = Path("../data/20ng")
    outdir = Path("processed-dev")
    outdir = Path(indir, outdir)
    outdir.mkdir(exist_ok=True)

    # copy over the train files
    shutil.copy(Path(indir, "train.jsonlist"), Path(outdir, "train.jsonlist"))
    shutil.copy(Path(indir, "processed/train.npz"), Path(outdir, "train.npz"))
    shutil.copy(Path(indir, "processed/train.ids.json"),
                Path(outdir, "train.ids.json"))
    shutil.copy(Path(indir, "processed/train.vocab.json"),
                Path(outdir, "train.vocab.json"))

    # read in test
    test_jsonlist = utils.load_jsonlist(Path(indir, "test.jsonlist"))
    test_counts = utils.load_sparse(Path(indir, "processed/test.npz"))
    test_ids = utils.load_json(Path(indir, "processed/test.ids.json"))

    # split into a dev set
    dev_jsonlist, test_jsonlist, dev_counts, test_counts, dev_ids, test_ids = (
        train_test_split(test_jsonlist,
                         test_counts,
                         test_ids,
                         test_size=0.5,
                         random_state=11225))

    # save
    utils.save_jsonlist(dev_jsonlist, Path(outdir, "dev.jsonlist"))
    utils.save_sparse(dev_counts, Path(outdir, "dev.npz"))
    utils.save_json(dev_ids, Path(outdir, "dev.ids.json"))
Example #5
0
import utils

if __name__ == "__main__":
    outdir = Path("processed-dev")
    outdir.mkdir(exist_ok=True)

    # copy over the train files
    shutil.copy("train.jsonlist", Path(outdir, "train.jsonlist"))
    shutil.copy("processed/train.npz", Path(outdir, "train.npz"))
    shutil.copy("processed/train.ids.json", Path(outdir, "train.ids.json"))
    shutil.copy("processed/train.vocab.json", Path(outdir, "train.vocab.json"))

    # read in test
    test_jsonlist = utils.load_jsonlist("test.jsonlist")
    test_counts = utils.load_sparse("processed/test.npz")
    test_ids = utils.load_json("processed/test.ids.json")

    # split into a dev set
    dev_jsonlist, test_jsonlist, dev_counts, test_counts, dev_ids, test_ids = (
        train_test_split(test_jsonlist,
                         test_counts,
                         test_ids,
                         test_size=0.5,
                         random_state=11225))

    # save
    utils.save_jsonlist(dev_jsonlist, Path(outdir, "dev.jsonlist"))
    utils.save_sparse(dev_counts, Path(outdir, "dev.npz"))
    utils.save_json(dev_ids, Path(outdir, "dev.ids.json"))
from utils import load_jsonlist, save_jsonlist, load_sparse, save_sparse, load_json, save_json

if __name__ == "__main__":

    dev_dir = Path("./aligned/dev")
    dev_dir.mkdir(exist_ok=True)
    for fpath in Path("./aligned").glob("*"):
        if fpath.is_file():
            shutil.copy(str(fpath), str(Path(dev_dir, fpath.name)))

    # Load in the ids from the replicated fpath
    repl_train_ids = load_json("./replicated/dev/train.ids.json")
    repl_dev_ids = load_json("./replicated/dev/dev.ids.json")

    data = load_jsonlist(Path(dev_dir, "train.jsonlist"))
    counts = load_sparse(Path(dev_dir, "train.npz"))
    ids = load_json(Path(dev_dir, "train.ids.json"))

    # split based on how the replication data was split
    data_train = [doc for doc in data if doc['id'] in repl_train_ids]
    data_dev = [doc for doc in data if doc['id'] in repl_dev_ids]

    counts_train = counts[
        np.array([doc['id'] in repl_train_ids for doc in data]), :]
    counts_dev = counts[np.array([doc['id'] in repl_dev_ids
                                  for doc in data]), :]

    ids_train = [id for id in ids if id in repl_train_ids]
    ids_dev = [id for id in ids if id in repl_dev_ids]

    assert (len(data_train) == counts_train.shape[0] == len(ids_train))