Exemple #1
0
def load_winer(split="train",
               shuffle=True,
               inc_outside=True,
               merge_entities=True):
    path = check_cache_and_download("winer")

    if split == "train":
        train_data_path = os.path.join(path, "train.txt")
        X, Y = _load_data_spacy(train_data_path,
                                inc_outside=inc_outside,
                                merge_entities=merge_entities)
    elif split == "test":
        test_data_path = os.path.join(path, "test.txt")
        X, Y = _load_data_spacy(test_data_path,
                                inc_outside=inc_outside,
                                merge_entities=merge_entities)
    else:
        raise ValueError(
            f"Split argument {split} is not one of train, test or evaluate")

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y
Exemple #2
0
def load_conll(split="train",
               shuffle=True,
               inc_outside=True,
               dataset: str = "conll"):
    """Load the conll dataset

    Args:
        split(str): Which split of the data to collect, one of ["train", "test",
            "evaluate"].
        shuffle(bool): Should the data be shuffled with random.shuffle?
        inc_outside(bool): Should outside charavters be included?
        dataset(str): Which dataset to load. This defaults to "conll" and should
            only be altered for test purposes in which case it should be set to
            "test_conll".
    """
    path = check_cache_and_download(dataset)

    map = {"train": "eng.train", "test": "eng.testa", "evaluate": "eng.testb"}

    try:
        data_path = os.path.join(path, map[split])
        X, Y = _load_data_spacy(data_path, inc_outside=inc_outside)
    except KeyError:
        raise KeyError(
            f"Split argument {split} is not one of train, test or evaluate")

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y
Exemple #3
0
def load_hoc(split="train", shuffle=True):
    path = check_cache_and_download("hoc")

    if split == "train":
        train_data_path = os.path.join(path, "train.tsv")
        X, Y = load_split(train_data_path)
    elif split == "test":
        test_data_path = os.path.join(path, "test.tsv")
        X, Y = load_split(test_data_path)
    else:
        raise ValueError(f"Split argument {split} is not one of train or test")

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y
Exemple #4
0
def load_conll(split="train", shuffle=True, inc_outside=True):
    path = check_cache_and_download("conll")

    if split == "train":
        train_data_path = os.path.join(path, "eng.train")
        X, Y = _load_data_spacy(train_data_path, inc_outside=inc_outside)
    elif split == "test":
        test_data_path = os.path.join(path, "eng.testa")
        X, Y = _load_data_spacy(test_data_path, inc_outside=inc_outside)
    elif split == "evaluate":
        eval_data_path = os.path.join(path, "eng.testb")
        X, Y = _load_data_spacy(eval_data_path, inc_outside=inc_outside)
    else:
        raise ValueError(
            f"Split argument {split} is not one of train, test or evaluate"
        )

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y
Exemple #5
0
                                merge_entities=merge_entities)
    else:
        raise ValueError(
            f"Split argument {split} is not one of train, test or evaluate")

    if shuffle:
        data = list(zip(X, Y))
        random.shuffle(data)
        X, Y = zip(*data)

    return X, Y


if __name__ == "__main__":

    path = check_cache_and_download("winer")

    train_processed_path = os.path.join(path, "train.txt")
    test_processed_path = os.path.join(path, "test.txt")

    if not os.path.exists(train_processed_path):
        # Since this has been done once it shouldnt need to be done again,
        # including here for completeness or in th case we want to increase
        # the sample size
        logger.info("No {} training data file found, generating ...".format(
            train_processed_path))

        NE_path = os.path.join(path, "CoarseNE.tar.bz2")
        docs_path = os.path.join(path, "Documents.tar.bz2")
        vocab_path = os.path.join(path, "document.vocab")