Ejemplo n.º 1
0
def load_raw(max_examples=None):
    """
    Loads the train and test datasets for the [HAHA 2019 corpus](https://www.fing.edu.uy/inco/grupos/pln/haha/index.html#data)
    as Pandas dataframes.

    ##### Examples

    ```python
    >>> train, test = load_raw()
    >>> len(train), len(test)
    (24000, 6000)
    >>> train.columns
    Index(['id', 'text', 'is_humor', 'votes_no', 'votes_1', 'votes_2', 'votes_3',
           'votes_4', 'votes_5', 'funniness_average'],
          dtype='object')
    >>> train["funniness_average"].mean()
    2.0464498676235694

    ```
    """

    download("haha_2019")

    train_df = pd.read_csv(datapath("haha_2019") / "haha_2019_train.csv")
    test_df = pd.read_csv(datapath("haha_2019") / "haha_2019_test_gold.csv")

    if max_examples is not None:
        train_df = train_df[:max_examples]
        test_df = test_df[:max_examples]

    return train_df, test_df
Ejemplo n.º 2
0
def load():
    """
    Loads train and valid datasets from [Gisette uci dataset](https://archive.ics.uci.edu/ml/datasets/Gisette).

    ##### Examples

    ```python
    >>> X_train, y_train, X_valid, y_valid = load()
    >>> X_train.shape, X_valid.shape
    ((6000, 5000), (1000, 5000))
    >>> len(y_train), len(y_valid)
    (6000, 1000)

    ```
    """

    try:
        download("gisette")
    except:
        print(
            "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry"
        )
        raise

    train_data = open(datapath("gisette") / "gisette_train.data", "r")
    train_labels = open(datapath("gisette") / "gisette_train.labels", "r")
    valid_data = open(datapath("gisette") / "gisette_valid.data", "r")
    valid_labels = open(datapath("gisette") / "gisette_valid.labels", "r")

    Xtrain = sp.lil_matrix((6000, 5000))
    ytrain = []
    Xvalid = sp.lil_matrix((1000, 5000))
    yvalid = []

    for i, line in enumerate(train_data):
        for j, value in enumerate(line.split()):
            value = int(value)
            if value > 0:
                Xtrain[i, j] = value

    for i, line in enumerate(valid_data):
        for j, value in enumerate(line.split()):
            value = int(value)
            if value > 0:
                Xvalid[i, j] = value

    for line in train_labels:
        ytrain.append(int(line) > 0)

    for line in valid_labels:
        yvalid.append(int(line) > 0)

    return Xtrain.tocsr(), np.asarray(ytrain), Xvalid.tocsr(), np.asarray(
        yvalid)
Ejemplo n.º 3
0
def load(training_batches=5):
    """
    Load the CIFAR-10 dataset

    ##### Parameters

    * 'training_batches': maximum number of batches to load for training, 
      each batch has 10,000 examples (min=`1`, max=`5`, default=`5`).

    ##### Examples

    >>> X_train, y_train, X_test, y_test = load(training_batches=5)
    >>> X_train.shape
    (50000, 32, 32, 3)
    >>> len(y_train)
    50000
    >>> X_test.shape
    (10000, 32, 32, 3)
    >>> len(y_test)
    10000
    >>> y_train[0]
    6

    """
    download("cifar10")

    X_train = []
    y_train = []

    for i in range(1, training_batches + 1):
        batch = datapath('cifar10') / f'data_batch_{i}'

        with open(batch, 'rb') as fp:
            data = pickle.load(fp, encoding='bytes')
            X_train.append(data[b'data'])
            y_train.extend(data[b'labels'])

    X_train = np.vstack(X_train)
    X_train = np.reshape(X_train, (-1, 3, 32, 32)).transpose(0, 2, 3, 1)

    test_batch = datapath('cifar10') / 'test_batch'

    with open(test_batch, 'rb') as fp:
        data = pickle.load(fp, encoding='bytes')
        X_test, y_test = data[b'data'], data[b'labels']
        X_test = np.reshape(X_test, (-1, 3, 32, 32)).transpose(0, 2, 3, 1)

    return X_train, y_train, X_test, y_test
Ejemplo n.º 4
0
def load(representation='onehot'):
    download("uci_cars")

    f = open(datapath("uci_cars") / "car.data", "r")

    X = []
    y = []

    for i in f.readlines():
        clean_line = i.strip().split(",")

        temp = {}
        temp["buying"] = clean_line[0]
        temp["maint"] = clean_line[1]
        temp["doors"] = clean_line[2]
        temp["persons"] = clean_line[3]
        temp["lug_boot"] = clean_line[4]
        temp["safety"] = clean_line[5]

        X.append(temp)
        y.append(clean_line[6])

    if representation == 'numeric':
        return _load_numeric(X, y)
    elif representation == 'onehot':
        return _load_onehot(X, y)

    raise ValueError("Invalid value for represenation: %s" % representation)
Ejemplo n.º 5
0
def load(max_examples=None):
    try:
        download("movie_reviews")
    except:
        print(
            "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry"
        )
        raise

    sentences = []
    classes = []

    path = datapath("movie_reviews")

    ids = list(path.rglob("*.txt"))
    random.shuffle(ids)

    for fd in ids:
        if "neg/" in str(fd):
            cls = "neg"
        else:
            cls = "pos"

        with fd.open() as fp:
            sentences.append(fp.read())
            classes.append(cls)

        if max_examples and len(classes) >= max_examples:
            break

    return sentences, classes
Ejemplo n.º 6
0
def load(white=True, red=True, max_examples=None):
    if not red and not white:
        raise ValueError("Either red or white must be selected")

    download("wine_quality")

    f_white = open(datapath("wine_quality") / "winequality-white.csv", "r")
    f_red = open(datapath("wine_quality") / "winequality-red.csv", "r")

    X = []
    y = []

    if white:
        title_line = True
        for i in f_white.readlines():

            if max_examples and len(X) >= max_examples:
                break

            if title_line == True:
                title_line = False
                continue

            clean_line = i.strip().split(";")

            X.append([1, 0] + [float(i) for i in clean_line[:-1]])
            y.append(float(clean_line[-1]))

    if red:
        title_line = True
        for i in f_red.readlines():

            if max_examples and len(X) >= max_examples:
                break

            if title_line == True:
                title_line = False
                continue

            clean_line = i.strip().split(";")

            X.append([0, 1] + [float(i) for i in clean_line[:-1]])
            y.append(float(clean_line[-1]))

    return np.asarray(X), np.asarray(y)
Ejemplo n.º 7
0
def load():
    """
    Loads train and valid datasets from [DOROTHEA uci dataset](https://archive.ics.uci.edu/ml/datasets/dorothea).

    ##### Examples

    ```python
    >>> X_train, y_train, X_valid, y_valid = load()
    >>> X_train.shape, X_valid.shape
    ((800, 100000), (350, 100000))
    >>> len(y_train), len(y_valid)
    (800, 350)

    ```
    """

    download("dorothea")

    train_data = open(datapath('dorothea') / "dorothea_train.data", "r")
    train_labels = open(datapath('dorothea') / "dorothea_train.labels", "r")
    valid_data = open(datapath('dorothea') / "dorothea_valid.data", "r")
    valid_labels = open(datapath('dorothea') / "dorothea_valid.labels", "r")

    Xtrain = sp.lil_matrix((800, 100000), dtype=int)
    ytrain = []
    Xvalid = sp.lil_matrix((350, 100000), dtype=int)
    yvalid = []

    for row, line in enumerate(train_data):
        for col in line.split():
            Xtrain[row, int(col) - 1] = 1

    for row, line in enumerate(valid_data):
        for col in line.split():
            Xvalid[row, int(col) - 1] = 1

    for line in train_labels:
        ytrain.append(int(line))

    for line in valid_labels:
        yvalid.append(int(line))

    return Xtrain.tocsr(), np.asarray(ytrain), Xvalid.tocsr(), np.asarray(
        yvalid)
Ejemplo n.º 8
0
def load(representation="numeric"):
    """
    Loads corpora from [ABALONE uci dataset](https://archive.ics.uci.edu/ml/datasets/Abalone).

    ##### Examples

    ```python
    >>> X, y = load()
    >>> X.shape
    (4177, 6047)
    >>> len(y)
    4177

    ```
    """

    try:
        download("abalone")
    except:
        print(
            "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry"
        )
        raise

    f = open(datapath("abalone") / "abalone.data", "r")

    X = []
    y = []

    for i in f.readlines():
        clean_line = i.strip().split(",")

        temp = {}
        temp["Sex"] = clean_line[0]
        temp["Length"] = clean_line[1]
        temp["Diameter"] = clean_line[2]
        temp["Height"] = clean_line[3]
        temp["Shucked weight"] = clean_line[4]
        temp["Whole weight"] = clean_line[5]
        temp["Viscera weight"] = clean_line[6]
        temp["Shell weight"] = clean_line[7]

        X.append(temp)
        y.append(clean_line[8])

    if representation == "numeric":
        return _load_numeric(X, y)
    elif representation == "onehot":
        return _load_onehot(X, y)

    raise ValueError("Invalid value for represenation: %s" % representation)
Ejemplo n.º 9
0
def load():
    """
    Loads corpora from [Yeast uci dataset](https://archive.ics.uci.edu/ml/datasets/Yeast).

    ##### Examples

    ```python
    >>> X, y = load()
    >>> X.shape
    (1484, 8)
    >>> len(y)
    1484

    ```
    """

    try:
        download("yeast")
    except:
        print(
            "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry"
        )
        raise

    path = str(datapath(os.path.dirname(
        os.path.abspath(__file__)))) + "/data/yeast"
    f = open(os.path.join(path, "yeast.data"), "r")

    X = []
    y = []

    for i in f:
        clean_line = i.strip().split()
        temp = {}
        temp["1"] = float(clean_line[1])
        temp["2"] = float(clean_line[2])
        temp["3"] = float(clean_line[3])
        temp["4"] = float(clean_line[4])
        temp["5"] = float(clean_line[5])
        temp["6"] = float(clean_line[6])
        temp["7"] = float(clean_line[7])
        temp["8"] = float(clean_line[8])

        X.append(temp)
        y.append(clean_line[9])

    return _load_onehot(X, y)
Ejemplo n.º 10
0
def data_list():
    """
    🔍 List the available datasets.
    """

    datasets = get_datasets_list()

    table = Table("📚 Dataset", "💾", "🔗 URL")

    for item, url in sorted(datasets.items(), key=lambda t: t[0]):
        path = datapath(item)

        if path.exists():
            table.add_row(item, "✔️", url)
        else:
            table.add_row(item, "", url)

    console.print(table)
Ejemplo n.º 11
0
def load(max_examples=None):
    download("german_credit")

    f = open(datapath("german_credit") / "german.data", "r")

    X = []
    y = []

    for i in f.readlines():

        if max_examples and len(X) >= max_examples:
            break

        clean_line = i.strip().split()

        line = {'feature_%i'% i : _parse(v) for i,v in enumerate(clean_line[:-1])}

        X.append(line)
        y.append(int(clean_line[-1]) == 2)

    return DictVectorizer(sparse=False).fit_transform(X), np.asarray(y)
Ejemplo n.º 12
0
def load(max_examples=None):
    """
    Loads train and test datasets from [MEDDOCAN iberleaf 2018](https://github.com/PlanTL-SANIDAD/SPACCC_MEDDOCAN).

    ##### Examples

    ```python
    >>> X_train, y_train, X_valid, y_valid = load()
    >>> len(X_train), len(X_valid)
    (25622, 8432)
    >>> len(y_train), len(y_valid)
    (25622, 8432)

    ```
    """

    try:
        download("meddocan_2018")
    except:
        print(
            "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry"
        )
        raise

    train_path = datapath("meddocan_2018") / "train/brat"
    dev_path = datapath("meddocan_2018") / "dev/brat"
    test_path = datapath("meddocan_2018") / "test/brat"

    X_train = []
    X_test = []
    y_train = []
    y_test = []

    total = 0
    success = 0
    failed = 0

    for file in os.scandir(train_path):
        if file.name.split(".")[1] == "ann":
            text, phi = parse_text_and_tags(file.path)
            brat_corpora, text, ibo_corpora = get_tagged_tokens(text, phi)
            if compare_tags(brat_corpora, phi):
                X_train.extend(text)
                y_train.extend(ibo_corpora)

    for file in os.scandir(dev_path):
        if file.name.split(".")[1] == "ann":
            text, phi = parse_text_and_tags(file.path)
            brat_corpora, text, ibo_corpora = get_tagged_tokens(text, phi)
            if compare_tags(brat_corpora, phi):
                X_train.extend(text)
                y_train.extend(ibo_corpora)

    for file in os.scandir(test_path):
        if file.name.split(".")[1] == "ann":
            text, phi = parse_text_and_tags(file.path)
            brat_corpora, text, ibo_corpora = get_tagged_tokens(text, phi)
            if compare_tags(brat_corpora, phi):
                X_test.extend(text)
                y_test.extend(ibo_corpora)

    if max_examples is not None:
        X_train = X_train[:max_examples]
        X_test = X_test[:max_examples]
        y_train = y_train[:max_examples]
        y_test = y_test[:max_examples]

    return X_train, y_train, X_test, y_test
Ejemplo n.º 13
0
def load(max_examples=None):
    """
    Loads train and valid datasets from [Shuttle uci dataset](https://archive.ics.uci.edu/ml/datasets/Statlog+(Shuttle)).

    ##### Examples

    ```python
    >>> X_train, y_train, X_valid, y_valid = load()
    >>> X_train.shape, X_valid.shape
    ((43500, 9), (14500, 9))
    >>> len(y_train), len(y_valid)
    (43500, 14500)

    ```
    """

    try:
        download("shuttle")
    except:
        print(
            "Error loading data. This may be caused due to bad connection. Please delete badly downloaded data and retry"
        )
        raise

    train_data = open(datapath("shuttle") / "shuttle.trn", "r")
    test_data = open(datapath("shuttle") / "shuttle.tst", "r")

    X_train = []
    X_test = []
    y_train = []
    y_test = []

    for i in train_data.readlines():
        clean_line = i.strip().split()

        temp = {}
        temp["1"] = int(clean_line[0])
        temp["2"] = int(clean_line[1])
        temp["3"] = int(clean_line[2])
        temp["4"] = int(clean_line[3])
        temp["5"] = int(clean_line[4])
        temp["6"] = int(clean_line[5])
        temp["7"] = int(clean_line[6])
        temp["8"] = int(clean_line[7])
        temp["9"] = int(clean_line[8])

        X_train.append(temp)
        y_train.append(clean_line[9])

        if max_examples and len(X_train) >= max_examples:
            break

    for i in test_data.readlines():
        clean_line = i.strip().split()

        temp = {}
        temp["1"] = int(clean_line[0])
        temp["2"] = int(clean_line[1])
        temp["3"] = int(clean_line[2])
        temp["4"] = int(clean_line[3])
        temp["5"] = int(clean_line[4])
        temp["6"] = int(clean_line[5])
        temp["7"] = int(clean_line[6])
        temp["8"] = int(clean_line[7])
        temp["9"] = int(clean_line[8])

        X_test.append(temp)
        y_test.append(clean_line[9])

        if max_examples and len(X_test) >= max_examples:
            break

    X_train, y_train = _load_onehot(X_train, y_train)
    X_test, y_test = _load_onehot(X_test, y_test)

    return X_train, y_train, X_test, y_test
Ejemplo n.º 14
0
def load_training_entities():
    # download("ehealthkd20")

    training_path = datapath("ehealthkd20") / "training"

    collection = Collection().load_dir(training_path)