Ejemplo n.º 1
0
def fetch_fashion_mnist(data_target = True, custom_path = os.getcwd()):
    train_dict = {}
    for file_key, file_value in train_files.items():
        train_dict.update({file_key : maybe_download(custom_path + '/../../ztlearn/datasets/fashion/', URL + file_value)})

    with gzip.open(list(train_dict.values())[0], 'rb') as label_path:
        train_label = np.frombuffer(label_path.read(), dtype = np.uint8, offset = 8)

    with gzip.open(list(train_dict.values())[1], 'rb') as data_path:
        train_data = np.frombuffer(data_path.read(), dtype = np.uint8, offset = 16).reshape(len(train_label), 784)

    test_dict = {}
    for file_key, file_value in test_files.items():
        test_dict.update({file_key : maybe_download(custom_path + '/../../ztlearn/datasets/fashion/', URL + file_value)})

    with gzip.open(list(test_dict.values())[0], 'rb') as label_path:
        test_label = np.frombuffer(label_path.read(), dtype = np.uint8, offset = 8)

    with gzip.open(list(test_dict.values())[1], 'rb') as data_path:
        test_data = np.frombuffer(data_path.read(), dtype = np.uint8, offset = 16).reshape(len(test_label), 784)

    if data_target:
        return DataSet(np.concatenate((train_data,  test_data),  axis = 0),
                       np.concatenate((train_label, test_label), axis = 0))
    else:
        return train_data, test_data, train_label, test_label
Ejemplo n.º 2
0
def fetch_digits(data_target=True):
    file_path = maybe_download('../../ztlearn/datasets/digits/', URL)

    with gzip.open(file_path, 'rb') as digits_path:
        digits_data = np.loadtxt(digits_path, delimiter=',')

    data, target = digits_data[:, :-1], digits_data[:, -1].astype(np.int)

    if data_target:
        return DataSet(data, target)
    else:
        return train_test_split(data, target, test_size=0.33, random_seed=5)
Ejemplo n.º 3
0
def fetch_pima_indians(data_target=True):
    file_path = maybe_download('../../ztlearn/datasets/pima/', URL)
    describe = [
        'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
        'DiabetesPedigreeFunction', 'Age', 'Insulin', 'BMI', 'Outcome (0 or 1)'
    ]

    dataframe = pd.read_csv(file_path, names=describe)
    data, target = dataframe.values[:, 0:8], dataframe.values[:, 8]

    if data_target:
        return DataSet(data, target, describe)
    else:
        return train_test_split(data, target, test_size=0.2, random_seed=2)
Ejemplo n.º 4
0
def fetch_boston(data_target=True):
    file_path = maybe_download('../../ztlearn/datasets/boston/', URL)
    describe = [
        'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
        'PTRATIO', 'B', 'LSTAT', 'MEDV'
    ]

    dataframe = pd.read_csv(file_path, delim_whitespace=True, names=describe)
    data, target = dataframe.values[:, 0:13], dataframe.values[:, 13]

    if data_target:
        return DataSet(data, target, describe)
    else:
        return train_test_split(data, target, test_size=0.2, random_seed=2)
Ejemplo n.º 5
0
def fetch_steel_plates_faults(data_target=True, custom_path=os.getcwd()):
    file_path = maybe_download(custom_path + '/../../ztlearn/datasets/steel/',
                               URL)
    file_path_2 = maybe_download(
        custom_path + '/../../ztlearn/datasets/steel/', URL_2)
    describe = [
        'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',
        'Other_Faults'
    ]

    InputDataHeader = pd.read_csv(file_path_2, header=None)
    InputData = pd.read_csv(file_path, header=None, sep="\t")
    InputData.set_axis(InputDataHeader.values.flatten(), axis=1, inplace=True)

    dataframe = InputData.copy()
    dataframe.drop(describe, axis=1, inplace=True)
    targetframe = InputData[describe].copy()

    data, target = dataframe.values, targetframe.values

    if data_target:
        return DataSet(data, target, describe)
    else:
        return train_test_split(data, target, test_size=0.2, random_seed=2)
Ejemplo n.º 6
0
def fetch_cifar_100(data_target=True, custom_path=os.getcwd()):
    extract_files(custom_path + CIFAR_100_BASE_PATH,
                  maybe_download(custom_path + CIFAR_100_BASE_PATH, URL))

    if not os.path.exists(
            os.path.join(custom_path + CIFAR_100_BASE_PATH,
                         CIFAR_100_BATCHES_FOLDER, train_files[0])):
        raise FileNotFoundError('{} File Not Found'.format(
            train_files[0]))  # dont continue

    if not os.path.exists(
            os.path.join(custom_path + CIFAR_100_BASE_PATH,
                         CIFAR_100_BATCHES_FOLDER, test_files[0])):
        raise FileNotFoundError('{} File Not Found'.format(
            test_files[0]))  # dont continue

    with open(
            os.path.join(custom_path + CIFAR_100_BASE_PATH,
                         CIFAR_100_BATCHES_FOLDER, train_files[0]),
            'rb') as file:
        data = cPickle.load(file, encoding='latin1')
        train_data = np.reshape(data['data'],
                                (data['data'].shape[0], 3, 32, 32))
        train_label = np.reshape(data['fine_labels'],
                                 len(data['fine_labels'], ))

    with open(
            os.path.join(custom_path + CIFAR_100_BASE_PATH,
                         CIFAR_100_BATCHES_FOLDER, test_files[0]),
            'rb') as file:
        data = cPickle.load(file, encoding='latin1')
        test_data = np.reshape(data['data'],
                               (data['data'].shape[0], 3, 32, 32))
        test_label = np.reshape(data['fine_labels'],
                                len(data['fine_labels'], ))

    if data_target:
        return DataSet(np.concatenate((train_data, test_data), axis=0),
                       np.concatenate((train_label, test_label), axis=0))
    else:
        return train_data, test_data, train_label, test_label
Ejemplo n.º 7
0
def fetch_iris(data_target = True):
    file_path = maybe_download('../../ztlearn/datasets/iris/', URL)
    describe  = [
        'sepal-length (cm)',
        'sepal-width (cm)',
        'petal-length (cm)',
        'petal-width (cm)',
        'petal_type'
    ]

    dataframe = pd.read_csv(file_path, names = describe)

    # convert petal type column to categorical data i.e {0:'Iris-setosa', 1:'Iris-versicolor', 2:'Iris-virginica'}
    dataframe.petal_type    = pd.Categorical(dataframe.petal_type)
    dataframe['petal_type'] = dataframe.petal_type.cat.codes

    data, target = dataframe.values[:,0:4], dataframe.values[:,4].astype('int')

    if data_target:
        return DataSet(data, target, describe)
    else:
        return train_test_split(data, target, test_size = 0.2, random_seed = 2)
Ejemplo n.º 8
0
def fetch_cifar_10(data_target=True, custom_path=os.getcwd()):
    extract_files(custom_path + CIFAR_10_BASE_PATH,
                  maybe_download(custom_path + CIFAR_10_BASE_PATH, URL))

    for train_file in train_files:
        if not os.path.exists(
                os.path.join(custom_path + CIFAR_10_BASE_PATH,
                             CIFAR_10_BATCHES_FOLDER, train_file)):
            raise FileNotFoundError(
                '{} File Not Found'.format(train_file))  # dont continue

    train_data = np.zeros((50000, 3, 32, 32), dtype='uint8')
    train_label = np.zeros((50000, ), dtype='uint8')
    for idx, train_file in enumerate(train_files):

        with open(
                os.path.join(custom_path + CIFAR_10_BASE_PATH,
                             CIFAR_10_BATCHES_FOLDER, train_file),
                'rb') as file:
            data = cPickle.load(file, encoding='latin1')
            batch_data = data['data'].reshape((-1, 3, 32, 32)).astype('uint8')
            batch_label = np.reshape(data['labels'], len(data['labels'], ))

        train_data[idx * 10000:(idx + 1) * 10000, ...] = batch_data
        train_label[idx * 10000:(idx + 1) * 10000] = batch_label

    with open(
            os.path.join(custom_path + CIFAR_10_BASE_PATH,
                         CIFAR_10_BATCHES_FOLDER, test_files[0]),
            'rb') as file:
        data = cPickle.load(file, encoding='latin1')
        test_data = data['data'].reshape((-1, 3, 32, 32)).astype('uint8')
        test_label = np.reshape(data['labels'], len(data['labels'], ))

    if data_target:
        return DataSet(np.concatenate((train_data, test_data), axis=0),
                       np.concatenate((train_label, test_label), axis=0))
    else:
        return train_data, test_data, train_label, test_label