Esempio n. 1
0
def test_fetch_covtype_true_shuffle():
    hold1 = fetch_covtype(download_if_missing=True, shuffle=True)
    hold2 = fetch_covtype(download_if_missing=False, shuffle=False)

    data1, data2 = hold1['data'], hold2['data']
    target1, target2 = hold1['data'], hold2['data']

    assert_false(np.array_equal(data1, data2))
    assert_false(np.array_equal(target1, target2))
def covtype_binary(dataset_dir: Path) -> bool:
    """
    Cover type dataset from UCI machine learning repository
    https://archive.ics.uci.edu/ml/datasets/covertype

    y contains 7 unique class labels from 1 to 7 inclusive.
    Classification task. n_classes = 7.
    covtype X train dataset (464809, 54)
    covtype y train dataset (464809, 1)
    covtype X test dataset  (116203,  54)
    covtype y test dataset  (116203,  1)
    """
    dataset_name = 'covtype_binary'
    os.makedirs(dataset_dir, exist_ok=True)

    nrows_train, nrows_test = 100000, 100000
    logging.info(f'Started loading {dataset_name}')
    X, y = fetch_covtype(return_X_y=True)  # pylint: disable=unexpected-keyword-arg
    logging.info(f'{dataset_name} is loaded, started parsing...')

    y = (y > 3).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=77,
                                                        train_size=nrows_train,
                                                        test_size=nrows_test,
                                                        shuffle=False)
    for data, name in zip((X_train, X_test, y_train, y_test),
                          ('x_train', 'x_test', 'y_train', 'y_test')):
        filename = f'{dataset_name}_{name}.npy'
        np.save(os.path.join(dataset_dir, filename), data)
    logging.info(f'dataset {dataset_name} is ready.')
    return True
Esempio n. 3
0
def testCovtype():
    from sklearn.datasets import fetch_covtype
    covtype = fetch_covtype()
    total = len(covtype.data)

    onePercent = int(total * 0.01)
    baseMap = map(zipToMap,
                  zip(covtype.data[:onePercent], covtype.target[:onePercent]))
    onPercentDataFrame = pd.DataFrame(baseMap)

    init = time.time()
    clusters = minasOffline(onPercentDataFrame)
    print(
        f'minasOffline(testCovtype) => {len(clusters)}, {time.time() - init} seconds'
    )
    print(len(clusters))

    fivePercent = int(total * 0.05)
    fivePercentZip = zip(covtype.data[onePercent + 1:fivePercent],
                         map(str, covtype.target[onePercent + 1:fivePercent]))
    inputStream = (Example(item=i, label=t) for i, t in fivePercentZip)
    init = time.time()
    for o in metaMinas(minasOnline(inputStream, clusters)):
        print(o)
    print(f'metaMinas(minasOnline(testCovtype) {time.time() - init} seconds')
Esempio n. 4
0
def generate_covertype():
    covtype = fetch_covtype()

    X = np.array(covtype["data"], dtype=float)
    y = np.array(covtype["target"]) == 2

    # Very Easy
    clf = xgb.XGBClassifier(objective="reg:logistic",
                            nthread=4,
                            tree_method="hist",
                            max_depth=4,
                            learning_rate=0.5,
                            n_estimators=10)
    model = clf.fit(X, y)
    at = addtree_from_xgb_model(model)
    at.base_score = 0.0
    err = sum(y != model.predict(X)) / len(y)
    mae = mean_absolute_error(model.predict(X[:1000], output_margin=True),
                              at.predict(X[:1000]))
    print(f"easy covtype: error rate {err}")
    print(f"easy covtype: mae model difference {mae}")

    # edge case test
    _, feat_id, split_value = at[0].get_split(0)
    Xt = [X[12]]
    Xt[0][feat_id] = split_value
    print("edge case diff: ",
          model.predict(Xt, output_margin=True) - at.predict(Xt))

    at.write("tests/models/xgb-covtype-easy.json")
Esempio n. 5
0
def forest_dataload():
    from sklearn.datasets import fetch_covtype
    import numpy as np
    forest = fetch_covtype()
    Data = forest['data']
    label = forest['target']
    return Data, label
Esempio n. 6
0
def load_data(dtype=np.float32, order='C'):
    ######################################################################
    ## Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True,
                         shuffle=True,
                         random_state=opts.random_seed)
    X, y = data['data'], data['target']
    X = np.asarray(X, dtype=dtype)

    if order.lower() == 'f':
        X = np.asfortranarray(X)

    # class 1 vs. all others.
    y[np.where(y != 1)] = -1

    ######################################################################
    ## Create train-test split (as [Joachims, 2006])
    logger.info("Creating train-test split...")
    n_train = 522911

    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    ######################################################################
    ## Standardize first 10 features (the numerical ones)
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    mean[10:] = 0.0
    std[10:] = 1.0
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    return X_train, X_test, y_train, y_test
Esempio n. 7
0
    def setUpClass(cls):
        # setupLog()
        with open('logging.conf.yaml', 'r') as f:
            config = yaml.load(f, Loader=yaml.FullLoader)
        logging.config.dictConfig(config)

        dataset = fetch_covtype()
        cls.dataset = dataset

        total = len(dataset.data)
        print('sizeof dataset', sizeof_fmt(dataset.data.nbytes), 'len', total)
        print('dataset', dataset.data[0], dataset.target[0])

        zipToMap = lambda x: {'item': x[0], 'label': str(x[1])}

        onePercent = int(total * 0.01)
        baseMap = map(
            zipToMap,
            zip(dataset.data[:onePercent], dataset.target[:onePercent]))
        cls.onPercentDataFrame = pd.DataFrame(baseMap)
        fivePercent = int(total * 0.05)
        fivePercentZip = zip(
            dataset.data[onePercent + 1:fivePercent],
            map(str, dataset.target[onePercent + 1:fivePercent]))
        cls.fivePercentDataIterator = list(fivePercentZip)

        tenPercent = int(total * 0.10)
        baseMap = map(
            zipToMap,
            zip(dataset.data[:tenPercent], dataset.target[:tenPercent]))
        cls.tenPercentDataFrame = pd.DataFrame(baseMap)
        cls.allDataIterator = list(zip(dataset.data, map(str, dataset.target)))
Esempio n. 8
0
def load_data(dtype=np.float32, order='C', random_state=13):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True, shuffle=True,
                         random_state=random_state)
    X = check_array(data['data'], dtype=dtype, order=order)
    y = (data['target'] != 1).astype(np.int)

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 522911
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    # Standardize first 10 features (the numerical ones)
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    mean[10:] = 0.0
    std[10:] = 1.0
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    return X_train, X_test, y_train, y_test
Esempio n. 9
0
def load_data():
    # Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True,
                         shuffle=True,
                         random_state=RANDOM_STATE)
    X = check_array(data["data"], dtype=np.float32, order="C")
    y = (data["target"] != 1).astype(np.int)

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 522911
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    # Standardize first 10 features (the numerical ones)
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    mean[10:] = 0.0
    std[10:] = 1.0
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    return X_train, X_test, y_train, y_test
def load_data(dtype=np.float32, order='C', random_state=13):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True,
                         shuffle=True,
                         random_state=random_state)
    X = check_array(data['data'], dtype=dtype, order=order)
    y = (data['target'] != 1).astype(int)

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 522911
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    # Standardize first 10 features (the numerical ones)
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    mean[10:] = 0.0
    std[10:] = 1.0
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    return X_train, X_test, y_train, y_test
Esempio n. 11
0
def get_cover_type(num_rows=None):
    data = datasets.fetch_covtype()
    data = data.data
    if num_rows is not None:
        data = data[0:num_rows]

    return data
Esempio n. 12
0
def load_data(dtype=np.float32, order='F'):
    ######################################################################
    ## Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True, shuffle=True,
                         random_state=opts.random_seed)
    X, y = data['data'], data['target']
    if order.lower() == 'f':
        X = np.asfortranarray(X)

    # class 1 vs. all others.
    y[np.where(y != 1)] = -1

    ######################################################################
    ## Create train-test split (as [Joachims, 2006])
    logger.info("Creating train-test split...")
    n_train = 522911

    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    ######################################################################
    ## Standardize first 10 features (the numerical ones)
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    mean[10:] = 0.0
    std[10:] = 1.0
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    return X_train, X_test, y_train, y_test
def make_forest_cover_data():
    forest_cover = fetch_covtype()
    X, y = forest_cover.data, forest_cover.target

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        random_state=0)

    cols = [
        'Cover_Type', 'Elevation', 'Aspect', 'Slope',
        'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
        'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
        'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
        'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
        'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
        'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
        'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
        'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
        'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
        'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
        'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
        'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
        'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
        'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'
    ]

    train_df = pd.DataFrame(np.hstack([y_train.reshape(-1, 1), X_train]),
                            columns=cols)
    test_df = pd.DataFrame(np.hstack([y_test.reshape(-1, 1), X_test]),
                           columns=cols)

    return train_df, test_df
Esempio n. 14
0
def forestcover(random_state=1):
    data = fetch_covtype()

    x = data.data
    x = MinMaxScaler().fit_transform(x)
    y = data.target
    y = np.array([1 if l == 2 else -1 if l == 4 else 0 for l in y])

    normal = x[np.where(y == 1)]
    anomalies = x[np.where(y == -1)]

    x = np.concatenate((normal, anomalies), axis=0)
    y = np.concatenate(([1]*len(normal), [-1]*len(anomalies)), axis=0)
    x, y = shuffle(x, y, random_state=random_state)

    normal = x[np.where(y == 1)]
    test_normal = normal[int(len(normal)/2):]
    normal = normal[:int(len(normal)/2)]

    anomalies = x[np.where(y == -1)]
    test_anomalies = anomalies[int(len(anomalies)/2):]
    anomalies = anomalies[:int(len(anomalies)/2)]

    x_train = np.concatenate((normal, anomalies), axis=0)
    y_train = np.concatenate(([1]*len(normal), [-1]*len(anomalies)), axis=0)
    x_train, y_train = shuffle(x_train, y_train, random_state=1)

    x_test = np.concatenate((test_normal, test_anomalies), axis=0)
    y_test = np.concatenate(([1]*len(test_normal), [-1]*len(test_anomalies)), axis=0)
    x_test, y_test = shuffle(x_test, y_test, random_state=1)

    return x_train, y_train, x_test, y_test
Esempio n. 15
0
def main():

    newsgroups_train = fetch_covtype()

    # Récupération des data et target
    data = newsgroups_train.data
    # Elimination de données pour accélerer le temps de traitement qui ne se terminé pas sur mon PC
    data = data[:len(data) - 575000]
    target = newsgroups_train.target
    target = target[:len(target) - 575000]

    classes = set(target)

    # Créer un jeu de données test et train
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2)
    # Créer une liste contenant des tuples (images,value)
    test_values = [(x_test[index], value)
                   for index, value in enumerate(y_test)]

    start_time = time.time()
    # Créer des classifieur O vs O
    o_vs_o_classifiers = generateOvOClassifier(classes, x_train, y_train)
    print("OvO classifieur Done")
    # Fait les prédictions avec les classifieurs O vs O
    predictOVO(test_values, o_vs_o_classifiers)
    print("Temps d'execution OvO : %s secondes" % (time.time() - start_time))
    print()

    start_time = time.time()
    # Créer des classifieur O vs R
    ovrclassifier = generateOvRClassifier(classes, x_train, y_train)
    print("OvR classifieur Done")
    # Fait les prédictions avec les classifieurs O vs R
    predictOVR(test_values, ovrclassifier)
    print("Temps d'execution OvR : %s secondes" % (time.time() - start_time))
    print()

    start_time = time.time()
    # Créer des forest classifieur
    forestclassifier = RandomForestClassifier(n_estimators=10).fit(
        x_train, y_train)
    print("Forest classifieur Done")
    # Fait les prédictions avec les forests classifieurs
    predictForest(test_values, forestclassifier)
    print("Temps d'execution Forest : %s secondes" %
          (time.time() - start_time))
    print()

    start_time = time.time()
    # Créer des SVM classifieur
    SVMclassifier = svm.SVC(gamma='scale',
                            decision_function_shape='ovo',
                            probability=True).fit(x_train, y_train)
    print("SVM classifieur Done")
    # Fait les prédictions avec les SVM classifieurs
    predictSVM(test_values, SVMclassifier)
    print("Temps d'execution SVM : %s secondes" % (time.time() - start_time))
Esempio n. 16
0
def create_covtype():
    covtype_data = datasets.fetch_covtype()
    print covtype_data.__dict__
    data = data_class.Data()
    data.x = covtype_data.data
    data.y = covtype_data.target
    helper_functions.save_object("data_sets/covtype/raw_data.pkl")
    pass
Esempio n. 17
0
def fetch_data():
    from sklearn.datasets import fetch_covtype
    import fcntl
    with open("sklearn_download.lock", mode="ab") as f:
        fcntl.lockf(f, fcntl.LOCK_EX)
        data = fetch_covtype()
        fcntl.lockf(f, fcntl.LOCK_UN)
        return data
Esempio n. 18
0
def create_covtype():
    covtype_data = datasets.fetch_covtype()
    print covtype_data.__dict__
    data = data_class.Data()
    data.x = covtype_data.data
    data.y = covtype_data.target
    helper_functions.save_object('data_sets/covtype/raw_data.pkl')
    pass
Esempio n. 19
0
def test_xgboost_covtype(n_gpus):
    import xgboost as xgb
    import numpy as np
    from sklearn.datasets import fetch_covtype
    from sklearn.model_selection import train_test_split
    import time

    # Fetch dataset using sklearn
    cov = fetch_covtype()
    X = cov.data
    y = cov.target

    # Create 0.75/0.25 train/test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        train_size=0.75,
                                                        random_state=42)

    # Specify sufficient boosting iterations to reach a minimum
    num_round = 10

    # Leave most parameters as default
    param = {
        'objective': 'multi:softmax',  # Specify multiclass classification
        'num_class': 8,  # Number of possible output classes
        'tree_method': 'gpu_hist',  # Use GPU accelerated algorithm
    }
    if n_gpus is not None:
        param['n_gpus'] = n_gpus

    # Convert input data from numpy to XGBoost format
    dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
    dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)

    gpu_res = {}  # Store accuracy result
    tmp = time.time()
    # Train model
    xgb.train(param,
              dtrain,
              num_round,
              evals=[(dtest, 'test')],
              evals_result=gpu_res)
    print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))

    # TODO: https://github.com/dmlc/xgboost/issues/4518
    dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
    dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)
    # Repeat for CPU algorithm
    tmp = time.time()
    param['tree_method'] = 'hist'
    cpu_res = {}
    xgb.train(param,
              dtrain,
              num_round,
              evals=[(dtest, 'test')],
              evals_result=cpu_res)
    print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
Esempio n. 20
0
def get_data(random_state, test_size=0.2):
    data = fetch_covtype(download_if_missing=True,
                         shuffle=True,
                         random_state=random_state)

    X = data['data']
    y = data['target']
    n_train = int((1 - test_size) * X.shape[0])
    return X[:n_train], y[:n_train], X[n_train:], y[n_train:]
Esempio n. 21
0
def get_covtype(num_rows=None):
    data = datasets.fetch_covtype()
    X = data.data
    y = data.target
    if num_rows is not None:
        X = X[0:num_rows]
        y = y[0:num_rows]

    return X, y
Esempio n. 22
0
 def closure(mu):
     ds = fetch_covtype()
     X, _, y, _ = train_test_split(ds["data"],
                                   ds["target"],
                                   random_state=42,
                                   test_size=0.9,
                                   stratify=ds["target"])
     y = y - 1
     return preprocess_and_noise({"data": X, "target": y}, mu=mu)
Esempio n. 23
0
def getdata_covtype():
    from sklearn.datasets import fetch_covtype
    data = fetch_covtype()
    X = data['data']
    y_ = data['target'] - 1
    y = np.zeros((len(y_), 7))
    y[np.arange(len(y_)), y_] = 1

    return X, y
Esempio n. 24
0
def prepare_covtype(dataset_folder, nrows):  # pylint: disable=unused-argument
    X, y = fetch_covtype(return_X_y=True)  # pylint: disable=unexpected-keyword-arg
    if nrows is not None:
        X = X[0:nrows]
        y = y[0:nrows]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=77,
                                                        test_size=0.2,
                                                        )
    return Data(X_train, X_test, y_train, y_test, LearningTask.MULTICLASS_CLASSIFICATION)
Esempio n. 25
0
    def get(self, **kwargs) -> np.ndarray:
        try:
            print('Fetching CovType dataset...')
            data = fetch_covtype(download_if_missing=True, shuffle=True)
            X, y = data['data'], data['target']

            return X, y

        except:
            raise RuntimeError
Esempio n. 26
0
 def raw_frame():
     dataset = datasets.fetch_covtype()
     feature_names = [
         f"feature_{ix}" for ix in range(dataset.data.shape[1])
     ]
     df = pd.DataFrame(data=dataset.data, columns=feature_names)
     # This is a multiclass dataset, but we want to treat it as a binary one.
     # We'll just try to detect class 2, since that one is the most common.
     df["target"] = dataset.target == 2
     return df
Esempio n. 27
0
def load_covtype_dataset(subset=0.1, test_size=0.2, random_state=None):
    '''Load & Split training/test covtype dataset.'''
    print ('\nDataset used: \t\tForest covertypes from UCI ({:.1%} random subset)'.format(subset))
    X, y = datasets.fetch_covtype(return_X_y=True)
    y = make_binary_classification_target(y, 7, verbose=True)
    X, y = imbalance_random_subset(
        X, y, size=subset, random_state=random_state)
    X_train, X_test, y_train, y_test = imbalance_train_test_split(
        X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test
Esempio n. 28
0
def dataSetGenCovtype(log):
    from sklearn.datasets import fetch_covtype
    covtype = fetch_covtype()
    # covtype.data.shape Out[2]: (581012, 54) 581 012
    log.info(f'Dataset len {covtype.data.shape}')
    allData = list()
    for data, target in zip(covtype.data, covtype.target):
        data = [float(i) for i in data]
        allData.append((data, str(target)))
    #
    return allData
Esempio n. 29
0
    def test_select_data(self):
        """
        Tests the select_data function in algo_runner.py
        :return: None
        """

        # case less than 10000 rows:
        iris = load_iris()
        data = iris['data']
        target = iris['target']
        data = pd.DataFrame(data)
        target = pd.DataFrame(target)
        pct_examples_1, pct_examples_2, pct_examples_3 = \
            algo_runner.select_data(data, target)
        self.assertTrue(np.isclose(pct_examples_1, 0.05, rtol=0.01, atol=0.01))
        self.assertTrue(np.isclose(pct_examples_2, 0.10, rtol=0.01, atol=0.01))
        self.assertTrue(np.isclose(pct_examples_3, 0.15, rtol=0.01, atol=0.01))

        # case greater than 10000 less than 100000 rows:
        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        num_pixels = x_train.shape[1] * x_train.shape[2]
        x_train = x_train.reshape(x_train.shape[0],
                                  num_pixels).astype('float32')
        x_train = x_train / 255
        y_train = np_utils.to_categorical(y_train)

        x_train = pd.DataFrame(x_train)
        y_train = pd.DataFrame(y_train)

        pct_examples_1, pct_examples_2, pct_examples_3 = \
            algo_runner.select_data(x_train, y_train)
        self.assertTrue(
            np.isclose(pct_examples_1, 0.02, rtol=0.001, atol=0.001))
        self.assertTrue(
            np.isclose(pct_examples_2, 0.04, rtol=0.001, atol=0.001))
        self.assertTrue(
            np.isclose(pct_examples_3, 0.06, rtol=0.001, atol=0.001))

        # Case greater than 100000 rows:
        covtype = fetch_covtype()
        data = covtype['data']
        target = covtype['target']
        data = pd.DataFrame(data)
        target = pd.DataFrame(target)
        pct_examples_1, pct_examples_2, pct_examples_3 = \
            algo_runner.select_data(data, target)

        self.assertTrue(
            np.isclose(pct_examples_1, 0.01, rtol=0.001, atol=0.001))
        self.assertTrue(
            np.isclose(pct_examples_2, 0.02, rtol=0.001, atol=0.001))
        self.assertTrue(
            np.isclose(pct_examples_3, 0.03, rtol=0.001, atol=0.001))
        return None
Esempio n. 30
0
def load_cover_type(random_state=None, dtype=np.float32, order='C'):
    """Load cover type data

    Parameters
    ----------
    random_state : int, np.random.RandomState or None, optional (default=None)
        The random state used to shuffle the data if needed.

    dtype : np.dtype, optional (default=np.float32)
        The type for the data to be returned.

    order : 'C', 'F' or None, optional (default='C')
        Whether an array will be forced to be fortran or c-style.
        When order is None (default), then if copy=False, nothing is ensured
        about the memory layout of the output array; otherwise (copy=True)
        the memory layout of the returned array is kept as close as possible
        to the original array.

    Returns
    -------
    X : ndarray, shape (n_train_samples, n_features)

    y : ndarray, shape (n_train_samples, )

    T : ndarray, shape (n_test_samples, n_features)

    valid: ndarray, shape (n_test_samples, )
    """
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_covtype(download_if_missing=True,
                         shuffle=True,
                         random_state=random_state)
    X = check_array(data['data'], dtype=dtype, order=order)
    y = (data['target'] != 1).astype(np.int)

    # Create train-test split (as [Joachims, 2006])
    print("Creating train-test split...")
    n_train = 522911
    X_train = X[:n_train]
    y_train = y[:n_train]
    X_test = X[n_train:]
    y_test = y[n_train:]

    # Standardize first 10 features (the numerical ones)
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    mean[10:] = 0.0
    std[10:] = 1.0
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std

    return X_train, y_train, X_test, y_test
Esempio n. 31
0
def run_in_cluster():
    dataset = datasets.fetch_covtype(data_home=tempfile.mkdtemp())
    X, y = dataset.data, dataset.target - 1
    training_size = 400000
    max_depth = 10
    clf = DecisionTreeClassifier(max_depth=max_depth)
    start = time.time()
    clf.fit(X[:training_size], y[:training_size])
    end = time.time()
    y_pred = clf.predict(X[training_size:])
    accuracy = metrics.accuracy_score(y[training_size:], y_pred)
    return end - start, accuracy
def get_cd_data(num_samples=500):

    # Load in the vectorized news group data from scikit-learn package
    cov = fetch_covtype()
    all_data = np.array(cov.data)
    all_targets = np.array(cov.target)

    # Set class pairings as described in the multiview clustering paper
    view1_classes = [1, 2, 3]
    view2_classes = [4, 5, 6]

    # Create lists to hold data and labels for each of the classes across
    # 2 different views
    labels = [
        num for num in range(len(view1_classes)) for _ in range(num_samples)
    ]
    labels = np.array(labels)
    view1_data = list()
    view2_data = list()

    # Randomly sample 500 items from each of the selected classes in view1
    for class_num in view1_classes:
        class_data = all_data[(all_targets == class_num)]
        indices = np.random.choice(class_data.shape[0], num_samples)
        view1_data.append(class_data[indices])
    view1_data = np.concatenate(view1_data)

    # Construct view 2 by applying a nonlinear transformation
    # to data from view 1 comprised of a linear transformation
    # and a logistic nonlinearity
    t_mat = np.random.random((view1_data.shape[1], 50))
    noise = 0.005 - 0.01 * np.random.random((view1_data.shape[1], 50))
    t_mat *= noise
    transformed = view1_data @ t_mat
    view2_data = scp.special.expit(transformed)

    # Shuffle and normalize vectors
    shuffled_inds = np.random.permutation(num_samples * len(view1_classes))
    view1_data = np.vstack(view1_data)
    view2_data = np.vstack(view2_data)
    view1_data = view1_data[shuffled_inds]
    view2_data = view2_data[shuffled_inds]
    magnitudes1 = np.linalg.norm(view1_data, axis=0)
    magnitudes2 = np.linalg.norm(view2_data, axis=0)
    magnitudes1[magnitudes1 == 0] = 1
    magnitudes2[magnitudes2 == 0] = 1
    magnitudes1 = magnitudes1.reshape((1, -1))
    magnitudes2 = magnitudes2.reshape((1, -1))
    view1_data /= magnitudes1
    view2_data /= magnitudes2
    labels = labels[shuffled_inds]
    return [view1_data, view2_data], labels
Esempio n. 33
0
def load_classification_data():
    dataset = fetch_covtype(data_home="data")
    data = np.hstack([dataset.data, dataset.target.reshape(-1, 1)])[:10000, :]
    col_names = [f"feature_{i}" for i in range(data.shape[-1])]
    col_names[-1] = "target"
    data = pd.DataFrame(data, columns=col_names)
    data["feature_0_cat"] = pd.qcut(data["feature_0"], q=4)
    data["feature_0_cat"] = "feature_0_" + data.feature_0_cat.cat.codes.astype(
        str)
    test_idx = data.sample(int(0.2 * len(data)), random_state=42).index
    test = data[data.index.isin(test_idx)]
    train = data[~data.index.isin(test_idx)]
    return (train, test, ["target"])
def get_covertype(train_test_ratio):
    covertype = datasets.fetch_covtype()
    x = covertype.data

    y = convert_to_1_hot(covertype.target - 1, 7)
    cutoff = int(x.shape[0] * train_test_ratio)

    x = x.astype(np.float32)
    y = y.astype(np.float32)

    x_train = x[0:cutoff, :]
    x_test = x[cutoff:, :]
    y_train = y[0:cutoff]
    y_test = y[cutoff:]
    return x_train, y_train, x_test, y_test
Esempio n. 35
0
def fun():
    import xgboost as xgb
    import numpy as np
    from sklearn.datasets import fetch_covtype
    from sklearn.model_selection import train_test_split
    import time

    # Fetch dataset using sklearn
    cov = fetch_covtype()
    X = cov.data
    y = cov.target

    # Create 0.75/0.25 train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
                                                        random_state=42)

    # Specify sufficient boosting iterations to reach a minimum
    num_round = 10

    # Leave most parameters as default
    param = {'objective': 'multi:softmax', # Specify multiclass classification
             'num_class': 8, # Number of possible output classes
             'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
             }

    # Convert input data from numpy to XGBoost format
    dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1)
    dtest = xgb.DMatrix(X_test, label=y_test, nthread=-1)

    gpu_res = {} # Store accuracy result
    tmp = time.time()
    # Train model
    xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
    print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))

    # Repeat for CPU algorithm
    tmp = time.time()
    param['tree_method'] = 'hist'
    cpu_res = {}
    xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
    print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
import numpy as np

from sklearn.ensemble import GradientBoostingRegressorCV, GradientBoostingClassifierCV
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

from sklearn.datasets import load_boston, fetch_covtype, load_iris, make_classification
from sklearn.datasets import fetch_20newsgroups_vectorized, fetch_california_housing

gbccv = GradientBoostingClassifierCV(n_jobs=8, random_state=42)

covtype = fetch_covtype()

X, y = covtype.data[::2], covtype.target[::2]
gbccv.fit(X, y)
Esempio n. 37
0
def load_data(ds_name):
  data_dir = path.dirname(__file__) + "/../data/"
  global _img_data
  if ds_name == "digits":
    ds = load_digits()
    x_train = ds.data
    y_train = ds.target
  elif ds_name == "iris":
    ds = load_iris()
    x_train = ds.data
    y_train = ds.target
  elif ds_name == "diabetes":
    ds = load_diabetes()
    x_train = ds.data 
    y_train = ds.target > 140 
  elif ds_name == "covtype":
    ds = fetch_covtype(download_if_missing = True)
    x_train = ds.data 
    y_train = ds.target 
  elif ds_name == "cf10":
    with open(data_dir + "data_batch_1", "r") as f:
      ds = cPickle.load(f)
      x_train = ds['data']
      y_train = np.array(ds['labels'])
  elif ds_name == "cf100":
    with open(data_dir + "train", "r") as f:
      ds = cPickle.load(f)
      x_train = ds['data']
      y_train = np.array(ds['fine_labels'])
  elif ds_name == "cd10_test":
    with open(data_dir + "test_batch", "r") as f:
      ds = cPickle.load(f)
      x_train = ds['data']
      y_train = np.array(ds['labels'])
  elif ds_name == "cf100_test":
    with open(data_dir + "test", "r") as f:
      ds = cPickle.load(f)
      x_train = ds['data']
      y_train = np.array(ds['fine_labels'])
  elif ds_name == "inet":
    if _img_data is None:
      with open("/ssd/imagenet-subset.pickle", "r") as f:
        _img_data = cPickle.load(f)
    return _img_data['x'][0:10000],  _img_data['Y'][0:10000] 
  elif ds_name == "inet_test":
    if _img_data is None:
      with open("/ssd/imagenet-subset.pickle", "r") as f:
        _img_data = cPickle.load(f)
    return _img_data['x'][10000:],  _img_data['Y'][10000:] 
  elif ds_name == "kdd":
    data = np.load(data_dir + "data.npy")
    x_train = data[:, :-1]
    y_train = data[:, -1]
  elif ds_name == "poker":
    data = sklearn.datasets.fetch_mldata("poker")
    x_train = data.data
    y_train = data.target
  elif ds_name == "pamap":
    data = np.load(data_dir + "pamap.npz")
    x_train = data['x']
    y_train = data['y']
  else:
    assert False, "Unrecognized data set name %s" % ds_name
  return x_train, y_train
Esempio n. 38
0
        X = dataset.data
        y = dataset.target

    if dataset_name == 'shuttle':
        dataset = fetch_openml('shuttle')
        X = dataset.data
        y = dataset.target
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)

    if dataset_name == 'forestcover':
        dataset = fetch_covtype()
        X = dataset.data
        y = dataset.target
        # normal data are those with attribute 2
        # abnormal those with attribute 4
        s = (y == 2) + (y == 4)
        X = X[s, :]
        y = y[s]
        y = (y != 2).astype(int)

    print('vectorizing data')

    if dataset_name == 'SF':
        lb = LabelBinarizer()
        x1 = lb.fit_transform(X[:, 1].astype(str))
        X = np.c_[X[:, :1], x1, X[:, 2:]]
Esempio n. 39
0
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
import time

# Fetch dataset using sklearn
cov = fetch_covtype()
X = cov.data
y = cov.target

# Create 0.75/0.25 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
                                                    random_state=42)

# Specify sufficient boosting iterations to reach a minimum
num_round = 3000

# Leave most parameters as default
param = {'objective': 'multi:softmax', # Specify multiclass classification
         'num_class': 8, # Number of possible output classes
         'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
         }

# Convert input data from numpy to XGBoost format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

gpu_res = {} # Store accuracy result
tmp = time.time()
# Train model
def fetch(*args, **kwargs):
    return fetch_covtype(*args, download_if_missing=False, **kwargs)
Esempio n. 41
0
        y = dataset.target

    if dat == 'shuttle':
        dataset = fetch_mldata('shuttle')
        X = dataset.data
        y = dataset.target
        sh(X, y)
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)

    if dat == 'forestcover':
        dataset = fetch_covtype(shuffle=True)
        X = dataset.data
        y = dataset.target
        # normal data are those with attribute 2
        # abnormal those with attribute 4
        s = (y == 2) + (y == 4)
        X = X[s, :]
        y = y[s]
        y = (y != 2).astype(int)

    if dat == 'SF':
        lb = LabelBinarizer()
        lb.fit(X[:, 1])
        x1 = lb.transform(X[:, 1])
        X = np.c_[X[:, :1], x1, X[:, 2:]]
        y = (y != 'normal.').astype(int)
Esempio n. 42
0
import time

import numpy as np

from sklearn.datasets import fetch_covtype
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from ivalice.regression import RFRegressor

data = fetch_covtype(download_if_missing=True, shuffle=True, random_state=0)
X, y = data.data, data.target

n_samples = 10000
mask = y <= 2
Xb = X[mask][:n_samples]
yb = y[mask][:n_samples]

Xb_tr, Xb_te, yb_tr, yb_te = train_test_split(Xb, yb, train_size=0.75,
                                              test_size=0.2, random_state=0)

rf = RandomForestRegressor(n_estimators=100,
                           max_depth=3,
                           max_features=0.6)
start = time.time()
rf.fit(Xb_tr, yb_tr)
print "RandomForestRegressor"
print time.time() - start, "seconds"
y_pred = rf.predict(Xb_te)
print mean_squared_error(yb_te, y_pred)
    if dat == 'shuttle':
        dataset = fetch_mldata('shuttle')
        X = dataset.data
        y = dataset.target
        X, y = sh(X, y, random_state=random_state)
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)
        print('----- ')

    if dat == 'forestcover':
        dataset = fetch_covtype(shuffle=True, random_state=random_state)
        X = dataset.data
        y = dataset.target
        # normal data are those with attribute 2
        # abnormal those with attribute 4
        s = (y == 2) + (y == 4)
        X = X[s, :]
        y = y[s]
        y = (y != 2).astype(int)
        print_outlier_ratio(y)

    print('--- Vectorizing data...')

    if dat == 'SF':
        lb = LabelBinarizer()
        x1 = lb.fit_transform(X[:, 1].astype(str))