Example #1
0
    def train_test_sampling(self, validation_split: float = 0.1) -> dict:
        """
        Data sampling into train & test data

        :param validation_split: float
            Amount of training data to validate quality during training

        :return dict:
            Train and test split for both target and predictors
        """
        #if self.stratification:
        #    _stratification: np.array = self.df[self.target].values
        #else:
        #    _stratification = None
        _x_train, _x_test, _y_train, _y_test = train_test_split(
            self.df[self.features],
            self.df[self.target],
            test_size=self.test_size,
            train_size=self.train_size,
            random_state=self.seed,
            shuffle=self.random_sample,
            #stratify=_stratification
        )
        if validation_split > 0:
            _x_train_, _x_val, _y_train_, _y_val = train_test_split(
                _x_train,
                _y_train,
                test_size=validation_split,
                train_size=1 - validation_split,
                random_state=self.seed,
                shuffle=self.random_sample)
        else:
            _x_train_ = _x_train
            del _x_train
            _x_val = None
            _y_train_ = _y_train
            del _y_train
            _y_val = None
            return dict(x_train=_x_train_.compute(),
                        x_test=_x_test.compute(),
                        y_train=_y_train_.compute(),
                        y_test=_y_test.compute(),
                        x_val=_x_val,
                        y_val=_y_val)
        return dict(x_train=_x_train_.compute(),
                    x_test=_x_test.compute(),
                    y_train=_y_train_.compute(),
                    y_test=_y_test.compute(),
                    x_val=_x_val.compute(),
                    y_val=_y_val.compute())
def process_data(df, pca_level):
    data_x = feature_engine(df)
    data_y = df['meter_reading']

    PP_Pipeline = Pipeline([
        ('Imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
        ('Scaler', preprocessing.MinMaxScaler()),
        ('PCA', PCA(n_components=pca_level)),
    ])

    x_train, x_test, y_train, y_test = train_test_split(data_x,
                                                        data_y,
                                                        test_size=0.3,
                                                        random_state=4)

    x_train_pp = PP_Pipeline.fit_transform(x_train)
    x_test_pp = PP_Pipeline.transform(x_test)

    # PipelineFile = open("PipelineFile", "wb")
    # pickle.dump(PP_Pipeline, PipelineFile)
    # PipelineFile.close()

    print('\n')
    print('Completed Preprocessing and Dimensionality Reduction')
    print('\n')

    return x_train_pp, x_test_pp, y_train, y_test
Example #3
0
def main():
    start = time.time()
    initialize(interface='ib0')
    client = Client()

    iris = datasets.load_iris()
    X = iris.data
    y = iris.target

    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.05)
    D_test = xgb.DMatrix(X_test, label=Y_test)

    params = {
        'eta': 0.3,
        'max_depth': 3,
        'objective': 'multi:softprob',
        'num_class': 3
    }

    bst = dxgb.train(client,
                     params,
                     da.asarray(X_train),
                     da.asarray(Y_train),
                     num_boost_round=10)
    preds = bst.predict(D_test)
    best_preds = np.asarray([np.argmax(line) for line in preds])

    print("Precision = {}".format(
        precision_score(Y_test, best_preds, average='macro')))
    print("Recall = {}".format(
        recall_score(Y_test, best_preds, average='macro')))
    print("Accuracy = {}".format(accuracy_score(Y_test, best_preds)))
    elapsed = (time.time() - start)
    print(f"Elapsed time: {elapsed}")
Example #4
0
    def setup_class(self):
        setup_dask(self)

        print("Loading datasets...")
        df_train = dd.from_pandas(dsutils.load_adult().head(1000),
                                  npartitions=2)
        self.y = df_train.pop(14)
        self.X = df_train

        conf = deeptable.ModelConfig(metrics=['AUC'],
                                     apply_gbm_features=False,
                                     auto_categorize=False,
                                     auto_discrete=False)
        self.dt = deeptable.DeepTable(config=conf)

        self.X_train, \
        self.X_eval, \
        self.y_train, \
        self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.oof_proba, self.eval_proba, self.test_proba = \
            self.dt.fit_cross_validation(self.X_train,
                                         self.y_train,
                                         self.X_eval,
                                         num_folds=3,
                                         epochs=1,
                                         n_jobs=1)
Example #5
0
def objective(trial):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = da.from_array(X,
                         chunks=len(X) // 5), da.from_array(y,
                                                            chunks=len(y) // 5)

    solver = trial.suggest_categorical(
        'solver', ['admm', 'gradient_descent', 'proximal_grad'])
    C = trial.suggest_uniform('C', 0.0, 1.0)

    if solver == 'admm' or solver == 'proximal_grad':
        penalty = trial.suggest_categorical('penalty',
                                            ['l1', 'l2', 'elastic_net'])
    else:
        # 'penalty' parameter isn't relevant for this solver,
        # so we always specify 'l2' as the dummy value.
        penalty = 'l2'

    classifier = LogisticRegression(max_iter=200,
                                    solver=solver,
                                    C=C,
                                    penalty=penalty)

    X_train, X_test, y_train, y_test = train_test_split(X, y)
    classifier.fit(X_train, y_train)

    score = classifier.score(X_test, y_test)
    return score
Example #6
0
    def train_test_split(*data,
                         shuffle=True,
                         random_state=None,
                         stratify=None,
                         **kwargs):
        if DaskToolBox.exist_dask_dataframe(*data):
            if len(data) > 1:
                data = [
                    DaskToolBox.make_divisions_known(
                        DaskToolBox.to_dask_frame_or_series(x)) for x in data
                ]
                head = data[0]
                for i in range(1, len(data)):
                    if data[i].divisions != head.divisions:
                        logger.info(
                            f'repartition {i} from {data[i].divisions} to {head.divisions}'
                        )
                        data[i] = data[i].repartition(divisions=head.divisions)
            result = dm_sel.train_test_split(*data,
                                             shuffle=shuffle,
                                             random_state=random_state,
                                             **kwargs)
            result = [x.clear_divisions() for x in result]
        else:
            result = sk_sel.train_test_split(*data,
                                             shuffle=shuffle,
                                             random_state=random_state,
                                             stratify=stratify,
                                             **kwargs)

        return result
    def split_dataset(self, dataset, random_state):
        """
        Split dataset into train and test data subsets,
        currently using CV-fold index for randomness.
        Plan to refactor with dask_ml KFold
        """
        hpo_log.info('> train-test split')
        label_column = self.hpo_config.label_column

        train, test = train_test_split(dataset, random_state=random_state)

        # build X [ features ], y [ labels ] for the train and test subsets
        y_train = train[label_column]
        X_train = train.drop(label_column, axis=1)
        y_test = test[label_column]
        X_test = test.drop(label_column, axis=1)

        # force execution
        X_train, y_train, X_test, y_test = persist_across_workers(
            self.client, [X_train, y_train, X_test, y_test],
            workers=self.client.has_what().keys())

        # wait!
        wait([X_train, y_train, X_test, y_test])

        return (X_train.astype(self.hpo_config.dataset_dtype),
                X_test.astype(self.hpo_config.dataset_dtype),
                y_train.astype(self.hpo_config.dataset_dtype),
                y_test.astype(self.hpo_config.dataset_dtype))
def objective(trial):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = da.from_array(X,
                         chunks=len(X) // 5), da.from_array(y,
                                                            chunks=len(y) // 5)

    solver = trial.suggest_categorical(
        "solver", ["admm", "gradient_descent", "proximal_grad"])
    C = trial.suggest_float("C", 0.0, 1.0)

    if solver == "admm" or solver == "proximal_grad":
        penalty = trial.suggest_categorical("penalty",
                                            ["l1", "l2", "elastic_net"])
    else:
        # 'penalty' parameter isn't relevant for this solver,
        # so we always specify 'l2' as the dummy value.
        penalty = "l2"

    classifier = LogisticRegression(max_iter=200,
                                    solver=solver,
                                    C=C,
                                    penalty=penalty)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    classifier.fit(X_train, y_train)

    score = classifier.score(X_valid, y_valid)
    return score
Example #9
0
    def split_data(self,
                   dataset,
                   y_label,
                   train_size=.8,
                   random_state=0,
                   shuffle=True):

        with PerfTimer(self, 'split_timer'):
            train, test = train_test_split(
                dataset, random_state=random_state
            )  # unable to shuffle -- no dask_cudf sampler implemented

            X_train, y_train = train.drop(
                y_label,
                axis=1).astype('float32'), train[y_label].astype('int32')
            X_test, y_test = test.drop(
                y_label,
                axis=1).astype('float32'), test[y_label].astype('int32')

        if 'multi-GPU' in self.compute_type:
            with PerfTimer(self, 'persist_timer'):
                workers = self.client.has_what().keys()
                X_train, X_test, y_train, y_test = persist_across_workers(
                    self.client, [X_train, X_test, y_train, y_test],
                    workers=workers)
                wait([X_train, X_test, y_train, y_test])

        return X_train, X_test, y_train, y_test
Example #10
0
    def split_dataset(self, dataset, random_state):
        """
        Split dataset into train and test data subsets,
        currently using CV-fold index for randomness.
        Plan to refactor with dask_ml KFold
        """

        hpo_log.info("> train-test split")
        label_column = self.hpo_config.label_column

        train, test = train_test_split(dataset, random_state=random_state)

        # build X [ features ], y [ labels ] for the train and test subsets
        y_train = train[label_column]
        X_train = train.drop(label_column, axis=1)
        y_test = test[label_column]
        X_test = test.drop(label_column, axis=1)

        # persist
        X_train = X_train.persist()
        y_train = y_train.persist()

        wait([X_train, y_train])

        return (
            X_train.astype(self.hpo_config.dataset_dtype),
            X_test.astype(self.hpo_config.dataset_dtype),
            y_train.astype(self.hpo_config.dataset_dtype),
            y_test.astype(self.hpo_config.dataset_dtype),
        )
Example #11
0
def main():
    # client = Client("tcp://127.0.0.1:64958")
    client = Client(processes=False, threads_per_worker=2, n_workers=1, memory_limit='4GB')
    print(client)

    rs = RandomSearcher(get_space_num_cat_pipeline_complex, optimize_direction=OptimizeDirection.Maximize)
    hk = HyperGBM(rs, task='classification', reward_metric='accuracy',
                  cache_dir=f'{test_output_dir}/hypergbm_cache',
                  callbacks=[SummaryCallback(), FileLoggingCallback(rs, output_dir=f'{test_output_dir}/hyn_logs')])

    df = dsutils.load_bank_by_dask()
    df.drop(['id'], axis=1)
    df['y'] = dm_pre.LabelEncoder().fit_transform(df['y'])
    # df = df.sample(frac=0.1)

    # object_columns = [i for i, v in df.dtypes.items() if v == 'object']
    # for c in object_columns:
    #     df[c] = df[c].astype('category')
    # df = df.categorize(object_columns)

    X_train, X_test = train_test_split(df, test_size=0.8, random_state=42)
    y_train = X_train.pop('y')
    y_test = X_test.pop('y')

    hk.search(X_train, y_train, X_test, y_test, max_trails=50)
    print('-' * 30)

    best_trial = hk.get_best_trail()
    print(f'best_train:{best_trial}')
    estimator = hk.final_train(best_trial.space_sample, X_train, y_train)
    score = estimator.predict(X_test)
    result = estimator.evaluate(X_test, y_test, metrics=['accuracy', 'auc', 'logloss'])
    print(f'final result:{result}')
Example #12
0
def main(client):
    m = 100000
    n = 100
    X, y = make_regression(n_samples=m,
                           n_features=n,
                           chunks=200,
                           random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    dtrain = DaskDMatrix(client, X_train, y_train)
    dtest = DaskDMatrix(client, X_test, y_test)

    output = xgb.dask.train(
        client,
        {
            "verbosity": 1,
            "tree_method": "hist",
            "objective": "reg:squarederror",
            "eval_metric": "rmse",
            "max_depth": 6,
            "learning_rate": 1.0,
        },
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, "train"), (dtest, "test")],
        callbacks=[
            CustomEarlyStopping(validation_set="test",
                                target_metric="rmse",
                                maximize=False,
                                seed=0)
        ],
    )
Example #13
0
def process_data(X, y=None, test_size=0.2):
    if y is None:
        km = dask_ml.cluster.KMeans(n_clusters=10, init_max_iter=100)
        km.fit(X.flatten().reshape(-1, 1))
        y = km.labels_
    y_uniqs = np.unique(y[:,0])

    len_ = X.shape[0]
    X = prepare_dataset(X)

    shape_ = list(X.shape[1:])

    if test_size != 0:
        samples = list()
        samples_labels = list()
        print('Preparing samples ...')
        for _ in range(2):
            for y_uniq in y_uniqs:
                sample = list()
                label = list()
                for xa, ya in zip(chunks(X, 10),chunks(y[:,0], 10)):
                    try:
                        sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]])
                        label.append(y_uniq)
                        if len(sample) >= len(y_uniqs):
                            break
                    except:
                        pass
                samples += sample
                samples_labels += label
        samples = da.vstack(samples)
        samples_labels = da.vstack(samples_labels)

    if test_size == 0:
        print('Training dataset shape x: ', X.shape)
        print('Training dataset shape y: ', y.shape)

        train_dataset = Dataset(X, y)
        return train_dataset

    else:
        X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size,
                                                        random_state=config.seeds)

        X_train = X_train.reshape([X_train.shape[0]] + shape_)
        X_test = X_test.reshape([X_test.shape[0]] + shape_)

        print('Training dataset shape: ', X_train.shape)
        print('Validation dataset shape: ', X_test.shape)

        train_dataset = Dataset(X_train, y_train)
        test_dataset = Dataset(X_test, y_test)

        train_dataset.samples = samples
        train_dataset.samples_labels = samples_labels

        print('Sample dataset shape: ', train_dataset.samples.shape)
        return train_dataset, test_dataset
Example #14
0
    def _preprocess(
            self, df: "dask.DataFrame",
            inferencing: bool) -> Tuple["dask.DataFrame", "dask.DataFrame"]:
        df = df.loc[:, df.columns != "index"]
        # remove nulls and/or NaNs scalably with dask
        print(f"step1: drop nulls from rows")
        df = df.dropna(subset=["nullable_feature"])

        print(f"step2: creating new_col and updatingfeature_1")
        df["new_col"] = (df["feature_1"] - 2 * df["feature_2"] +
                         df["feature_3"]) / 3.
        df["feature_1"] = 2. * df["feature_1"] + 0.1
        # TODO: this doesn't work with more than 1 parquet file
        # df['mean_by_fruit'] = df.groupby('fruit')['feature_1'].transform('mean')

        print(f"step3: one-hot encoding fruit")
        df = df.astype({"fruit": "category"})
        df = df.categorize()
        df.persist()

        if inferencing:
            assert self.column_transformer is not None
            df_fruits = self.column_transformer.transform(df)
        else:
            assert self.column_transformer is None
            self.column_transformer = ColumnTransformer([
                ("one-hot", OneHotEncoder(sparse=False), ["fruit"])
            ])
            df_fruits = self.column_transformer.fit_transform(df)

        df_data = df.loc[:, (df.columns != "label") & (df.columns != "fruit")]
        df_data = dd.concat([df_data, df_fruits], axis=1)

        assert df_data.isnull().sum().sum().compute(
        ) == 0, "There are nulls or Nans in the data!"

        if inferencing:
            print(f"step4: standardrize inference dataset")
            assert self.scaler is not None
            df_data_inference = self.scaler.transform(df_data)
            return df_data_inference, None
        else:
            print(f"step4: standardrize train dataset")
            df_labels = df.loc[:, df.columns == "label"]
            df_data_train, df_data_test, df_label_train, df_label_test = train_test_split(
                df_data, df_labels)
            df_data_train.persist()
            assert self.scaler is None
            self.scaler = StandardScaler(
            )  # this just turns col values to z-scores
            df_data_train = self.scaler.fit_transform(df_data_train)
            df_data_test = self.scaler.transform(df_data_test)
            df_train = dd.concat([df_data_train, df_label_train], axis=1)
            df_test = dd.concat([df_data_test, df_label_test], axis=1)
            return df_train, df_test
Example #15
0
def split(full_set):
    training_dd = dd.read_csv(full_set, assume_missing=True)

    y = training_dd.y
    X = training_dd.drop('y', axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        test_size=0.2)
    return (X_train, X_test, y_train, y_test)
Example #16
0
def prepare_dataset(X, y):
    scaler = StandardScaler()

    X.compute_chunk_sizes()
    X_train, X_test, y_train, y_test = train_test_split(X.rechunk(
        {1: X.shape[1]}),
                                                        y,
                                                        test_size=0.25)
    del X
    del y

    X_test, X_valid, y_test, y_valid = train_test_split(X_test,
                                                        y_test,
                                                        test_size=0.5)

    y_train = scaler.fit_transform(y_train.compute().reshape(-1, 1))
    y_test = scaler.transform(y_test.compute().reshape(-1, 1))
    y_valid = scaler.transform(y_valid.compute().reshape(-1, 1))

    return X_train, X_test, X_valid, y_train, y_test, y_valid
Example #17
0
def split_and_write_data(df: pd.DataFrame, *, seed=42) -> None:
    df = df.drop(['ID', 'lat', 'lon', 'year'], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(df.drop(['n2o', 'gwp'],
                                                                axis=1),
                                                        df[['n2o', 'gwp']],
                                                        shuffle=True,
                                                        train_size=0.8,
                                                        random_state=seed)
    y_train_n2o, y_train_gwp = y_train['n2o'], y_train['gwp']
    #y_test_n2o, y_test_gwp = y_test['n2o'], y_test['gwp']

    gzip_args = {'method': 'gzip', 'compresslevel': 1}
    X_train.to_csv(DEST / "x_train.csv.gz", compression=gzip_args)
    X_test.to_csv(DEST / "x_test.csv.gz", compression=gzip_args)
    y_train.to_csv(DEST / "y_train.csv.gz", compression=gzip_args)
    y_test.to_csv(DEST / "y_test.csv.gz", compression=gzip_args)
    y_train_n2o.to_csv(DEST / "y_train_n2o.csv.gz", compression=gzip_args)
    y_train_gwp.to_csv(DEST / "y_train_gwp.csv.gz", compression=gzip_args)
def process_data(X, y=None, test_size=0.20, dummies=False):
    if y is None:
        y = da.ones(X.shape[0])
    y_uniqs = np.unique(y)

    len_ = X.shape[0]
    X = prepare_dataset(X)

    if dummies:
        y = dd.get_dummies(y)

    shape_ = list(X.shape[1:])

    samples = list()
    for _ in range(10):
        for y_uniq in y_uniqs:
            sample = list()
            for xa, ya in zip(chunks(X, 10),chunks(y, 10)):
                try:
                    sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]])
                    if len(sample) >= 500:
                        break
                except:
                    pass
            samples += sample
    samples = da.vstack(samples)

    X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size,
                                                        random_state=4891)

    X_train = X_train.reshape([X_train.shape[0]] + shape_)
    X_test = X_test.reshape([X_test.shape[0]] + shape_)

    print('Training dataset shape: ', X_train.shape)
    print('Validation dataset shape: ', X_test.shape)

    train_dataset = Dataset(X_train, y_train)
    test_dataset = Dataset(X_test, y_test)

    train_dataset.samples = samples
    print('Sample dataset shape: ', train_dataset.samples.shape)
    return train_dataset, test_dataset
Example #19
0
def train(seed, epochs, n_gpus, dataset):
    with LocalCUDACluster(n_workers=n_gpus, threads_per_worker=4) as cluster:
        with Client(cluster) as client:
            # Fetch dataset using sklearn
            if dataset == 'boston':
                dataset = load_boston()
                param = {}
            elif dataset == 'covertype':
                dataset = fetch_covtype()
                param = {
                    'objective': 'multi:softmax',
                    'num_class': 8
                    # 'single_precision_histogram': True
                }

            param['verbosity'] = 2
            param['tree_method'] = 'gpu_hist'

            # Rechunking is required for the covertype dataset
            X = da.from_array(dataset.data, chunks=1000)
            y = da.from_array(dataset.target, chunks=1000)

            # Create 0.75/0.25 train/test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.25, train_size=0.75, random_state=0)

            dtrain = DaskDMatrix(client, X_train, y_train)
            dtest = DaskDMatrix(client, X_test, y_test)

            random_seed(seed, param)

            gpu_runtime = time.time()
            model_training_results = xgb.dask.train(client,
                                                    param,
                                                    dtrain,
                                                    num_boost_round=epochs,
                                                    evals=[(dtest, 'test')])

            print(model_training_results)
            print(f'GPU Run Time: {str(time.time() - gpu_runtime)} seconds')
Example #20
0
def train_test_split(*data, shuffle=True, random_state=None, **kwargs):
    if exist_dask_dataframe(*data):
        if len(data) > 1:
            data = [make_divisions_known(to_dask_type(x)) for x in data]
            head = data[0]
            for i in range(1, len(data)):
                if data[i].divisions != head.divisions:
                    print(
                        '-' * 10,
                        f'repartition {i} from {data[i].divisions} to {head.divisions}'
                    )
                    data[i] = data[i].repartition(divisions=head.divisions)

        result = dm_sel.train_test_split(*data,
                                         shuffle=shuffle,
                                         random_state=random_state,
                                         **kwargs)
    else:
        result = sk_sel.train_test_split(*data,
                                         shuffle=shuffle,
                                         random_state=random_state,
                                         **kwargs)

    return result
Example #21
0
def searchBestForest(params, client):
    c = client
    print(c)
    data = getDataForTraining(getData())
    data.to_csv('../ignore/dataPrepared.csv')
    data = dd.read_csv('../ignore/dataPrepared.csv').drop(columns='Unnamed: 0')
    X_train, X_test, y_train, y_test = train_test_split(
        data.drop(columns='price'), data.price, test_size=0.2)
    [ele.compute() for ele in [X_train, X_test, y_train, y_test]]

    with joblib.parallel_backend('dask'):
        model = RandomForestRegressor(bootstrap=True,
                                      criterion='mse',
                                      max_depth=10,
                                      max_features='auto',
                                      max_leaf_nodes=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1,
                                      min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      n_estimators=200,
                                      n_jobs=None,
                                      oob_score=False,
                                      random_state=None,
                                      verbose=0,
                                      warm_start=False)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        bestMod = {'model': model, 'R2_score': r2_score(y_test, y_pred)}
        contador = 1
        print(bestMod)
        for estimators in params['n_estimators']:
            for features in params['max_features']:
                for dep in params['max_depth']:
                    for samples in params['min_samples_split']:
                        for samplesL in params['min_samples_leaf']:
                            for boot in params['bootstrap']:
                                model = RandomForestRegressor(
                                    bootstrap=boot,
                                    criterion='mse',
                                    max_depth=dep,
                                    max_features=features,
                                    max_leaf_nodes=None,
                                    min_impurity_decrease=0.0,
                                    min_impurity_split=None,
                                    min_samples_leaf=samplesL,
                                    min_samples_split=samples,
                                    min_weight_fraction_leaf=0.0,
                                    n_estimators=estimators,
                                    n_jobs=None,
                                    oob_score=False,
                                    random_state=None,
                                    verbose=0,
                                    warm_start=False)
                                model.fit(X_train, y_train)
                                y_pred = model.predict(X_test)
                                r2 = r2_score(y_test, y_pred)
                                if r2 > bestMod['R2_score']:
                                    bestMod = {'model': model, 'R2_score': r2}
                                    print(bestMod)
                                del model
Example #22
0
x_data1 = da.array(X_data1)
X_data2 =np.load('D:/GAT/Sound/2next2b+.npy')
x_data2 = da.array(X_data2)
X_data3 =np.load('D:/GAT/Sound/3next2b+.npy')
x_data3 = da.array(X_data3)

x_data=da.concatenate([x_data1,x_data2,x_data3],axis=-1)
print(x_data.shape)

y_data=pd.read_csv('D:/GAT/subm/train_answer.csv', index_col=0)
y_labels = y_data.columns.values
y_data=y_data.values
Y_data=y_data

# #Preprocessing----------------------------------------------------------------------------------------------------------
x_train,xtest,y_train,ytest=train_test_split(x_data,Y_data,train_size=0.8,random_state=42)
x_train,x_val,y_train,y_val=train_test_split(x_train,y_train,train_size=0.8,random_state=42)

# kf = KFold(n_splits=4)
# for train_index, test_index in kf.split(x_data):
#     x_train, x_test = x_data[train_index], x_data[test_index]
#     y_train, y_test = y_data[train_index], y_data[test_index]
batch_size=32
# train_generator = ImageDataGenerator(horizontal_flip=True, width_shift_range=0.1)
# train_Iterator = train_generator.flow(x_train, y_train,batch_size=batch_size)
#
# valid_generator = ImageDataGenerator()
# valid_Iterator = valid_generator.flow(x_test, y_test,batch_size=8)

input=(x_train.shape[1],x_train.shape[2],x_train.shape[3])
Example #23
0
#!/usr/bin/env python
# coding: utf-8

# In[ ]:

# https://www.kaggle.com/puneetgrover/speed-up-your-algorithms-dask
# dask_kaggle_Regression

# In[1]:

from dask_ml.datasets import make_regression
import dask.dataframe as dd

X, y = make_regression(n_samples=1e6, chunks=50000)

# In[2]:

df = dd.from_dask_array(X)
df.head()

# In[3]:

from dask_ml.model_selection import train_test_split, GridSearchCV

xtr, ytr, xval, yval = train_test_split(X, y)

# In[ ]:
    results = compute('float64', *shapes)  # trigger computation to find shape
    dtype, shapes = results[0], results[1:]

    chunks = [
        da.from_delayed(part.values, shape, dtype)
        for part, shape in zip(partitions, shapes)
    ]
    return da.concatenate(chunks, axis=0)


# Test-train split
from dask_ml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(to_dask_array(X),
                                                    to_dask_array(y),
                                                    random_state=99)

###################################################################################

# Fitting the Logistic Regression Classifier
from dask_ml.linear_model import LogisticRegression

lr = LogisticRegression()

with ProgressBar():
    lr.fit(X_train, y_train)

print('Logistic Regression Score : ', lr.score(X_test, y_test).compute())
##### OUTPUT --------> Logistic Regression Score :  0.70025
Example #25
0
def run():
    client = Client()
    from dask_ml.datasets import make_classification
    df = dd.read_csv("isHealth.csv",
                     assume_missing=True,
                     sample=640000000,
                     blocksize="10MB")
    df = df.fillna(0).fillna(0)
    for column in df.columns:
        if '.' in column:
            df = df.drop(column, axis=1)
    # for column in droppedColumns:
    #     df = df.drop(column, axis=1)
    y = df['acquired']
    X = df.drop('acquired', axis=1)
    from dask_ml.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
    # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train)
    x_test_tickers = X_test['ticker'].values.compute()
    x_test_dates = X_test['date'].values.compute()
    print(x_test_tickers[0])
    np.savetxt("x_test_tickers.csv", [x_test_tickers, x_test_dates],
               delimiter=",",
               fmt='%s')
    np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s')
    print("GOOD")
    for column in X_train.columns:
        if 'ticker' in column or 'date' in column:
            X_train = X_train.drop(column, axis=1)
            X_test = X_test.drop(column, axis=1)
    X_train = X_train.to_dask_array()
    X_test = X_test.values.compute()
    y_train = y_train.to_dask_array()
    y_test = y_test.values.compute()
    np.savetxt("y_test.csv", y_test, delimiter=",")
    from dask_ml.wrappers import Incremental
    from sklearn.linear_model import SGDClassifier
    from sklearn.neural_network import MLPClassifier
    from dask_ml.wrappers import ParallelPostFit

    est = MLPClassifier(solver='adam', activation='relu', random_state=0)
    inc = Incremental(est, scoring='neg_log_loss')
    print("WORKING")
    for _ in range(10):
        inc.partial_fit(X_train, y_train, classes=[0, 1])
        print("FITTED")
        np.savetxt("predictions.csv", inc.predict_proba(X_test))
        print('Score:', inc.score(X_test, y_test))

    # model = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,2),random_state=1)
    params = {'alpha': np.logspace(-2, 1, num=1000)}
    from dask_ml.model_selection import IncrementalSearchCV
    search = IncrementalSearchCV(est,
                                 params,
                                 n_initial_parameters=100,
                                 patience=20,
                                 max_iter=100)
    search.fit(X_train, y_train, classes=[0, 1])
    print(search)
    print("SCORE")
    print("FITTED")
    np.savetxt("predictions.csv", inc.predict_proba(X_test))
    print('Score:', inc.score(X_test, y_test))
#del Y_vector, outputset
gc.collect()
'''
#a = np.argmax(dummy_y, axis=1)
from dask_ml.model_selection import train_test_split
from dask_ml.preprocessing import DummyEncoder
encoder = DummyEncoder()
yyy = encoder.fit_transform(Y_cat)
dummy_y = dummy_y.values
'''

#Spliting dataset
from dask_ml.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.3, random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(dask_X,
                                                    dask_dummy_y,
                                                    random_state=42)

#del X, dummy_y, dask_X
gc.collect()

# Number of catagories
y_catagories = len(Y_test[0])
#number of rows - outcome_size = len(Y_test)
'''
#Feature scaling
don't need to feature scale because all data is binary
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
Example #27
0
#print a few lines
print("\n Dataframe: ")
print(df.head())

#Get target variable
dt = dd.from_array(data['target'])
dt.columns = ["target"]

#print target classes example
print("\n Target: ")
print(dt.head())

# train and test split
from dask_ml.model_selection import train_test_split
train, test, train_labels, test_labels = train_test_split(df,
                                                          dt,
                                                          random_state=123)

#xgboost
from dask_ml.xgboost import XGBClassifier
est = XGBClassifier()

#fit model
model = est.fit(train, train_labels)

#which features contribute most
import pandas as pd
featureimp = pd.DataFrame(model.feature_importances_)
featureimp.columns = ['classifier_feature_importance']
featureimp["variable"] = data['feature_names']
print("\n\n === Xgboost Classifier Feature Importance: === ")
Example #28
0
# In[8]:



print("Scaler")
scaler = StandardScaler()

scaler.fit(X)

X_scaled = scaler.transform(X)

# In[7]:

X_train, X_test, y_train, y_test = train_test_split(X_scaled , 
                                                        y["p.ERK"], 
                                                        test_size=0.33, 
                                                        random_state=101,shuffle=True)


# In[9]:


print ("train base model")
import joblib
model = SGDRegressor(verbose=2)
with joblib.parallel_backend('dask'):
    model.fit(X_train.compute(), y_train.compute())


# In[10]:
Example #29
0
def train_model(context: MLClientCtx,
                dataset: DataItem,
                model_pkg_class: str,
                label_column: str = "label",
                train_validation_size: float = 0.75,
                sample: float = 1.0,
                models_dest: str = "models",
                test_set_key: str = "test_set",
                plots_dest: str = "plots",
                dask_key: str = "dask_key",
                dask_persist: bool = False,
                scheduler_key: str = '',
                file_ext: str = "parquet",
                random_state: int = 42) -> None:
    """
    Train a sklearn classifier with Dask
    
    :param context:                 Function context.
    :param dataset:                 Raw data file.
    :param model_pkg_class:         Model to train, e.g, "sklearn.ensemble.RandomForestClassifier", 
                                    or json model config.
    :param label_column:            (label) Ground-truth y labels.
    :param train_validation_size:   (0.75) Train validation set proportion out of the full dataset.
    :param sample:                  (1.0) Select sample from dataset (n-rows/% of total), randomzie rows as default.
    :param models_dest:             (models) Models subfolder on artifact path.
    :param test_set_key:            (test_set) Mlrun db key of held out data in artifact store.
    :param plots_dest:              (plots) Plot subfolder on artifact path.
    :param dask_key:                (dask key) Key of dataframe in dask client "datasets" attribute.
    :param dask_persist:            (False) Should the data be persisted (through the `client.persist`)
    :param scheduler_key:           (scheduler) Dask scheduler configuration, json also logged as an artifact.
    :param file_ext:                (parquet) format for test_set_key hold out data
    :param random_state:            (42) sklearn seed
    """

    if scheduler_key:
        client = Client(scheduler_key)

    else:
        client = Client()

    context.logger.info("Read Data")
    df = dataset.as_df(df_module=dd)

    context.logger.info("Prep Data")
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)

    if df.isna().any().any().compute() == True:
        raise Exception('NAs valus found')

    df_header = df.columns

    df = df.sample(frac=sample).reset_index(drop=True)
    encoder = LabelEncoder()
    encoder = encoder.fit(df[label_column])
    X = df.drop(label_column, axis=1).to_dask_array(lengths=True)
    y = encoder.transform(df[label_column])

    classes = df[label_column].drop_duplicates()  # no unique values in dask
    classes = [str(i) for i in classes]

    context.logger.info("Split and Train")
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, train_size=train_validation_size, random_state=random_state)

    scaler = StandardScaler()
    scaler = scaler.fit(X_train)
    X_train_transformed = scaler.transform(X_train)
    X_test_transformed = scaler.transform(X_test)

    model_config = gen_sklearn_model(model_pkg_class,
                                     context.parameters.items())

    model_config["FIT"].update({"X": X_train_transformed, "y": y_train})

    ClassifierClass = create_class(model_config["META"]["class"])

    model = ClassifierClass(**model_config["CLASS"])

    with joblib.parallel_backend("dask"):

        model = model.fit(**model_config["FIT"])

    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.logger.info("Evaluate")
    extra_data_dict = {}
    for report in (ROCAUC, ClassificationReport, ConfusionMatrix):

        report_name = str(report.__name__)
        plt.cla()
        plt.clf()
        plt.close()

        viz = report(model, classes=classes, per_class=True, is_fitted=True)
        viz.fit(X_train_transformed,
                y_train)  # Fit the training data to the visualizer
        viz.score(X_test_transformed,
                  y_test.compute())  # Evaluate the model on the test data

        plot = context.log_artifact(PlotArtifact(report_name,
                                                 body=viz.fig,
                                                 title=report_name),
                                    db_key=False)
        extra_data_dict[str(report)] = plot

        if report_name == 'ROCAUC':
            context.log_results({
                "micro": viz.roc_auc.get("micro"),
                "macro": viz.roc_auc.get("macro")
            })

        elif report_name == 'ClassificationReport':
            for score_name in viz.scores_:
                for score_class in viz.scores_[score_name]:

                    context.log_results({
                        score_name + "-" + score_class:
                        viz.scores_[score_name].get(score_class)
                    })

    viz = FeatureImportances(model,
                             classes=classes,
                             per_class=True,
                             is_fitted=True,
                             labels=df_header.delete(
                                 df_header.get_loc(label_column)))
    viz.fit(X_train_transformed, y_train)
    viz.score(X_test_transformed, y_test)

    plot = context.log_artifact(PlotArtifact("FeatureImportances",
                                             body=viz.fig,
                                             title="FeatureImportances"),
                                db_key=False)
    extra_data_dict[str("FeatureImportances")] = plot

    plt.cla()
    plt.clf()
    plt.close()

    context.logger.info("Log artifacts")
    artifact_path = context.artifact_subpath(models_dest)

    plots_path = context.artifact_subpath(models_dest, plots_dest)

    context.set_label('class', model_pkg_class)

    context.log_model("model",
                      body=dumps(model),
                      artifact_path=artifact_path,
                      model_file="model.pkl",
                      extra_data=extra_data_dict,
                      metrics=context.results,
                      labels={"class": model_pkg_class})

    context.log_artifact("standard_scaler",
                         body=dumps(scaler),
                         artifact_path=artifact_path,
                         model_file="scaler.gz",
                         label="standard_scaler")

    context.log_artifact("label_encoder",
                         body=dumps(encoder),
                         artifact_path=artifact_path,
                         model_file="encoder.gz",
                         label="label_encoder")

    df_to_save = delayed(np.column_stack)((X_test, y_test)).compute()
    context.log_dataset(
        test_set_key,
        df=pd.DataFrame(df_to_save,
                        columns=df_header),  # improve log dataset ability
        format=file_ext,
        index=False,
        labels={"data-type": "held-out"},
        artifact_path=context.artifact_subpath('data'))

    context.logger.info("Done!")
Example #30
0
client

# In[4]:

from dask_ml.datasets import make_regression

X, y = make_regression(n_samples=4000000,
                       n_features=32,
                       chunks=1000,
                       n_informative=10,
                       random_state=101)

# In[5]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# In[6]:

params = {
    'objective': 'reg:squarederror',
    'n_estimators': 100000,
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.5,
    'min_child_weight': 0.5
}

bst = dask_xgboost.train(client, params, X_train, y_train, num_boost_round=100)

# In[7]: