Example #1
0
def test_get_dummies_errors():
    msg = 'data must have category dtype'
    with tm.assertRaisesRegexp(ValueError, msg):
        # not Categorical
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        ds = dd.from_pandas(s, 2)
        dd.get_dummies(ds)
Example #2
0
def test_get_dummies_kwargs():
    s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category')
    exp = pd.get_dummies(s, prefix='X', prefix_sep='-')

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, prefix='X', prefix_sep='-')
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4']))

    exp = pd.get_dummies(s, drop_first=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, drop_first=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # nan
    s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category')
    exp = pd.get_dummies(s)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # dummy_na
    exp = pd.get_dummies(s, dummy_na=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, dummy_na=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))
Example #3
0
def test_get_dummies_dtype_raises():
    df = pd.DataFrame({
        "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']),
        "B": [0, 0, 1],
    })
    ddf = dd.from_pandas(df, 2)

    with pytest.raises(ValueError) as m:
        dd.get_dummies(ddf, dtype='float64')

    assert m.match("0.23.0")
Example #4
0
def test_get_dummies_dtype():
    df = pd.DataFrame({
        "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']),
        "B": [0, 0, 1],
    })
    ddf = dd.from_pandas(df, 2)

    exp = pd.get_dummies(df, dtype='float64')
    res = dd.get_dummies(ddf, dtype='float64')
    assert_eq(exp, res)
    assert res.compute().A_a.dtype == 'float64'

    # dask's get_dummies on a pandas dataframe.
    assert_eq(dd.get_dummies(df, dtype='float64'), exp)
    assert res.compute().A_a.dtype == 'float64'
Example #5
0
def test_get_dummies_sparse():
    s = pd.Series(pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']))
    ds = dd.from_pandas(s, 2)

    exp = pd.get_dummies(s, sparse=True)
    res = dd.get_dummies(ds, sparse=True)
    assert_eq(exp, res)

    assert res.compute().a.dtype == 'uint8'
    assert pd.api.types.is_sparse(res.a.compute())

    exp = pd.get_dummies(s.to_frame(name='a'), sparse=True)
    res = dd.get_dummies(ds.to_frame(name='a'), sparse=True)
    assert_eq(exp, res)
    assert pd.api.types.is_sparse(res.a_a.compute())
Example #6
0
def test_get_dummies(data):
    exp = pd.get_dummies(data)

    ddata = dd.from_pandas(data, 2)
    res = dd.get_dummies(ddata)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)
Example #7
0
def test_get_dummies_sparse_mix():
    df = pd.DataFrame({
        "A": pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']),
        "B": [0, 0, 1],
    })
    ddf = dd.from_pandas(df, 2)

    exp = pd.get_dummies(df, sparse=True)
    res = dd.get_dummies(ddf, sparse=True)
    assert_eq(exp, res)

    assert res.compute().A_a.dtype == 'uint8'
    assert pd.api.types.is_sparse(res.A_a.compute())
Example #8
0
def test_get_dummies_object():
    df = pd.DataFrame({'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]),
                       'b': list('abcdabcd'),
                       'c': pd.Categorical(list('abcdabcd'))})
    # exclude object columns
    exp = pd.get_dummies(df, columns=['a', 'c'])

    ddf = dd.from_pandas(df, 2)
    res = dd.get_dummies(ddf)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    exp = pd.get_dummies(df, columns=['a'])

    ddf = dd.from_pandas(df, 2)
    res = dd.get_dummies(ddf, columns=['a'])
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # cannot target object columns
    msg = 'target columns must have category dtype'
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.get_dummies(ddf, columns=['b'])
Example #9
0
def test_get_dummies_errors():
    with pytest.raises(NotImplementedError):
        # not Categorical
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        ds = dd.from_pandas(s, 2)
        dd.get_dummies(ds)

    # unknown categories
    df = pd.DataFrame({'x': list('abcbc'), 'y': list('bcbcb')})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf._meta = make_meta({'x': 'category', 'y': 'category'})

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=['x', 'y'])

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.x)
Example #10
0
def test_get_dummies_errors():
    with pytest.raises(NotImplementedError):
        # not Categorical
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        ds = dd.from_pandas(s, 2)
        dd.get_dummies(ds)

    # unknown categories
    df = pd.DataFrame({'x': list('abcbc'), 'y': list('bcbcb')})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf._meta = make_meta({'x': 'category', 'y': 'category'})

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=['x', 'y'])

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.x)
Example #11
0
def test_get_dummies_errors():
    with pytest.raises(NotImplementedError):
        # not Categorical
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        ds = dd.from_pandas(s, 2)
        dd.get_dummies(ds)

    # unknown categories
    df = pd.DataFrame({"x": list("abcbc"), "y": list("bcbcb")})
    ddf = dd.from_pandas(df, npartitions=2)
    ddf._meta = make_meta({"x": "category", "y": "category"})

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=["x", "y"])

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.x)
Example #12
0
def test_get_dummies_sparse_mix():
    df = pd.DataFrame(
        {
            "A": pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]),
            "B": [0, 0, 1],
        }
    )
    ddf = dd.from_pandas(df, 2)

    exp = pd.get_dummies(df, sparse=True)
    res = dd.get_dummies(ddf, sparse=True)
    assert_eq(exp, res)

    if PANDAS_GT_0240:
        exp_dtype = "Sparse[uint8, 0]"
    else:
        exp_dtype = "uint8"
    assert res.compute().A_a.dtype == exp_dtype
    assert pd.api.types.is_sparse(res.A_a.compute())
    def run(self):
        dtype = {
            'bedrooms': 'float32',
            'beds': 'float32',
            'review_scores_accuracy': 'float32',
            'review_scores_checkin': 'float32',
            'review_scores_cleanliness': 'float32',
            'review_scores_communication': 'float32',
            'review_scores_location': 'float32',
            'review_scores_rating': 'float32',
            'review_scores_value': 'float32'
        }

        ddf_listing = dd.read_csv(self.listings_csv_filename, dtype=dtype)
        use_columns_in_listing = [
            'id',
            'latitude',
            'longitude',
            'property_type',
            'room_type',
            'accommodates',
            'bedrooms',
            'beds',
            'cancellation_policy',
        ]
        ddf_listing = ddf_listing.loc[:, use_columns_in_listing]

        # property_type, room_type, cancellation_policy
        ddf_listing = ddf_listing.categorize(
            columns=['property_type', 'room_type', 'cancellation_policy'])
        ddf_listing = dd.get_dummies(
            ddf_listing,
            columns=['property_type', 'room_type', 'cancellation_policy'])

        # ddf_listing = ddf_listing.reset_index()
        ddf_listing = ddf_listing.rename(columns={'id': 'listing_id'})
        ddf_listing = ddf_listing.compute()

        print(ddf_listing.shape)
        print(ddf_listing.head())

        with open(self.output().path, "w") as target:
            ddf_listing.to_csv(target)
def process_data(X, y=None, test_size=0.20, dummies=False):
    if y is None:
        y = da.ones(X.shape[0])
    y_uniqs = np.unique(y)

    len_ = X.shape[0]
    X = prepare_dataset(X)

    if dummies:
        y = dd.get_dummies(y)

    shape_ = list(X.shape[1:])

    samples = list()
    for _ in range(10):
        for y_uniq in y_uniqs:
            sample = list()
            for xa, ya in zip(chunks(X, 10),chunks(y, 10)):
                try:
                    sample.append([xa[ya == y_uniq][random.randint(0, len(xa[ya == y_uniq]) - 1)]])
                    if len(sample) >= 500:
                        break
                except:
                    pass
            samples += sample
    samples = da.vstack(samples)

    X_train, X_test, y_train, y_test = train_test_split(X.flatten().reshape(len_, -1), y, test_size=test_size,
                                                        random_state=4891)

    X_train = X_train.reshape([X_train.shape[0]] + shape_)
    X_test = X_test.reshape([X_test.shape[0]] + shape_)

    print('Training dataset shape: ', X_train.shape)
    print('Validation dataset shape: ', X_test.shape)

    train_dataset = Dataset(X_train, y_train)
    test_dataset = Dataset(X_test, y_test)

    train_dataset.samples = samples
    print('Sample dataset shape: ', train_dataset.samples.shape)
    return train_dataset, test_dataset
Example #15
0
def preprocess(csv_dir, types_path, preproc_path, output_path):
    """
    Perform preprocessing steps on the raw CSV data as prescribed in the preprocessors file and store the output to HDF.
    :param csv_dir: directory with ram CSV data.
    :param types_path: JSON with type information to correctly parse CSVs.
    :param preproc_path: JSON with preprocessors descriptions.
    :param output_path: destination path for the resulting HDF file with processed data.
    :return:
    """
    print("** Start preprocessing **")

    if not os.path.exists(output_path):
        # Read type & transform configs
        with open(types_path) as jf:
            dtypes = json.load(jf)

        with open(preproc_path) as jf:
            preproc = json.load(jf)

        with LocalCluster() as cluster:
            with Client(cluster) as client:
                df = dd.read_csv(os.path.join(csv_dir, '*.csv'), dtype=dtypes)

                # Separate categoricals
                cat_cols = [col for col, tp in dtypes.items() if tp == 'category' and col in df.columns]
                num_cols = [col for col in df.columns if col not in cat_cols]

                for desc in preproc["preprocessors"]:
                    p = create_preprocessor(desc)
                    df = p.apply(df)

                # {"column": "TailNum", "name": "FillValue", "value": "UNKNOW"},

                # Convert to known categoricals
                df = dd.get_dummies(df.categorize())
                print("Columns after preprocessing: ", df.columns)

                df.to_hdf(output_path, '/data')
    else:
        print("- file {} already exists, skipping preprocessing".format(output_path))

    print("** Finished preprocessing **")
Example #16
0
def make_categorical(
    client: Client,
    n_samples: int,
    n_features: int,
    n_categories: int,
    onehot: bool = False,
) -> Tuple[dd.DataFrame, dd.Series]:
    workers = _get_client_workers(client)
    n_workers = len(workers)
    dfs = []

    def pack(**kwargs: Any) -> dd.DataFrame:
        X, y = tm.make_categorical(**kwargs)
        X["label"] = y
        return X

    meta = pack(n_samples=1,
                n_features=n_features,
                n_categories=n_categories,
                onehot=False)

    for i, worker in enumerate(workers):
        l_n_samples = min(n_samples // n_workers,
                          n_samples - i * (n_samples // n_workers))
        future = client.submit(
            pack,
            n_samples=l_n_samples,
            n_features=n_features,
            n_categories=n_categories,
            onehot=False,
            workers=[worker],
        )
        dfs.append(future)

    df = dd.from_delayed(dfs, meta=meta)
    y = df["label"]
    X = df[df.columns.difference(["label"])]

    if onehot:
        return dd.get_dummies(X), y
    return X, y
Example #17
0
def test_get_dummies_object():
    df = pd.DataFrame({'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]),
                       'b': list('abcdabcd'),
                       'c': pd.Categorical(list('abcdabcd'))})
    ddf = dd.from_pandas(df, 2)

    # Explicitly exclude object columns
    exp = pd.get_dummies(df, columns=['a', 'c'])
    res = dd.get_dummies(ddf, columns=['a', 'c'])
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.b)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=['b'])
Example #18
0
def weather_cluster(data):
    """
    Creates a column that gives a cluster id based on KMeans clustering of only weather-related features

    :param data: a pandas dataframe where each row is an hour
    :return: a pandas dataframe containing the new column
    """
    print("\tAdding clustering variable based on weather-related features...")
    df = data.copy()[["weathersit", "temp", "atemp", "hum", "windspeed"]]
    to_cluster = dd.get_dummies(df)
    train = get_train(to_cluster)
    holdout = get_holdout(to_cluster)

    kmeans = KMeans(n_clusters=5,
                    random_state=SEED).fit(train)  # magic numbers, blech

    data["weather_cluster"] = da.append(kmeans.labels_,
                                        kmeans.predict(holdout))

    data["weather_cluster"] = data["weather_cluster"].astype("category")

    return data
Example #19
0
def cluster_variable(data):
    """
    Creates a column that gives a cluster id based on KMeans clustering of all features

    :param data: a pandas dataframe where each row is an hour
    :return: a pandas dataframe containing the new column
    """
    print("\tAdding cluster variable...")
    data = data.copy()
    to_cluster = dd.get_dummies(data)
    train = get_train(to_cluster)
    holdout = get_holdout(to_cluster)

    kmeans = KMeans(n_clusters=5, random_state=SEED).fit(
        train.drop("cnt", axis=1))  # magic numbers, blech

    data["cluster"] = da.append(kmeans.labels_,
                                kmeans.predict(holdout.drop("cnt", axis=1)))

    data["cluster"] = data["cluster"].astype("category")

    return data
Example #20
0
def preprocessing(data):
    print('Preprocessing started!')
    start_time = time.time()

    # One-Hot Encoding
    data['DayofWeek'] = data['DayofWeek'].astype('category')
    data_encoded = dd.get_dummies(data[['UniqueCarrier', 'Origin', 'Dest', 'DayofWeek']].categorize()).compute()
    print('Data enocded: ', (time.time()-start_time))

    data_reduced = data.drop(['UniqueCarrier', 'Origin', 'Dest', 'FlightNum', 'Diverted','DayofWeek'], axis=1).compute()
    print('Data reduced: ', (time.time() - start_time))

    X = pd.concat([data_reduced, data_encoded], axis=1)
    print('Data concatenated: ', (time.time() - start_time))


    #y[y<0] = 0


    end_time = time.time()
    duration = end_time - start_time

    # print(data_encoded.info())
    # print(data_full.info())
    # print(data_reduced.info())
    #
    # print(h.heap())

    del data_reduced
    del data_encoded

    gc.collect()

    #print('Afer Deletion:', h.heap())

    print('Duration Preprocessing: ', duration)

    return X
Example #21
0
    def transform(self, X, y=None):
        """Dummy encode the categorical columns in X

        Parameters
        ----------
        X : pd.DataFrame or dd.DataFrame
        y : ignored

        Returns
        -------
        transformed : pd.DataFrame or dd.DataFrame
            Same type as the input
        """
        if not X.columns.equals(self.columns_):
            raise ValueError("Columns of 'X' do not match the training "
                             "columns. Got {!r}, expected {!r}".format(
                                 X.columns, self.columns))
        if isinstance(X, pd.DataFrame):
            return pd.get_dummies(X, drop_first=self.drop_first)
        elif isinstance(X, dd.DataFrame):
            return dd.get_dummies(X, drop_first=self.drop_first)
        else:
            raise TypeError("Unexpected type {}".format(type(X)))
Example #22
0
def test_get_dummies_object():
    df = pd.DataFrame({
        'a': pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]),
        'b': list('abcdabcd'),
        'c': pd.Categorical(list('abcdabcd'))
    })
    ddf = dd.from_pandas(df, 2)

    # Explicitly exclude object columns
    exp = pd.get_dummies(df, columns=['a', 'c'])
    res = dd.get_dummies(ddf, columns=['a', 'c'])
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.b)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=['b'])
Example #23
0
def test_get_dummies_object():
    df = pd.DataFrame({
        "a": pd.Categorical([1, 2, 3, 4, 4, 3, 2, 1]),
        "b": list("abcdabcd"),
        "c": pd.Categorical(list("abcdabcd")),
    })
    ddf = dd.from_pandas(df, 2)

    # Explicitly exclude object columns
    exp = pd.get_dummies(df, columns=["a", "c"])
    res = dd.get_dummies(ddf, columns=["a", "c"])
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf.b)

    with pytest.raises(NotImplementedError):
        dd.get_dummies(ddf, columns=["b"])
Example #24
0
def test_get_dummies_kwargs():
    s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category')
    exp = pd.get_dummies(s, prefix='X', prefix_sep='-')

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, prefix='X', prefix_sep='-')
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4']))

    exp = pd.get_dummies(s, drop_first=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, drop_first=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # nan
    s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category')
    exp = pd.get_dummies(s)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # dummy_na
    exp = pd.get_dummies(s, dummy_na=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, dummy_na=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))

    msg = 'sparse=True is not supported'
    with pytest.raises(NotImplementedError) as err:
        dd.get_dummies(ds, sparse=True)
    assert msg in str(err.value)
def test_get_dummies_kwargs():
    s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category')
    exp = pd.get_dummies(s, prefix='X', prefix_sep='-')

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, prefix='X', prefix_sep='-')
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4']))

    exp = pd.get_dummies(s, drop_first=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, drop_first=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # nan
    s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category')
    exp = pd.get_dummies(s)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # dummy_na
    exp = pd.get_dummies(s, dummy_na=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, dummy_na=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))

    msg = 'sparse=True is not supported'
    with pytest.raises(NotImplementedError) as err:
        dd.get_dummies(ds, sparse=True)
    assert msg in str(err.value)
Example #26
0
def pipeline_casero(data, preprocessing=[], creation=[], reduction=[], selection=[], models=[]):
    """
    A homemade pipeline to automate all the steps of data preparation, feature creation, feature selection, feature
    reduction, and outputting a fitted model

    This is not strictly necessary as it does not add more functionality than an sklearn Pipeline, but we thought it
    would be easier to use for our purposes and it has the added benefit of allowing us to control the verbosity of
    the output.


    :param path: A path to the file containing the data for the pipeline
    :param preprocessing: An iterable containing all the preprocessing steps (functions with signature [DataFrame -> DataFrame]
    :param creation: An iterable containing all the feature creation steps (functions with signature [DataFrame -> DataFrame]
    :param reduction: An iterable containing all the dimensionality reduction steps (functions with signature [DataFrame -> DataFrame]
    :param selection: An iterable containing all the feature selection steps (functions with signature [DataFrame -> DataFrame]
    :param models: An array of dicts containing the name for the model ("name"), the sklearn estimator ("model"),
                    and the parameters for Grid Search Cross Validation ("params")
    :return: A fitted model that represents the best model out of all the ones in 'models'
    """
    print("Beginning pipeline at {}\n".format(datetime.now()))
    print("Performing preprocessing steps...")
    data = update_df(data, preprocessing)
    print("Preprocessing completed at {}, performed {} steps".format(datetime.now(), len(preprocessing)))
    print("New Shape of data: {0}\n".format(len(data.columns)))

    print("Performing feature creation...")
    data = update_df(data, creation)
    print("Feature Creation completed at {}, performed {} steps".format(datetime.now(), len(creation)))
    print("New Shape of data: {0}\n".format(len(data.columns)))

    print("Dummifying...")
    data = dd.get_dummies(data)
    print("New Shape of data: {0}\n".format(len(data.columns)))


    print("Performing dimensionality reduction...")
    data = update_df(data, reduction)
    print("Dimensionality reduction completed at {}, performed {} steps".format(datetime.now(), len(reduction)))
    print("New Shape of data: {0}\n".format(len(data.columns)))

    train = get_train(data)
    holdout = get_holdout(data)
    print("Performing feature selection...")
    train = update_df(train, selection)
    print("Feature Selection completed at {}, performed {} steps".format(datetime.now(), len(selection)))
    print("New Shape of train: {0}\n".format(len(train.columns)))

    holdout = holdout[train.columns]

    # The issue here is this assumes the model need the data in the same format, unless we submit pipelines
    # here which seems silly....

    print("Scoring models....")

    best_model = select_best_model(models, train)

    print("Evaluating model on the holdout...")
    final_r2 = r2_score(holdout.cnt, best_model.predict(holdout.drop("cnt", axis=1)))
    print("Final R2: {0}".format(final_r2))
    print("\nPipeline finished! Completed execution at {}. Returning model...".format(datetime.now()))

    return best_model
dummies = pd.get_dummies(race_column, prefix='race')

ipums_rejoined = ipums_race_df_sample.join(dummies)
ipums_rejoined['Hisp_1'] = ipums_rejoined['Hisp'].apply(lambda x: 1 if (x > 0 ) and (x != 9) else 0)
ipums_rejoined['Educ_1'] = ipums_rejoined['Educ'].apply(lambda x: 1 if x > 9 else 0)

ipums_grped = ipums_rejoined.groupby(['State', 'Year'])[['Per_Urban', 'Mean_Num_Veh', 'Educ_1', 'Poverty', 'race_1', 'race_2', 'race_3', 'race_4', 'Hisp_1']].mean()

ipums_race_df = dd.read_csv('/home/jaala/win-Python/Projects/Abortion/data/ipums_demo_various.csv', \
    names=['Year', 'State', 'Per_Urban', 'Mean_Num_Veh', 'Hisp', 'Race', 'Educ_1', 'Per_Poverty'], \
    usecols=[0, 6, 7, 9, 12, 14, 16, 18], header=0, blocksize=25e6, \
    dtype={'Race': 'category', 'Hisp': 'category', 'State': 'category', 'Region': 'category', 'Per_Urban': 'float64'})


race_column = ipums_race_df['Race']
dummies = dd.get_dummies(race_column.to_frame().categorize(), prefix='race')
ipums_rejoined = ipums_race_df.join(dummies)
ipums_rejoined['Per_Hisp'] = ipums_rejoined['Hisp'].apply(lambda x: 1 if (x > 0 ) and (x != 9) else 0, meta=('float'))
ipums_rejoined['Perc_w_Bachelors'] = ipums_rejoined['Educ_1'].apply(lambda x: 1 if x > 9 else 0, meta=('float'))
ipums_grped =
ipums_rejoined.groupby(['State', 'Year'])[['Per_Urban', 'Mean_Num_Veh', 'Perc_w_Bachelors', 'Per_Poverty', 'race_1', 'race_2', 'race_3', 'race_4', 'race_5', 'Per_Hisp']].mean()

ipums_df = ipums_grped.compute()


#df_ipums

ipums_grped['race_1'].describe()


################################################################################
Example #28
0
    },
    blocksize="16 MiB",
    storage_options={"anon": True},
)  #.head(n=1000)

print(df.columns)
print(len(df))  # 7,667,792

#storage_options={'key': settings.AWS_ACCESS_KEY_ID,
#                'secret': settings.AWS_SECRET_ACCESS_KEY})

df = df.repartition(partition_size="10 MiB").persist()

# one hot encode the categorical columns
df = df.categorize(categorical_features)
df = dd.get_dummies(df, columns=categorical_features)

# persist so only download once
df = df.persist()

data = df[[c for c in df.columns if c not in output]]
data = data.fillna(0)

durations = (df["tpep_dropoff_datetime"] -
             df["tpep_pickup_datetime"]).dt.total_seconds() / 60  # minutes

from dask_ml.model_selection import train_test_split
import dask

X = data.to_dask_array(lengths=True).astype("float32")
y = durations.to_dask_array(lengths=True).astype("float32")
Example #29
0
def test_get_dummies_errors():
    with pytest.raises(NotImplementedError):
        # not Categorical
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        ds = dd.from_pandas(s, 2)
        dd.get_dummies(ds)
#ddf = ddf.drop(["cell_line"],axis=1)

print("filling NA")
# y = y.compute()
with ProgressBar():
    for m in ["p.HER2", "p.PLCg2"]:
        #         print (df[m])
        train["%s_c" % (m)] = train[m].fillna(
            train[m].mean())  #, inplace=True )
        test["%s_c" % (m)] = test[m].fillna(test[m].mean())
#       ddf["%s_c"%(m)] =ddf[m].fillna(ddf[m].mean() )#, inplace=True )

# In[7]:

#ddf = dd.get_dummies(ddf.categorize()).persist()
train = dd.get_dummies(train.categorize()).persist()
test = dd.get_dummies(test.categorize()).persist()

# In[8]:

#ddf = ddf.drop(["p.HER2","p.PLCg2","cellID","fileID"],axis=1)
train = train.drop(["p.HER2", "p.PLCg2", "cellID", "fileID"], axis=1)
test = test.drop(["p.HER2", "p.PLCg2", "cellID", "fileID"], axis=1)

# In[9]:

rounds = {}
genes = [
    'b.CATENIN', 'cleavedCas', 'CyclinB', 'GAPDH', 'IdU', 'Ki.67', 'p.4EBP1',
    'p.Akt.Ser473.', 'p.AKT.Thr308.', 'p.AMPK', 'p.BTK', 'p.CREB', 'p.ERK',
    'p.FAK', 'p.GSK3b', 'p.H3', 'p.HER2_c', 'p.JNK', 'p.MAP2K3', 'p.MAPKAPK2',
#having a look at the head of the dataset
df.head()

#finding the null values in the dataset
df.isnull().sum().compute()

#defining the data and target
categorical_variables = df[[
    'Gender', 'Age', 'Occupation', 'City_Category',
    'Stay_In_Current_City_Years', 'Marital_Status'
]]
target = df['Purchase']

#creating dummies for the categorical variables
data = dd.get_dummies(categorical_variables.categorize()).compute()

#converting dataframe to array
datanew = data.values

#fit the model
from dask_ml.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(datanew, target)

#preparing the test data
test_categorical = test[[
    'Gender', 'Age', 'Occupation', 'City_Category',
    'Stay_In_Current_City_Years', 'Marital_Status'
]]
test_dummy = dd.get_dummies(test_categorical.categorize()).compute()
Example #32
0
def test_get_dummies_errors():
    with pytest.raises(NotImplementedError):
        # not Categorical
        s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4])
        ds = dd.from_pandas(s, 2)
        dd.get_dummies(ds)
Example #33
0
# In[22]:
#Now that we have extracted required derived features from the pickup and dropoff datetime, drop them
ddf = ddf.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime'], axis=1)

# #### One Hot encoding
# In[23]:
get_dummy_col = [
    "VendorID", "RatecodeID", "store_and_fwd_flag", "PULocationID",
    "DOLocationID", "payment_type", "pickup_hour", "dropoff_hour",
    "pickup_day", "dropoff_day"
]
ddf = ddf.categorize(columns=get_dummy_col)

# In[24]:
ddf = dpd.get_dummies(ddf, columns=get_dummy_col)

print("After one-hot encoding")
print(ddf.__class__)
print(ddf.shape[0].compute())

# #### Use tip_amount as the target label for training
# In[25]:
label = ddf[['tip_amount']].compute()
ddf = ddf.drop(['tip_amount'], axis=1)
ddf = ddf.drop(['total_amount'], axis=1)

# #### Transform features and then normalize values
# In[26]:
from dask_ml.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
Example #34
0

X = train_df.drop(target_genes,axis=1)
y = train_df[target_genes]


# In[14]:


print ("cat4egorize")

X = X.categorize(columns=["treatment"])

print ("dummies")

my_dummies = dd.get_dummies(X["treatment"])


X= X.drop(['treatment', 'cell_line', 'time', 'cellID', 'fileID'],axis=1)


# In[15]:


#  y.columns


# In[16]:


# test  = my_dummies.compute()
cat_variables = hour.dtypes[hour.dtypes == "object"].index
cat_variables

# In[29]:

ddf_hour = hour.categorize()

# In[30]:

ddf_hour.head()

# Now we proceed to dummify the selected variables to be able to be used in our models.

# In[31]:

hour = ddf.get_dummies(ddf_hour, columns=cat_variables)
print("The dataset now contains {} columns.".format(ddf_hour.shape[1]))

# ## Skewness

# Now we will check if there is any skewness in our target variable, if so we will proceed to take the log in order to make it normally distributed.

# In[32]:

plt.subplots(figsize=(15, 6))
sns.distplot(ddf_hour.cnt.compute(), color="red")
plt.title("Distribution of Total Count")

# In[33]:

ddf_hour.cnt = np.log1p(ddf_hour.cnt)