Esempio n. 1
0
def test_dataframes():
    df = dd.from_pandas(
        pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 10]}), npartitions=2
    )

    kmeans = DKKMeans()
    kmeans.fit(df)
Esempio n. 2
0
    def test_basic(self, Xl_blobs_easy):
        X, _ = Xl_blobs_easy

        # make it super easy to cluster
        a = DKKMeans(n_clusters=3, random_state=0)
        b = SKKMeans(n_clusters=3, random_state=0)
        a.fit(X)
        b.fit(X)
        assert_estimator_equal(
            a,
            b,
            exclude=['n_iter_', 'inertia_', 'cluster_centers_', 'labels_'])
        assert abs(a.inertia_ - b.inertia_) < 0.01
        # order is arbitrary, so align first
        a_order = np.argsort(a.cluster_centers_, 0)[:, 0]
        b_order = np.argsort(b.cluster_centers_, 0)[:, 0]
        a_centers = a.cluster_centers_[a_order]
        b_centers = b.cluster_centers_[b_order]
        np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3)
        b_labels = replace(b.labels_, [0, 1, 2], a_order[b_order])
        assert_eq(a.labels_.compute(), b_labels)
        assert a.n_iter_
        # this is hacky
        b.cluster_centers_ = b_centers
        a.cluster_centers_ = a_centers
        assert_eq(a.transform(X), b.transform(X), rtol=1e-3)
Esempio n. 3
0
def test_fit_raises():
    km = DKKMeans()
    with pytest.raises(ValueError):
        km.fit(np.array([]).reshape(0, 1))

    with pytest.raises(ValueError):
        km.fit(np.array([]).reshape(1, 0))
Esempio n. 4
0
 def test_fit_given_init(self, X_blobs):
     X_ = X_blobs.compute()
     x_squared_norms = k_means_.row_norms(X_, squared=True)
     rs = np.random.RandomState(0)
     init = k_means_._k_init(X_, 3, x_squared_norms, rs)
     dkkm = DKKMeans(3, init=init, random_state=rs)
     skkm = SKKMeans(3, init=init, random_state=rs, n_init=1)
     dkkm.fit(X_blobs)
     skkm.fit(X_)
     assert_eq(dkkm.inertia_, skkm.inertia_)
Esempio n. 5
0
 def test_kmeanspp_init(self, Xl_blobs_easy):
     X, y = Xl_blobs_easy
     X_ = X.compute()
     rs = np.random.RandomState(0)
     dkkm = DKKMeans(3, init="k-means++", random_state=rs)
     skkm = SKKMeans(3, init="k-means++", random_state=rs)
     dkkm.fit(X)
     skkm.fit(X_)
     assert abs(dkkm.inertia_ - skkm.inertia_) < 1e-4
     assert dkkm.init == "k-means++"
Esempio n. 6
0
 def test_random_init(self, Xl_blobs_easy):
     X, y = Xl_blobs_easy
     X_ = X.compute()
     rs = 0
     dkkm = DKKMeans(3, init="random", random_state=rs)
     skkm = SKKMeans(3, init="random", random_state=rs, n_init=1)
     dkkm.fit(X)
     skkm.fit(X_)
     assert abs(dkkm.inertia_ - skkm.inertia_) < 1e-4
     assert dkkm.init == "random"
Esempio n. 7
0
    def run(self):
        if self.word_vectors not in {"fasttext", "word2vec"}:
            raise ValueError(
                f'Expected fasttext or word2vec; got {self.word_vectors}')

        print(
            f'Initializing dask dataframe of word embeddings at {datetime.now()}'
        )
        ddf = dask.dataframe.read_csv(config.ARTICLE_EMBEDDINGS_DIR /
                                      f'{self.word_vectors}_to_csv' / "*.part")

        print(
            f'Dropping columns and converting to design matrix (dask array) at {datetime.now()}'
        )
        X = ddf.drop(['Unnamed: 0', "id", "url", "title"], axis=1)
        X = X.to_dask_array(lengths=True)

        # Perform k-means clustering
        print(f'Starting K-Means clustering at {datetime.now()}')
        k_means_clustering_model = KMeans(n_clusters=self.num_clusters,
                                          n_jobs=-1,
                                          max_iter=config.K_MEANS_MAX_ITER)
        k_means_cluster_labels = k_means_clustering_model.fit(X)

        # Write k-means results to disk
        print(
            f'Joining K-means results and writing to disk at {datetime.now()}')
        k_means_results_ddf = ddf.join(k_means_cluster_labels)
        k_means_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_k_means'
        k_means_ddf_output_path.mkdir(parents=True, exist_ok=True)
        dask.dataframe.to_csv(k_means_results_ddf, k_means_ddf_output_path)

        # Perform spectral clustering
        print(f'Starting Spectral clustering at {datetime.now()}')
        spectral_clustering_model = SpectralClustering(
            n_clusters=self.num_clusters,
            n_jobs=-1,
            persist_embedding=True,
            kmeans_params={"max_iter": config.K_MEANS_MAX_ITER})
        spectral_cluster_labels = spectral_clustering_model.fit(X)

        # Write spectral results to disk
        print(
            f'Joining Spectral results and writing to disk at {datetime.now()}'
        )
        spectral_results_ddf = ddf.join(spectral_cluster_labels)
        spectral_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_spectral'
        spectral_ddf_output_path.mkdir(parents=True, exist_ok=True)
        dask.dataframe.to_csv(spectral_results_ddf, spectral_ddf_output_path)

        # And save the success flag
        with self.output().open("w") as f:
            # f.write(f'Clustering {self.word_vectors} k={self.num_clusters}: {silhouette_score_result}' + "\n")
            # f.write(spectral_clustering_model.get_params(deep=True))
            f.write(f'{self.word_vectors}: Success!')
Esempio n. 8
0
 def test_fit_given_init(self):
     X, y = sklearn.datasets.make_blobs(n_samples=1000, n_features=4, random_state=1)
     X = da.from_array(X, chunks=500)
     X_ = X.compute()
     x_squared_norms = sklearn.utils.extmath.row_norms(X_, squared=True)
     rs = np.random.RandomState(0)
     init = _k_init(X_, 3, x_squared_norms, rs)
     dkkm = DKKMeans(3, init=init, random_state=0)
     skkm = SKKMeans(3, init=init, random_state=0, n_init=1)
     dkkm.fit(X)
     skkm.fit(X_)
     assert_eq(dkkm.inertia_, skkm.inertia_)
Esempio n. 9
0
    def test_dtypes(self):
        X = da.random.uniform(size=(100, 2), chunks=(50, 2))
        X2 = X.astype("f4")
        pairs = [(X, X), (X2, X2), (X, X2), (X2, X)]

        for xx, yy in pairs:
            a = DKKMeans()
            b = SKKMeans()
            a.fit(xx)
            b.fit(xx)
            assert a.cluster_centers_.dtype == b.cluster_centers_.dtype
            assert a.labels_.dtype == b.labels_.dtype
            assert a.transform(xx).dtype == b.transform(xx).dtype
            assert a.transform(yy).dtype == b.transform(yy).dtype
Esempio n. 10
0
    def genmask(self, ddf: dask.dataframe.DataFrame):

        center = None
        if self.lat_lon:
            center = self.lat_lon
        else:
            # If lat_long is empty, do some ML
            model = KMeans(n_clusters=1, init_max_iter=self.max_iter)
            model.fit(ddf[[self.lat_col,
                           self.lon_col]].to_dask_array(lengths=True))

            center = tuple(model.cluster_centers_[0])

        return ddf.map_partitions(lambda df: df.apply(
            self.applyfunc, axis=1, center=center).rename(self.name),
                                  meta=(self.name, 'bool'))
Esempio n. 11
0
def main():
    cfg = Path(__file__).parent.joinpath("kmeans_config.yaml")
    cfg = load_config(str(cfg))
    kmeans = KMeans(n_clusters=3, random_state=0)
    X = read(cfg)
    fit(cfg, kmeans, X)
    print(timings)
Esempio n. 12
0
def fit(data, use_scikit_learn=False):
    logger.info("Starting to cluster")
    # Cluster
    n_clusters = 8
    oversampling_factor = 2
    if use_scikit_learn:
        km = sk.KMeans(n_clusters=n_clusters, random_state=0)
    else:
        km = KMeans(n_clusters=n_clusters,
                    oversampling_factor=oversampling_factor,
                    random_state=0)
    t0 = tic()
    logger.info("Starting n_clusters=%2d, oversampling_factor=%2d",
                n_clusters, oversampling_factor)
    km.fit(data)
    t1 = tic()
    logger.info("Finished in %.2f", t1 - t0)
Esempio n. 13
0
def fit(data, use_scikit_learn=False):
    logger.info("Starting to cluster")
    # Cluster
    n_clusters = 8
    oversampling_factor = 2
    if use_scikit_learn:
        km = sk.KMeans(n_clusters=n_clusters, random_state=0)
    else:
        km = KMeans(
            n_clusters=n_clusters,
            oversampling_factor=oversampling_factor,
            random_state=0,
        )
    logger.info(
        "Starting n_clusters=%2d, oversampling_factor=%2d",
        n_clusters,
        oversampling_factor,
    )
    with _timer("km.fit", _logger=logger):
        km.fit(data)
Esempio n. 14
0
    def test_kmeanspp_init_random_state(self, Xl_blobs_easy):
        X, y = Xl_blobs_easy
        a = DKKMeans(3, init="k-means++")
        a.fit(X)

        b = DKKMeans(3, init="k-means++", random_state=0)
        b.fit(X)
Esempio n. 15
0
def cluster_variable(data):
    """
    Creates a column that gives a cluster id based on KMeans clustering of all features

    :param data: a pandas dataframe where each row is an hour
    :return: a pandas dataframe containing the new column
    """
    print("\tAdding cluster variable...")
    data = data.copy()
    to_cluster = dd.get_dummies(data)
    train = get_train(to_cluster)
    holdout = get_holdout(to_cluster)

    kmeans = KMeans(n_clusters=5, random_state=SEED).fit(
        train.drop("cnt", axis=1))  # magic numbers, blech

    data["cluster"] = da.append(kmeans.labels_,
                                kmeans.predict(holdout.drop("cnt", axis=1)))

    data["cluster"] = data["cluster"].astype("category")

    return data
Esempio n. 16
0
def weather_cluster(data):
    """
    Creates a column that gives a cluster id based on KMeans clustering of only weather-related features

    :param data: a pandas dataframe where each row is an hour
    :return: a pandas dataframe containing the new column
    """
    print("\tAdding clustering variable based on weather-related features...")
    df = data.copy()[["weathersit", "temp", "atemp", "hum", "windspeed"]]
    to_cluster = dd.get_dummies(df)
    train = get_train(to_cluster)
    holdout = get_holdout(to_cluster)

    kmeans = KMeans(n_clusters=5,
                    random_state=SEED).fit(train)  # magic numbers, blech

    data["weather_cluster"] = da.append(kmeans.labels_,
                                        kmeans.predict(holdout))

    data["weather_cluster"] = data["weather_cluster"].astype("category")

    return data
Esempio n. 17
0
 def test_inputs(self, X):
     km = DKKMeans(n_clusters=3)
     km.fit(X)
     km.transform(X)
Esempio n. 18
0
def learn_clusters(n_clust):
    client = Client(n_workers=4, processes=True)

    # 1. Learn clusters

    # Full set
    kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust)

    array = da.from_npy_stack(npy_stack_path)
    kmeans = KMeans(n_clusters=n_clust)

    # Learn on a part of set
    # array = np.load('Clustering/npy_post_vecs_part/0.npy')
    # kmeans = SKMeans(n_clusters=n_clust)

    print('Fitting')
    kmeans.fit(array)

    del array
    # Dump centroids to the disk

    # Dump as a sklearn object, for (maybe) faster prediction and less problems
    skmeans = SKMeans(n_clusters=n_clust)
    skmeans.cluster_centers_ = kmeans.cluster_centers_
    skmeans._n_threads = _openmp_effective_n_threads()
    dump(skmeans, kmeans_path)
    del kmeans, skmeans

    # dump(kmeans, kmeans_path) # For learning on a part of set
    # del kmeans
    print('Fitted')

    # 3. Turn posts into clusters
    kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust)

    df = dd.read_parquet('preprocessed.parquet')
    df = df.map_partitions(df_to_vector_predict,
                           kmeans_path,
                           meta={
                               'user_id': int,
                               'post_id': int,
                               'text': object,
                               'type': str,
                               'date': str,
                               'cluster': int
                           })
    df.to_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust))
    print('Clustered')

    # 2.5. Filter outdated posts out. (The next time write date of parsing to user_info)
    # For each user find his last like and filter out likes that are older than the last + half a year
    df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust))
    print('Original df len: {}'.format(len(df)))

    year = 31536000  # One year in timestamp
    kyear = 20
    break_time = kyear * year  # 0.75*year - A quarter to year
    last_like = df['date'].max().compute(
    )  # Set has been fully collected on 8 of June 2020

    df = df[df['date'] >
            last_like - break_time]  # Pass only a quarter-to-year recent likes
    print('max_date: {} '.format(df['date'].max().compute()))
    print('min date: {}'.format(df['date'].min().compute()))
    print('Filtered df len: {}'.format(len(df)))
    print('Likes has been filtered out by date')

    # 3. Group clusters by user_id and turn them into a single vector for each user

    # df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) # INSTEAD OF FILTER!

    # - Count text_likes number for each user (and later merge with user_info)
    count = df.drop(columns=['post_id', 'type', 'date', 'cluster']).groupby(
        'user_id')['text'].count().compute()
    count.rename('text_likes', inplace=True)

    # Generate meta
    meta = {'user_id': int}
    for i in range(n_clust):
        meta[i] = float

    df = df.map_partitions(
        lambda df_part: kt.clusters_to_vector(df_part, n_clust), meta=meta)

    df.to_parquet(
        'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust))

    # 5. Merge clusters and user_info dataframes. (Working with pandas frames)
    df_info = pd.read_csv('users_info.csv')

    df_info = df_info.merge(count, on='user_id', how='inner')
    del count

    df = pd.read_parquet(
        'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust))

    df = df_info.merge(
        df, on='user_id', how='inner'
    )  # Merging user's info and clusters. Maybe, mistake is here

    df.to_csv('Clustering/KMeans/n{}-final_dataset-{}year.csv'.format(
        n_clust, kyear))
    print('Final dataset has been saved')
    del df_info

    # Filter some users out
    # df = pd.read_csv('Clustering/KMeans/n{}-final_dataset.csv'.format(n_clust)).drop(columns=['Unnamed: 0']) # TESTING

    df = df.loc[(df['text_likes'] > 100) & (df['text_likes'] < 1000)]

    df['bdate'] = df['bdate'].apply(
        lambda bd: time.mktime(datetime.strptime(bd, "%d.%m.%Y").timetuple()))

    # Clean up the dataset
    df = df.drop(columns=[
        'posts_n', 'text_likes', 'status', 'sex', 'smoking', 'alcohol',
        'parth_id', 'country', 'city', 'user_id'
    ]).dropna().reset_index(drop=True)

    # 6. Supervise a Linear Regression model
    regr = LinearRegression()
    R2 = train(df, regr)

    client.close()
    return R2
Esempio n. 19
0
def do(X, n_clusters, factor):
    km = KMeans(n_clusters=n_clusters, oversampling_factor=factor)
    km.fit(X)
    return km
Esempio n. 20
0
 def test_dask_dataframe_raises(self):
     km = DKKMeans(n_clusters=3)
     X = dd.from_pandas(pd.DataFrame({"A": range(50)}), npartitions=2)
     with pytest.raises(TypeError):
         km.fit(X)
Esempio n. 21
0
n_centers = 12
n_features = 20
X_small, y_small = make_blobs(n_samples=1000,
                              centers=n_centers,
                              n_features=n_features,
                              random_state=0)
centers = np.zeros((n_centers, n_features))
for i in range(n_centers):
    centers[i] = X_small[y_small == i].mean(0)
print(centers)

n_samples_per_block = 20000  # 0
n_blocks = 500
delayeds = [
    dask.delayed(make_blobs)(n_samples=n_samples_per_block,
                             centers=centers,
                             n_features=n_features,
                             random_state=i)[0] for i in range(n_blocks)
]
arrays = [
    da.from_delayed(obj,
                    shape=(n_samples_per_block, n_features),
                    dtype=X_small.dtype) for obj in delayeds
]
X = da.concatenate(arrays)
print(X.nbytes / 1e9)
X = X.persist()  #actually run the stuff

clf = KMeans(init_max_iter=3, oversampling_factor=10)
clf.fit(X)
print(clf.labels_[:10].compute())  #actually run the stuff
Esempio n. 22
0
def test_too_small():
    km = DKKMeans()
    X = da.random.uniform(size=(20, 2), chunks=(10, 2))
    km.fit(X)
Esempio n. 23
0
from dask.distributed import Client
import time
import sys
from dask_ml.cluster import KMeans
import dask.dataframe as dd

client = Client(n_workers=4)
t0 = time.time()
dataset = dd.read_csv(sys.argv[1], header=None)
dataset = dataset[[
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21
]]
clf = KMeans(n_clusters=5, tol=0.0001)
clf.fit(dataset)
a = clf.transform(dataset)
a.compute()
print(clf.cluster_centers_)
print('Tiempo transcurrido:', time.time() - t0)
client.close()
    'Stay_In_Current_City_Years', 'Marital_Status'
]]
target = df['Purchase']

#creating dummies for the categorical variables
data = dd.get_dummies(categorical_variables.categorize()).compute()

#converting dataframe to array
datanew = data.values

#fit the model
from dask_ml.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(datanew, target)

#preparing the test data
test_categorical = test[[
    'Gender', 'Age', 'Occupation', 'City_Category',
    'Stay_In_Current_City_Years', 'Marital_Status'
]]
test_dummy = dd.get_dummies(test_categorical.categorize()).compute()
testnew = test_dummy.values

#predict on test and upload
pred = lr.predict(testnew)

#Clustering/K-Means
from dask_ml.cluster import KMeans
model = KMeans()
model.fit(datanew, target)