Ejemplo n.º 1
0
 def cal_k_shape(self, data, num_cluster):
     """
     use best of cluster
     :param df: time series dataset
     :param num_cluster:
     :return:cluster label
     """
     ks = KShape(n_clusters=num_cluster,
                 n_init=5,
                 verbose=True,
                 random_state=self.seed)
     y_pred = ks.fit_predict(data)
     return y_pred
Ejemplo n.º 2
0
def clustering_Kshape(tsdata, n_clusters, random_state, n_init, max_iter=100):
	np.random.seed(random_state)
	# Need to be normalized to calculate cross correlation
	# stack_data = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(stack_data)

	# Instantiate of KShape Class
	ks = KShape(
		n_clusters=n_clusters,
		n_init=n_init,
		verbose=True,
		random_state=random_state,
		max_iter=max_iter
	)
	y_pred = ks.fit_predict(tsdata)

	return y_pred
    def do_kshape(days, km_size):
        """
        From a time series (as a list of df called days), creates km_size 
        clusters using kshape algo.
        """
        # Arrange data for our lib
        unq = days["n_day_"].unique()
        values = [days[days["n_day_"] == l]["val_"].values for l in unq]
        formatted_dataset = to_time_series_dataset(values)

        # Configure our kmeans
        kshape = KShape(n_clusters=km_size, random_state=42, verbose=False)

        y_pred = kshape.fit_predict(formatted_dataset)

        return kshape, y_pred
Ejemplo n.º 4
0
def kshape(container: DataFrameContainer,
           data_column: str,
           n_clusters: int,
           max_iter: int = 300,
           tol: float = 1e-6,
           n_init: int = 1,
           verbose: bool = True,
           random_state: Union[int, None] = None,
           init: np.ndarray = 'random',
           centroid_seeds: np.ndarray = None):
    """

    :param container:
    :param data_column:
    :param n_clusters:
    :param max_iter:
    :param tol:
    :param n_init:
    :param verbose:
    :param random_state:
    :param init:
    :param centroid_seeds: arrays of shape [n_clusters, ts_size]
    :return:
    """
    if centroid_seeds is not None:
        init = np.swapaxes(np.array([centroid_seeds]).T, 0, 1)

    ks = KShape(n_clusters=n_clusters,
                max_iter=max_iter,
                tol=tol,
                n_init=n_init,
                verbose=verbose,
                random_state=random_state,
                init=init)

    X = np.vstack(container.dataframe[data_column].values)

    y = ks.fit_predict(X)

    container.dataframe['KSHAPE_CLUSTER'] = y

    return container
Ejemplo n.º 5
0
 def plot_best_shape(self, data, num_cluster):
     """
     time series cluster plot
     :param df:
     :param num_cluster:
     :return:
     """
     ks = KShape(n_clusters=num_cluster,
                 n_init=5,
                 verbose=True,
                 random_state=self.seed)
     y_pred = ks.fit_predict(data)
     for yi in range(num_cluster):
         for xx in data[y_pred == yi]:
             plt.plot(xx.ravel(), "k-", alpha=.3)
         plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
         plt.text(0.55,
                  0.85,
                  'Cluster %d' % (yi + 1),
                  transform=plt.gca().transAxes)
         plt.tight_layout()
         plt.show()
Ejemplo n.º 6
0
from tslearn.clustering import KShape
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

seed = 0
numpy.random.seed(seed)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train < 4]  # Keep first 3 classes
numpy.random.shuffle(X_train)
X_train = TimeSeriesScalerMeanVariance().fit_transform(
    X_train[:50])  # Keep only 50 time series
sz = X_train.shape[1]

# Euclidean k-means
ks = KShape(n_clusters=3, verbose=True, random_state=seed)
y_pred = ks.fit_predict(X_train)

plt.figure()
for yi in range(3):
    plt.subplot(3, 1, 1 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()
plt.show()
Ejemplo n.º 7
0
def mass_upload(startDate, endDate, id_unit_usaha):
    print(id_unit_usaha)
    login = ""
    password = ""
    # engine = sqlalchemy.create_engine('mysql+pymysql://energy:energy2x5=10@localhost:3306/pgn')
    engine = sqlalchemy.create_engine(
        'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    sql = " SELECT a.IDREFPELANGGAN, a.ID_UNIT_USAHA, 1 AS FSTREAMID, DATEPART(dw, a.FDATETIME) as FDAYOFWEEK, a.FHOUR, AVG(a.FDVC) as AVG_FDVC\
            FROM(SELECT IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR, SUM(FDVC) as FDVC\
                FROM amr_bridge\
                WHERE FDATETIME >= '" + startDate + "'\
                and FDATETIME < '" + endDate + "'\
                GROUP BY IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR) a\
            GROUP BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR\
            ORDER BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR"

    df = pd.read_sql_query(sql, engine)
    totaldf = len(df)
    totaldf = str(totaldf)
    print('total Data: ' + totaldf)

    # rslt_df = df.loc[df['ID_UNIT_USAHA'] == '014']

    # print(startDate)
    # print('\nResult dataframe :\n', rslt_df)

    # df.to_csv('pgn_customer_cluster_v1_{}.csv'.format(id_unit_usaha), index=False)

    # df.to_hdf("amr_bridge_22122020.hdf", key='hdf5')

    # df = pd.read_hdf("amr_bridge_22122020.hdf")

    def select_data(id_unit):
        query = "ID_UNIT_USAHA == '{}'".format(id_unit_usaha)
        columns = ['FDAYOFWEEK', 'FHOUR', 'IDREFPELANGGAN', 'AVG_FDVC']

        # df = df.set_index('FDATETIME')
        df_selected = df.query(query, engine='python')[columns]
        return df_selected

    def pivot_data(df):
        # df_pivoted = df.pivot(index='FDATETIME', columns='IDREFPELANGGAN', values='FDVC')
        df_pivoted = df.pivot(index=['FDAYOFWEEK', 'FHOUR'],
                              columns='IDREFPELANGGAN',
                              values='AVG_FDVC')
        return df_pivoted

    def remove_zerocolumns(df):
        # Get all columns which have all zero values
        cols = df.columns[df.mean() == 0]
        # Drop columns which has all zero values
        df = df.drop(cols, axis=1)
        return df

    df_week1 = select_data(id_unit_usaha)
    df_week1.fillna(0.0, inplace=True)

    df_pivoted1 = pivot_data(df_week1)
    df_pivoted1.fillna(0.0, inplace=True)

    df_pivoted1 = remove_zerocolumns(df_pivoted1)
    cols = list(df_pivoted1.columns)
    df_pivoted1.head()

    # Function to plot cluster

    # def plot_clusters(ds, y_pred, n_clusters, ks, filename):
    #     plt.figure(figsize=(12, 40))
    #     for yi in range(n_clusters):
    #         plt.subplot(n_clusters, 1, 1 + yi)
    #         for xx in ds[y_pred == yi]:
    #             plt.plot(xx.ravel(), "k-", alpha=.2)
    #         plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    #         plt.xlim(0, sz)
    #         plt.ylim(-7, 7)
    #         plt.title("Cluster %d" % (yi))

    #     plt.tight_layout()
    #     plt.savefig(filename, format='jpg', dpi=300, quality=95)
    #     plt.show()

    def create_cluster_info(y_pred, cols):

        df_cluster = pd.DataFrame(y_pred.copy(),
                                  index=cols.copy(),
                                  columns=['cluster'])
        df_cluster.reset_index(inplace=True)
        df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True)

        unique_cluster = df_cluster['cluster'].unique()

        # Get ID ref based on cluster
        idrefs_list = []
        for i, x in enumerate(unique_cluster):
            idref_list = df_cluster.query(
                "cluster == {}".format(x))['idrefpelanggan'].values.tolist()
            # idrefs_list[x] = idref_list

            # Create dictionary
            idref_cluster_dict = {'cluster': x, 'idrefpelanggan': idref_list}
            idrefs_list.append(idref_cluster_dict)

        idrefs_cluster = pd.DataFrame(idrefs_list)
        return idrefs_cluster

    # def run_once(startime, totalData, _has_run=[]):
    #     if _has_run:
    #         return
    #     # print("run_once doing stuff")
    #     print(startime)
    #     endtime = time.time_ns()
    #     print(endtime)
    #     invTime = endtime-startime

    #     estTime = invTime * totalData
    #     _has_run.append(1)

    #     print(totalData)
    #     print(estTime)
    #     return estTime

    seed = 0
    np.random.seed(seed)

    # Convert data frame to list of series
    pivoted_series = []
    pivoted_columns = []
    for i, y in enumerate(cols):
        length = len(df_pivoted1[y])
        cst = df_pivoted1[y].values
        pivoted_series.append(cst)
        pivoted_columns.append(y)

        # Convert data set to standar time series format
    formatted_dataset = to_time_series_dataset(pivoted_series)
    print("Data shape: {}".format(formatted_dataset.shape))

    formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
        formatted_dataset)
    sz = formatted_norm_dataset.shape[1]
    print("Data shape: {}".format(sz))

    formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
        formatted_dataset)
    clusters = 5
    totalColumn = formatted_norm_dataset.shape[0]
    totalRow = formatted_norm_dataset.shape[1]
    totalData = totalRow * totalColumn + totalRow * clusters

    ks = KShape(n_clusters=clusters, verbose=True, random_state=seed)
    y_pred_ks = ks.fit_predict(formatted_norm_dataset)
    formatted_norm_dataset.shape
    data = formatted_norm_dataset
    data.shape

    formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0]
    formatted_norm_dataset_2d.shape
    # pd.DataFrame(A.T.reshape(2, -1), columns=cols)

    df_normalized = pd.DataFrame(formatted_norm_dataset_2d)
    df_normalized
    # df_normalized = df_normalized.pivot()
    # formatted_norm_dataset[0]

    df_cluster = pd.DataFrame(y_pred_ks,
                              index=pivoted_columns,
                              columns=['cluster'])
    df_cluster.reset_index(inplace=True)
    df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True)
    df_cluster.sort_values(['cluster'])

    df_normalized_detail = pd.DataFrame.join(df_normalized, df_cluster)
    df_normalized_detail

    # df_cluster.to_csv('pgn_customer_cluster_{}.csv'.format(
    #     id_unit_usaha), index=False)

    # Create data frame for customer and its cluster
    create_cluster_info(y_pred_ks, cols)

    # plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks,
    #               'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha))

    # engine2 = sqlalchemy.create_engine(
    #     'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    # Session = sessionmaker(bind=engine2)
    # session = Session()

    # Base = declarative_base()

    # class PL_CUSTOMER_CLUSTER(Base):

    #     __tablename__ = 'PL_CUSTOMER_CLUSTER'

    #     ID = Column(Integer, primary_key=True)
    #     DATE_STAMP = Column(DateTime)
    #     IDREFPELANGGAN = Column(String(30))
    #     HOUR_NUM = Column(Integer)
    #     CLUSTER_NUM = Column(Integer)
    #     HOUR_NUM = Column(Integer)
    #     FDVC_NORMALIZED = Column(Float)
    #     AREA_ID = Column(String(5))
    # startime = time.time_ns()
    # for i in range(totalColumn):

    #     idref = df_normalized_detail.iloc[i, totalRow]
    #     cluster = int(df_normalized_detail.iloc[i, totalRow+1])
    #     print("idref = " + idref)
    #     cluster_num = df_normalized_detail.iloc[i, totalRow-1]
    #     for j in range(totalRow):

    #         hour_num = df_normalized_detail.columns[j]
    #         fdvc = df_normalized_detail.iloc[i, j]

    #         sql = ""

    #         # insert into table
    #         item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=idref,
    #                                    HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc, AREA_ID=id_unit_usaha)
    #         session.add(item)

    #     # commit per id ref pelanngan
    #     session.commit()

    engine2 = sqlalchemy.create_engine(
        'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    Session = sessionmaker(bind=engine2)
    session = Session()

    Base = declarative_base()

    class PL_CUSTOMER_CLUSTER(Base):
        __tablename__ = 'PL_CUSTOMER_CLUSTER'

        ID = Column(Integer, primary_key=True)
        DATE_STAMP = Column(DateTime)
        IDREFPELANGGAN = Column(String(30))
        HOUR_NUM = Column(Integer)
        CLUSTER_NUM = Column(Integer)
        HOUR_NUM = Column(Integer)
        FDVC_NORMALIZED = Column(Float)
        AREA_ID = Column(String(5))

    df_normalized_detail

    for i in range(5):
        print("cluster: " + str(i))
        CLUSTER_NAME = "CENTROID_ID" + str(i)
        cluster = i
        for j in range(totalRow):
            fdvc_norm = ks.cluster_centers_[i][j][0]
            hour_num = j

            sql = ""
            item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate,
                                       IDREFPELANGGAN=CLUSTER_NAME,
                                       HOUR_NUM=hour_num,
                                       CLUSTER_NUM=cluster,
                                       FDVC_NORMALIZED=fdvc_norm,
                                       AREA_ID=id_unit_usaha)
            session.add(item)
            print("fdvc:" + str(fdvc_norm) + "Hour:" + str(hour_num))
        # commit per id ref pelanngan
        session.commit()
        print(str(j) + ", " + str(fdvc_norm))

    return totalData
Ejemplo n.º 8
0
# scale mean around zero
input_waves = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(waves)

# run clustering or skip and load results if desired
if skipClustering:
    clustFile = h5py.File(
        templatePath + str(numCluster) + "/" + str(numCluster) +
        "_cluster_predictions_" + str(prefiltFreq[0]) + "-" +
        str(prefiltFreq[1]) + "Hz.h5", "r")
    pred = np.array(list(clustFile["cluster_index"]))
    centroids = list(clustFile["centroids"])
    clustFile.close()
else:
    print("Clustering...")
    ks = KShape(n_clusters=numCluster, n_init=1, random_state=0)
    pred = ks.fit_predict(input_waves)

    clustFile = h5py.File(
        templatePath + str(numCluster) + "/" + str(numCluster) +
        "_cluster_predictions_" + str(prefiltFreq[0]) + "-" +
        str(prefiltFreq[1]) + "Hz.h5", "w")
    clustFile.create_dataset("cluster_index", data=pred)
    clustFile.create_dataset("centroids", data=ks.cluster_centers_)
    clustFile.create_dataset("inertia", data=ks.inertia_)
    clustFile.close()

    modelFile = templatePath + str(numCluster) + "/" + str(
        numCluster) + "_cluster_model_" + str(prefiltFreq[0]) + "-" + str(
            prefiltFreq[1]) + "Hz.h5"
    ks.to_hdf5(modelFile)
print("Data shape: {}".format(formatted_dataset.shape))

formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
    formatted_dataset)
sz = formatted_norm_dataset.shape[1]
print("Data shape: {}".format(sz))

formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
    formatted_dataset)

totalColumn = formatted_norm_dataset.shape[0]
totalRow = formatted_norm_dataset.shape[1]

clusters = 5
ks = KShape(n_clusters=clusters, verbose=True, random_state=seed)
y_pred_ks = ks.fit_predict(formatted_norm_dataset)
formatted_norm_dataset.shape
data = formatted_norm_dataset
data.shape

formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0]
formatted_norm_dataset_2d.shape
#pd.DataFrame(A.T.reshape(2, -1), columns=cols)

df_normalized = pd.DataFrame(formatted_norm_dataset_2d)
df_normalized
#df_normalized = df_normalized.pivot()
# formatted_norm_dataset[0]

df_cluster = pd.DataFrame(y_pred_ks,
                          index=pivoted_columns,
def k_shape(X_train, n_clusters, verbose=True, seed=0):
    # Euclidean k-means
    ks = KShape(n_clusters=n_clusters, verbose=verbose, random_state=seed)

    return ks, ks.fit_predict(X_train)
Ejemplo n.º 11
0
class Kshape(cs):
    """Classe de partitionnement des donnees avec l'algorithm K-shape

    Parameters:
        * ss : SeriesSupp
            instance du manager de series temporelles

    Variables:
        * seed: int
            Valeur d'initialisation de l'algo, random.
        * counter: Counter
            repartition des objets au sein des clusters
        * km: TimeSeriesKMeans
            Instance de l'algo
        * clust_name: String
            Nom de l'algo(affichage des plots)
        * metric: String
            Choix du metrics utilise, principalement softdtw ici car tres efficace et rapide
    """
    def __init__(self, ss):
        super().__init__(ss)
        self.seed = 0
        np.random.seed(self.seed)
        self.counter = None
        self.km = None
        self.clust_name = "Kshape"
        self.metric = "shape"

    def k_init(self, v=True):
        """
        initialisation de l'instance de l'algorithm avec les parametres actuels

        Parameters:
            * v: boolean
                Verbose, affiche les info lie au partitionnement

        Returns:
            NA
        """
        self.km = KShape(n_clusters=self.n, verbose=v, random_state=self.seed)

    def k_fit(self):
        """
        Effectue le partitionnement

        Parameters:
            NA

        Returns:
            NA
        """
        self.ts_clust = self.km.fit_predict(self.ts)

    def cluster_counter(self):
        """
        Compte les objets au sein des clusters

        Parameters:
            NA

        Returns:
            NA
        """
        self.counter = Counter(self.ts_clust)
Ejemplo n.º 12
0
print(df)

# ...........Making input environment..............
my_time_series = []

for i in range(df.shape[0]):
    # for i in range(50):
    my_time_series.append(df.iloc[i].values)
formatted_dataset = to_time_series_dataset(my_time_series)

print(formatted_dataset.shape)

ks = KShape(n_clusters=100, verbose=True)
# ks=KShape(n_clusters=10,verbose=True)
y_pred = ks.fit_predict(formatted_dataset)
print(y_pred)
centroid = ks.cluster_centers_
centroid = centroid.reshape((centroid.shape[0], centroid.shape[1]))
print(centroid.shape)
# np.savetxt("Results/centroid.csv", centroid, delimiter=",")
pd.DataFrame(centroid).to_csv('Results/centroid_lv10.csv')

#
D = cdist(
    formatted_dataset.reshape(
        (formatted_dataset.shape[0], formatted_dataset.shape[1])), centroid)
print(D.shape)
selected_ts = np.argmin(D.T, axis=1)
pd.DataFrame(selected_ts).to_csv('Results/selected_ts_lv12.csv')
pd.DataFrame(y_pred).to_csv('Results/clusterprediction_lv12.csv')
Ejemplo n.º 13
0
class Kshape():
    """
    Input time_span data
    data is pd.DataFrame
    data columns are [DEVICE_DATETIME, TEMPRATURE] where DEVICE_DATETIME is index.
    data is must be sorted by index, ascendings = True.
    data has taken every 10 seconds.
    time_span = 1 means 1 timeseries = 1 minutes data.
    batch is the number of elements what using 1 timeseris has. 
    """
    def __init__(
        self,
        time_span=1,
        batch=60,
        data=None,
    ):
        self.time_span = time_span * 6
        self.data = data
        self.batch = batch
        self.km = KShape(n_clusters=2,
                         max_iter=50,
                         verbose=True,
                         random_state=0)

    def Preprocess(self, x=None):
        """
        dataを(batch, len(data)//time_span)の形に整形する。
        """
        if str(type(x)) == "<class 'NoneType'>":
            self.n_data = len(self.data) // self.time_span
            self.n_use = self.time_span * self.n_data
            ts = self.data.loc[:self.data.index[self.n_use - 1]]
            ts = np.array(ts.TEMPERATURE).reshape(1, -1)
            ts = TimeSeriesScalerMeanVariance().fit_transform(ts)
            ts = np.array(ts).reshape(self.n_data, -1)
            ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts)
            self.ts = ts
        else:
            self.x_data = len(x) // self.time_span
            self.x_use = self.time_span * self.x_data
            ts = x.loc[:x.index[self.x_use - 1]]
            ts = np.array(ts.TEMPERATURE).reshape(1, -1)
            ts = TimeSeriesScalerMeanVariance().fit_transform(ts)
            ts = np.array(ts).reshape(self.x_data, -1)
            ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts)
            return ts

    def classification(self):
        """
        KShape で分類する。
        使わなかったデータは、TimeSeriesResampler でかさ増しして使う
        分類後に、self.data にcluster 列を作る
        """
        self.Preprocess()
        self.y_pred = self.km.fit_predict(self.ts)
        #cluster 列を作る
        self.cluster = []
        for i in range(self.n_data):
            list_item = [self.y_pred[i]] * self.time_span
            self.cluster.extend(list_item)
        #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。
        if not self.n_use == len(self.data):
            self.ts_c = self.data.loc[self.data.index[self.n_use]:]
            self.ts_c = np.array(self.ts_c.TEMPERATURE).reshape(1, -1)
            self.ts_batch = TimeSeriesResampler(sz=self.batch).fit_transform(
                self.ts_c)
            self.y_pred_c = [int(self.km.predict(self.ts_batch))
                             ] * self.ts_c.shape[1]
            self.cluster.extend(self.y_pred_c)
        self.data["CLUSTER"] = self.cluster

    def draw_graph(self, x=None):
        if str(type(x)) == "<class 'NoneType'>":
            fig, ax = plt.subplots()
            sns.scatterplot(data=self.data,
                            x="DEVICE_DATETIME",
                            y="TEMPERATURE",
                            hue="CLUSTER")
            locator = mdates.AutoDateLocator(minticks=4, maxticks=10)
            formatter = mdates.ConciseDateFormatter(locator=locator)
            ax.xaxis.set_major_locator(locator)
            ax.xaxis.set_major_formatter(formatter)
            plt.show()
        else:
            fig, ax = plt.subplots()
            sns.scatterplot(data=x,
                            x="DEVICE_DATETIME",
                            y="TEMPERATURE",
                            hue="CLUSTER")
            locator = mdates.AutoDateLocator(minticks=4, maxticks=10)
            formatter = mdates.ConciseDateFormatter(locator=locator)
            ax.xaxis.set_major_locator(locator)
            ax.xaxis.set_major_formatter(formatter)
            plt.show()

    def predict(self, x):
        ts = self.Preprocess(x=x)
        pred = self.km.predict(ts)
        cluster = []
        for i in range(self.x_data):
            list_item = [pred[i]] * self.time_span
            cluster.extend(list_item)
        #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。
        if not self.x_use == len(x):
            self.x_c = x.loc[x.index[self.x_use]:]
            self.x_c = np.array(self.x_c.TEMPERATURE).reshape(1, -1)
            self.x_batch = TimeSeriesResampler(sz=self.batch).fit_transform(
                self.x_c)
            y_pred_c = [int(self.km.predict(self.x_batch))] * self.x_c.shape[1]
            cluster.extend(y_pred_c)
        x["CLUSTER"] = cluster
        self.draw_graph(x=x)
Ejemplo n.º 14
0
# # print("cosine:", distance)
# plt.imshow(distance)
# plt.show()

# In[2]
np.random.shuffle(all_data)
print(all_data.shape)

# For this method to operate properly, prior scaling is required
x_train = TimeSeriesScalerMeanVariance().fit_transform(all_data)
sz = x_train.shape[1]

# kShape clustering
seed = 0
ks = KShape(n_clusters=2, verbose=True, random_state=seed)
y_pred = ks.fit_predict(x_train)

print(x_train.shape)
print(y_pred.shape)

plt.figure()
for yi in range(2):
    N = len(x_train[y_pred == yi])
    ind = 0
    for xx in x_train[y_pred == yi]:
        plt.subplot(N, 2, 2*ind+yi+1)
        plt.plot(xx)
        ind += 1
        # plt.plot(xx.ravel(), "k-", alpha=.2)
    # plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    # plt.xlim(0, sz)
Ejemplo n.º 15
0
from tslearn.clustering import KShape

hum_sub = np.loadtxt('../../HUM_subs.csv', delimiter=',', skiprows=0)
print(hum_sub.shape)

X = to_time_series_dataset(hum_sub)
print(X.shape)
X = TimeSeriesScalerMeanVariance().fit_transform(X)
sz = X.shape[1]

seed = 4
np.random.seed(seed)

nclust = 3
ks = KShape(n_clusters=nclust, verbose=True, random_state=seed)
y_pred = ks.fit_predict(X)

print(y_pred + 1)
print(len(y_pred))

# for i,j in enumerate(y_pred+1):
#     if j == 2:
#         print(i+1)

plt.figure()
for yi in range(nclust):
    plt.subplot(nclust, 1, 1 + yi)
    for xx in X[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=0.2)
    plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
Ejemplo n.º 16
0
def main(argv):
    # define global timer to obtain global execution time
    start_global = timer()
    
    # define globals variables
    global euclidean_clustered_data, \
        dtw_clustered_data, \
        soft_dtw_clustered_data, \
        k_shape_clustered_data, \
        gak_clustered_data
    
    #############################################################################################
    # Input arguments parsing
    #############################################################################################
    
    # define help message
    help_message = \
        'clustering.py -h \n\n' \
        'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \
        'by default: processing input data (without any sampling)' \
        '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \
        'options list: \n' \
        '  -c / --clusters <number_clusters>  # set number of clusters (default 3) \n\n' \
        '  -i / --ifile <input_file>          # set input filename \n' \
        '  -n / --normalise                   # normalise input data \n' \
        '  -s / --standardise                 # standardise input data \n\n' \
        '  -a / --all                         # perform all 5 implemented methods of clustering: \n' \
        '                                       euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \
        '  -E / --euclidean                   # perform euclidean k-means clustering \n' \
        '  -D / --dtw                         # perform dtw k-means clustering \n' \
        '  -S / --soft-dtw                    # perform soft-dtw k-means clustering \n' \
        '  -K / --k-shape                     # perform k-shape clustering \n' \
        '  -G / --gak                         # perform GAK k-means clustering \n'
    
    # Create new object to save arguments
    i_args = Arguments()
    
    # number of rows in plot to create correct number of subplots
    # default = 3 (raw data plus distribution histograms)
    n_rows_plot = 3
    
    # define validation rules for arguments
    try:
        opts, args = getopt.getopt(
            argv,
            "hc:i:nsaEDSKG",
            [
                "help",
                "clusters=",
                "ifile=",
                "normalise",
                "standardise",
                "all",
                "euclidean",
                "dtw",
                "soft-dtw",
                "k-shape",
                "gak"
            ]
        )
    except getopt.GetoptError:
        print(help_message)
        sys.exit(2)
    
    # parse arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print(help_message)
            sys.exit()
        elif opt in ("-c", "--clusters"):
            i_args.number_clusters = arg
        elif opt in ("-i", "--ifile"):
            i_args.input_file = arg
        elif opt in ("-n", "--normalise"):
            i_args.normalise_data = True
        elif opt in ("-s", "--standardise"):
            i_args.standardise_data = True
        elif opt in ("-E", "--euclidean"):
            n_rows_plot += 1
            i_args.euclidean_clustering = True
        elif opt in ("-D", "--dtw"):
            n_rows_plot += 1
            i_args.dtw_clustering = True
        elif opt in ("-S", "--soft-dtw"):
            n_rows_plot += 1
            i_args.soft_dtw_clustering = True
        elif opt in ("-K", "--k-shape"):
            n_rows_plot += 1
            i_args.k_shape_clustering = True
        elif opt in ("-G", "--gak"):
            n_rows_plot += 1
            i_args.gak_clustering = True
        elif opt in ("-a", "--all"):
            n_rows_plot = 8
            i_args.euclidean_clustering = True
            i_args.dtw_clustering = True
            i_args.soft_dtw_clustering = True
            i_args.k_shape_clustering = True
            i_args.gak_clustering = True
    
    # normalise maximum number of subplots levels
    n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot
    
    #############################################################################################
    # Raw data processing stage
    #############################################################################################
    
    # set style to matplotlib plot
    mpl.style.use('seaborn')
    
    # set seed value and seed the generator
    seed = 0
    numpy.random.seed(seed)
    
    # import data and print first 5 rows
    raw_data = import_data()
    print(raw_data.head())
    
    # convert raw data to the format which can be used by tslearn
    # (3-d dimensional array)
    # BUILT functionality: adjust all time series to one size
    # (NaN values are appended to the shorter ones)
    formatted_data = to_time_series_dataset(raw_data)
    
    # print shape of new array
    print(formatted_data.shape)
    
    # obtain number of measuring
    n_measuring = formatted_data.shape[1]
    
    # define figure, grid_spec to create layout of the plot
    fig = plt.figure(constrained_layout=True)
    grid_spec = fig.add_gridspec(
        n_rows_plot,
        i_args.number_clusters
    )
    
    # set A4 size to figure
    fig.set_size_inches(8.5, 11.75)
    
    # setup count of layers of subplots
    count_layer = 3
    # setup first subplot and draw raw time series
    f_ax_raw_data = fig.add_subplot(grid_spec[:2, :])
    
    for xx in formatted_data:
        f_ax_raw_data.plot(xx.ravel(), alpha=.2)
    
    formatted_data_min = formatted_data.min()
    formatted_data_max = formatted_data.max()
    # draw title for chart with min and max values
    f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max))

    # obtain and print executing time of data processing stage to console,
    timer_tick = get_time_tick(start_global)
    plt.ion()
    plt.show()
    
    print("Raw data processing time: %s" % timer_tick)
    
    #############################################################################################
    # Data preprocessing stage
    #############################################################################################
    
    start = timer()
    
    # Convert NaNs to value predicted by interpolation
    # linearly interpolate for NaN/NaNs
    n_nan_changes = 0
    for ind in range(formatted_data.shape[0]):
        mask = numpy.isnan(formatted_data[ind])
        n_nan_changes += mask.sum()
        formatted_data[ind][mask] = numpy.interp(
            numpy.flatnonzero(mask),
            numpy.flatnonzero(~mask),
            formatted_data[ind][~mask]
        )
    print("%d NaN values was/were interpolated" % n_nan_changes)
    
    # Scaling
    # to know should we use normalization or standardization, we need to see
    # the distribution of values.
    
    # take random 3 measuring for each case to draw histograms
    random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False)
    
    # create new arrays with values of randomly chosen measurements
    histogram_data = formatted_data[:, random_indexes]
    
    # draw histograms
    for i_histogram in range(i_args.number_clusters):
        f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram])
        f_ax_histogram.hist(
            histogram_data[:, i_histogram],
            bins=25, density=True
        )
        
        f_ax_histogram.text(0.55, 0.98,
                            'Measurement #%d' % random_indexes[i_histogram],
                            transform=plt.gca().transAxes,
                            color="navy"
                            )
        if i_histogram == 1:
            preprocessing = ''
            if i_args.normalise_data:
                preprocessing += "normalised"
                if i_args.standardise_data:
                    preprocessing += " and standardised"
            elif i_args.standardise_data:
                preprocessing += "standardised"

            preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing
            f_ax_histogram.set_title(
                "Distributions histograms %s" % preprocessing,
                color='navy', y=1, pad=14
            )
    
    # if no processing data option chosen continue with raw data
    processed_data = formatted_data
    
    # since for this concrete challenge data the distributions are more/less
    # Gaussian/Normal we can use standardization
    
    # normalize data: Min-Max scaling ranging between 0 and 1
    if i_args.normalise_data:
        processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data)
        print("Data was normalised")
    
    # standardize data: scaling technique where the values are centered around
    # the mean with a unit standard deviation
    if i_args.standardise_data:
        processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data)
        print("Data was standardised")
    
    # obtain max value of data (to be used in visualization subplots)
    max_data = processed_data.max() * 1.2
    min_data = processed_data.min() * 1.2
    
    timer_tick = get_time_tick(start)
    print("#############################################################################################")
    print("Data processing stage elapsed time: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Euclidean k-means clustering algorithm
    #############################################################################################
    
    if i_args.euclidean_clustering:
        
        start = timer()
        print("Euclidean k-means")
        
        # define parameters of the model of the algorithm
        k_means_euclidean = TimeSeriesKMeans(
            n_clusters=i_args.number_clusters,
            verbose=True,
            random_state=seed,
            n_jobs=4
        )
        
        # calculate cluster's label array
        euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data)
        
        # draw subplots with attributed clusters of time series as well as
        # cluster centers' lines
        for i_cluster in range(i_args.number_clusters):
            f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                n_measuring, min_data, max_data,
                                                processed_data, euclidean_clustered_data, 'tab:blue')
            
            f_ax_euclidean.plot(
                k_means_euclidean.cluster_centers_[i_cluster].ravel(),
                "tab:green"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_euclidean
        
        # increment count of filled layer of subplots
        count_layer += 1
        
        # obtain processing time, print it to console and
        # add it to the title of the series of subplots
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Euclidean $k$-means (%s)" % timer_tick,
            color='tab:green', y=1, pad=14
        )
        print("#############################################################################################")
        print("Euclidean k-means time processing: %s" % timer_tick)
        
    #############################################################################################
    # Implementing DTW k-means clustering algorithm
    # use dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.dtw_clustering:
        
        start = timer()
        print("DTW k-means")
        k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                       n_init=3,
                                       metric="dtw",
                                       verbose=True,
                                       max_iter_barycenter=10,
                                       random_state=seed,
                                       n_jobs=6
                                       )
        dtw_clustered_data = k_means_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                          n_measuring, min_data, max_data,
                                          processed_data, dtw_clustered_data, 'tab:blue')
            
            f_ax_dtw.plot(
                k_means_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:red"
            )
            if i_cluster == 1:
                middle_axis = f_ax_dtw

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "DTW $k$-means (%s)" % timer_tick,
            color='tab:red', y=1, pad=14
        )
        print("#############################################################################################")
        print("DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing soft DTW k-means clustering algorithm
    # use soft dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.soft_dtw_clustering:
        
        start = timer()
        print("Soft-DTW k-means")
        k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                            metric="softdtw",
                                            metric_params={"gamma": .025},
                                            verbose=True,
                                            random_state=seed,
                                            n_jobs=6
                                            )
        soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                               n_measuring, min_data, max_data,
                                               processed_data, soft_dtw_clustered_data, 'tab:blue')
            
            f_ax_soft_dtw.plot(
                k_means_soft_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:purple"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_soft_dtw

        # increment count of filled layer of subplots
        count_layer += 1

        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Soft-DTW $k$-means (%s)" % timer_tick,
            color='tab:purple', y=1, pad=14
        )
        print("#############################################################################################")
        print("Soft-DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing k-Shape clustering algorithm
    #############################################################################################
    
    if i_args.k_shape_clustering:
        
        start = timer()
        print("K-Shape")
        k_shape = KShape(n_clusters=i_args.number_clusters,
                         verbose=True,
                         random_state=seed
                         )
        k_shape_clustered_data = k_shape.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            
            min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min())
            max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max())
            
            f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                              n_measuring, min_axe_value, max_axe_value,
                                              processed_data, k_shape_clustered_data, 'tab:blue')
            
            f_ax_k_shape.plot(
                k_shape.cluster_centers_[i_cluster].ravel(),
                "tab:orange"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_k_shape

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "$K$-Shape (%s)" % timer_tick,
            color='tab:orange', y=1, pad=14
        )
        print("#############################################################################################")
        print("K-Shape time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Global Alignment kernel k-means clustering algorithm
    # since kernel is used, there is no centroid of the cluster
    #############################################################################################
    
    if i_args.gak_clustering:
        
        start = timer()
        print("GAK-k-means")
        gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters,
                                   kernel="gak",
                                   kernel_params={"sigma": "auto"},
                                   n_init=10,
                                   verbose=True,
                                   random_state=seed,
                                   n_jobs=6
                                   )
        
        gak_clustered_data = gak_k_means.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                  n_measuring, min_data, max_data,
                                                  processed_data, gak_clustered_data, 'tab:blue')
            
            if i_cluster == 1:
                middle_axis = f_ax_gak_k_means

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Global Alignment kernel $k$-means (%s)" % timer_tick,
            color='tab:cyan', y=1, pad=14)
        print("#############################################################################################")
        print("GAK k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    
    # return string with current datetime
    now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

    # define the name of the directory to be created
    path = "./out/%s" % now

    print("#############################################################################################")
    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
    else:
        print("Successfully created the directory %s " % path)
    
    try:
        # save figure as pdf to out folder
        fig.savefig("./out/%s/visual_result.pdf" % now)
    
        # save clustering results
        if i_args.euclidean_clustering:
            numpy.savetxt(
                "./out/%s/euclidean_clustering_result.csv" % now,
                euclidean_clustered_data,
                delimiter=","
            )
        if i_args.dtw_clustering:
            numpy.savetxt(
                "./out/%s/dtw_clustering_result.csv" % now,
                dtw_clustered_data,
                delimiter=","
            )
        if i_args.soft_dtw_clustering:
            numpy.savetxt(
                "./out/%s/soft_dtw_clustering_result.csv" % now,
                soft_dtw_clustered_data,
                delimiter=","
            )
        if i_args.k_shape_clustering:
            numpy.savetxt(
                "./out/%s/k_shape_clustering_result.csv" % now,
                k_shape_clustered_data,
                delimiter=","
            )
        if i_args.gak_clustering:
            numpy.savetxt(
                "./out/%s/gak_clustering_result.csv" % now,
                gak_clustered_data,
                delimiter=","
            )
    except RuntimeError:
        print("Saving results failed")
    else:
        print("Successfully saved results in the path %s " % path)

    #############################################################################################
    
    # obtain and print global executing time
    timer_tick = get_time_tick(start_global)
    print("#############################################################################################")
    print("All algorithms elapsed time: % s" % timer_tick)
    
    #############################################################################################

    # render and show plot
    # plt.show()
    plt.draw()
    plt.pause(0.001)
    input("Press [enter] to finish.")
    print("#############################################################################################")
Ejemplo n.º 17
0
# In[111]:

data = np.reshape(np.nan_to_num(users), (-1, len(starttimes), 1))

seed = 0
np.random.seed(seed)
sz = data.shape[1]

output = pd.DataFrame()
output['cellname'] = full_cells[0:1000]

for cluster_number in cluster_numbers:
    print(cluster_number)
    ks = KShape(n_clusters=cluster_number, verbose=True, random_state=seed)
    y_pred = ks.fit_predict(data)
    output[cluster_number] = y_pred

# In[ ]:

try:
    output = pd.read_csv(
        "/vha/home/61072380/seca_hh_clusters_20190930_200800.csv")
    output = output.drop(output.columns[0], axis=1)
except:
    print("No scaled data")

cluster_counts = pd.DataFrame()
cluster_counts['index'] = cluster_numbers
cluster_counts = cluster_counts.set_index('index', drop=True)
cluster_medians = pd.DataFrame()