Beispiel #1
0
def getNormalize(data_baseline):

    data_baseline = TimeSeriesScalerMeanVariance().fit_transform(data_baseline)
    data_baseline = data_baseline.reshape(
        (data_baseline.shape[0], data_baseline.shape[1]))

    return data_baseline
def mass_upload(startDate, endDate, id_unit_usaha):
    print(id_unit_usaha)
    login = ""
    password = ""
    # engine = sqlalchemy.create_engine('mysql+pymysql://energy:energy2x5=10@localhost:3306/pgn')
    engine = sqlalchemy.create_engine(
        'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    sql = " SELECT a.IDREFPELANGGAN, a.ID_UNIT_USAHA, 1 AS FSTREAMID, DATEPART(dw, a.FDATETIME) as FDAYOFWEEK, a.FHOUR, AVG(a.FDVC) as AVG_FDVC\
            FROM(SELECT IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR, SUM(FDVC) as FDVC\
                FROM amr_bridge\
                WHERE FDATETIME >= '" + startDate + "'\
                and FDATETIME < '" + endDate + "'\
                GROUP BY IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR) a\
            GROUP BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR\
            ORDER BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR"

    df = pd.read_sql_query(sql, engine)
    totaldf = len(df)
    totaldf = str(totaldf)
    print('total Data: ' + totaldf)

    # rslt_df = df.loc[df['ID_UNIT_USAHA'] == '014']

    # print(startDate)
    # print('\nResult dataframe :\n', rslt_df)

    # df.to_csv('pgn_customer_cluster_v1_{}.csv'.format(id_unit_usaha), index=False)

    # df.to_hdf("amr_bridge_22122020.hdf", key='hdf5')

    # df = pd.read_hdf("amr_bridge_22122020.hdf")

    def select_data(id_unit):
        query = "ID_UNIT_USAHA == '{}'".format(id_unit_usaha)
        columns = ['FDAYOFWEEK', 'FHOUR', 'IDREFPELANGGAN', 'AVG_FDVC']

        # df = df.set_index('FDATETIME')
        df_selected = df.query(query, engine='python')[columns]
        return df_selected

    def pivot_data(df):
        # df_pivoted = df.pivot(index='FDATETIME', columns='IDREFPELANGGAN', values='FDVC')
        df_pivoted = df.pivot(index=['FDAYOFWEEK', 'FHOUR'],
                              columns='IDREFPELANGGAN',
                              values='AVG_FDVC')
        return df_pivoted

    def remove_zerocolumns(df):
        # Get all columns which have all zero values
        cols = df.columns[df.mean() == 0]
        # Drop columns which has all zero values
        df = df.drop(cols, axis=1)
        return df

    df_week1 = select_data(id_unit_usaha)
    df_week1.fillna(0.0, inplace=True)

    df_pivoted1 = pivot_data(df_week1)
    df_pivoted1.fillna(0.0, inplace=True)

    df_pivoted1 = remove_zerocolumns(df_pivoted1)
    cols = list(df_pivoted1.columns)
    df_pivoted1.head()

    # Function to plot cluster

    # def plot_clusters(ds, y_pred, n_clusters, ks, filename):
    #     plt.figure(figsize=(12, 40))
    #     for yi in range(n_clusters):
    #         plt.subplot(n_clusters, 1, 1 + yi)
    #         for xx in ds[y_pred == yi]:
    #             plt.plot(xx.ravel(), "k-", alpha=.2)
    #         plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    #         plt.xlim(0, sz)
    #         plt.ylim(-7, 7)
    #         plt.title("Cluster %d" % (yi))

    #     plt.tight_layout()
    #     plt.savefig(filename, format='jpg', dpi=300, quality=95)
    #     plt.show()

    def create_cluster_info(y_pred, cols):

        df_cluster = pd.DataFrame(y_pred.copy(),
                                  index=cols.copy(),
                                  columns=['cluster'])
        df_cluster.reset_index(inplace=True)
        df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True)

        unique_cluster = df_cluster['cluster'].unique()

        # Get ID ref based on cluster
        idrefs_list = []
        for i, x in enumerate(unique_cluster):
            idref_list = df_cluster.query(
                "cluster == {}".format(x))['idrefpelanggan'].values.tolist()
            # idrefs_list[x] = idref_list

            # Create dictionary
            idref_cluster_dict = {'cluster': x, 'idrefpelanggan': idref_list}
            idrefs_list.append(idref_cluster_dict)

        idrefs_cluster = pd.DataFrame(idrefs_list)
        return idrefs_cluster

    # def run_once(startime, totalData, _has_run=[]):
    #     if _has_run:
    #         return
    #     # print("run_once doing stuff")
    #     print(startime)
    #     endtime = time.time_ns()
    #     print(endtime)
    #     invTime = endtime-startime

    #     estTime = invTime * totalData
    #     _has_run.append(1)

    #     print(totalData)
    #     print(estTime)
    #     return estTime

    seed = 0
    np.random.seed(seed)

    # Convert data frame to list of series
    pivoted_series = []
    pivoted_columns = []
    for i, y in enumerate(cols):
        length = len(df_pivoted1[y])
        cst = df_pivoted1[y].values
        pivoted_series.append(cst)
        pivoted_columns.append(y)

        # Convert data set to standar time series format
    formatted_dataset = to_time_series_dataset(pivoted_series)
    print("Data shape: {}".format(formatted_dataset.shape))

    formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
        formatted_dataset)
    sz = formatted_norm_dataset.shape[1]
    print("Data shape: {}".format(sz))

    formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
        formatted_dataset)
    if formatted_dataset.shape[0] <= 5:
        clusters = formatted_dataset.shape[0]
    else:
        clusters = 5

    totalColumn = formatted_norm_dataset.shape[0]
    totalRow = formatted_norm_dataset.shape[1]
    totalData = totalRow * totalColumn + totalRow * clusters

    ks = KShape(n_clusters=clusters, verbose=True, random_state=seed)
    y_pred_ks = ks.fit_predict(formatted_norm_dataset)

    nsamples, nx, ny = formatted_norm_dataset.shape
    formatted_norm_dataset2 = formatted_norm_dataset.reshape(
        (nsamples, nx * ny))
    # formatted_norm_dataset.to_csv('OutputData.csv')
    sil_score_max = -1  # this is the minimum possible score
    for n_clusters in range(2, 10):
        # model = KMeans(n_clusters=n_clusters, init='k-means++',
        #                max_iter=100, n_init=1)
        # labels = model.fit_predict(formatted_norm_dataset)
        ks = KShape(n_clusters=n_clusters, verbose=True, random_state=seed)
        y_pred_ks = ks.fit_predict(formatted_norm_dataset)
        sil_score = silhouette_score(formatted_norm_dataset2, y_pred_ks)
        print("The average silhouette score for %i clusters is %0.2f" %
              (n_clusters, sil_score))
        if sil_score > sil_score_max:
            sil_score_max = sil_score
            best_n_clusters = n_clusters
    print(best_n_clusters)

    # dba_km = TimeSeriesKMeans(n_clusters=clusters,
    #                           metric="dtw",
    #                           max_iter_barycenter=20,
    #                           verbose=False,
    #                           random_state=seed)
    # y_pred_dbakm = dba_km.fit_predict(formatted_norm_dataset)

    formatted_norm_dataset.shape
    data = formatted_norm_dataset
    data.shape

    formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0]
    formatted_norm_dataset_2d.shape
    # pd.DataFrame(A.T.reshape(2, -1), columns=cols)

    df_normalized = pd.DataFrame(formatted_norm_dataset_2d)
    df_normalized
    # df_normalized = df_normalized.pivot()
    # formatted_norm_dataset[0]

    df_cluster = pd.DataFrame(y_pred_ks,
                              index=pivoted_columns,
                              columns=['cluster'])
    df_cluster.reset_index(inplace=True)
    df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True)
    df_cluster.sort_values(['cluster'])

    df_normalized_detail = pd.DataFrame.join(df_normalized, df_cluster)

    # df_cluster.to_csv('pgn_customer_cluster_{}.csv'.format(
    #     id_unit_usaha), index=False)

    # Create data frame for customer and its cluster
    create_cluster_info(y_pred_ks, cols)

    # plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks,
    #               'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha))

    engine2 = sqlalchemy.create_engine(
        'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    Session = sessionmaker(bind=engine2)
    session = Session()

    Base = declarative_base()

    class PL_CUSTOMER_CLUSTER(Base):

        __tablename__ = 'PL_CUSTOMER_CLUSTER'

        ID = Column(Integer, primary_key=True)
        DATE_STAMP = Column(DateTime)
        IDREFPELANGGAN = Column(String(30))
        HOUR_NUM = Column(Integer)
        CLUSTER_NUM = Column(Integer)
        HOUR_NUM = Column(Integer)
        FDVC_NORMALIZED = Column(Float)
        AREA_ID = Column(String(5))

    startime = time.time_ns()
    for i in range(totalColumn):

        idref = df_normalized_detail.iloc[i, totalRow]
        cluster = int(df_normalized_detail.iloc[i, totalRow + 1])
        print("idref = " + idref)
        cluster_num = df_normalized_detail.iloc[i, totalRow - 1]
        for j in range(totalRow):

            hour_num = df_normalized_detail.columns[j]
            fdvc = df_normalized_detail.iloc[i, j]

            sql = ""

            # insert into table
            item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate,
                                       IDREFPELANGGAN=idref,
                                       HOUR_NUM=hour_num,
                                       CLUSTER_NUM=cluster,
                                       FDVC_NORMALIZED=fdvc,
                                       AREA_ID=id_unit_usaha)
            session.add(item)

        # commit per id ref pelanngan
        session.commit()

    arr_column = []
    # arr_centroid = []
    engine2 = sqlalchemy.create_engine(
        'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    Session = sessionmaker(bind=engine2)
    session = Session()

    Base = declarative_base()

    class PL_CUSTOMER_CLUSTER(Base):
        __tablename__ = 'PL_CUSTOMER_CLUSTER'

        ID = Column(Integer, primary_key=True)
        DATE_STAMP = Column(DateTime)
        IDREFPELANGGAN = Column(String(30))
        HOUR_NUM = Column(Integer)
        CLUSTER_NUM = Column(Integer)
        HOUR_NUM = Column(Integer)
        FDVC_NORMALIZED = Column(Float)
        AREA_ID = Column(String(5))

    for i in range(clusters):
        print("cluster: " + str(i))
        CLUSTER_NAME = "CENTROID_ID" + str(i)
        cluster = i
        df = df_normalized_detail
        df = df[df["cluster"] == cluster].reset_index(drop=True)
        df_transpose = df.transpose()

        column_count = df_transpose.shape[1]
        idrefpelanggan = df_transpose.loc[['idrefpelanggan']]

        for l in range(column_count):
            strid = str(int(idrefpelanggan[l]))
            df_transpose = df_transpose.rename({l: strid}, axis=1)
            arr_column.append(strid)
        df_transpose.drop("idrefpelanggan", inplace=True)
        df_transpose.drop("cluster", inplace=True)

        for j in range(totalRow):
            fdvc_norm = np.mean(df_transpose.iloc[j])
            # fdvc_norm = dba_km.cluster_centers_[i][j][0]
            hour_num = j

            sql = ""
            item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate,
                                       IDREFPELANGGAN=CLUSTER_NAME,
                                       HOUR_NUM=hour_num,
                                       CLUSTER_NUM=cluster,
                                       FDVC_NORMALIZED=fdvc_norm,
                                       AREA_ID=id_unit_usaha)
            session.add(item)
            print("fdvc:" + str(fdvc_norm) + "Hour:" + str(hour_num))
        # commit per id ref pelanngan
        session.commit()
        print(str(j) + ", " + str(fdvc_norm))

    return totalData
Beispiel #3
0
series = pd.read_csv(datapath, header=0)

X_train = series.values[:, 1:].T
headers = list(series)[1:]

n_samples = 100
if (X_train.shape[0] > n_samples):
    X_train = X_train[:
                      n_samples]  # truncate number of series extracted, if speed an issue
else:
    n_samples = X_train.shape[0]
X_train = X_train.astype(np.float)

# scaling can sometimes improve performance
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]))

nrows, ncols = X_train.shape

print("DEBUG::shape of final data for clustering:")
print(X_train.shape)

## this is the first clustering method, via dbscan
# some hyper-parameters
eps = 90
min_samples = 2
LOAD = False  # Flag for loading similarity matrix from file if it has been computed before
if (LOAD):
    SimilarityMatrix = cluster.LoadSimilarityMatrix('SimilarityMatrix_alt')
else:
    SimilarityMatrix = cluster.GenerateSimilarityMatrix(X_train)