Example #1
0
def test_clustering_tree(directory=None):
    s = np.array([[0., 0, 1, 2, 1, 0, 1, 0, 0], [0., 1, 2, 0, 0, 0, 0, 0, 0],
                  [1., 2, 0, 0, 0, 0, 0, 1, 1], [0., 0, 1, 2, 1, 0, 1, 0, 0],
                  [0., 1, 2, 0, 0, 0, 0, 0, 0], [1., 2, 0, 0, 0, 0, 0, 1, 1],
                  [1., 2, 0, 0, 0, 0, 0, 1, 1]])

    def test_hook(from_idx, to_idx, distance):
        assert (from_idx, to_idx) in [(3, 0), (4, 1), (5, 2), (6, 2), (1, 0),
                                      (2, 0)]

    model = clustering.Hierarchical(dtw.distance_matrix_fast, {},
                                    merge_hook=test_hook,
                                    show_progress=False)
    modelw = clustering.HierarchicalTree(model)
    cluster_idx = modelw.fit(s)
    assert cluster_idx[0] == {0, 1, 2, 3, 4, 5, 6}

    if directory:
        hierarchy_fn = os.path.join(directory, "hierarchy.png")
        graphviz_fn = os.path.join(directory, "hierarchy.dot")
    else:
        file = tempfile.NamedTemporaryFile()
        hierarchy_fn = file.name + "_hierarchy.png"
        graphviz_fn = file.name + "_hierarchy.dot"
    modelw.plot(hierarchy_fn)
    print("Figure saved to", hierarchy_fn)
    with open(graphviz_fn, "w") as ofile:
        print(modelw.to_dot(), file=ofile)
    print("Dot saved to", graphviz_fn)
def main():
    s = np.array([
        np.flip([0., 0, 1, 2, 1, 0, 1, 0, 0, 1]),
        [0., 1, 2, 0, 0, 0, 0, 0, 0, 1],
        np.flip([1., 2, 0, 0, 0, 0, 0, 1, 1, 1], 0),
        [0., 0, 1, 2, 1, 0, 1, 0, 0, 1], [0., 1, 2, 0, 0, 0, 0, 0, 0, 1],
        np.flip([1., 2, 0, 0, 0, 0, 0, 1, 1, 1], 0),
        np.flip([1., 2, 0, 0, 0, 0, 0, 1, 1, 1], 0)
    ])

    # Custom Hierarchical clustering
    model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
    cluster_idx = model1.fit(s)
    # Keep track of full tree by using the HierarchicalTree wrapper class
    model2 = clustering.HierarchicalTree(model1)
    cluster_idx = model2.fit(s)

    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10))
    show_ts_label = lambda idx: "ts-" + str(idx)
    model2.plot('hierarchy.jpg',
                axes=ax,
                show_ts_label=show_ts_label,
                show_tr_label=True,
                ts_label_margin=-10,
                ts_left_margin=10,
                ts_sample_length=1)

    # reading png image file
    im = img.imread('hierarchy.jpg')

    # show image
    plt.imshow(im)
Example #3
0
def test_clustering_tree_ndim():
    with util_numpy.test_uses_numpy() as np:
        s = np.array([
             [[0.,0.], [0,0], [1,0], [2,0], [1,0], [0,0], [1,0], [0,0], [0,0]],
             [[0.,0.], [1,0], [2,0], [0,0], [0,0], [0,0], [0,0], [0,0], [0,0]],
             [[1.,0.], [2,0], [0,0], [0,0], [0,0], [0,0], [0,0], [1,0], [1,0]]])

        model = clustering.Hierarchical(dtw_ndim.distance_matrix_fast, {'ndim':2},
                                        show_progress=False)
        cluster_idx = model.fit(s)
        assert cluster_idx[0] == {0, 1, 2}
Example #4
0
def test_clustering():
    s = np.array([[0., 0, 1, 2, 1, 0, 1, 0, 0], [0., 1, 2, 0, 0, 0, 0, 0, 0],
                  [1., 2, 0, 0, 0, 0, 0, 1, 1], [0., 0, 1, 2, 1, 0, 1, 0, 0],
                  [0., 1, 2, 0, 0, 0, 0, 0, 0], [1., 2, 0, 0, 0, 0, 0, 1, 1]])

    def test_hook(from_idx, to_idx, distance):
        assert (from_idx, to_idx) in [(3, 0), (4, 1), (5, 2), (1, 0)]

    model = clustering.Hierarchical(dtw.distance_matrix_fast, {},
                                    2,
                                    merge_hook=test_hook,
                                    show_progress=False)
    cluster_idx = model.fit(s)
    assert cluster_idx[0] == {0, 1, 3, 4}
    assert cluster_idx[2] == {2, 5}
 def d():
     c = clustering.Hierarchical(dtw.distance_matrix_fast, {})
     return c.fit(s)
Example #6
0
def get_cluster():
    """
    Function to get the clustering for the time series getting the distances between
    each operation.
    """

    series = []
    aux_file_path = r'C:\TFM\auxdata\hist_protected.csv'
    data_path = r'C:\TFM\data\2018\2018.csv'

    hierarchical_plot = r'C:\TFM\dtw\hierarchical_cluster.png'
    linkage_plot = r'C:\TFM\dtw\linkage_cluster.png'

    df_aux = pd.read_csv(aux_file_path,
                         header=0,
                         delimiter=',',
                         parse_dates=[SEGMENT_BEGIN, SEGMENT_END])
    df_data = pd.read_csv(data_path,
                          header=0,
                          delimiter=',',
                          parse_dates=[DATE])

    # print(df_aux[SEGMENT_BEGIN, SEGMENT_END][df_data[OPERATION_ID_NUMBER] == 4])

    op_no = 28
    program_number = 1108805036

    # df1 = df[(df.a != -1) & (df.b != -1)]
    # begin_date = (df_aux.loc[(df_aux[OPERATION_ID_NUMBER] == op_no)][SEGMENT_BEGIN])

    # Get begin date and end date for each time serie corresponding to the
    begin_date = (
        df_aux[(df_aux[OPERATION_ID_NUMBER] == op_no)
               & (df_aux[PROGRAM_NAME] == program_number)][SEGMENT_BEGIN])
    end_date = (
        df_aux[(df_aux[OPERATION_ID_NUMBER] == op_no)
               & (df_aux[PROGRAM_NAME] == program_number)][SEGMENT_END])

    data_index = begin_date.index

    # data_index = data_index[:30]

    for item in data_index:
        if item > YEAR_INDEX_LIMIT:
            break
        else:
            series_begin = begin_date[item]
            series_end = end_date[item]
            aux_series = df_data.loc[(df_data[DATE] >= series_begin)
                                     & (df_data[DATE] <= series_end)]
            if not aux_series.empty:
                df_spload = aux_series[SPINDLE_LOAD]
                df_spload = np.array(df_spload)
                series.append(df_spload)

    # Custom Hierarchical clustering
    model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
    cluster_idx = model1.fit(series)

    try:
        # Augment Hierarchical object to keep track of the full tree
        model2 = clustering.HierarchicalTree(model1)
        cluster_idx = model2.fit(series)
        model2.plot(hierarchical_plot, show_tr_label=True)
    except Exception as ex:
        print(ex)
    # SciPy linkage clustering
    try:
        model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
        cluster_idx = model3.fit(series)
        model3.plot(linkage_plot, show_tr_label=True)
    except Exception as ex:
        print(ex)
df = pd.read_csv("Scania_Data_Clustering.csv", header=0) # header=0 is default
head = list(df.columns.values) # get machine names
print("head", head) # print machine names

df = df.T # transpose the data
df = df.values

ds = dtw.distance_matrix_fast(df) # get dist matrix
ds[ds == inf] = 0 # replace all infinity vals in the dist matrix with 0.

pd.DataFrame(ds).to_excel("ds.xlsx") # save dist matrix to a xlsx.

# clustering starts
# Custom Hierarchical clustering
model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
# Augment Hierarchical object to keep track of the full tree
model2 = clustering.HierarchicalTree(model1)
# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})

cluster_idx = model3.fit(df)

# plot
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 15))
model3.plot("hierarchy.png", axes=ax, show_ts_label=head,
           show_tr_label=True, ts_label_margin=-10,
           ts_left_margin=10, ts_sample_length=1)
		   
# to find number of clusters
NumberOfClusters=range(2,30)
Example #8
0
def clusterTest(pq,XTest, distParams, generalClusterParams, pqClusterParams, doAll = True, groundTruth=None ):
    #quantizer model
    generalClusterParams['min_clusters']=20
    n_clust=generalClusterParams['min_clusters']

    #normal model
    
    modelN = clustering.Hierarchical(
        dtaidistance.dtw.distance_matrix_fast, distParams,
        **generalClusterParams)


    
    if not doAll:
        print('Exact clustering')
        start = time.clock()
        cluster_idxN = modelN.fit(XTest)
        end = time.clock()
        dtwTime = end-start
        print('dtw time', dtwTime)

    else:
        generalClusterParams = {'dists_merger':clustering.singleLinkageUpdater, 'min_clusters':n_clust}
        modelS = clustering.Hierarchical(
            dtaidistance.dtw.distance_matrix_fast, distParams,
            **generalClusterParams)
        print('Exact single')
        start = time.clock()
        cluster_idxS = modelS.fit(XTest)
        end = time.clock()
        pqTime = end-start
        print('time',pqTime)

        generalClusterParams = {'dists_merger':clustering.completeLinkageUpdater, 'min_clusters':n_clust}
        modelC = clustering.Hierarchical(
            dtaidistance.dtw.distance_matrix_fast, distParams,
            **generalClusterParams)
        print('Exact complete')
        start = time.clock()
        cluster_idxC = modelC.fit(XTest)
        end = time.clock()
        pqTime = end-start
        print('time',pqTime)

        generalClusterParams = {'dists_merger':None, 'min_clusters':n_clust}
        modelP = clustering.Hierarchical(
            dtaidistance.dtw.distance_matrix_fast, distParams,
            **generalClusterParams)
        print('Exact prototypes')
        start = time.clock()
        cluster_idxP = modelP.fit(XTest)
        end = time.clock()
        pqTime = end-start
        print('time',pqTime)
        if groundTruth is not None:
            jaccards, aris = equaliseClusterLabelsAndCalculateScores(groundTruth, cluster_idxS, XTest)
            print(jaccards, aris)
            jaccardc, aric = equaliseClusterLabelsAndCalculateScores(groundTruth, cluster_idxC, XTest)
            print(jaccardc, aric)
            jaccardp, arip = equaliseClusterLabelsAndCalculateScores(groundTruth, cluster_idxP, XTest)
            print(jaccardp, arip)
            return



    #print('Exact clustering done')
    if not doAll:
        model = clustering.HierarchicalWithQuantizer(
            dtaidistance.dtw.distance_fast, distParams,
            **generalClusterParams,
            **pqClusterParams)
        model.setQuantizer(pq)
        print('Approximate clustering')
        start = time.clock()
        cluster_idx = model.fit(XTest)
        end = time.clock()
        #print('Approximate clustering done')
        pqTime = end-start

        jaccard, ari = equaliseClusterLabelsAndCalculateScores(cluster_idxN, cluster_idx, XTest)
        print(jaccard, ari, dtwTime)
        return {'jaccard':jaccard, 'ari':ari, 'DTWTime': dtwTime, 'PQTime':pqTime}
    else: 
        def performCluster(trueResults, pq,XTest, distParams, generalClusterParams, pqClusterParams):
            model = clustering.HierarchicalWithQuantizer(
                dtaidistance.dtw.distance_fast, distParams,
                **generalClusterParams,
                **pqClusterParams)
            model.setQuantizer(pq)
            start = time.clock()
            cluster_idx = model.fit(XTest)
            end = time.clock()
            #print('Approximate clustering done')
            pqTime = end-start
            jaccard, ari = equaliseClusterLabelsAndCalculateScores(trueResults, cluster_idx, XTest)
            print( {'jaccard':jaccard, 'ari':ari, 'PQTime':pqTime})

            

        tot = len(XTest)
        calcs = (tot*tot-tot)/2  #amount of distance calculations required
        # test 0, 2, 5, 10, 25 %precalcs
        testkperc = [2,5,10,25]
        # test 5, 10, 25 %calcsper merge
        testkpermerge = [0.5, 2.0, 5.0, 10.0, 20.0]
        testperc = [2,5,10,25,50]
        print(calcs, tot)
        n_clust = generalClusterParams['min_clusters']
        clusterTypes = [None, clustering.singleLinkageUpdater, clustering.completeLinkageUpdater]

        print ('approx', 'single')
        generalClusterParams = {'dists_merger':clustering.singleLinkageUpdater, 'min_clusters':n_clust}
        pqClusterParams = {'k':199800,'quantizer_usage':clustering.QuantizerUsage.ONLY_APPROXIMATES}
        performCluster(cluster_idxS, pq,XTest, distParams, generalClusterParams, pqClusterParams )
        for k in testperc:
            print ('percent',k, 'single',k)
            pqClusterParams = {'k':int(k*calcs/100),'quantizer_usage':clustering.QuantizerUsage.TOP_K_ONLY_AT_INITIALISATION}
            performCluster(cluster_idxS, pq,XTest, distParams, generalClusterParams, pqClusterParams )
        print ('approx', 'complete')
        generalClusterParams = {'dists_merger':clustering.completeLinkageUpdater, 'min_clusters':n_clust}
        pqClusterParams = {'k':199800,'quantizer_usage':clustering.QuantizerUsage.ONLY_APPROXIMATES}
        performCluster(cluster_idxC, pq,XTest, distParams, generalClusterParams, pqClusterParams )
        for k in testperc:
            print ('percent',k, 'complete',k)
            pqClusterParams = {'k':int(k*calcs/100.0),'quantizer_usage':clustering.QuantizerUsage.TOP_K_ONLY_AT_INITIALISATION}
            performCluster(cluster_idxC, pq,XTest, distParams, generalClusterParams, pqClusterParams )
        print ('approx', 'proto')
        generalClusterParams = {'dists_merger':None, 'min_clusters':n_clust}
        pqClusterParams = {'k':199800,'quantizer_usage':clustering.QuantizerUsage.ONLY_APPROXIMATES}
        performCluster(cluster_idxP, pq,XTest, distParams, generalClusterParams, pqClusterParams )
        for k in testperc:
            print ('percent',k, 'proto',k)
            pqClusterParams = {'k':int(k*calcs/100.0),'quantizer_usage':clustering.QuantizerUsage.TOP_K_ONLY_AT_INITIALISATION}
            performCluster(cluster_idxP, pq,XTest, distParams, generalClusterParams, pqClusterParams )
        for k in testkpermerge:
            print ('partEachMerge',k, 'proto',k)
            pqClusterParams = {'k':int(k*tot/100),'quantizer_usage':clustering.QuantizerUsage.TOP_K}
            performCluster(cluster_idxP, pq,XTest, distParams, generalClusterParams, pqClusterParams )
Example #9
0
def cluster(time_series_set, name):

    path = "./static/cluster_data.csv"
    cluster_data = csv.reader(open(path, 'r'))

    name_list = []
    series_list = []

    for row in cluster_data:
        #print(row)
        #print("row", row)
        name_list.append(row[0])
        #print("name", name_list)
        series = row[1:]
        #print("series", series)
        float_series = []
        for i in series:
            float_series.append(float(i))
        np_series = np.array(float_series)
        temp_series = stats.zscore(np_series)
        series_list.append(temp_series)

    if name not in name_list:
        # timeseries是性能指标序列
        time_series = []
        time_series_with_name = []
        time_series_with_name.append(name)
        for row in time_series_set:
            time_series.append(row[1])
            time_series_with_name.append(row[1])
        #print(time_series)

        with open(path, 'a') as f:
            csv_write = csv.writer(f)
            csv_write.writerow(time_series_with_name)
            f.close()

        name_list.append(name)
        float_series = []
        for i in time_series:
            float_series.append(float(i))
        np_series = np.array(float_series)
        temp_series = stats.zscore(np_series)
        series_list.append(temp_series)

    # Custom Hierarchical clustering
    model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
    cluster_idx = model1.fit(series_list)
    # Augment Hierarchical object to keep track of the full tree
    model2 = clustering.HierarchicalTree(model1)
    cluster_idx = model2.fit(series_list)
    # SciPy linkage clustering
    model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
    cluster_idx = model3.fit(series_list)

    # model2.plot("hierarchy.png")

    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10))
    show_ts_label = lambda idx: name_list[idx]
    model2.plot("hierarchy.png",
                axes=ax,
                show_ts_label=show_ts_label,
                show_tr_label=True,
                ts_label_margin=-10,
                ts_left_margin=10,
                ts_sample_length=1)