Esempio n. 1
0
def test_controlchart():
    import matplotlib.pyplot as plt
    series = np.zeros((600, 60))
    rsrc_fn = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'rsrc',
                           'synthetic_control.data')
    with open(rsrc_fn, 'r') as ifile:
        for idx, line in enumerate(ifile.readlines()):
            series[idx, :] = line.split()
    s = []
    for idx in range(0, 600, 20):
        s.append(series[idx, :])

    model = clustering.LinkageTree(dtw.distance_matrix_fast, {})
    cluster_idx = model.fit(s)

    if directory:
        hierarchy_fn = os.path.join(directory, "hierarchy.png")
    else:
        file = tempfile.NamedTemporaryFile()
        hierarchy_fn = file.name + "_hierarchy.png"
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10))
    show_ts_label = lambda idx: "ts-" + str(idx)
    # show_ts_label = list(range(len(s)))
    model.plot(hierarchy_fn,
               axes=ax,
               show_ts_label=show_ts_label,
               show_tr_label=True,
               ts_label_margin=-10,
               ts_left_margin=10,
               ts_sample_length=1)
    print("Figure saved to", hierarchy_fn)
def test_bug3():
    with util_numpy.test_uses_numpy() as np:
        series = np.array([
            np.array([1, 2, 1]),
            np.array([0., 1, 2, 0, 0, 0, 0, 0, 0]),
            np.array([1., 2, 0, 0, 0, 0, 0, 1, 1, 3, 4, 5]),
            np.array([0., 0, 1, 2, 1, 0, 1]),
            np.array([0., 1, 2, 0, 0, 0, 0, 0]),
            np.array([1., 2, 0, 0, 0, 0, 0, 1, 1])
        ])
        ds = dtw.distance_matrix(series)
        print(ds)

        model = clustering.LinkageTree(dtw.distance_matrix, {})
        cluster_idx = model.fit(series)
        print(cluster_idx)

        if directory:
            fn = directory / "bug3.png"
        else:
            file = tempfile.NamedTemporaryFile()
            fn = Path(file.name + "_bug3.png")

        if not dtwvis.test_without_visualization():
            model.plot(fn, show_ts_label=True)
Esempio n. 3
0
def test_linkage_tree():
    s = np.array([
         [0., 0, 1, 2, 1, 0, 1, 0, 0],
         [0., 1, 2, 0, 0, 0, 0, 0, 0],
         [1., 2, 0, 0, 0, 0, 0, 1, 1],
         [0., 0, 1, 2, 1, 0, 1, 0, 0],
         [0., 1, 2, 0, 0, 0, 0, 0, 0],
         [1., 2, 0, 0, 0, 0, 0, 1, 1],
         [1., 2, 0, 0, 0, 0, 0, 1, 1]])

    model = clustering.LinkageTree(dtw.distance_matrix_fast, {})
    cluster_idx = model.fit(s)

    if directory:
        hierarchy_fn = os.path.join(directory, "hierarchy.png")
        graphviz_fn = os.path.join(directory, "hierarchy.dot")
    else:
        file = tempfile.NamedTemporaryFile()
        hierarchy_fn = file.name + "_hierarchy.png"
        graphviz_fn = file.name + "_hierarchy.dot"
    model.plot(hierarchy_fn)
    print("Figure saved to", hierarchy_fn)
    with open(graphviz_fn, "w") as ofile:
        print(model.to_dot(), file=ofile)
    print("Dot saved to", graphviz_fn)
 def linkage_tree(self, df):
     print('Producing linkage Tree')
     self.model = clustering.LinkageTree(dtw.distance_matrix_fast, {})
     clusters_dtw = self.model.fit(df)
     return clusters_dtw
     pickle.dump(self.model, open('model.pkl', 'wb'))
     if run_plots == True:
         f, ax = self.model.plot()
         f.set_size_inches(17, 20)
Esempio n. 5
0
def dtai_dendogram(series, dir_name):
    from dtaidistance import clustering
    model = clustering.LinkageTree(dtw.distance_matrix_fast, \
        {'window':window_size, 'psi':psi_size})
    model.fit(series)
    model.plot(filename=dir_name+"/dendogram_dtai_"+str(num_series)+"_"+str(num_pts_per_series)+".png",\
        axes=None, ts_height=.5, \
        bottom_margin=.4, top_margin=.4, ts_left_margin=.2, \
        ts_sample_length=1/num_pts_per_series, \
        tr_label_margin=.1, tr_left_margin=0, ts_label_margin=-.25, \
        show_ts_label=lambda x : "ts-" + str(x), show_tr_label=True, \
        cmap='viridis_r', ts_color=None)
Esempio n. 6
0
def test_plotbug1():
    s1 = np.array([0., 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 0, 0])
    s2 = np.array([0., 1, 2, 3, 1, 0, 0, 0, 2, 1, 0, 0])

    series = s1, s2

    m = clustering.LinkageTree(dtw.distance_matrix, {})
    m.fit(series)

    if directory:
        hierarchy_fn = os.path.join(directory, "clustering.png")
    else:
        file = tempfile.NamedTemporaryFile()
        hierarchy_fn = file.name + "_clustering.png"
    m.plot(hierarchy_fn)
    print("Figure save to", hierarchy_fn)
def test_bug1():
    series = np.array([[0., 0, 1, 2, 1, 0, 1, 0, 0],
                       [0., 1, 2, 0, 0, 0, 0, 0, 0],
                       [1., 2, 0, 0, 0, 0, 0, 1, 1],
                       [0., 0, 1, 2, 1, 0, 1, 0, 0],
                       [0., 1, 2, 0, 0, 0, 0, 0, 0],
                       [1., 2, 0, 0, 0, 0, 0, 1, 1]])
    model = clustering.LinkageTree(dtw.distance_matrix_fast, {})
    cluster_idx = model.fit(series)

    if directory:
        hierarchy_fn = directory / "hierarchy.png"
    else:
        file = tempfile.NamedTemporaryFile()
        hierarchy_fn = Path(file.name + "_hierarchy.png")
    model.plot(hierarchy_fn)
    print("Figure saved to", hierarchy_fn)
Esempio n. 8
0
def test_plotbug1():
    with util_numpy.test_uses_numpy() as np:
        s1 = np.array([0., 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 0, 0])
        s2 = np.array([0., 1, 2, 3, 1, 0, 0, 0, 2, 1, 0, 0])

        series = s1, s2

        m = clustering.LinkageTree(dtw.distance_matrix, {})
        m.fit(series)

        if not dtwvis.test_without_visualization():
            if directory:
                hierarchy_fn = os.path.join(directory, "clustering.png")
            else:
                file = tempfile.NamedTemporaryFile()
                hierarchy_fn = file.name + "_clustering.png"
            m.plot(hierarchy_fn)
            print("Figure save to", hierarchy_fn)
def test_bug1():
    with util_numpy.test_uses_numpy() as np:
        series = np.array([[0., 0, 1, 2, 1, 0, 1, 0, 0],
                           [0., 1, 2, 0, 0, 0, 0, 0, 0],
                           [1., 2, 0, 0, 0, 0, 0, 1, 1],
                           [0., 0, 1, 2, 1, 0, 1, 0, 0],
                           [0., 1, 2, 0, 0, 0, 0, 0, 0],
                           [1., 2, 0, 0, 0, 0, 0, 1, 1]])
        model = clustering.LinkageTree(dtw.distance_matrix_fast, {})
        cluster_idx = model.fit(series)

        if directory:
            hierarchy_fn = directory / "hierarchy.png"
        else:
            file = tempfile.NamedTemporaryFile()
            hierarchy_fn = Path(file.name + "_hierarchy.png")
        if not dtwvis.test_without_visualization():
            model.plot(hierarchy_fn)
            print("Figure saved to", hierarchy_fn)
Esempio n. 10
0
def test_controlchart(directory=None):
    series = np.zeros((600, 60))
    rsrc_fn = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'rsrc',
                           'synthetic_control.data')
    with open(rsrc_fn, 'r') as ifile:
        for idx, line in enumerate(ifile.readlines()):
            series[idx, :] = line.split()
    s = []
    for idx in range(0, 600, 20):
        s.append(series[idx, :])

    model = clustering.LinkageTree(dtw.distance_matrix_fast, {})
    cluster_idx = model.fit(s)

    if directory:
        hierarchy_fn = os.path.join(directory, "hierarchy.png")
    else:
        file = tempfile.NamedTemporaryFile()
        hierarchy_fn = file.name + "_hierarchy.png"
    model.plot(hierarchy_fn)
    print("Figure saved to", hierarchy_fn)
Esempio n. 11
0
def test_bug3():
    series = np.array([
        np.array([1, 2, 1]),
        np.array([0., 1, 2, 0, 0, 0, 0, 0, 0]),
        np.array([1., 2, 0, 0, 0, 0, 0, 1, 1, 3, 4, 5]),
        np.array([0., 0, 1, 2, 1, 0, 1]),
        np.array([0., 1, 2, 0, 0, 0, 0, 0]),
        np.array([1., 2, 0, 0, 0, 0, 0, 1, 1])
    ])
    ds = dtw.distance_matrix(series)
    print(ds)

    model = clustering.LinkageTree(dtw.distance_matrix, {})
    cluster_idx = model.fit(series)
    print(cluster_idx)

    if directory:
        fn = directory / "bug3.png"
    else:
        file = tempfile.NamedTemporaryFile()
        fn = Path(file.name + "_bug3.png")

    model.plot(fn, show_ts_label=True)
 def d():
     c = clustering.LinkageTree(dtw.distance_matrix_fast, {})
     return c.fit(s)
Esempio n. 13
0
def get_cluster():
    """
    Function to get the clustering for the time series getting the distances between
    each operation.
    """

    series = []
    aux_file_path = r'C:\TFM\auxdata\hist_protected.csv'
    data_path = r'C:\TFM\data\2018\2018.csv'

    hierarchical_plot = r'C:\TFM\dtw\hierarchical_cluster.png'
    linkage_plot = r'C:\TFM\dtw\linkage_cluster.png'

    df_aux = pd.read_csv(aux_file_path,
                         header=0,
                         delimiter=',',
                         parse_dates=[SEGMENT_BEGIN, SEGMENT_END])
    df_data = pd.read_csv(data_path,
                          header=0,
                          delimiter=',',
                          parse_dates=[DATE])

    # print(df_aux[SEGMENT_BEGIN, SEGMENT_END][df_data[OPERATION_ID_NUMBER] == 4])

    op_no = 28
    program_number = 1108805036

    # df1 = df[(df.a != -1) & (df.b != -1)]
    # begin_date = (df_aux.loc[(df_aux[OPERATION_ID_NUMBER] == op_no)][SEGMENT_BEGIN])

    # Get begin date and end date for each time serie corresponding to the
    begin_date = (
        df_aux[(df_aux[OPERATION_ID_NUMBER] == op_no)
               & (df_aux[PROGRAM_NAME] == program_number)][SEGMENT_BEGIN])
    end_date = (
        df_aux[(df_aux[OPERATION_ID_NUMBER] == op_no)
               & (df_aux[PROGRAM_NAME] == program_number)][SEGMENT_END])

    data_index = begin_date.index

    # data_index = data_index[:30]

    for item in data_index:
        if item > YEAR_INDEX_LIMIT:
            break
        else:
            series_begin = begin_date[item]
            series_end = end_date[item]
            aux_series = df_data.loc[(df_data[DATE] >= series_begin)
                                     & (df_data[DATE] <= series_end)]
            if not aux_series.empty:
                df_spload = aux_series[SPINDLE_LOAD]
                df_spload = np.array(df_spload)
                series.append(df_spload)

    # Custom Hierarchical clustering
    model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
    cluster_idx = model1.fit(series)

    try:
        # Augment Hierarchical object to keep track of the full tree
        model2 = clustering.HierarchicalTree(model1)
        cluster_idx = model2.fit(series)
        model2.plot(hierarchical_plot, show_tr_label=True)
    except Exception as ex:
        print(ex)
    # SciPy linkage clustering
    try:
        model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
        cluster_idx = model3.fit(series)
        model3.plot(linkage_plot, show_tr_label=True)
    except Exception as ex:
        print(ex)
df = df.T # transpose the data
df = df.values

ds = dtw.distance_matrix_fast(df) # get dist matrix
ds[ds == inf] = 0 # replace all infinity vals in the dist matrix with 0.

pd.DataFrame(ds).to_excel("ds.xlsx") # save dist matrix to a xlsx.

# clustering starts
# Custom Hierarchical clustering
model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
# Augment Hierarchical object to keep track of the full tree
model2 = clustering.HierarchicalTree(model1)
# SciPy linkage clustering
model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})

cluster_idx = model3.fit(df)

# plot
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 15))
model3.plot("hierarchy.png", axes=ax, show_ts_label=head,
           show_tr_label=True, ts_label_margin=-10,
           ts_left_margin=10, ts_sample_length=1)
		   
# to find number of clusters
NumberOfClusters=range(2,30)
silhouette_score_values=list()

for i in NumberOfClusters:
    
Esempio n. 15
0
def cluster_the_ts_curves(infile, outfolder, maturity, smoothing):

    series = {}
    venues = []
    indicies = [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (1, 1), (1, 2),
                (1, 3), (1, 4), (1, 5)]

    for ind, line in enumerate(open(infile)):
        fields = line.strip().split('\t')
        venue = fields[0]
        ts = fields[1:]
        venues.append(venue)
        #if ind == 500: break

        if smoothing == 'smooth':
            series[venue] = savgol_filter(
                np.asarray([float(fff) for fff in ts]), 5, 3)
        elif smoothing == 'notsmooth':
            series[venue] = np.asarray([float(fff) for fff in ts])
        else:
            print('F**K OFF')

    dists = dtw.distance_matrix_fast(list(series.values()))
    model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
    cluster_idx = model3.fit(list(series.values()))
    linkage_matrix = model3.linkage

    nnn = len(series)
    cluster_dict = {}

    if not os.path.exists(maturity):
        os.makedirs(maturity)

    for i in range(0, nnn - 1):

        new_cluster_id = nnn + i
        old_cluster_id_0 = linkage_matrix[i, 0]
        old_cluster_id_1 = linkage_matrix[i, 1]
        combined_ids = list()
        if old_cluster_id_0 in cluster_dict:
            combined_ids += cluster_dict[old_cluster_id_0]
            del cluster_dict[old_cluster_id_0]
        else:
            combined_ids += [old_cluster_id_0]
        if old_cluster_id_1 in cluster_dict:
            combined_ids += cluster_dict[old_cluster_id_1]
            del cluster_dict[old_cluster_id_1]
        else:
            combined_ids += [old_cluster_id_1]
        cluster_dict[new_cluster_id] = combined_ids

        nodes_included = []
        for v in cluster_dict.values():
            nodes_included += v

        nc = len(cluster_dict)
        nnodes = len(set(nodes_included))

        #for NNN in [6]:
        #for NNN in [3, 5, 6, 10]:
        for NNN in [10]:

            #NNN = 6   # 5 # 6 # 10

            figfolder = outfolder + '/' + maturity + '/figs_clusters_' + smoothing + '/' + str(
                NNN)
            curvefodler = outfolder + '/' + maturity + '/avg_curves_' + smoothing + '/' + str(
                NNN)
            vensfolder = outfolder + '/' + maturity + '/clusters_venues_' + smoothing + '/' + str(
                NNN)

            if not os.path.exists(figfolder): os.makedirs(figfolder)
            if not os.path.exists(curvefodler): os.makedirs(curvefodler)
            if not os.path.exists(vensfolder): os.makedirs(vensfolder)

            MINCSIZE = 100
            MAXSIZE = len(series) / 2

            cnt = [(c, len(n)) for (c, n) in cluster_dict.items()
                   if len(n) > MINCSIZE and len(n) < MAXSIZE]
            num = min(len(cnt), NNN)
            cnt = sorted(cnt, key=lambda tup: tup[1], reverse=True)[0:num]

            biggest = sum([cc[1] for cc in cnt])
            top5cluster = [c[0] for c in cnt]

            if biggest > len(series) / 2:

                f, ax = plt.subplots(2, 5, figsize=(20, 8))
                ind = 0

                for ccc, nodes in cluster_dict.items():

                    if ccc in top5cluster:

                        ttt = []
                        sss = []

                        cluster_vens = []
                        subseries = []

                        for n in nodes:

                            subseries.append(list(series.values())[int(n)])

                            sss += list(list(series.values())[int(n)])
                            ttt += transform_ts(
                                list(range(len(list(
                                    series.values())[int(n)]))), 11)

                        for n in nodes:

                            cluster_vens.append(list(series.keys())[int(n)])
                            linetotplot = list(series.values())[int(n)]
                            xlinetotplot = transform_ts(
                                list(range(len(list(
                                    series.values())[int(n)]))), 11)

                            ax[indicies[ind]].plot(xlinetotplot,
                                                   linetotplot,
                                                   linewidth=0.4,
                                                   color='grey',
                                                   alpha=0.15)

                        ffout = open(
                            vensfolder + '/venues_in_' + str(ind) + '_' +
                            str(biggest) + '_venuesnum=' +
                            str(len(subseries)) + '.dat', 'w')
                        ffout.write('\n'.join(cluster_vens))
                        ffout.close()

                        ax[indicies[ind]].set_title('Number of venues = ' +
                                                    str(len(subseries)),
                                                    fontsize=15)

                        bx, by = getBinnedDistribution(ttt, sss, 8)
                        bx = (bx[1:] + bx[:-1]) / 2

                        fout = open(
                            curvefodler + '/avg_curve_' + str(ind) + '_' +
                            str(biggest) + '_venuesnum=' +
                            str(len(subseries)) + '.dat', 'w')
                        fout.write('\t'.join([str(b) for b in bx]) + '\n')
                        fout.write('\t'.join([str(b) for b in by]) + '\n')
                        fout.close()
                        ax[indicies[ind]].plot(bx, by, linewidth=3, color='r')

                        ind += 1

                plt.savefig(figfolder + '/top_' + str(NNN) + '_clusters_' +
                            str(biggest) + '.png')
                plt.close()
Esempio n. 16
0
def cluster(time_series_set, name):

    path = "./static/cluster_data.csv"
    cluster_data = csv.reader(open(path, 'r'))

    name_list = []
    series_list = []

    for row in cluster_data:
        #print(row)
        #print("row", row)
        name_list.append(row[0])
        #print("name", name_list)
        series = row[1:]
        #print("series", series)
        float_series = []
        for i in series:
            float_series.append(float(i))
        np_series = np.array(float_series)
        temp_series = stats.zscore(np_series)
        series_list.append(temp_series)

    if name not in name_list:
        # timeseries是性能指标序列
        time_series = []
        time_series_with_name = []
        time_series_with_name.append(name)
        for row in time_series_set:
            time_series.append(row[1])
            time_series_with_name.append(row[1])
        #print(time_series)

        with open(path, 'a') as f:
            csv_write = csv.writer(f)
            csv_write.writerow(time_series_with_name)
            f.close()

        name_list.append(name)
        float_series = []
        for i in time_series:
            float_series.append(float(i))
        np_series = np.array(float_series)
        temp_series = stats.zscore(np_series)
        series_list.append(temp_series)

    # Custom Hierarchical clustering
    model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {})
    cluster_idx = model1.fit(series_list)
    # Augment Hierarchical object to keep track of the full tree
    model2 = clustering.HierarchicalTree(model1)
    cluster_idx = model2.fit(series_list)
    # SciPy linkage clustering
    model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {})
    cluster_idx = model3.fit(series_list)

    # model2.plot("hierarchy.png")

    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10))
    show_ts_label = lambda idx: name_list[idx]
    model2.plot("hierarchy.png",
                axes=ax,
                show_ts_label=show_ts_label,
                show_tr_label=True,
                ts_label_margin=-10,
                ts_left_margin=10,
                ts_sample_length=1)