def split_superinstance(self, si, k): # the actual splitting pred = kshape(self.data[si.indices, :], k) # making sure that all of the super-instances contain at least one training instance # super-instances without training instance are merged with the closest one that does contain a # training instance training = [] no_training = [] for new_si_centroid, new_si_idx in pred: # go from super instance indices to global ones cur_indices = [si.indices[idx] for idx in new_si_idx] si_train_indices = [x for x in cur_indices if x in self.train_indices] if len(si_train_indices) != 0: training.append(SuperInstance_kShape(self.data, cur_indices, self.train_indices, new_si_centroid, si)) else: no_training.append((cur_indices, new_si_centroid)) for indices, centroid in no_training: # sets of indices without a training point are merged with their closest super-instance # closeness is based on the SBD centroid closest_train = None closest_train_dist = np.inf for training_si in training: cur_dist, _ = _sbd(training_si.sbd_centroid, centroid) if cur_dist < closest_train_dist: closest_train_dist = cur_dist closest_train = training_si closest_train.indices.extend(indices) si.children = training return training
def run(): args = parser.parse_args(sys.argv[1:]) print("number of clusters: ", args.clusters) print(time_series_number, ": ", args.number) print(time_series_length, ": ", args.length) print(framework_help, ": ", args.framework) print("selected device: ", args.device) print("selected data type: ", args.type) print("selected source of data: ", args.sourcedata) clusters = args.clusters if args.sourcedata == "random": x = np.random.rand(args.number, args.length) else: datasets = load_time_series.load_data(args.sourcedata) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] x = np.vstack((train_set_x, valid_set_x, test_set_x)) print("loaded ", x.shape[0], " data points of length ", x.shape[1]) clusters = len(np.unique(train_set_y)) print("clusters: ", clusters) try: x = x.astype(dtype=args.type, copy=False) except TypeError as err: print(err) print("Error: ", datatype_help, " got: ", args.type) sys.stdout.flush() exit(1) if args.device == gpu: if torch.cuda.is_available(): print("CUDA is available via PyTorch") else: print( "CUDA is not available via PyTorch, please install cuda and libcudnn from NVIDIA" ) exit(1) result = None start = time.time() if args.device == gpu: result = core_gpu.kshape_gpu(x=x, k=clusters, device="cuda") elif args.device == cpu: if args.framework == torch_lib: result = core_gpu.kshape_gpu(x=x, k=clusters, device=cpu) elif args.framework == numpy_lib: result = core.kshape(x=x, k=clusters) else: print("Error: ", device_help) exit(1) print("elapsed time, ", time.time() - start, ",sec") if args.print: print(result)
def _prepare_kshape(data: pd.DataFrame, cluster_num: int) -> List[Tuple]: """ Parameters ---------- data: pd.DataFrame cluster_num: int Returns ------- List[Tuple] """ return kshape(data, cluster_num)
from kshape.core import kshape, zscore time_series = [[1, 2, 3, 4], [0, 1, 2, 3], [0, 1, 2, 3], [1, 2, 2, 3]] cluster_num = 2 clusters = kshape(zscore(time_series, axis=1), cluster_num)
#def data_plotter(data): if __name__ == '__main__': with open( 'C://Users//k_mathin//PycharmProjects//Masters//ciena_trials//Kamal//data//vodafone_data_clusters_filtered.pkl', 'rb') as f: data_set = pickle.load(f) data = [] for d in data_set['data']: data.append(d) data = np.asarray(data) #data = data[:,:15] print(data.shape[0]) label_data = np.asarray(data_set['osid']) labels, levels = pd.factorize(label_data) shelves = np.asarray(data_set['shelf']) cluster_num = levels.shape[0] print(cluster_num) clusters = kshape(zscore(data, axis=1), cluster_num) #clusters = kshape(data,cluster_num) y_pred = [] for i in range(0, data.shape[0]): for j in range(0, cluster_num): if i in clusters[j][1]: y_pred.append(j) continue conf = conf_mat(labels, y_pred) print(conf_mat(labels, y_pred)) print("done")
def kshape_clusters(arr, cluster_num, ax=1): from kshape.core import kshape, zscore clusters = kshape(zscore(arr, ax), cluster_num) re_arr = apply_clusters(clusters, arr) return re_arr
from kshape.core import kshape, zscore time_series = [[1, 2, 3, 4, 5], [0, 1, 2, 3, 4], [3, 2, 1, 0, -1], [1, 2, 2, 3, 3]] cluster_num = 2 clusters = kshape(zscore(time_series), cluster_num) print(clusters)
#%% causes = pd.read_pickle('data/causes.pkl') def cluster_show(cluster_rep, cluster_id): #plt.plot(cluster_rep) for i in cluster_id: ev = causes.iloc[i]['id'] temp_In = list(Event(ev, start, end).data[' In'].values) plt.plot(temp_In) plt.legend(list(causes.iloc[cluster_id]['cause'])) plt.show() cluster_num = 6 clusters = kshape(zscore(I_ns, axis=1), cluster_num) for i in range(cluster_num): print(causes.iloc[clusters[i][1]], '\n', '----------------------') cluster_show(clusters[i][0], clusters[i][1]) #%% import statsmodels.api as sm dta = sm.datasets.co2.load_pandas().data # deal with missing values. see issue dta.co2.interpolate(inplace=True) id = whole_events[100] start = 0 end = -1 e = Event(id, start, end)