def decode(self, bit_msg, morse_format): if not bit_msg or '1' not in bit_msg: raise ValueError(f'Invalid bit sequence: "{bit_msg}"') len_set = {'0': set(), '1': set()} # Remove enclosing zeroes transmission noise bit_msg = bit_msg.strip('0') parsed = [] prev = None count = 1 for bit in bit_msg: if bit not in ('0', '1'): raise ValueError(f'Invalid bit: "{bit}"') if bit == prev: count += 1 continue if prev is not None: # Avoid first initial None element len_set[prev].add(count) parsed.append((prev, count)) count = 1 prev = bit # Process last bit sequence len_set[bit].add(count) parsed.append((bit, count)) pulse_satisfies_jenks = len(len_set['1']) > 2 if pulse_satisfies_jenks: pulse_breaks = jenkspy.jenks_breaks(len_set['1'], 2) else: pulse_breaks = [0] + sorted(len_set['1']) space_satisfy_jenks = len(len_set['0']) > 3 if space_satisfy_jenks: space_breaks = jenkspy.jenks_breaks(len_set['0'], 3) else: space_breaks = sorted(len_set['0']) normalized = [] for bit, count in parsed: if bit == '1': is_dot = count <= pulse_breaks[1] # first break seq = morse_format.dot if is_dot else morse_format.dash else: if len(space_breaks) < 2: is_intra_char = True else: is_intra_char = count < space_breaks[1] # first break is_inter_word = count >= space_breaks[2] # second break if is_intra_char: seq = morse_format.intra_char elif is_inter_word: seq = morse_format.inter_word else: seq = morse_format.inter_char normalized.append(seq) return ''.join(normalized)
def goodness_of_variance_fit(array, classes): # get the break points classes = jenkspy.jenks_breaks(array, classes) # do the actual classification classified = np.array([classify(i, classes) for i in array]) # max value of zones maxz = max(classified) # nested list of zone indices zone_indices = [[ idx for idx, val in enumerate(classified) if zone + 1 == val ] for zone in range(maxz)] # sum of squared deviations from array mean sdam = np.sum((array - array.mean())**2) # sorted polygon stats array_sort = [ np.array([array[index] for index in zone]) for zone in zone_indices ] # sum of squared deviations of class means sdcm = sum([ np.sum((classified - classified.mean())**2) for classified in array_sort ]) # goodness of variance fit gvf = (sdam - sdcm) / sdam return gvf
def optimal_jenk(image, threshold, gvf=0.0, nclasses=2): """ Find GVF at incrementing class numbers until threshold GVF is reached Parameters ---------- image: np.array shape = (x,x).ravel() unrolled input image from spatial slice of swe cube threshold: float cutoff value for how high to optimize gvf gvf: float initial gvf value (default = 0.0) nclasses: int how many classes to start with Returns ------- [nclasses, bounds] nclasses: int optimal number of classes bounds: list Generated jenks boundaries for given image """ while gvf < threshold: gvf = govf(image, nclasses) nclasses += 1 bounds = jenkspy.jenks_breaks(image, nclasses) return nclasses, bounds
def emperical_histogram(observes,k): # observes is a list of observations # k is the number of bins # here we use Jenks natural breaks optimization to select the bins observes.sort() breaks = jenkspy.jenks_breaks(observes, nb_class=k) print(observes) print(breaks) omega = [] phi = [] curr_sum = 0 curr_count = 0 brk_idx = 1 curr_break = breaks[brk_idx] N = len(observes) for v in observes: #print(v,curr_break) if v < curr_break: curr_sum += v curr_count += 1 else: omega.append(curr_sum/curr_count) phi.append(curr_count/N) curr_sum = v curr_count = 1 if brk_idx < k: brk_idx += 1 curr_break = breaks[brk_idx] print("Precision of the phi values: " + str(abs(1-sum(phi)))) return multinomial(omega,phi)
def cluster_data(confirmed, clusters_config=None): if len(confirmed) == 0: return {"data": [], "clusters": []} if clusters_config is None: clusters_config = {"clusters": CLUSTERS, "labels": CLUSTERS_LABELS} df = pd.DataFrame(confirmed) df = df.dropna(how="any", axis=0, subset=["confirmed"]) breaks = jenkspy.jenks_breaks(df["confirmed"], nb_class=clusters_config["clusters"]) rounded_breaks = list(map(lambda limit: round(limit, sigfigs=3), breaks)) df["group"] = pd.cut( df["confirmed"], bins=breaks, labels=clusters_config["labels"], include_lowest=True, ) df = df.where(pd.notnull(df), None) # convert NaN to None non_inclusive_lower_limits = list( map(lambda limit: 0 if limit == 0 else limit + 1, rounded_breaks)) # add 1 to all limits (except 0) return { "data": df.to_dict("records"), "clusters": list(zip(non_inclusive_lower_limits, rounded_breaks[1:])), }
def assess(df, column): import jenkspy import pandas as pd pd.set_option('mode.chained_assignment', None) if len(df[column]) > 6: breaks = jenkspy.jenks_breaks(df[column], 5) df['p'] = 0 df['p'][df[column] <= breaks[1]] = 0.75 df['p'][(df[column] > breaks[1]) & (df[column] <= breaks[2])] = 0.25 df['p'][(df[column] > breaks[2]) & (df[column] <= breaks[3])] = 0.20 df['p'][(df[column] > breaks[3]) & (df[column] <= breaks[4])] = 0.15 df['p'][(df[column] > breaks[4]) & (df[column] <= breaks[5])] = 0.10 else: sum_df = df[column].sum() df['prop'] = df[column] / sum_df df['p'] = 0 df['p'][df[column] <= 0.1] = 0.75 df['p'][(df[column] > 0.1) & (df[column] <= (0.8 / 3) + 0.1)] = 0.25 df['p'][(df[column] > (0.8 / 3) + 0.1) & (df[column] <= (0.8 / 3) * 2 + 0.1)] = 0.20 df['p'][(df[column] > (0.8 / 3) * 2 + 0.1) & (df[column] <= 0.9)] = 0.15 df['p'][df[column] > 0.9] = 0.10 return df['p']
def jenks_clustering(Ms, clique_start_nr = 1, n_clusters=2): ''' Return the clusters Parameters ---------- Ms : pd.Series 1D arrays to be clustered cliques_start_nr : int Number from which to start the clique numbering n_clusters : int Number of clusters to cluster data into Returns ------- result : pd.Series Series containing which cluster number each of the vertices belong to (result[vertex] = cluster_nr) ''' if n_clusters > 1: breaks = np.unique( jenkspy.jenks_breaks(list(Ms), nb_class=n_clusters) ) else: breaks = np.array([min(Ms), max(Ms)]) labels_list = [int(i) + int(clique_start_nr) for i in range(len(breaks) - 1)] return pd.cut(Ms, bins=breaks, labels= labels_list, include_lowest=True)
def Jenks_algorithm_to_bin_in_3_classes(y_train, y_test): """ Implements the Jenks algorithm to cluster the continuous target variable. This algorithm allows us to find natural breaks in a 1D-array. Three classes are selected, aiming to represent a low-value pass, a medium-value pass and a high-value pass. Parameters ----------- y_train: the continuous target array of the training set y_test: the continuous target array of the test set Returns ----------- y_binned: the target array of the training set (with 3 distinct classes) """ breaks = jenkspy.jenks_breaks(y_train, nb_class=3) y_train_binned = pd.cut(y_train, bins=breaks, labels=[0, 1, 2], include_lowest=True) y_test_binned = y_test.copy() y_test_binned.loc[y_test < breaks[1]] = 0 y_test_binned.loc[(y_test > breaks[1]) & (y_test < breaks[2])] = 1 y_test_binned.loc[y_test > breaks[2]] = 2 return y_train_binned, y_test_binned
def define_levels(self, nb_class, disc_func): zi = self.zi _min = np.nanmin(zi) if not nb_class: # nb_class = int(get_opt_nb_class(len(zi)) - 2) nb_class = 8 if not disc_func or "prog_geom" in disc_func: levels = [_min ] + [np.nanmax(zi) / i for i in range(1, nb_class + 1)][::-1] elif "equal_interval" in disc_func: _bin = np.nanmax(zi) / nb_class levels = [_min] + [_bin * i for i in range(1, nb_class + 1)] elif "percentiles" in disc_func: levels = np.percentile( np.concatenate((zi[zi.nonzero()], np.array([_min]))), np.linspace(0.0, 100.0, nb_class + 1)) elif "jenks" in disc_func: levels = list( jenks_breaks(np.concatenate(([_min], zi[zi.nonzero()])), nb_class)) levels[0] = levels[0] - _min * 0.01 elif "head_tail" in disc_func: levels = head_tail_breaks( np.concatenate(([_min], zi[zi.nonzero()]))) elif "maximal_breaks" in disc_func: levels = maximal_breaks(np.concatenate(([_min], zi[zi.nonzero()])), nb_class) else: raise ValueError return levels
def goodness_of_variance_fit(self, array, classes): """对一维数据进行聚类 :param array: type:array. definition:一维数组 :param classes: type:int. definition:聚类的个数 :return: gvf:(The Goodness of Variance Fit)type:float. definiti:方差拟合优度,值越大效果越好。 classes:聚类的阈值区间 """ # get the break points classes = classes classes = jenks_breaks(array, classes) classified = np.array([self.classify(i, classes) for i in array]) # 获取区间最大值 maxz = max(classified) zone_indices = [[ idx for idx, val in enumerate(classified) if zone + 1 == val ] for zone in range(maxz)] # 与阵列平均值的平方偏差之和 sdam = np.sum((array - array.mean())**2) array_sort = [ np.array([array[index] for index in zone]) for zone in zone_indices ] sdcm = sum([ np.sum((classified - classified.mean())**2) for classified in array_sort ]) gvf = (sdam - sdcm) / sdam return gvf, classes
def ver_poligonos(request): poligonos_para_analizar = Poligono.objects.all() poligonos_clasificados = {'ide': [], 'pob': [], 'clase': []} for row in poligonos_para_analizar: poligonos_clasificados['ide'].append(row.id) poligonos_clasificados['pob'].append(row.poblacion) categorias = jenkspy.jenks_breaks(poligonos_clasificados['pob'], nb_class=3) print(categorias) print(len(poligonos_clasificados['ide'])) for i in range(len(poligonos_clasificados['ide'])): if poligonos_clasificados['pob'][i] <= categorias[1]: poligonos_clasificados['clase'].append('clase1') elif poligonos_clasificados['pob'][i] <= categorias[2]: poligonos_clasificados['clase'].append('clase2') elif poligonos_clasificados['pob'][i] <= categorias[3]: poligonos_clasificados['clase'].append('clase3') # ------------------------------------------------------------------------ poligonos_py = Poligono.objects.all() poligonos_json = serialize('geojson', poligonos_py) poligonos = json.loads(poligonos_json) return render(request, 'mapa/ver_poligonos.html', { 'poligonos': poligonos, 'poligonos_clasificados': poligonos_clasificados })
def get_split_point(values, classes, mode, result_decimal): rets = [] if mode == 'equal_count': # 等宽 df = pd.DataFrame(np.array(values), columns=['data']) for i in range(classes): p = (i + 1) / classes rets.append(df.quantile(p, 0, 'linear')['data']) elif mode == 'equal_interval': # 等间隔 value_interval = (max(values) - min(values)) / classes for i in range(classes): rets.append(min(values) + value_interval * (i + 1)) elif mode == 'nature_breaks': # 自然裂点 rets = jenkspy.jenks_breaks(values, classes) rets.pop(0) elif mode == 'standard_deviation': # 标准差 avg_value = sum(values) / len(values) p = pd.Series(values).std() delta = (math.ceil(classes / 2) - 1) * 0.5 for i in range(classes - 1): rets.append(avg_value + p * (i * 0.5 - delta)) rets.append(max(values)) return [round(x, int(result_decimal)) for x in rets]
def mask_ocean_winter(swe_matrix, day=0, nclasses=3): """ Use a winter day to mask ocean pixels out of coastal imagery in arctic. There is a clear difference between winter land pixels and ocean pixels that classification can sort out for us using a simple jenks classification. Data should have already moved through "vector_clean" and "apply_filter" Parameters ---------- swe_matrix: np.array swe time cube day: int julian date of time series to use for classification (should be winter) nclasses: int number of classes to use in jenks classification, defaults to 3 """ winter_day = swe_matrix[day, :, :] classes_jenk = jenkspy.jenks_breaks(winter_day.ravel(), nclasses) mask = classes_jenk == 1 winter_day[mask] = -8888 matrix_mask = np.zeros(swe_matrix.shape, dtype=bool) matrix_mask[:, :, :] = winter_day[np.newaxis, :, :] == -8888 swe_matrix[matrix_mask] = -8888 return swe_matrix
def get_breaks(atoms, n_layers): atom_zs = np.array([atom.z for atom in atoms]) try: breaks = jenkspy.jenks_breaks(atom_zs, nb_class = n_layers) except ValueError: breaks = atom_zs return get_unique_list(sorted(breaks))
def decision(lista): import jenkspy as jk lista_sem_na = lista[~np.isnan(lista)] if len(lista_sem_na) < 4: quebra = max(lista_sem_na) else: quebra = jk.jenks_breaks(lista_sem_na, nb_class=3)[-2] return quebra
def jenks_break(act_score_map, num): values = [] act_score_map = sorted(act_score_map.items(), key=lambda x: x[1], reverse=True) for _, val in act_score_map: values.append(val) return jenkspy.jenks_breaks(values, nb_class=num)[1:]
def natural_banding(): fn = sys.argv[1] num_of_breaks = int(sys.argv[2]) input_list = [] with open(fn, 'r') as f: for line in f: input_list.append(float(line)) print(jenkspy.jenks_breaks(input_list, num_of_breaks))
def jenks_discretize(values, number_of_bins=None): import jenkspy import math if number_of_bins is None: number_of_bins = min(len(set(values)), round(math.sqrt(len(values)))) #print(number_of_bins) breaks = jenkspy.jenks_breaks(values, int(number_of_bins)) values_in_bins = [classify(value, breaks) for value in values] return values_in_bins
def _discretize_continuous(ar, func=None, num_bins=None): '''Assign continuous vector to bins; available functions: - quantile, uniform, kmeans (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html) ''' if func is None: # no discretization is needed return (ar) if num_bins is None: # by default, num_bins = cube_root(n) or # unique(n) num_bins = min(math.floor(len(ar)**(1. / 3.)), len(set(ar))) elif num_bins == 0: raise ValueError('# bins should be > 0') else: num_bins = min(num_bins, len(set(ar))) if func in ['quantile', 'uniform', 'kmeans']: warnings.simplefilter('ignore') # ignore nan warning # remove NaNs temporarily since KBinsDiscretizer currently doesn't handle NaNs temp_ar = np.array([x for x in ar if x == x]) nans = np.array([not x == x for x in ar]) discretizer = KBinsDiscretizer(n_bins=num_bins, encode='ordinal', strategy=func) temp_discretized_result = discretizer.fit_transform( temp_ar.reshape(len(temp_ar), 1)).reshape(len(temp_ar)) # assign NaNs to a separate bin discretized_result = np.zeros(len(ar)) discretized_result[~nans] = temp_discretized_result discretized_result[nans] = np.nan warnings.resetwarnings() elif func == 'jenks': warnings.simplefilter('ignore') # ignore nan warning # remove the first lower bound breaks = jenkspy.jenks_breaks(ar, nb_class=num_bins) breaks[-1] += 1 discretized_result = [] for val in ar: if np.isnan(val): discretized_result.append(np.nan) continue for bound_i in range(1, len(breaks)): if val >= breaks[bound_i - 1] and val < breaks[bound_i]: discretized_result.append(bound_i - 1) break discretized_result = np.array(discretized_result, dtype=float) warnings.resetwarnings() else: raise ValueError('Discretization function not available...') # tidy up data in case some bins are empty counter = 0 for i in range(int(np.nanmax(discretized_result) + 1)): if i not in discretized_result: continue discretized_result[discretized_result == i] = counter counter += 1 # assign missing values to a separate bin discretized_result[np.isnan( discretized_result)] = np.nanmax(discretized_result) + 1 return (discretized_result)
def test_errors(self): # Using wrong 'nb_class' argument: with self.assertRaises(ValueError): jenks_breaks([1, 2, 3, 4], 32) with self.assertRaises(ValueError): jenks_breaks(self.data2, -5) # Using a wrong 'values' argument: with self.assertRaises(TypeError): jenks_breaks("a sequence of characters", 4) with self.assertRaises(TypeError): jenks_breaks(['a', 'b', 'c', 'd'], 3)
def recovery_rate_for_all(df, dgf): #------------------------Find ruptures in time------------------------------------------------ # temp_arr = {} # for cell_id in dgf.id.unique(): # print("******************************") # print("ID = {}".format(cell_id)) # dchange = dgf[(dgf.id == cell_id)][["date_c","rad_corr"]].drop_duplicates() # dates_rpt = changepoint_detection_singlecell(dchange, cell_id, penalty=15, create_plot=False) # print("dates = {}".format(dates_rpt)) # temp_arr[cell_id] = dates_rpt # code.interact(local=locals()) #-----------------------Create baseline for recovery tracking--------------------------------------- di = pd.read_hdf("yemen_groups.h5", key="zeal") dbase = df[(df.date_c < "2015-03-26")].groupby( ["id", "Latitude", "Longitude"]).mean()[["rad_corr"]].reset_index() dbase = dbase.rename(columns={"rad_corr": "rad_base"}) dbase = pd.merge(dbase, di, left_on=["id"], right_on=["id"], how="left") # base_mean = dbase.rad_base.values[0] # #---------------------- PICKLED ALREADY---------------- # dm = df[(df.date_c>="2016-06-04")] # dm = dm.groupby(["id",pd.Grouper(freq="1M",key="date_c")]).mean()[["rad_corr"]].reset_index() # dm = dm.rename(columns={"rad_corr":"rad_month"}) # temp_dict = {} # for cell_id in df.id.unique(): # print("******************************") # print("ID = {}".format(cell_id)) # dm_rec = dm[(dm.id == cell_id)].reset_index().drop(columns=["index"]) # rate_of_recovery = run_OLS(dm_rec, column=["rad_month"]) # temp_dict[cell_id] = rate_of_recovery # print("ROR = {}".format(rate_of_recovery)) dr = pd.read_hdf("id_ror.h5", key="zeal") dr = pd.merge(dr[["id", "ror"]].drop_duplicates(), dbase, left_on=["id"], right_on=["id"], how="left") dr["ror_norm"] = (dr["ror"] - min(dr["ror"])) / (max(dr["ror"] - min(dr["ror"]))) breaks = jenkspy.jenks_breaks(dr.ror.values, nb_class=3) dr["ror_group"] = dr.ror.apply(lambda x: "Low" if breaks[0] <= x < breaks[ 1] else "Medium" if breaks[1] <= x < breaks[2] else "High") dr_gdf = create_geodataframe(dr, buffered=True, radius=462, cap_style=3) #---------create plots--------------------------------------------- # fig, ax1 = plt.subplots(figsize=(4,5)) # plot_geospatial_heatmap_with_event_locs(geo_df=dr_gdf, col_name="ror_norm", events_data=None, title=None, cmap=cm.seismic, cmap_type="seismic", marker_color=None, events_data_type="locations_points", needs_colormapping=False, add_title=False, event_locs_included=False, include_colorbar=True, with_streetmap=True, ax=ax1) # plt.rc('font', size=14) # plt.tight_layout() # plt.show() code.interact(local=locals()) return None
def goodness_of_variance_fit(array, classes): classes = jenkspy.jenks_breaks(array, nb_class=classes) classifiedd = np.array([classify(i, classes) for i in array]) maxz = np.amax(classifiedd) zone_indices = [[idx for idx, val in enumerate(classifiedd) if zone + 1 == val] for zone in range(maxz)] sdam = np.sum((array - array.mean()) ** 2) array_sort = [np.array([array[index] for index in zone]) for zone in zone_indices] sdcm = np.sum([np.sum((cla - cla.mean()) ** 2) for cla in array_sort]) gvf = (sdam - sdcm) / sdam return gvf
def test_json_ref(self): # Test it against break values computed using another library # implementing jenks natural breaks: res = jenks_breaks(self.data1, 5) self.assertEqual(len(self.res1), len(res)) for break_values in zip(res, self.res1): self.assertAlmostEqual(break_values[0], break_values[1], places=6) # Test the result is the same using a python array as input: res_py_array = jenks_breaks(array('d', self.data1), 5) self.assertEqual(len(self.res1), len(res_py_array)) for break_values in zip(res_py_array, self.res1): self.assertAlmostEqual(break_values[0], break_values[1], places=6) # Test the result is the same using a numpy array as input: if np: data_np = np.array(self.data1) res_np = jenks_breaks(data_np, 5) self.assertEqual(res_np, res)
def calculateJenks(auth, args, es): #Build the view string from arguments view = "dsra_{eq_scenario}_{retrofit_prefix}_{dbview}".format(**{'eq_scenario':args.eqScenario, 'retrofit_prefix':args.retrofitPrefix, 'dbview':args.dbview}) response = Search(using=es, index=view) #Create a dataframe containing the full series of values from the specified view and field df = pd.DataFrame([getattr(hit.properties, args.field) for hit in response.scan()], columns=[args.field]) #Use Jenskpy to create natural breaks breaks = jenkspy.jenks_breaks(df[args.field], nb_class=args.bins) return breaks
def plot_jenks(image, gvt, interactive=False): """ Given swe image, classify using jenks classication. Uses goodness of variance fit to optimize number of classes given a threshold to maximize to. Paramters --------- image: np.array 2-d image of swe values to be classified gvt: float goodness of variance threshold value. Optimize gvf until this value is reached. Values between 0-1, generally around 0.8 """ list_colors = [ "blue", "green", "orange", "magenta", "cyan", "gray", "red", "yellow", ] classes_jenk = jenkspy.jenks_breaks( image.ravel(), optimal_jenk(image.ravel(), gvt)[0] ) classes = np.digitize(image, classes_jenk) nclasses = len(classes_jenk) fig, ax = plt.subplots(1, 1, figsize=(14, 5)) xlabel = str(nclasses) + " Classes, Jenks Classification" ax.set_title("Jenk Classification with" + str(nclasses) + " Classes") ax.set_xlabel(xlabel) bounds = range(0, nclasses + 1) cmap = c.ListedColormap(list_colors[0:nclasses]) kmp = ax.imshow( classes, interpolation="nearest", aspect="auto", cmap=cmap, origin="lower", ) plt.colorbar(kmp, cmap=cmap, ticks=bounds, ax=ax, orientation="vertical") if interactive: plt.ion() plt.show() plt.pause(0.001) plt.close() else: plt.show() return fig
def create_weights(self): avg_prices = self.df.groupby(["brand"]).mean()["price"] prices_df = pd.DataFrame(data={"avg_prices": avg_prices}) prices_df = prices_df.sort_values("avg_prices") breaks = jenkspy.jenks_breaks(avg_prices, nb_class=28) mapping_dict = {} for brand, i in zip(prices_df.index, prices_df["avg_prices"]): mapping_dict[brand] = bisect.bisect_left(breaks, i) + 1 self.df["brand_weight"] = self.df["brand"].map(mapping_dict) mapping_dict brands_df = {} for brand in self.df["brand"]: brands_df[brand], _ = [ x for _, x in self.df.groupby(self.df['brand'] != brand) ] mapping_dict = {} for brand, dataframe in brands_df.items(): sub_brand_averages = pd.DataFrame( data={ "sub_brand_averages": dataframe.groupby(["sub-brand"]).mean()["price"] }) sub_brand_breaks = jenkspy.jenks_breaks( sub_brand_averages["sub_brand_averages"], nb_class=int(len(sub_brand_averages) - 1)) for i, weight in zip(sub_brand_averages.index, sub_brand_averages["sub_brand_averages"]): mapping_dict[i] = bisect.bisect_left(sub_brand_breaks, weight) + 1 self.df["sub_brand_weight"] = self.df["sub-brand"].map( mapping_dict) self.df = self.df.drop(1095).dropna() self.df = self.df.drop(1728).dropna() self.df["no. of cylinders"] = self.df["no. of cylinders"].astype(float)
def _get_clfs_weights(self): gu = self.global_utilities if self.jenks == True: self.natural_breaks = jenkspy.jenks_breaks(gu, nb_class=5) gu = [ i if i >= self.natural_breaks[-self.jenks_limit] else 0 for i in gu ] gu_sum = sum(gu) for value in gu: self.weights.append(value / gu_sum)
def breaks(self): # todo: handle custom bucket counts if not self._breaks: values = [ feat['properties']['value'] for feat in self.as_geojson()['features'] if feat['properties']['value'] is not None ] self._breaks = jenks_breaks(values, nb_class=min(len(values), 6))[0:] return self._breaks
def __determine_knots(self, X: np.array): """ Determine the locations of the knots for every feature using Jenks Natural Breaks. Arguments: X (np.array): The train input used to determine the location of the knots. """ self.knots = [] for column, num_curves in zip(X.T, self.num_curves): breaks = jenks_breaks(column, nb_class=num_curves) self.knots.append(breaks) self.knots = np.array(self.knots).T
def cluster_by_attention_weight(self): attention_weight = np.load("average_weight.npy") attention_weight_array = attention_weight.reshape( [-1, self._num_features]) all_feature_breaks = [] for nums in range(self._num_features): one_feature_breaks = jenkspy.jenks_breaks( attention_weight_array[:, nums], nb_class=5) print(one_feature_breaks) all_feature_breaks.append(one_feature_breaks) np.save("all_features_breaks_ave.npy", all_feature_breaks)