def gen_list_gvf(classes): gvf = 0.0 nclasses = classes #global c1_peerId, c3_peerId randar1 = np.random.randint(1,5,len(c1_peerId)) randar3 = np.random.randint(75,100,len(c3_peerId)) randar2 = np.random.randint(30,50,len(ND.get_allNodes())-len(c3_peerId)-len(c1_peerId)) randartemp = [] randartemp.append(randar1) randartemp.append(randar2) randartemp.append(randar3) randar = np.concatenate((randar1, randar2, randar3)) #randar = randar1 + randar2 + randar3 #print (randar) print(np.sort(randar)) segs = [] arr = [] global found while gvf < .5 and nclasses!=2: gvf = goodness_of_variance_fit(randar, nclasses) print("RandIntegers:"+str(np.sort(randar))) print("GVF: "+str(gvf)) print nclasses print(jenks(randar,nclasses)) segs = jenks(randar,nclasses) arr = randar nclasses += 1 if nclasses > 3: print ("Randoming randar") nclasses = 3 randar1 = np.random.randint(1,5,len(c1_peerId)) randar3 = np.random.randint(75,100,len(c3_peerId)) randar2 = np.random.randint(30,50,len(ND.get_allNodes())-len(c3_peerId)-len(c1_peerId)) randartemp = [] randartemp.append(randar1) randartemp.append(randar2) randartemp.append(randar3) randar = np.concatenate((randar1, randar2, randar3)) #randar = np.asarray(randartemp) #randar = randar1 + randar2 + randar3 #randar = np.random.randint(1,100,len(ND.get_allNodes())) print ("New Randar"+str(randar)) print ("New Randar1"+str(randar1)) print ("New Randar2"+str(randar2)) print ("New Randar3"+str(randar3)) return segs, arr, gvf
def calc_stops(vals, ramp, nodata=-99): vals = [v for v in vals if v != nodata] try: _ramps = colors[ramp] except KeyError: raise KeyError("ramp must be one of {}".format(', '.join( colors.keys()))) n_classes = 8 _colors = _ramps[n_classes] try: float(vals[0]) _breaks = [float(x) for x in jenks(vals, n_classes)] _stops = list(zip(_breaks, _colors)) except ValueError: uniq = list(zip(set(vals))) # Try for ramp of exact length try: _colors = _ramps[n_classes] except KeyError: pass # Broadcast colors in cycle while len(_colors) < len(uniq): _colors += _colors _stops = list(zip(set(vals), _colors)) return _stops
def fit_posteriors(self, document, desired_gvf=0.8): """ Cluster the posteriors using Jenks Natural Breaks algorithm :param document: document in {problems, questions} :param desired_gvf: A number between [0, 1] showing goodness of fit :return: A list of the dictionary of the most likely problems/questions """ # gvf denotes the goodness of fit, n denotes the number of classes in Jenks/k-means gvf = 0.0 n = 0 if document == 'problems': cursor = self.db.problems.find() elif document == 'questions': cursor = self.db.questions.find() else: return posteriors = list() i = 0 idx_to_hash_name = dict() for item in cursor: posteriors.append(float(item['posterior'])) idx_to_hash_name[i] = item i += 1 array = np.array(posteriors) while gvf < desired_gvf: # Keep increasing n till gvf is at least the desired_gvf gvf = natural_break.gvf(array, n) n += 1 centers = jenks(array, n) most_likely = list() for i in range(len(posteriors)): d = [(abs(posteriors[i] - centers[k]), k) for k in range(len(centers))] d.sort() if d[0][1] == len(centers) - 1: most_likely.append(idx_to_hash_name[i]) return most_likely
def goodness_of_variance_fit(array, classes): # get the break points classes = jenks(array, classes) # do the actual classification classified = np.array([classify(i, classes) for i in array]) # max value of zones maxz = max(classified) # nested list of zone indices zone_indices = [[idx for idx, val in enumerate(classified) if zone + 1 == val] for zone in range(maxz)] # sum of squared deviations from array mean sdam = np.sum((array - array.mean()) ** 2) # sorted polygon stats array_sort = [np.array([array[index] for index in zone]) for zone in zone_indices] # sum of squared deviations of class means sdcm = sum([np.sum((classified - classified.mean()) ** 2) for classified in array_sort]) # goodness of variance fit gvf = (sdam - sdcm) / sdam return gvf
def goodness_of_variance_fit(array, classes): '''This and the next function were written by camdenl: https://stats.stackexchange.com/questions/143974/jenks-natural-breaks-in-python-how-to-find-the-optimum-number-of-breaks/144075 ''' # get the break points classes = jenks(array, classes) # do the actual classification classified = np.array([classify(i, classes) for i in array]) # max value of zones maxz = max(classified) # nested list of zone indices zone_indices = [[idx for idx, val in enumerate(classified) if zone + 1 == val] for zone in range(maxz)] # sum of squared deviations from array mean sdam = np.sum((array - array.mean()) ** 2) # sorted polygon stats array_sort = [np.array([array[index] for index in zone]) for zone in zone_indices] # sum of squared deviations of class means sdcm = sum([np.sum((classified - classified.mean()) ** 2) for classified in array_sort]) # goodness of variance fit gvf = (sdam - sdcm) / sdam return gvf
def calc_breaks_natural(values, n_classes): natural = None if values: natural = [float(bp) for bp in jenks(values, n_classes)] else: natural = [] return natural
def calc_stops(vals, ramp, nodata=-99): vals = [v for v in vals if v != nodata] try: _ramps = colors[ramp] except KeyError: raise KeyError("ramp must be one of {}".format(', '.join(colors.keys()))) n_classes = 8 _colors = _ramps[n_classes] try: float(vals[0]) _breaks = [float(x) for x in jenks(vals, n_classes)] _stops = list(zip(_breaks, _colors)) except ValueError: uniq = list(zip(set(vals))) # Try for ramp of exact length try: _colors = _ramps[n_classes] except KeyError: pass # Broadcast colors in cycle while len(_colors) < len(uniq): _colors += _colors _stops = list(zip(set(vals), _colors)) return _stops
def get_no_transaction_segments(self, n_breaks=10, visualize=False): """ segments the number of transactions array :param n_breaks: number of segments to break for jenks algorithm :type n_breaks: int :param visualize: to visualize the output :type visualize: bool :return: the array containing the numbers in the list for breaking :rtype: numpy.core.numeric.array """ x = self._merchant_data # Data Selection no_transaction = x[:, 1].tolist() # Frequency no_transaction.sort() no_transactions_breaks = jenks(no_transaction, n_breaks) if visualize: plotlyvisualize.segments_plot( no_transaction, vertical_lines=no_transactions_breaks, title= "Segmentations of Number of Transactions With Jenks Natural Breaks", out_path=PLOT_OUT_DIR) return np.array(no_transactions_breaks)
def get_cb_range(arr=np.empty([2, 2]), xaxis_min=0.0, xaxis_max=1.1, xaxis_step=0.1, do_jenks=True): """ https://github.com/perrygeo/jenks :param arr: :param xaxis_min: :param xaxis_max: :param xaxis_step: :param do_jenks: :return: """ # Array can only have shape == 2 if len(np.shape(arr)) != 2: sys.exit(0) if do_jenks: # Select 11 elements, discard the highest arr = np.array(jenks(np.unique(np.round_(arr, decimals=1)).data, 11))[:-1] # return only the unique elements, sometimes jenks selects duplicate elements return np.unique(arr) else: return np.arange(xaxis_min, xaxis_max, xaxis_step)
def segmentation(self, data_series, n_breaks, all_breaks=[], limit=1000): """ the method tries to segment data_series. first it find breaks with n_breaks then it tries to find the most populous break and using the start_interval and end_interval. of the most populous break find its exact population size. then it tries to merge breaks with all_breaks it has found. Then, if the most populous segment contains more than limit size it tries to segment it recursively. :param data_series: data series that need to be segmented :type data_series: pandas.core.series.Series :param n_breaks: number of breaks in each try of algorithm using jenks(not equal to all the breaks it finally find) :type n_breaks: int :param all_breaks: auxiliary list that contains all the breaks algorithm will find. set it [] always. :type all_breaks: :param limit: least number of population in each break, if it exceeds algorithm will recur :type limit: int :return: all the breaks :rtype: """ breaks = jenks(data_series.tolist(), n_breaks) start_interval, end_interval = self._find_most_populous_break( breaks, data_series) most_populous_chunk_series = data_series[ (data_series > start_interval) & (data_series < end_interval)] all_breaks = self._merge_breaks(all_breaks, breaks) if most_populous_chunk_series.size > limit and int(n_breaks / 2) >= 1: return self.segmentation(most_populous_chunk_series, int(n_breaks / 2), all_breaks, limit) else: return all_breaks
def test_json(): data = json.load(open('test.json')) breaks = jenks(data, 5) assert [round(v, 6) for v in breaks] == [0.002811, 2.093548, 4.205495, 6.178148, 8.091759, 9.997983]
def test_json(): data = json.load(open('test.json')) breaks, groups = jenks(data, 5) assert [round(v, 6) for v in breaks] == [0.002811, 2.093548, 4.205495, 6.178148, 8.091759, 9.997983]
def get_jenks_breaks(col_index, num_breaks): data=[] with open(QUAKE_PARSED,'r') as f: flist=f.readlines() flist.pop(0) for line in flist: values = line.strip().split(',') data.append(float(values[col_index])) breaks = jenks(data, num_breaks) return breaks
def build_jenks(target): gvf = 0 nclasses = 2 while gvf < 0.95 and nclasses < 10: gvf = goodness_of_variance_fit(target, nclasses) print( "\tGVF for {0} classes: {1}".format(nclasses, gvf)) nclasses += 1 breaks = jenks(target, nclasses-1) print("Breaks: ", breaks) classified = np.array([classify(i, breaks) for i in target]).reshape(-1, 1) print(classified.shape) return classified
def cluster(items, value=None, K=None): if value is None: raise ValueError("Distance function not set") if K is None: raise ValueError("Parameter K not set") if len(items) <= K: sys.stderr.write( "WARNING: NOT ENOUGH ITEMS!\nInput List Size: {}\n".format( len(items))) return [[i] for i in items] distance = lambda p1, p2: value(p1) - value(p2) breakpoints = sorted(map(value, items)) sys.stderr.write("Sorted values list:\n{}\n".format(breakpoints)) # Find natural jenks breakpoints of items breakpoints = jenks(breakpoints, K) # Remove duplicate #breakpoints = list(set(breakpoints)) sys.stderr.write("Breakpoints:\n{}\n".format(breakpoints)) clustered_items = [] last_bp = None # Lambda to test if item's distance is between breakpoints, using interval: (bp1, bp2] between = lambda item: value(item) > last_bp and value(item) <= bp # Group items using breakpoints for bp in breakpoints: # Jenks returns zero distance above if last_bp is None: last_bp = bp continue between_items = filter(between, items) sys.stderr.write("Values between {} and {}:\n{}\n".format( last_bp, bp, map(value, between_items))) if len(between_items) == 0: last_bp = bp continue clustered_items.append(between_items) last_bp = bp sys.stderr.write("{} clusters found (K={})\nItems:\n{}\n".format( len(clustered_items), K, clustered_items)) return clustered_items
def myjenks(array, label, sz=6): """Create classification breaks for the array""" a = list(set(jenks(array, sz))) # Some failures happen when number of values > 0 is less than 6 # sys.stderr.write(label + str(a)) a.sort() if max(a) == 0: return [0] if a[1] < 0.01: newa = [a[0]] for _ in a[1:]: if _ > 0.01: newa.append(_) a = newa # sys.stderr.write(label + str(a)) if max(a) == 0 or len(a) < 2: return [0] if a[0] == 0 and a[1] > 0.001: a[0] = 0.001 return [float(_) for _ in a]
def gjsonJenks(polyStats, polys, inP, classes, spacing = None): #get the break points classes = jenks(polyStats, classes) #do the actual classification classified = np.array([classify(i,classes) for i in polyStats]) #max value of zones maxz = max(classified) #nested list of zone indices zoneIndices = [[idx for idx,val in enumerate(classified) if zone + 1 == val] for zone in range(maxz)] #nested list of polygons corresponding to each zone number polySort = [[polys[index] for index in zone] for zone in zoneIndices] #merge geometries, generate list of zones, create geojson feature collection from list polyComb = [cascaded_union(polyz).simplify(.01) for polyz in polySort] #if simplifying is needed if spacing is not None:
def cluster(items, value=None, K=None): if value is None: raise ValueError("Distance function not set") if K is None: raise ValueError("Parameter K not set") if len(items) <= K: sys.stderr.write("WARNING: NOT ENOUGH ITEMS!\nInput List Size: {}\n".format(len(items))) return [[i] for i in items] distance = lambda p1, p2: value(p1) - value(p2) breakpoints = sorted(map(value, items)) sys.stderr.write("Sorted values list:\n{}\n".format(breakpoints)) # Find natural jenks breakpoints of items breakpoints = jenks(breakpoints, K) # Remove duplicate # breakpoints = list(set(breakpoints)) sys.stderr.write("Breakpoints:\n{}\n".format(breakpoints)) clustered_items = [] last_bp = None # Lambda to test if item's distance is between breakpoints, using interval: (bp1, bp2] between = lambda item: value(item) > last_bp and value(item) <= bp # Group items using breakpoints for bp in breakpoints: # Jenks returns zero distance above if last_bp is None: last_bp = bp continue between_items = filter(between, items) sys.stderr.write("Values between {} and {}:\n{}\n".format(last_bp, bp, map(value, between_items))) if len(between_items) == 0: last_bp = bp continue clustered_items.append(between_items) last_bp = bp sys.stderr.write("{} clusters found (K={})\nItems:\n{}\n".format(len(clustered_items), K, clustered_items)) return clustered_items
def StatesJson(request, word): word = urllib.unquote(word); states = State.objects.filter(word__word=word) ## Calculate subgrups with jenks if len(states) >= 3: scores = list() for state in states: scores.append(state.score) scores_jenks = jenks(scores,3) negative = float(scores_jenks[1]) positive = float(scores_jenks[2]) else: # We force to be neutral negative = 1 positive = 10 ## Prepare the dict states_dict = dict() for state in states: if (state.score <= negative): fillKey = 'negative' elif (state.score >= positive): fillKey = 'positive' else: fillKey = 'neutral' states_dict[state.state] = {"fillKey": fillKey, "score": state.score,"recurrence": state.recurrence} states_dict = OrderedDict(sorted(states_dict.items(), key=lambda x: x[1]['score'], reverse=True)) return HttpResponse(json.dumps(states_dict))
def get_sum_amounts_segments(self, n_breaks=10, visualize=False): """ segments the sum amounts array :param n_breaks: number of segments to break for jenks algorithm :type n_breaks: int :param visualize: to visualize the output :type visualize: bool :return: the array containing the numbers in the list for breaking :rtype: numpy.core.numeric.array """ x = self._merchant_data # Data Selection sum_amounts = x[:, 2].tolist() # Money sum_amounts.sort() sum_amounts_breaks = jenks(sum_amounts, n_breaks) if visualize: plotlyvisualize.segments_plot( sum_amounts, vertical_lines=sum_amounts_breaks, title="Segmentations of Sum Amount With Jenks Natural Breaks", out_path=PLOT_OUT_DIR) return np.array(sum_amounts_breaks)
def get_harmonic_segments(self, n_breaks=10, visualize=False): """ segments the harmonic number calculated in dataframe :param n_breaks: number of segments to break for jenks algorithm :type n_breaks: int :param visualize: to visualize the output :type visualize: bool :return: the array containing the numbers in the list for breaking :rtype: numpy.core.numeric.array """ x = self._merchant_data # Data Selection harmonic = x[:, 0].tolist() # Recency harmonic.sort() harmonic_breaks = jenks(harmonic, n_breaks) if visualize: plotlyvisualize.segments_plot( harmonic, vertical_lines=harmonic_breaks, title="Segmentations of Harmonic sum With Jenks Natural Breaks", out_path=PLOT_OUT_DIR) return np.array(harmonic_breaks)
def test_short(): data = [1, 2, 3, 100] breaks = jenks(data, 2) assert [round(v, 5) for v in breaks] == [1.0, 3.0, 100.0]
def test_json(): data = json.load(open('test.json')) breaks = jenks(data, 5) assert [round(float(v), 5) for v in breaks] == \ [0.00281, 2.09355, 4.2055, 6.17815, 8.09176, 9.99798]
def field_jenks(in_table, field_name, class_num): rows = get_rows(in_table) data_list = [row.getValue(field_name) for row in rows] result_data_list = jenks(data_list, class_num) print result_data_list return [float(group[-1]) for group in result_data_list]
X.append(float(line.strip())) X = np.array(X) ########### #Jenks gvf = 0.0 for k in range(2,end): gvf = goodness_of_variance_fit(X, k) if round(gvf,2) >= thres: break print(k) breaks = jenks(X,k) print("breaks =",breaks) print("\nmean clust_n") means = [] for i in range(1,k+1): if i == 1: X_ = X[X<= breaks[i]] means += [X_.mean()] print(round(X_.mean(),2)," ",X_.size) else: x = X[X > breaks[i-1]] X_ = x[x<=breaks[i]] means += [X_.mean()] print(round(X_.mean(),2)," ",X_.size)
randar = np.random.randint(1,100,8)''' #a = np.array([2,2, 4, 20, 18, 22, 28, 35, 42]) #a = np.array([1, 2, 10, 8, 0, 5, 6, 0, 10, 8, 49, 40, 49, 46, 30, 42, 39, 46, 44, 33, 32, 34, 37, 30, 39, 33, 39, 46, 32, 37, 46, 43, 49, 39, 50, 50, 50, 34, 38, 49, 34, 44, 39, 36, 32, 39, 40, 48, 49, 39, 35, 47, 43, 39, 50, 30, 37, 46, 50, 31, 45, 45, 38, 32, 33, 30, 32, 47, 47, 42, 42, 48, 44, 44, 31, 50, 38, 39, 50, 33, 37, 36, 50, 44, 31, 34, 50, 47, 30, 44, 78, 74, 73, 77, 62, 67, 70, 66, 64, 65, 72, 65, 83, 66, 69, 60, 62, 85, 80, 65]) a = np.array([ 8, 10, 7, 11, 9, 6, 10, 8, 9, 10, 36, 39, 38, 42, 35, 37, 47, 41, 46, 38, 32, 48, 42, 38, 43, 33, 40, 41, 45, 37, 36, 41, 30, 43, 48, 49, 36, 50, 45, 32, 35, 34, 36, 48, 31, 33, 38, 49, 50, 48, 34, 33, 36, 36, 40, 49, 32, 34, 45, 48, 49, 46, 47, 50, 33, 49, 40, 48, 49, 33, 88, 70, 79, 63, 86, 60, 83, 75, 88, 60, 77, 62, 65, 73, 72, 64, 62, 66, 76, 71, 75, 63, 66, 60, 85, 65, 61, 62, 69, 75 ]) #a = np.random.randint(1,100,8) gvf = goodness_of_variance_fit(a, 3) print(np.sort(a)) print("GVF: " + str(gvf)) segs = jenks(a, 3) print("SEGS:" + str(segs)) rates = np.sort(a).tolist() peers = ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8'] c1 = {} c2 = {} c3 = {} c1_rates = [] c2_rates = [] c3_rates = [] random.shuffle(peers) print(peers) print("NP SORT" + str(np.sort(a))) ind = 0 for i in rates: if i <= segs[1]: