class Kmeans: def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider def spatial(self, query, no_clusters, no_init=20): """ find centers based on clusters of latitude/longitude pairs query: SQL query that has a WGS84 geometry (the_geom) """ params = { "subquery": query, "geom_col": "the_geom", "id_col": "cartodb_id" } data = self.data_provider.get_spatial_kmeans(params) # Unpack query response xs = data[0]['xs'] ys = data[0]['ys'] ids = data[0]['ids'] km = KMeans(n_clusters=no_clusters, n_init=no_init) labels = km.fit_predict(zip(xs, ys)) return zip(ids, labels)
class Kmeans: def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider def spatial(self, query, no_clusters, no_init=20): """ find centers based on clusters of latitude/longitude pairs query: SQL query that has a WGS84 geometry (the_geom) """ params = {"subquery": query, "geom_col": "the_geom", "id_col": "cartodb_id"} data = self.data_provider.get_spatial_kmeans(params) # Unpack query response xs = data[0]['xs'] ys = data[0]['ys'] ids = data[0]['ids'] km = KMeans(n_clusters=no_clusters, n_init=no_init) labels = km.fit_predict(zip(xs, ys)) return zip(ids, labels)
class Getis(object): def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider def getis_ord(self, subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col): """ Getis-Ord's G* Implementation building neighbors with a PostGIS database and PySAL's Getis-Ord's G* hotspot/coldspot module. Andy Eschbacher """ # geometries with attributes that are null are ignored # resulting in a collection of not as near neighbors if kNN is chosen params = OrderedDict([("id_col", id_col), ("attr1", attr), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_getis(w_type, params) attr_vals = pu.get_attributes(result) # build PySAL weight object weight = pu.get_weight(result, w_type, num_ngbrs) # calculate Getis-Ord's G* z- and p-values getis = ps.esda.getisord.G_Local(attr_vals, weight, star=True, permutations=permutations) return list( zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order))
def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider
class Moran: def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider def global_stat(self, subquery, attr_name, w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I (global) Implementation building neighbors with a PostGIS database and Moran's I core clusters with PySAL. Andy Eschbacher """ params = OrderedDict([("id_col", id_col), ("attr1", attr_name), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) # collect attributes attr_vals = pu.get_attributes(result) # calculate weights weight = pu.get_weight(result, w_type, num_ngbrs) # calculate moran global moran_global = ps.esda.moran.Moran(attr_vals, weight, permutations=permutations) return zip([moran_global.I], [moran_global.EI]) def local_stat(self, subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I implementation for PL/Python Andy Eschbacher """ # geometries with attributes that are null are ignored # resulting in a collection of not as near neighbors params = OrderedDict([("id_col", id_col), ("attr1", attr), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) attr_vals = pu.get_attributes(result) weight = pu.get_weight(result, w_type, num_ngbrs) # calculate LISA values lisa = ps.esda.moran.Moran_Local(attr_vals, weight, permutations=permutations) # find quadrants for each geometry quads = quad_position(lisa.q) return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) def global_rate_stat(self, subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I Rate (global) Andy Eschbacher """ params = OrderedDict([("id_col", id_col), ("attr1", numerator), ("attr2", denominator) ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) # collect attributes numer = pu.get_attributes(result, 1) denom = pu.get_attributes(result, 2) weight = pu.get_weight(result, w_type, num_ngbrs) # calculate moran global rate lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight, permutations=permutations) return zip([lisa_rate.I], [lisa_rate.EI]) def local_rate_stat(self, subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I Local Rate Andy Eschbacher """ # geometries with values that are null are ignored # resulting in a collection of not as near neighbors params = OrderedDict([("id_col", id_col), ("numerator", numerator), ("denominator", denominator), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) # collect attributes numer = pu.get_attributes(result, 1) denom = pu.get_attributes(result, 2) weight = pu.get_weight(result, w_type, num_ngbrs) # calculate LISA values lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight, permutations=permutations) # find quadrants for each geometry quads = quad_position(lisa.q) return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y) def local_bivariate_stat(self, subquery, attr1, attr2, permutations, geom_col, id_col, w_type, num_ngbrs): """ Moran's I (local) Bivariate (untested) """ params = OrderedDict([("id_col", id_col), ("attr1", attr1), ("attr2", attr2), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) # collect attributes attr1_vals = pu.get_attributes(result, 1) attr2_vals = pu.get_attributes(result, 2) # create weights weight = pu.get_weight(result, w_type, num_ngbrs) # calculate LISA values lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight, permutations=permutations) # find clustering of significance lisa_sig = quad_position(lisa.q) return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
class Segmentation(object): """ Add docstring """ def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider def create_and_predict_segment_agg(self, target, features, target_features, target_ids, model_parameters): """ Version of create_and_predict_segment that works on arrays that come straight form the SQL calling the function. Input: @param target: The 1D array of length NSamples containing the target variable we want the model to predict @param features: The 2D array of size NSamples * NFeatures that form the input to the model @param target_ids: A 1D array of target_ids that will be used to associate the results of the prediction with the rows which they come from @param model_parameters: A dictionary containing parameters for the model. """ clean_target, _ = replace_nan_with_mean(target) clean_features, _ = replace_nan_with_mean(features) target_features, _ = replace_nan_with_mean(target_features) model, accuracy = train_model(clean_target, clean_features, model_parameters, 0.2) prediction = model.predict(target_features) accuracy_array = [accuracy] * prediction.shape[0] return list(zip(target_ids, prediction, accuracy_array)) def create_and_predict_segment(self, query, variable, feature_columns, target_query, model_params, id_col='cartodb_id'): """ generate a segment with machine learning Stuart Lynn @param query: subquery that data is pulled from for packaging @param variable: name of the target variable @param feature_columns: list of column names @target_query: The query to run to obtain the data to predict @param model_params: A dictionary of model parameters, the full specification can be found on the scikit learn page for [GradientBoostingRegressor] (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) """ params = {"subquery": target_query, "id_col": id_col} (target, features, target_mean, feature_means) = self.clean_data(query, variable, feature_columns) model, accuracy = train_model(target, features, model_params, 0.2) result = self.predict_segment(model, feature_columns, target_query, feature_means) accuracy_array = [accuracy] * result.shape[0] rowid = self.data_provider.get_segmentation_data(params) ''' rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}] ''' return list(zip(rowid[0]['ids'], result, accuracy_array)) def predict_segment(self, model, feature_columns, target_query, feature_means): """ Use the provided model to predict the values for the new feature set Input: @param model: The pretrained model @features_col: A list of features to use in the model prediction (list of column names) @target_query: The query to run to obtain the data to predict on and the cartodb_ids associated with it. """ batch_size = 1000 params = {"subquery": target_query, "feature_columns": feature_columns} results = [] cursors = self.data_provider.get_segmentation_predict_data(params) ''' cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]], [m1[2],m2[2],m3[2]]]}] ''' while True: rows = cursors.fetch(batch_size) if not rows: break batch = np.row_stack([np.array(row['features']) for row in rows]).astype(float) batch = replace_nan_with_mean(batch, feature_means)[0] prediction = model.predict(batch) results.append(prediction) # NOTE: we removed the cartodb_ids calculation in here return np.concatenate(results) def clean_data(self, query, variable, feature_columns): """ Add docstring """ params = {"subquery": query, "target": variable, "features": feature_columns} data = self.data_provider.get_segmentation_model_data(params) ''' data = [{'target': [2.9, 4.9, 4, 5, 6], 'feature1': [1,2,3,4], 'feature2' : [2,3,4,5]}] ''' # extract target data from data_provider object target = np.array(data[0]['target'], dtype=float) # put n feature data arrays into an n x m array of arrays features = np.column_stack([np.array(data[0][col]) for col in feature_columns]).astype(float) features, feature_means = replace_nan_with_mean(features) target, target_mean = replace_nan_with_mean(target) return target, features, target_mean, feature_means
class Markov(object): def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider def spatial_trend(self, subquery, time_cols, num_classes=7, w_type='knn', num_ngbrs=5, permutations=0, geom_col='the_geom', id_col='cartodb_id'): """ Predict the trends of a unit based on: 1. history of its transitions to different classes (e.g., 1st quantile -> 2nd quantile) 2. average class of its neighbors Inputs: @param subquery string: e.g., SELECT the_geom, cartodb_id, interesting_time_column FROM table_name @param time_cols list of strings: list of strings of column names @param num_classes (optional): number of classes to break distribution of values into. Currently uses quantile bins. @param w_type string (optional): weight type ('knn' or 'queen') @param num_ngbrs int (optional): number of neighbors (if knn type) @param permutations int (optional): number of permutations for test stats @param geom_col string (optional): name of column which contains the geometries @param id_col string (optional): name of column which has the ids of the table Outputs: @param trend_up float: probablity that a geom will move to a higher class @param trend_down float: probablity that a geom will move to a lower class @param trend float: (trend_up - trend_down) / trend_static @param volatility float: a measure of the volatility based on probability stddev(prob array) """ if len(time_cols) < 2: plpy.error('More than one time column needs to be passed') params = { "id_col": id_col, "time_cols": time_cols, "geom_col": geom_col, "subquery": subquery, "num_ngbrs": num_ngbrs } result = self.data_provider.get_markov(w_type, params) # build weight weights = pu.get_weight(result, w_type) weights.transform = 'r' # prep time data t_data = get_time_data(result, time_cols) sp_markov_result = ps.Spatial_Markov(t_data, weights, k=num_classes, fixed=False, permutations=permutations) # get lag classes lag_classes = ps.Quantiles(ps.lag_spatial(weights, t_data[:, -1]), k=num_classes).yb # look up probablity distribution for each unit according to class and # lag class prob_dist = get_prob_dist(sp_markov_result.P, lag_classes, sp_markov_result.classes[:, -1]) # find the ups and down and overall distribution of each cell trend_up, trend_down, trend, volatility = get_prob_stats( prob_dist, sp_markov_result.classes[:, -1]) # output the results return zip(trend, trend_up, trend_down, volatility, weights.id_order)
class Moran(object): """Class for calculation of Moran's I statistics (global, local, and local rate) Parameters: data_provider (:obj:`AnalysisDataProvider`): Class for fetching data. See the `crankshaft.analysis_data_provider` module for more information. """ def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider def global_stat(self, subquery, attr_name, w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I (global) Implementation building neighbors with a PostGIS database and Moran's I core clusters with PySAL. Args: subquery (str): Query to give access to the data needed. This query must give access to ``attr_name``, ``geom_col``, and ``id_col``. attr_name (str): Column name of data to analyze w_type (str): Type of spatial weight. Must be one of `knn` or `queen`. See `PySAL documentation <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__ for more information. num_ngbrs (int): If using `knn` for ``w_type``, this specifies the number of neighbors to be used to define the spatial neighborhoods. permutations (int): Number of permutations for performing conditional randomization to find the p-value. Higher numbers takes a longer time for getting results. geom_col (str): Name of the geometry column in the dataset for finding the spatial neighborhoods. id_col (str): Row index for each value. Usually the database index. """ params = OrderedDict([("id_col", id_col), ("attr1", attr_name), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) # collect attributes attr_vals = pu.get_attributes(result) # calculate weights weight = pu.get_weight(result, w_type, num_ngbrs) # calculate moran global moran_global = ps.esda.moran.Moran(attr_vals, weight, permutations=permutations) return zip([moran_global.I], [moran_global.EI]) def local_stat(self, subquery, attr, w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I (local) Args: subquery (str): Query to give access to the data needed. This query must give access to ``attr_name``, ``geom_col``, and ``id_col``. attr (str): Column name of data to analyze w_type (str): Type of spatial weight. Must be one of `knn` or `queen`. See `PySAL documentation <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__ for more information. num_ngbrs (int): If using `knn` for ``w_type``, this specifies the number of neighbors to be used to define the spatial neighborhoods. permutations (int): Number of permutations for performing conditional randomization to find the p-value. Higher numbers takes a longer time for getting results. geom_col (str): Name of the geometry column in the dataset for finding the spatial neighborhoods. id_col (str): Row index for each value. Usually the database index. Returns: list of tuples: Where each tuple consists of the following values: - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`) - p-value - spatial lag - standardized spatial lag (centered on the mean, normalized by the standard deviation) - original value - standardized value - Moran's I statistic - original row index """ # geometries with attributes that are null are ignored # resulting in a collection of not as near neighbors params = OrderedDict([("id_col", id_col), ("attr1", attr), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) attr_vals = pu.get_attributes(result) weight = pu.get_weight(result, w_type, num_ngbrs) # calculate LISA values lisa = ps.esda.moran.Moran_Local(attr_vals, weight, permutations=permutations) # find quadrants for each geometry quads = quad_position(lisa.q) # calculate spatial lag lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y) lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z) return zip(quads, lisa.p_sim, lag, lag_std, lisa.y, lisa.z, lisa.Is, weight.id_order) def global_rate_stat(self, subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I Rate (global) Args: subquery (str): Query to give access to the data needed. This query must give access to ``attr_name``, ``geom_col``, and ``id_col``. numerator (str): Column name of numerator to analyze denominator (str): Column name of the denominator w_type (str): Type of spatial weight. Must be one of `knn` or `queen`. See `PySAL documentation <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__ for more information. num_ngbrs (int): If using `knn` for ``w_type``, this specifies the number of neighbors to be used to define the spatial neighborhoods. permutations (int): Number of permutations for performing conditional randomization to find the p-value. Higher numbers takes a longer time for getting results. geom_col (str): Name of the geometry column in the dataset for finding the spatial neighborhoods. id_col (str): Row index for each value. Usually the database index. """ params = OrderedDict([("id_col", id_col), ("attr1", numerator), ("attr2", denominator), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) # collect attributes numer = pu.get_attributes(result, 1) denom = pu.get_attributes(result, 2) weight = pu.get_weight(result, w_type, num_ngbrs) # calculate moran global rate lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight, permutations=permutations) return zip([lisa_rate.I], [lisa_rate.EI]) def local_rate_stat(self, subquery, numerator, denominator, w_type, num_ngbrs, permutations, geom_col, id_col): """ Moran's I Local Rate Args: subquery (str): Query to give access to the data needed. This query must give access to ``attr_name``, ``geom_col``, and ``id_col``. numerator (str): Column name of numerator to analyze denominator (str): Column name of the denominator w_type (str): Type of spatial weight. Must be one of `knn` or `queen`. See `PySAL documentation <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__ for more information. num_ngbrs (int): If using `knn` for ``w_type``, this specifies the number of neighbors to be used to define the spatial neighborhoods. permutations (int): Number of permutations for performing conditional randomization to find the p-value. Higher numbers takes a longer time for getting results. geom_col (str): Name of the geometry column in the dataset for finding the spatial neighborhoods. id_col (str): Row index for each value. Usually the database index. Returns: list of tuples: Where each tuple consists of the following values: - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`) - p-value - spatial lag - standardized spatial lag (centered on the mean, normalized by the standard deviation) - original value (roughly numerator divided by denominator) - standardized value - Moran's I statistic - original row index """ # geometries with values that are null are ignored # resulting in a collection of not as near neighbors params = OrderedDict([("id_col", id_col), ("numerator", numerator), ("denominator", denominator), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) # collect attributes numer = pu.get_attributes(result, 1) denom = pu.get_attributes(result, 2) weight = pu.get_weight(result, w_type, num_ngbrs) # calculate LISA values lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight, permutations=permutations) # find quadrants for each geometry quads = quad_position(lisa.q) # spatial lag lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y) lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z) return zip(quads, lisa.p_sim, lag, lag_std, lisa.y, lisa.z, lisa.Is, weight.id_order) def local_bivariate_stat(self, subquery, attr1, attr2, permutations, geom_col, id_col, w_type, num_ngbrs): """ Moran's I (local) Bivariate (untested) """ params = OrderedDict([("id_col", id_col), ("attr1", attr1), ("attr2", attr2), ("geom_col", geom_col), ("subquery", subquery), ("num_ngbrs", num_ngbrs)]) result = self.data_provider.get_moran(w_type, params) # collect attributes attr1_vals = pu.get_attributes(result, 1) attr2_vals = pu.get_attributes(result, 2) # create weights weight = pu.get_weight(result, w_type, num_ngbrs) # calculate LISA values lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight, permutations=permutations) # find clustering of significance lisa_sig = quad_position(lisa.q) return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
class GWR: def __init__(self, data_provider=None): if data_provider: self.data_provider = data_provider else: self.data_provider = AnalysisDataProvider() def gwr(self, subquery, dep_var, ind_vars, bw=None, fixed=False, kernel='bisquare', geom_col='the_geom', id_col='cartodb_id'): """ subquery: 'select * from demographics' dep_var: 'pctbachelor' ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack'] bw: value of bandwidth, if None then select optimal fixed: False (kNN) or True ('distance') kernel: 'bisquare' (default), or 'exponential', 'gaussian' """ params = { 'geom_col': geom_col, 'id_col': id_col, 'subquery': subquery, 'dep_var': dep_var, 'ind_vars': ind_vars } # get data from data provider query_result = self.data_provider.get_gwr(params) # exit if data to analyze is empty if len(query_result) == 0: plpy.error('No data passed to analysis or independent variables ' 'are all null-valued') # unique ids and variable names list rowid = np.array(query_result[0]['rowid'], dtype=np.int) # x, y are centroids of input geometries x = np.array(query_result[0]['x'], dtype=np.float) y = np.array(query_result[0]['y'], dtype=np.float) coords = list(zip(x, y)) # extract dependent variable Y = np.array(query_result[0]['dep_var'], dtype=np.float).reshape( (-1, 1)) n = Y.shape[0] k = len(ind_vars) X = np.zeros((n, k)) # extract query result for attr in range(0, k): attr_name = 'attr' + str(attr + 1) X[:, attr] = np.array(query_result[0][attr_name], dtype=np.float).flatten() # add intercept variable name ind_vars.insert(0, 'intercept') # calculate bandwidth if none is supplied if bw is None: bw = Sel_BW(coords, Y, X, fixed=fixed, kernel=kernel).search() model = PySAL_GWR(coords, Y, X, bw, fixed=fixed, kernel=kernel).fit() # containers for outputs coeffs = [] stand_errs = [] t_vals = [] filtered_t_vals = [] # extracted model information c_alpha = model.adj_alpha filtered_t = model.filter_tvals(c_alpha[1]) predicted = model.predy.flatten() residuals = model.resid_response r_squared = model.localR2.flatten() bw = np.repeat(float(bw), n) # create lists of json objs for model outputs for idx in range(n): coeffs.append( json.dumps({ var: model.params[idx, k] for k, var in enumerate(ind_vars) })) stand_errs.append( json.dumps( {var: model.bse[idx, k] for k, var in enumerate(ind_vars)})) t_vals.append( json.dumps({ var: model.tvalues[idx, k] for k, var in enumerate(ind_vars) })) filtered_t_vals.append( json.dumps({ var: filtered_t[idx, k] for k, var in enumerate(ind_vars) })) return list( zip(coeffs, stand_errs, t_vals, filtered_t_vals, predicted, residuals, r_squared, bw, rowid)) def gwr_predict(self, subquery, dep_var, ind_vars, bw=None, fixed=False, kernel='bisquare', geom_col='the_geom', id_col='cartodb_id'): """ subquery: 'select * from demographics' dep_var: 'pctbachelor' ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack'] bw: value of bandwidth, if None then select optimal fixed: False (kNN) or True ('distance') kernel: 'bisquare' (default), or 'exponential', 'gaussian' """ params = { 'geom_col': geom_col, 'id_col': id_col, 'subquery': subquery, 'dep_var': dep_var, 'ind_vars': ind_vars } # get data from data provider query_result = self.data_provider.get_gwr_predict(params) # exit if data to analyze is empty if len(query_result) == 0: plpy.error('No data passed to analysis or independent variables ' 'are all null-valued') # unique ids and variable names list rowid = np.array(query_result[0]['rowid'], dtype=np.int) x = np.array(query_result[0]['x'], dtype=np.float) y = np.array(query_result[0]['y'], dtype=np.float) coords = np.array(list(zip(x, y)), dtype=np.float) # extract dependent variable Y = np.array(query_result[0]['dep_var']).reshape((-1, 1)) n = Y.shape[0] k = len(ind_vars) X = np.empty((n, k), dtype=np.float) for attr in range(0, k): attr_name = 'attr' + str(attr + 1) X[:, attr] = np.array(query_result[0][attr_name], dtype=np.float).flatten() # add intercept variable name ind_vars.insert(0, 'intercept') # split data into "training" and "test" for predictions # create index to split based on null y values train = np.where(Y != np.array(None))[0] test = np.where(Y == np.array(None))[0] # report error if there is no data to predict if len(test) < 1: plpy.error('No rows flagged for prediction: verify that rows ' 'denoting prediction locations have a dependent ' 'variable value of `null`') # split dependent variable (only need training which is non-Null's) Y_train = Y[train].reshape((-1, 1)) Y_train = Y_train.astype(np.float) # split coords coords_train = coords[train] coords_test = coords[test] # split explanatory variables X_train = X[train] X_test = X[test] # calculate bandwidth if none is supplied if bw is None: bw = Sel_BW(coords_train, Y_train, X_train, fixed=fixed, kernel=kernel).search() # estimate model and predict at new locations model = PySAL_GWR(coords_train, Y_train, X_train, bw, fixed=fixed, kernel=kernel).predict(coords_test, X_test) coeffs = [] stand_errs = [] t_vals = [] r_squared = model.localR2.flatten() predicted = model.predy.flatten() m = len(model.predy) for idx in range(m): coeffs.append( json.dumps({ var: model.params[idx, k] for k, var in enumerate(ind_vars) })) stand_errs.append( json.dumps( {var: model.bse[idx, k] for k, var in enumerate(ind_vars)})) t_vals.append( json.dumps({ var: model.tvalues[idx, k] for k, var in enumerate(ind_vars) })) return list( zip(coeffs, stand_errs, t_vals, r_squared, predicted, rowid[test]))
class Kmeans(object): def __init__(self, data_provider=None): if data_provider is None: self.data_provider = AnalysisDataProvider() else: self.data_provider = data_provider def spatial(self, query, no_clusters, no_init=20): """ find centers based on clusters of latitude/longitude pairs query: SQL query that has a WGS84 geometry (the_geom) """ params = { "subquery": query, "geom_col": "the_geom", "id_col": "cartodb_id" } result = self.data_provider.get_spatial_kmeans(params) # Unpack query response xs = result[0]['xs'] ys = result[0]['ys'] ids = result[0]['ids'] km = KMeans(n_clusters=no_clusters, n_init=no_init) labels = km.fit_predict(zip(xs, ys)) return zip(ids, labels) def nonspatial(self, subquery, colnames, no_clusters=5, standardize=True, id_col='cartodb_id'): """ Arguments: query (string): A SQL query to retrieve the data required to do the k-means clustering analysis, like so: SELECT * FROM iris_flower_data colnames (list): a list of the column names which contain the data of interest, like so: ['sepal_width', 'petal_width', 'sepal_length', 'petal_length'] no_clusters (int): number of clusters (greater than zero) id_col (string): name of the input id_column Returns: A list of tuples with the following columns: cluster labels: a label for the cluster that the row belongs to centers: center of the cluster that this row belongs to silhouettes: silhouette measure for this value rowid: row that these values belong to (corresponds to the value in `id_col`) """ import json from sklearn import metrics params = {"colnames": colnames, "subquery": subquery, "id_col": id_col} data = self.data_provider.get_nonspatial_kmeans(params) # fill array with values for k-means clustering if standardize: cluster_columns = _scale_data(_extract_columns(data)) else: cluster_columns = _extract_columns(data) kmeans = KMeans(n_clusters=no_clusters, random_state=0).fit(cluster_columns) centers = [ json.dumps(dict(zip(colnames, c))) for c in kmeans.cluster_centers_[kmeans.labels_] ] silhouettes = metrics.silhouette_samples(cluster_columns, kmeans.labels_, metric='sqeuclidean') return zip(kmeans.labels_, centers, silhouettes, [kmeans.inertia_] * kmeans.labels_.shape[0], data[0]['rowid'])