def fit(self, tweets): reggrp = tweets.groupby('region') regions = reggrp.head(1).set_index('region').sort_index() self.regions = regions distances_km = pd.DataFrame( (6371.0088 * haversine_distances( np.radians(regions[['latitude', 'longitude']]), )), index=regions.index, columns=regions.index, ) self.distances = distances_km.stack() seed = np.exp(-self.beta * distances_km) seed += 0.0000001 seed = seed.div(seed.sum(axis=1), axis=0) self.seed = seed region_counts = reggrp.size().sort_values(ascending=False) region_probs = np.power( np.arange(1, region_counts.shape[0] + 1), -self.zipfs, ) region_probs += 0.0000001 region_probs = pd.Series( region_probs / np.sum(region_probs), index=region_counts.index, ).sort_index() self.region_probabilities = region_probs fitted = region_probs * seed fitted = fitted.div(fitted.sum(axis=1), axis=0) self.transition_mx = fitted.stack()
def test_haversine_vectorized(self): sp_file = os.path.join("tests", "data", "geolife", "geolife_staypoints.csv") sp = ti.read_staypoints_csv(sp_file, tz="utc", index_col="id") x = sp.geometry.x.values y = sp.geometry.y.values n = len(x) # our distance ix_1, ix_2 = np.triu_indices(n, k=1) x1 = x[ix_1] y1 = y[ix_1] x2 = x[ix_2] y2 = y[ix_2] d_ours = haversine_dist(x1, y1, x2, y2) # their distance x_rad = np.asarray([radians(_) for _ in x]) y_rad = np.asarray([radians(_) for _ in y]) yx = np.concatenate((y_rad.reshape(-1, 1), x_rad.reshape(-1, 1)), axis=1) D_theirs = haversine_distances(yx, yx) * 6371000 d_theirs = D_theirs[ix_1, ix_2] assert np.sum(np.abs(d_ours - d_theirs)) < 0.01 # 1cm for 58 should be good enough
def process_similarity(self, similarity): if similarity == "cosine": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = cosine_similarity(self._attribute_matrix)[x, y] elif similarity == "dot": self._similarity_matrix = (self._attribute_matrix @ self._attribute_matrix.T).toarray() elif similarity == "euclidean": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = (1 / (1 + euclidean_distances(self._attribute_matrix)))[x, y] elif similarity == "manhattan": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = (1 / (1 + manhattan_distances(self._attribute_matrix)))[x, y] elif similarity == "haversine": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = (1 / (1 + haversine_distances(self._attribute_matrix)))[x, y] elif similarity == "chi2": x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = (1 / (1 + chi2_kernel(self._attribute_matrix)))[x, y] elif similarity in ['cityblock', 'l1', 'l2']: x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = (1 / (1 + pairwise_distances(self._attribute_matrix, metric=similarity)))[x, y] elif similarity in ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']: x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) self._similarity_matrix[x, y] = (1 / (1 + pairwise_distances(self._attribute_matrix.toarray(), metric=similarity)))[x, y] else: raise Exception("Not implemented similarity")
def compute_clusters(self): """ Find clusters using DBSCAN algorithm Returns ------- tup : tuple centroids, sizes, and number of points of cluster found """ X, date_distances = self.transform_data() X_rad = np.array([np.radians(i) for i in X]) # scikit method takes radians # 2-D table of haversine distances between each pair of points distance_pairs = haversine_distances(X_rad, X_rad) distance_pairs /= distance_pairs.max() # Normalize distances # Weight of space and time distances # Found by experimentation prop = 0.98 # Distance is weighted average of space distance and time distance space_time_distance = prop * distance_pairs + (1 - prop) * date_distances # epsilon is the max distance for 2 points to be considered "close" # 0.014 has been found by experimentation Y = DBSCAN(eps=0.014, metric="precomputed").fit_predict(space_time_distance) return self.get_cluster_data(X, Y)
def find_closest_ll(input_ll, reference_ll, n=1): """ Find the closest pairing of longitude and latitudes from the input set to the reference set """ nbrs = NearestNeighbors(n_neighbors=n).fit( reference_ll[['longitude', 'latitude']].values) _, indices = nbrs.kneighbors(input_ll[['longitude', 'latitude']].values) input_ll['longitude_rad'] = input_ll['longitude'].apply(radians) input_ll['latitude_rad'] = input_ll['latitude'].apply(radians) loc_1 = input_ll[['longitude_rad', 'latitude_rad']] reference_ll['longitude_rad'] = reference_ll['longitude'].apply(radians) reference_ll['latitude_rad'] = reference_ll['latitude'].apply(radians) loc_2 = reference_ll.iloc[indices.flatten()][[ 'longitude_rad', 'latitude_rad' ]] distances = np.array([]) for l1, l2 in zip(loc_1.values, loc_2.values): d = (haversine_distances([l1, l2]) * EARTH_RADIUS / 1000) # km distances = np.append(distances, np.max(d)) return distances, indices
def binned_variance_batch(inds1, inds2, bin_edges, coords, X): # Compute distances. distances = haversine_distances(coords[inds1[0]:inds1[1]], coords[inds2[0]:inds2[1]]) # Convert distances to km. distances *= 6371000 / 1000 # Use just the upper triangle later on - mark all others using -1. distances[np.triu_indices(n=distances.shape[0], m=distances.shape[1], k=abs(inds1[0] - inds2[0]))] = -1 n_samples = np.empty((bin_edges.shape[0] - 1, ), dtype=np.int64) means = np.empty((bin_edges.shape[0] - 1, )) variances = np.empty((bin_edges.shape[0] - 1, )) for (bin_index, (lower, upper)) in enumerate(zip(bin_edges[:-1], bin_edges[1:])): # Bin the observations. selection = (lower <= distances) & (distances < upper) # Get matching indices. diffs = np.empty((np.sum(selection), )) for (counter, (i, j)) in enumerate(zip(*np.where(selection))): diffs[counter] = X[inds1[0] + i] - X[inds2[0] + j] n = diffs.size n_samples[bin_index] = n means[bin_index] = np.mean(diffs) if n else 0 variances[bin_index] = np.var(diffs) if n else 0 return n_samples, means, variances
def obtain_dist(c_a, c_b): # Convert angle to radians ca_in_radians = [radians(_) for _ in c_a] cb_in_radians = [radians(_) for _ in c_b] # Obtain the haversine distance result = haversine_distances([ca_in_radians, cb_in_radians]) return result[0][1] * 6371000
def gen_distance_matrix(df: pd.DataFrame, cluster: int) -> pd.DataFrame: """ This function takes in a dataframe, with lon lat coordinates, and a cluster number and calculates the distance matrix between points for a single cluster :return: return_df: a pandas dataframe, containing 'identificatie' as both row and column names, with the distance between those 'identificatie' as value in kilometers """ # Take only the data belonging to the cluster we want only_cluster = df[df['cluster'] == cluster] # Take out only the coordinates cluster_coords = only_cluster[['x_coordinate', 'y_coordinate']] # Calculate radians for the haversine function in_radians = [[radians(coord[0]), radians(coord[1])] for coord in cluster_coords.values] # Calculate distances with the haversine function, and multiply by the circumference of earth to get kilometers result = haversine_distances(in_radians) * 6371.0088 # Add 'identificatie' as column and row names return_df = pd.DataFrame(result, columns=only_cluster.identificatie_vbo, index=only_cluster.identificatie_vbo) return return_df
def distance_matrix(X1: np.ndarray, X2: np.ndarray, units: str = "km", fast_dist: bool = False) -> np.ndarray: """ Computes the geodesic (or great circle if fast_dist=True) distance among all pairs of points given two sets of coordinates. Wrapper for scipy.spatial.distance.cdist using geopy.distance.geodesic as a the metric. NOTE: - points should be formatted in rows as [lat, lon] - if fast_dist=True, units are kilometers regardless of specification """ # enforce 2d array in case of single point X1 = np.atleast_2d(X1) X2 = np.atleast_2d(X2) if fast_dist: # great circle distances in kilometers X1_r = np.radians(X1) X2_r = np.radians(X2) return haversine_distances(X1_r, X2_r) * EARTH_RADIUS elif units is not None: # geodesic distances in specified units return cdist(X1, X2, lambda s_i, s_j: getattr(geodesic(s_i, s_j), units)) else: # Euclidean distance return cdist(X1, X2)
def get_haversine(x): lat1 = x['Latitude'] long1 = x['Longitude'] lat2 = 41.8889 long2 = -87.6264 loc1 = [radians(lat1), radians(long1)] loc2 = [radians(lat2), radians(long2)] return (haversine_distances([loc1, loc2]) * 6357000)[0][1]
def sklearn_example(): # distance b/w Ezeiza Airport (Buenos Aires, Argentina) and Charles de Gaulle Airport (Paris, France) bas_coords = [-34.83333, -58.5166646] paris_coords = [49.0083899664, 2.53844117956] bsas_in_radians = [radians(_) for _ in bas_coords] paris_in_radians = [radians(_) for _ in paris_coords] result = haversine_distances([bsas_in_radians, paris_in_radians]) print(result * 6371000/1000) # multiply by Earth radius to get kilometers
def haversine(row): from_station = [row['rad_lat_i'],row['rad_lon_i']] to_station = [row['rad_lat_j'],row['rad_lon_j']] distance = haversine_distances([from_station,to_station]) distance = distance * 6371000/1000 # multiply by Earth radius to get kilometers return distance[0][1]
def haversine_distance(orig_long, orig_lat, dest_long, dest_lat): origin_coord = [orig_lat, orig_long] destination_coord = [dest_lat, dest_long] origin_in_radians = [radians(_) for _ in origin_coord] destination_in_radians = [radians(_) for _ in destination_coord] res = haversine_distances([origin_in_radians, destination_in_radians ])[0][1] * 6371000 / 1000 return res
def great_circle(loc1, lat2, long2): rest = np.array(loc1) comparison = np.array([lat2, long2]).reshape(1, 2) rest_in_radians = np.radians(rest) comp_in_radians = np.radians(comparison) result = haversine_distances(rest_in_radians, comp_in_radians) result = result * 6371000 / 1000 return result
def calc_matrices(invar, lon, lat, return_all=False): """ Calculate correlation, covariance, and distance matrices in preparation for clustering. Parameters ---------- invar : ARRAY (Time x Lat x Lon) Input variable lon : ARRAY (Lon) Longitudes lat : ARRAY (Lat) Latitudes return_all : BOOL, optional Set to true to return non-nan points, indices, and coordinates. The default is False. Returns ------- srho: ARRAY [npts x npts] Correlation Matrix scov: ARRAY [npts x npts] Covariance Matrix sdist: ARRAY [npts x npts] Distance Matrix """ # --------------------- # Remove All NaN Points # --------------------- ntime, nlat, nlon = invar.shape varrs = invar.reshape(ntime, nlat * nlon) okdata, knan, okpts = proc.find_nan(varrs, 0) npts = okdata.shape[1] # --------------------------------------------- # Calculate Correlation and Covariance Matrices # --------------------------------------------- srho = np.corrcoef(okdata.T, okdata.T) scov = np.cov(okdata.T, okdata.T) srho = srho[:npts, :npts] scov = scov[:npts, :npts] # -------------------------- # Calculate Distance Matrix # -------------------------- lonmesh, latmesh = np.meshgrid(lon, lat) coords = np.vstack([lonmesh.flatten(), latmesh.flatten()]).T coords = coords[okpts, :] coords1 = coords.copy() coords2 = np.zeros(coords1.shape) coords2[:, 0] = np.radians(coords1[:, 1]) # First point is latitude coords2[:, 1] = np.radians(coords1[:, 0]) # Second Point is Longitude sdist = haversine_distances(coords2, coords2) * 6371 if return_all: return srho, scov, sdist, okdata, okpts, coords2 return srho, scov, sdist
def get_pairwise_dists(df, lat_col, lng_col): lat = df[lat_col].apply(math.radians) lng = df[lng_col].apply(math.radians) R = 3959.87433 * 5280 # approximate radius of earth in ft (mi * ft/mi) pairwise_dists_df = pd.DataFrame(haversine_distances( pd.DataFrame([lat, lng]).T), index=df.index, columns=df.index) return pairwise_dists_df * R # (converting radians to feet)
def mask_sig_to_cluster(mask_and_data_s, wght_area, distance_eps, min_area_samples, n_jobs=-1): from sklearn import cluster from math import radians as _r from sklearn.metrics.pairwise import haversine_distances mask_sig_1d = mask_and_data_s.mask.astype('bool').values == False data = mask_and_data_s.data lons = mask_and_data_s.longitude.values lats = mask_and_data_s.latitude.values n_lags = mask_and_data_s.lag.size np_dbregs = np.zeros( (n_lags, lats.size, lons.size), dtype=int ) labels_sign_lag = [] label_start = 0 for sign in [-1, 1]: mask = mask_sig_1d.copy() mask[np.sign(data) != sign] = False n_gc_sig_sign = mask[mask==True].size labels_for_lag = np.zeros( (n_lags, n_gc_sig_sign), dtype=bool) meshgrid = np.meshgrid(lons.data, lats.data) mask_sig = np.reshape(mask, (n_lags, lats.size, lons.size)) sign_coords = [] ; count=0 weights_core_samples = [] for l in range(n_lags): sign_c = meshgrid[0][ mask_sig[l,:,:] ], meshgrid[1][ mask_sig[l,:,:] ] n_sign_c_lag = len(sign_c[0]) labels_for_lag[l][count:count+n_sign_c_lag] = True count += n_sign_c_lag # shape sign_coords = [(lats, lons)] sign_coords.append( [[_r(sign_c[1][i]), _r(sign_c[0][i]-180)] for i in range(sign_c[0].size)] ) weights_core_samples.append(wght_area[mask_sig[l,:,:]].reshape(-1)) sign_coords = flatten(sign_coords) if len(sign_coords) != 0: weights_core_samples = flatten(weights_core_samples) # calculate distance between sign coords accross all lags to keep labels # more consistent when clustering distance = haversine_distances(sign_coords) * 6371000/1000 # multiply by Earth radius to get kilometers dbresult = cluster.DBSCAN(eps=distance_eps, min_samples=min_area_samples, metric='precomputed', n_jobs=n_jobs).fit(distance, sample_weight=weights_core_samples) labels = dbresult.labels_ + 1 # all labels == -1 (now 0) are seen as noise: labels[labels==0] = -label_start individual_labels = labels + label_start [labels_sign_lag.append((l, sign)) for l in np.unique(individual_labels) if l != 0] for l in range(n_lags): mask_sig_lag = mask[l,:,:]==True np_dbregs[l,:,:][mask_sig_lag] = individual_labels[labels_for_lag[l]] label_start = int(np_dbregs[mask].max()) else: pass np_regs = np.array(np_dbregs, dtype='int') return np_regs, labels_sign_lag
def take_dist_mat(df): ''' in km ''' coords_temp = [[d1, d2] for d1, d2 in zip(df.lat.tolist(), df.lon.tolist())] coords_rad = [[radians(_) for _ in a1] for a1 in coords_temp] hav_mat_ = haversine_distances(coords_rad, coords_rad) * 6371 hav_mat_ = np.round(hav_mat_, 2) return hav_mat_
def distance_to_station(my_cords, station_cords): """Calculates distance from one coordinate to another. """ my_cords_in_radians = [radians(_) for _ in my_cords] station_cords_in_radians = [radians(_) for _ in station_cords] result = haversine_distances( [my_cords_in_radians, station_cords_in_radians]) result = result * 6371000 / 1000 # multiply by Earth radius to get kilometers return result[1][0]
def test_example_from_sklean(self): bsas = [-34.83333, -58.5166646] paris = [49.0083899664, 2.53844117956] bsas_in_radians = [radians(_) for _ in bsas] paris_in_radians = [radians(_) for _ in paris] d_theirs = haversine_distances([bsas_in_radians, paris_in_radians]) * 6371000 d_ours = haversine_dist(bsas[1], bsas[0], paris[1], paris[0]) assert np.abs(d_theirs[1][0] - d_ours) < 0.01
def calc_distance(a, b): # Convert positions a and b from degrees to radians a_radians = [math.radians(_) for _ in a] b_radians = [math.radians(_) for _ in b] # Calculate the distance between a and b with the haversine formula distance = haversine_distances([a_radians, b_radians]) distance *= 6371 # multiply by Earth radius to get kilometers return distance[0, 1]
def get_max_distance(coordinates): """Gets the maximum distance between a set of co-ordinates. Parameters: coordinates (numpy array of lat, lon): list of points Returns: maximum distance between given points """ distances = haversine_distances(coordinates) return np.max(distances)
def haversine_adapted(point_1, point_2): # lat lon to radians for haversine point_1 = [radians(_) for _ in point_1] point_2 = [radians(_) for _ in point_2] result = haversine_distances([point_1, point_2]) # convert to km result *= 6371000 / 1000 # result is a 2d distance matrix, # 0, dist # dist, 0 return result[0][1]
def test_haversine_distances(): # Check haversine distance with distances computation def slow_haversine_distances(x, y): diff_lat = y[0] - x[0] diff_lon = y[1] - x[1] a = np.sin(diff_lat / 2)**2 + (np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon / 2)**2) c = 2 * np.arcsin(np.sqrt(a)) return c rng = np.random.RandomState(0) X = rng.random_sample((5, 2)) Y = rng.random_sample((10, 2)) D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X]) D2 = haversine_distances(X, Y) assert_array_almost_equal(D1, D2) # Test haversine distance does not accept X where n_feature != 2 X = rng.random_sample((10, 3)) err_msg = "Haversine distance only valid in 2 dimensions" with pytest.raises(ValueError, match=err_msg): haversine_distances(X)
def gps_distance(p1: list, p2: list): """ @param p[1/2]: Coordinate Point (latitude, longitude) in floating angular notation @return: The Distance between the coordinate points [meter] """ r_earth = 6371000 # earth radius in meter p1_rad = [math.radians(x) for x in p1] p2_rad = [math.radians(x) for x in p2] d_haversine = haversine_distances([p1_rad, p2_rad]) d_real = d_haversine * r_earth return d_real[0][1]
def get_pairs(geos, df): """Get pairwise comparisons""" # Clean centroid data geos_for_pairwise_comp = (geos.set_index("geoid").assign( treated=lambda x: x["status"] == "Selected")[[ "statefp", "intptlat", "intptlon", "treated" ]].transform_column("intptlat", float).transform_column("intptlon", float)) # Tracts with nonmissing housing price data with_data = set( df.query("year == 2018").dropna( subset=["annual_change"])["tract"].unique()) pair_dfs = [] for state in geos_for_pairwise_comp.statefp.unique(): state_data = geos_for_pairwise_comp.query(f"statefp == @state").copy() rad_per_degree = 1 / 360 * 2 * np.pi x = state_data.query("treated")[["intptlon", "intptlat" ]] * rad_per_degree x_index = x.index y = state_data.query("not treated")[["intptlon", "intptlat" ]] * rad_per_degree y_index = y.index y_index_data = y_index.isin(with_data) dist_mat = haversine_distances(X=x, Y=y) # Distance is infinity to places with missing data in order to exclude them dist_mat[:, ~y_index_data] = np.inf min_dist_control = y_index[dist_mat.argmin(axis=1)] pair_dfs.append( pd.DataFrame({ "treated": x_index, "untreated": min_dist_control, "dist": dist_mat.min(axis=1), }).assign(statefp=state)) pair_df = pd.concat(pair_dfs) pair_df = (pair_df.reset_index(drop=True).reset_index().melt( ["statefp", "index", "dist"]).sort_values("index").rename_column( "variable", "treatment").rename_column("value", "tract").reset_index( drop=True).merge(df[["tract", "annual_change", "year"]], on="tract", how="left").sort_values([ "statefp", "year", "index", "treatment" ]).rename_column("index", "pair_id").assign( post_treatment=lambda x: x.year >= 2018)) return pair_df
def kantenmodell(d): ### Elevation Change d['elev_delta'] = d['Elevation'].shift(-1) - d['Elevation'] ### State of Charge Change d['soc_delta'] = d['HV Battery SOC_%_'].shift(-1) - d['HV Battery SOC_%_'] ### Distance concated = pd.concat([ d[['Latitude_deg_','Longitude_deg_']].shift(-1).astype(float).add_suffix('_to').reset_index(drop=True), d[['Latitude_deg_','Longitude_deg_']].astype(float).add_suffix('_from').reset_index(drop=True)], axis=1 ) dist_matrix = haversine_distances(concated[['Latitude_deg__from', 'Longitude_deg__from']], concated[['Latitude_deg__to', 'Longitude_deg__to']]) * 6371000/1000 d['distance'] = [dist_matrix[i,i] for i in range(dist_matrix.shape[0]) if i < dist_matrix.shape[1] - 1] + [np.nan] return d
def get_cluster_data(self, X, Y): """ Use clustering computed by DBSCAN to find: * centroid of each cluster * number of points per cluster * radius of each cluster Parameters ---------- X : numpy array points clustered Y : numpy array cluster decision vector Returns ------- centroids : list centroids of clusters sizes : list sizes of clusters (in kilometers) num_points : list number of points in clusters """ centroids = [] num_points = [] sizes = [] for i in range(np.max(Y) + 1): points_in_cluster = X[Y == i] # Centroid is arithmetic mean of point coordinates centroid = np.mean(points_in_cluster, axis=0) centroids.append(centroid) num_points.append(len(points_in_cluster)) # Radius of cluster is distance from centroid to farthest point size = 0 for point in points_in_cluster: point = np.array([np.radians(i) for i in point]) centroid_rad = np.array([np.radians(i) for i in centroid]) distance = haversine_distances([point], [centroid_rad])[0][0] if distance > size: size = distance # Multiply by radius of Earth to get kilometers size *= 6371 sizes.append(size) return centroids, num_points, sizes
def parse_toy_data(data_dir="."): lats, longs, names = [], [], [] with open(f"{data_dir}/cities-us0.txt", "r") as in_file: # ignore first line for line in in_file.readlines()[1:]: s = line.split() lats.append(radians(float(s[1]))) longs.append(radians(float(s[2]))) names.append(" ".join(s[3:])) X = np.array(list(zip(lats, longs))) dists = haversine_distances(X) # * 6_371_000 / 1_000 to km dists /= dists.max() return squareform(dists), np.array(names), len(names)
def connectTrafficData(accData, trafData, inplace=True, hardsave=False): ''' Attaches traffic data to accident data as 'Traffic' column Parameters: accData: Pandas dataframe of the accident data trafData: Pandas dataframe of traffic data inplace: Default True. If True, will add a "CP" column to accident data with the closest traffic checkpoint. If false will return closest array which can be used to add traffic data. hardsave: Default False. If true will save the resulting DataFrame in the Data directory. Returns: closest: Array of closest traffic CP (checkpoint) and distance to it for each accident in accData. ''' #Haversine distance finds the actual distance between two points given their latitude and longitude #Accuracy for Haversine formula is within 1%, doesn't account for ellipsoidal shape of the earth. from sklearn.metrics.pairwise import haversine_distances years = np.unique(accData['Year']) # accLocs = accData[['Latitude', 'Longitude']].values # trafLocs = trafData[['Lat','Lon']].values closest = np.ones((len(accData), 5)) * 10 index = 0 for year in years: curAccs = accData[accData['Year'] == year].copy() curTraf = trafData[trafData['year'] == year].copy() curAccLocs = curAccs[['Latitude', 'Longitude']].copy().values curTrafLocs = curTraf[['latitude', 'longitude']].copy().values for i, acc in enumerate(curAccLocs): distances = haversine_distances(acc.reshape((1, -1)), curTrafLocs) closest[index + i, 0] = distances.min() CPindex = distances.argmin() closest[index + i, 1] = curTraf.iloc[CPindex].count_point_id closest[index + i, 2] = curTraf.iloc[CPindex].all_motor_vehicles closest[index + i, 3] = curTraf.iloc[CPindex].latitude closest[index + i, 4] = curTraf.iloc[CPindex].longitude index += len(curAccs) if inplace: accData['CP'] = closest[:, 1].copy() accData['Traffic'] = closest[:, 2].copy() accData['CPlatitude'] = closest[:, 3].copy() accData['CPlongitude'] = closest[:, 4].copy() if hardsave: accData.to_csv("data/accidents_2005_to_2014_wTraffic.csv") else: return closest
def test_haversine_distances(): # Check haversine distance with distances computation def slow_haversine_distances(x, y): diff_lat = y[0] - x[0] diff_lon = y[1] - x[1] a = np.sin(diff_lat / 2) ** 2 + ( np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon/2) ** 2 ) c = 2 * np.arcsin(np.sqrt(a)) return c rng = np.random.RandomState(0) X = rng.random_sample((5, 2)) Y = rng.random_sample((10, 2)) D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X]) D2 = haversine_distances(X, Y) assert_array_almost_equal(D1, D2) # Test haversine distance does not accept X where n_feature != 2 X = rng.random_sample((10, 3)) assert_raise_message(ValueError, "Haversine distance only valid in 2 dimensions", haversine_distances, X)
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # Test haversine distance # The data should be valid latitude and longitude X = rng.random_sample((5, 2)) X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2 X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi S = pairwise_distances(X, metric="haversine") S2 = haversine_distances(X) assert_array_almost_equal(S, S2) # Test haversine distance, with Y != X Y = rng.random_sample((2, 2)) Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2 Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi S = pairwise_distances(X, Y, metric="haversine") S2 = haversine_distances(X, Y) assert_array_almost_equal(S, S2) # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")