def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # distance from the center till cutting point v_mn = 0 head = 0 if len(x) > 1: v_mn = haversine_distance(x[0, :], x[-1, :])[0] head = heading(x[0, :], x[-1, :]) # distance from the center till cutting point d_st = haversine_distance(x[0, :], CITY_CENTER) h_st = heading(x[0, :], CITY_CENTER[0]) data += [x[-1, 0], x[-1, 1], d_st, h_st, v_mn, head] return data
def distance_intra_cluster(transformed_df, main_df, centroids): zones = ["D1", "D2", "D3", "D4", "D5", "D6"] centroids_map = dict(zip(zones, centroids)) centroids_df = pd.DataFrame.from_dict( centroids_map, orient="index", columns=["lat_centroids", "lon_centroids"]) centroids_df = centroids_df.reset_index().rename( columns={"index": "zones"}) unpivoted_df = pd.melt(transformed_df, id_vars=['Id_Cliente'], var_name="zones") unpivoted_df = unpivoted_df[unpivoted_df.value > 0] joined_df = unpivoted_df.merge(main_df[["Id_Cliente", "lat", "lon"]], on="Id_Cliente", how = "left")\ .merge(centroids_df, on="zones", how = "left") joined_df["manhattan"] = manhattan_distance(joined_df["lat"], joined_df["lon"], joined_df["lat_centroids"], joined_df["lon_centroids"]) joined_df["haversine"] = haversine_distance(joined_df["lat"], joined_df["lon"], joined_df["lat_centroids"], joined_df["lon_centroids"]) print("Manhattan Distance") print(joined_df.groupby("zones")["manhattan"].sum()) print("Haversine Distance") print(joined_df.groupby("zones")["haversine"].sum()) return (joined_df["manhattan"].sum(), joined_df["haversine"].sum())
def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # distance from the center till cutting point d_st = haversine_distance(x, CITY_CENTER) head = heading(x, CITY_CENTER[0]) data += [x[0], x[1], d_st, head] return data
def _compute_or_fetch(self, code_one, loc_one, code_two, loc_two): """Computes the distance between two languages or fetches it.""" key_one = (code_one, code_two) key_two = (code_two, code_one) if key_one in self._distance_cache or key_two in self._distance_cache: return self._distance_cache[key_one] else: dist = utils.haversine_distance(loc_one, loc_two) self._distance_cache[key_one] = dist self._distance_cache[key_two] = dist return dist
def process_trip(x, start_time): tt = time.localtime(start_time) data = [tt.tm_wday, tt.tm_hour] # cumulative sum of distance d_cs = 0 vcar = 0 vmed = 0 head = 0 if x.shape[0] > 1: d1 = haversine_distance(x[:-1, :], x[1:, :]) d_cs = np.sum(d1) vmed = np.median(d1) vcar = d1[-1] head = heading(x[-2, :], x[-1, :]) # distance from the center till cutting point d_st = haversine_distance(x[0, :], CITY_CENTER)[0] h_st = heading(x[0, :], CITY_CENTER[0]) d_cut = haversine_distance(x[-1, :], CITY_CENTER)[0] h_cut = heading(CITY_CENTER[0], x[-1, :]) data += [ x.shape[0], x[0, 0], x[0, 1], x[-1, 0], x[-1, 1], d_st, h_st, d_cut, h_cut, d_cs, vmed, vcar, head ] return data
def SearchNearestPharmacy(cls, currentLocation: Dict, range: int, limit: int) -> tuple: if not {"latitude", "longitude"} <= currentLocation.keys(): return { "message": "Validation error: currentLocation must contain 'latitude' and 'longitude'" }, 422 uri = env("DATA_URL") try: res = requests.get(uri, timeout=1) except requests.ConnectionError: logger.error("Connection error") return {"message": "Connection Error"}, 503 except Exception as e: logger.error(str(e)) return {"message": str(e)}, 500 if res.status_code != 200: logger.error(res.status_code) return {"message": "Service error"}, res.status_code data = res.json()['features'] pharmacies_distance = [] for key, pharmacy in enumerate(data): coordinates = pharmacy['geometry']['coordinates'] pharmacy_name = pharmacy['properties']['Descrizione'] distance = utils.haversine_distance(currentLocation['latitude'], currentLocation['longitude'], coordinates[1], coordinates[0]) if distance <= range: pharmacies_distance.append({ "name": pharmacy_name, "distance": distance, "location": { "latitude": coordinates[1], "longitude": coordinates[0] } }) if len(pharmacies_distance) == 0: return "No resources", 404 sorted_pharmacies = sorted(pharmacies_distance, key=lambda k: k['distance']) sorted_pharmacies = sorted_pharmacies[:limit] if limit < len( sorted_pharmacies) else sorted_pharmacies return {"pharmacies": sorted_pharmacies}, 200
def match_telemetry(self): """ Match visitors telemetry fields :return score: calculated telemetry score based on all attributes :return ip_timing_red_flag: True if IP timing/geographic location values indicate it can't be the same person """ ip_timing_red_flag = False match_scores = [] for prev_v in self.prev_vs: distance_scores, ip_matches = [], [] for previous_ip_data in prev_v["ips"]: for new_ip_data in self.new_v["ips"]: distance = haversine_distance(previous_ip_data["props"], new_ip_data["props"]) time_delta = timestamp_difference( previous_ip_data["updated_at"], new_ip_data["updated_at"] ) ip_matches.append(exact_match(previous_ip_data["ip"], new_ip_data["ip"])) if distance > 10 and (distance / time_delta) > self.MAX_PLAUSIBLE_SPEED: ip_timing_red_flag = True elif distance < 10: distance_scores.append(1) else: distance_scores.append(min(1000, distance) / 1000) results = { "ip_match": max(ip_matches), "geographic_proximity": 1 - (sum(distance_scores) / len(distance_scores)), # average distance score "creation_time_proximity": 1 - min( 1, timestamp_difference( prev_v["visitors"]["createdAt"], self.new_v["visitors"]["createdAt"] ) / 86400, # seconds in a day ), "visitor_age_proximity": 1 - age_difference( prev_v["visitors"]["createdAt"], self.new_v["visitors"]["createdAt"] ), } match_scores.append(generate_match_score(results, self.weights["telemetry"])) return {"score": max(match_scores), "ip_timing_red_flag": ip_timing_red_flag}
def find_close_languages(lat1, lng1, languages, distance_cache): """Given latitude/longitude coordinates finds the nearest language.""" close_language_indices = [] for i, language in enumerate(languages): lat2 = language["latitude"] lng2 = language["longitude"] loc1 = (float(lat1), float(lng1)) loc2 = (float(lat2), float(lng2)) if (loc1, loc2) not in distance_cache: dist = utils.haversine_distance((float(lat1), float(lng1)), (float(lat2), float(lng2))) distance_cache[(loc1, loc2)] = dist distance_cache[(loc2, loc1)] = dist else: dist = distance_cache[(loc1, loc2)] if dist < FLAGS.close_enough: close_language_indices.append(i) return close_language_indices
def get_user_shop_distance(result): result['feature_user_shop_lon_sub'] = (result['user_longitude'] - result['shop_longitude']) result['feature_user_shop_lat_sub'] = (result['user_latitude'] - result['shop_latitude']) result['feature_user_shop_lon_sub_abs'] = abs( result['feature_user_shop_lon_sub']) result['feature_user_shop_lat_sub_abs'] = abs( result['feature_user_shop_lat_sub']) result['feature_user_shop_uclidean_dis'] = euclidean_distance( result['user_latitude'], result['user_longitude'], result['shop_latitude'], result['shop_longitude']) result['feature_user_shop_haversine_dis'] = haversine_distance( result['user_latitude'], result['user_longitude'], result['shop_latitude'], result['shop_longitude']) result['feature_user_shop_manhattan_dis'] = manhattan_distance( result['user_latitude'], result['user_longitude'], result['shop_latitude'], result['shop_longitude']) return result
def get_user_shop_average_distance(refer, result): shop_longitude = refer.groupby(['shop_id'], as_index=False)['longitude'].agg( {'shop_average_longitude': 'mean'}) shop_latitude = refer.groupby(['shop_id'], as_index=False)['latitude'].agg( {'shop_average_latitude': 'mean'}) result = pd.merge(result, shop_longitude, on=['shop_id'], how='left') result = pd.merge(result, shop_latitude, on=['shop_id'], how='left') result['feature_user_shop_aver_uclidean_dis'] = euclidean_distance( result['user_latitude'], result['user_longitude'], result['shop_average_latitude'], result['shop_average_longitude']) result['feature_user_shop_aver_haversine_dis'] = haversine_distance( result['user_latitude'], result['user_longitude'], result['shop_average_latitude'], result['shop_average_longitude']) result['feature_user_shop_aver_manhattan_dis'] = manhattan_distance( result['user_latitude'], result['user_longitude'], result['shop_average_latitude'], result['shop_average_longitude']) del result['shop_average_longitude'] del result['shop_average_latitude'] return result
import os import time import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from utils import haversine_distance DATA_DIR = '../data' t0 = time.time() for filename in ['train_pp_N2.csv', 'train_pp_N3.csv', 'train_pp_N1.csv']: print('reading training data from %s ...' % filename) df = pd.read_csv(os.path.join(DATA_DIR, filename)) d1 = haversine_distance(df[['xs', 'ys']].values, df[['xe', 'ye']].values) # create training set y = np.log((df['len'] - 1) * 15) # remove non-predictive features df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True) X = np.array(df, dtype=np.float) # clean data by removing long distance tracks th1 = np.percentile(d1, [99.9]) X = X[(d1 < th1), :] y = y[(d1 < th1)] print('training a random forest regressor ...') # Initialize the famous Random Forest Regressor from scikit-learn clf = RandomForestRegressor(n_estimators=200, n_jobs=3, random_state=21)
def getDistanceTo(self, baum): # the returned distance between this and the given baum (in metre) return 1000 * haversine_distance(self.latitude, self.longitude, baum.latitude, baum.longitude)
if not os.path.isfile(filename): continue df = pd.read_csv(filename) if df.shape[0] < 1000: print('skipping key point %i (%i)' % (id_, df.shape[0])) continue # factorize categorical columns in training set #df['CALL_TYPE'], ct_index = pd.factorize(df['CALL_TYPE']) #df = df[df['CALL_TYPE'] == 0] # A=2, B=1, C=0 # fill all NaN values with -1 #df = df.fillna(-1) # remove long distance d1 = haversine_distance(df[['xs', 'ys']], df[['xe', 'ye']]) th1 = np.percentile(d1, [99.9]) df = df.loc[d1 < th1] y = np.ravel(np.log(df['len']*15 + 1)) df.drop(['CALL_TYPE', 'TAXI_ID', 'xe', 'ye', 'len'], axis=1, inplace=True) X = np.array(df, dtype=np.float) print('training classifier of key point %i (sz=%i) ...' % (id_, X.shape[0])) # Initialize the famous Random Forest Regressor from scikit-learn clf = RandomForestRegressor(n_estimators=200, n_jobs=3, random_state=21) clf.fit(X, y) pred_rf = clf.predict(X_tst[id_:id_+1, :]) clf = GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=21) clf.fit(X, y)