def stationsInGeometry(geometry, stations): intersect_stations = [] station_index = 0 intersect_stations.append( ["ID", "NAME", "LONG", "LAT", "ELEVATION", "STATIONID"]) geometry = shape(geometry[0]['geometry']) print("Number of stations: " + str(len(stations["results"]))) for station in stations["results"]: point = Point(station["longitude"], station["latitude"]) point.crs = {'init': '+proj=longlat +datum=WGS84 +no_defs'} if geometry.contains(point): station_index += 1 add_station = [ station_index, station["name"], station["longitude"], station["latitude"], station["elevation"], station["id"] ] intersect_stations.append(add_station) print("Number of stations in geometry:" + str(station_index)) return intersect_stations
def do_KNN_and_naive_bayes(self): new_p = self.new_p # print(new_p) # df= pd.read_excel('core/tweets_location.xls') df = pd.read_csv('core/tweets_location.csv') # new_p= [-81.33response020, 28.5380] #the point for which i need to know the zipcode # KNN on lat long of the tweetzips. n_neighbors = 5 samples = df.as_matrix(['longi', 'lat']) neigh = NearestNeighbors(n_neighbors) neigh.fit(samples) NearestNeighbors(algorithm='auto', metric='haversine') a, b = neigh.kneighbors([new_p]) candidates, tweetzip, candidate_loc, annotation, raw_tweets = self.most_frequent( a, b, df) # create new dataframe to match with the train file new_point = pd.DataFrame(index=range(1)) new_point['predicted_k=%s' % str(n_neighbors)] = tweetzip # make a shaply point sp = Point(new_p[0], new_p[1]) sp.crs = {'init': 'epsg:4326'} usps = gpd.GeoDataFrame.from_file('core/maps/usps_wgs84.shp') esri = gpd.GeoDataFrame.from_file('core/maps/esri_wgs84.shp') zcta = gpd.GeoDataFrame.from_file('core/maps/zcta_wgs84.shp') map1 = gpd.GeoDataFrame.from_file('core/maps/map1_wgs84.shp') usps.crs = {'init': 'epsg:4326'} esri.crs = {'init': 'epsg:4326'} zcta.crs = {'init': 'epsg:4326'} map1.crs = {'init': 'epsg:4326'} usps = usps.loc[usps.zipc > 30000] esri = esri.loc[esri.ZIP_number > 30000] zcta.ZCTA5CE10 = zcta.ZCTA5CE10.astype(int) zcta = zcta.loc[zcta.ZCTA5CE10 > 30000] map1.ZIP = map1.ZIP.astype(int) map1 = map1.loc[map1.ZIP > 30000] # intersect geodataframe of the point to the polygon geodataframe and get the zipcode pnt = gpd.GeoDataFrame(geometry=[sp]) # print("here") usps_int = gpd.sjoin(pnt, usps[['zipc', 'geometry']], how='left', op='intersects') new_point['tweet_usps'] = usps_int.zipc esri_int = gpd.sjoin(pnt, esri[['ZIP_number', 'geometry']], how='left', op='intersects') zcta_int = gpd.sjoin(pnt, zcta[['ZCTA5CE10', 'geometry']], how='left', op='intersects') map1_int = gpd.sjoin(pnt, map1[['ZIP', 'geometry']], how='left', op='intersects') new_point['tweet_map'] = map1_int.ZIP new_point['tweet_zcta'] = zcta_int.ZCTA5CE10 new_point['tweet_esri'] = esri_int.ZIP_number new_point = new_point[[ 'tweet_usps', 'tweet_map', 'tweet_zcta', 'tweet_esri', 'predicted_k=5' ]] naive_train = pd.read_csv('core/k5_forNaive_allmap.csv') X_train = naive_train[[ 'tweet_map', 'tweet_zcta', 'tweet_esri', 'predicted_k=5' ]].values y_train = naive_train[['tweet_usps']].values X_test = new_point[[ 'tweet_map', 'tweet_zcta', 'tweet_esri', 'predicted_k=5' ]].values y_test = new_point[['tweet_usps']].values # multinomial naive base prediction clf = MultinomialNB().fit(X_train, y_train) y_pred = clf.predict(new_point[[ 'tweet_map', 'tweet_zcta', 'tweet_esri', 'predicted_k=5' ]]) zip_prob = pd.DataFrame(clf.predict_proba(X_test), columns=clf.classes_) # return int(y_pred) # return zip_prob.to_json() ## to return all the zips with probabilities # gaussian prediction clf_gs = GaussianNB().fit(X_train, y_train.ravel()) y_pred_gs = clf_gs.predict(X_test) zip_prob_gs = pd.DataFrame(clf_gs.predict_proba(X_test), columns=clf_gs.classes_) return { "zip_with_probs": zip_prob_gs.to_json(), "candidates": candidates, "candidate_loc": candidate_loc, "candidate_annotation": annotation, "raw_tweets": raw_tweets }