def sanitizeData(df, FACE_PAGE=None): from gasp.mng.fld.df import listval_to_newcols if FACE_PAGE: df['page_ref'] = FACE_PAGE # Sanitize created_time COLS = df.columns.values if 'created_time' in COLS: df['datahora'] = df.created_time.str.replace('T', ' ') df["datahora"] = df.datahora.str[:-5] df.drop(['created_time'], axis=1, inplace=True) # Sanitize ID df.rename(columns={'id': 'post_id'}, inplace=True) # Sanitize Places if 'place' in COLS: df = listval_to_newcols(df, 'place') df.rename(columns={ 'id': 'place_id', 'name': 'place_name', 0: 'unk1' }, inplace=True) df = listval_to_newcols(df, 'location') df.rename(columns={0: 'unk2'}, inplace=True) df.drop(['unk1', 'unk2'], axis=1, inplace=True) return df
def find_places(inShp, epsg, radius, output, keyword=None, type=None): """ Extract places from Google Maps """ import pandas import time from gasp.fm import tbl_to_obj from gasp.to.geom import pnt_dfwxy_to_geodf from gasp.mng.prj import project from gasp.mng.fld.df import listval_to_newcols from gasp.to.shp import df_to_shp pntDf = tbl_to_obj(inShp) pntDf = project(pntDf, None, 4326, gisApi='pandas') if epsg != 4326 else pntDf pntDf['latitude'] = pntDf.geometry.y.astype(str) pntDf['longitude'] = pntDf.geometry.x.astype(str) DATA = 1 def get_places(row): places = get_places_by_radius(row.latitude, row.longitude, radius, keyword, type) if type(DATA) == int: DATA = pandas.DataFrame(places['results']) else: DATA = DATA.append(pandas.DataFrame(places['results']), ignore_index=True) a = pntDf.apply(lambda x: get_places(x), axis=1) DATA = listval_to_newcols(DATA, 'geometry') fldsToDelete = ['viewport', 'opening_hours', 'icon', 'plus_code', 'photos'] realDeletion = [x for x in fldsToDelete if x in DATA.columns.values] DATA.drop(realDeletion, axis=1, inplace=True) DATA = listval_to_newcols(DATA, 'location') DATA = pnt_dfwxy_to_geodf(DATA, 'lng', 'lat', 4326) if epsg != 4326: DATA = project(DATA, None, epsg, gisApi='pandas') DATA["types"] = DATA.types.astype(str) df_to_shp(DATA, output) return output
def search_photos(lat=None, lng=None, radius=None, keyword=None, apiKey=None): """ Method to connect with Flickr in order to querie photos and other kinds of data using keyworkds, coordinates and a radius Returns a Pandas Dataframe """ import pandas from flickrapi import FlickrAPI from gasp import unicode_to_str from gasp.mng.fld.df import listval_to_newcols if apiKey: FLIC_PUB, FLIC_SEC = apiKey else: FLIC_PUB, FLIC_SEC = FLICKR_PUBLIC, FLICKR_SECRET flickr_engine = FlickrAPI(FLIC_PUB, FLIC_SEC, format='parsed-json', store_token=False) extras = 'url_l,geo,date_taken,date_upload,description' if not keyword: keyword = '' else: if type(keyword) == unicode: keyword = unicode_to_str(keyword) if not lat or not lng or not radius: data = flickr_engine.photos.search(text=keyword, pp=500, extras=extras) else: data = flickr_engine.photos.search(text=keyword, lat=lat, lon=lng, radius=radius, pp=500, extras=extras) photos_array = pandas.DataFrame(data['photos']['photo']) if not photos_array.shape[0]: return None photos_array = listval_to_newcols(photos_array, "description") return photos_array
def matrix_od(originsShp, destinationShp, originsEpsg, destinationEpsg, resultShp, modeTrans="driving"): """ Use Pandas to Retrieve data from MapBox Matrix OD Service """ import time from threading import Thread from gasp.web.mapbx import get_keys, matrix from gasp.fm import tbl_to_obj from gasp.mng.split import split_df, split_df_inN from gasp.mng.fld.df import listval_to_newcols from gasp.fm.geom import pointxy_to_cols from gasp.mng.prj import project from gasp.mng.gen import merge_df from gasp.prop.feat import get_geom_type from gasp.to.shp import df_to_shp # Data to GeoDataFrame origens = tbl_to_obj( originsShp) destinos = tbl_to_obj(destinationShp) # Check if SHPs are points inGeomType = get_geom_type(origens, geomCol="geometry", gisApi='pandas') if inGeomType != 'Point' and inGeomType != 'MultiPoint': raise ValueError('The input geometry must be of type point') inGeomType = get_geom_type(destinos, geomCol="geometry", gisApi='pandas') if inGeomType != 'Point' and inGeomType != 'MultiPoint': raise ValueError('The input geometry must be of type point') # Re-Project data to WGS if originsEpsg != 4326: origens = project(origens, None, 4326, gisApi='pandas') if destinationEpsg != 4326: destinos = project(destinos, None, 4326, gisApi='pandas') origens = pointxy_to_cols( origens, geomCol="geometry", colX="longitude", colY="latitude" ); destinos = pointxy_to_cols( destinos, geomCol="geometry", colX="longitude", colY="latitude" ) # Prepare coordinates Str origens["location"] = origens.longitude.astype(str) \ + "," + origens.latitude.astype(str) destinos["location"] = destinos.longitude.astype(str) \ + "," + destinos.latitude.astype(str) # Split destinations DataFrame into Dafaframes with # 24 rows lst_destinos = split_df(destinos, 24) # Get Keys to use KEYS = get_keys() # Split origins by key origensByKey = split_df_inN(origens, KEYS.shape[0]) lst_keys= KEYS["key"].tolist() # Produce matrix results = [] def get_matrix(origins, key): def def_apply(row): rowResults = [] for df in lst_destinos: strDest = df.location.str.cat(sep=";") strLocations = row["location"] + ";" + strDest dados = matrix( strLocations, idxSources="0", idxDestinations=";".join([str(i) for i in range(1, df.shape[0] + 1)]), useKey=key, modeTransportation=modeTrans ) time.sleep(5) rowResults += dados["durations"][0] row["od_matrix"] = rowResults return row newOrigins = origins.apply( lambda x: def_apply(x), axis=1 ) results.append(newOrigins) # Create threads thrds = [] i = 1 for df in origensByKey: thrds.append(Thread( name="tk{}".format(str(i)), target=get_matrix, args=(df, lst_keys[i - 1]) )) i += 1 # Start all threads for thr in thrds: thr.start() # Wait for all threads to finish for thr in thrds: thr.join() # Join all dataframes RESULT = merge_df(results, ignIndex=False) RESULT = listval_to_newcols(RESULT, "od_matrix") RESULT.rename( columns={ c: "dest_{}".format(c) for c in RESULT.columns.values if type(c) == int or type(c) == long }, inplace=True ) if originsEpsg != 4326: RESULT = project(RESULT, None, originsEpsg, gisApi='pandas') return df_to_shp(RESULT, resultShp) return results
def dist_matrix_by_shp(oShp, dShp, oEpsg, dEpsg, result, transMode=None): """ Create distance matrix using shapes and Google Maps API - Uses my first API_KEY """ import time import pandas from gasp.fm import tbl_to_obj from gasp.mng.split import split_df from gasp.mng.prj import project from gasp.mng.fld.df import listval_to_newcols from gasp.prop.feat import get_geom_type from gasp.mng.gen import merge_df from gasp.web.glg.distmx import dist_matrix from gasp.to import obj_to_tbl from gasp.to.obj import df_to_list from gasp.oss import get_filename # Origins and Destionations to GeoDataframe originsDf = tbl_to_obj(oShp) destnatDf = tbl_to_obj(dShp) # Check Geometries type - shapes should be of type point originsGeom = get_geom_type(originsDf, gisApi='pandas') destGeom = get_geom_type(destnatDf, gisApi='pandas') if (originsGeom != 'Point' and originsGeom != 'MultiPoint') or \ (destGeom != 'Point' and destGeom != 'MultiPoint'): raise ValueError('All input geometries must be of type point') # Re-project GeoDataframes if needed originsDf = originsDf if oEpsg == 4326 else \ project(originsDf, None, 4326, gisApi='pandas') destnatDf = destnatDf if dEpsg == 4326 else \ project(destnatDf, None, 4326, gisApi='pandas') # Geom to Field as str originsDf["geom"] = originsDf["geometry"].y.astype(str) + "," + \ originsDf["geometry"].x.astype(str) destnatDf["geom"] = destnatDf["geometry"].y.astype(str) + "," + \ destnatDf["geometry"].x.astype(str) originsDf["old_fid"] = originsDf.index destnatDf["old_fid"] = destnatDf.index # Split Destinations lstOrigins = split_df(originsDf, 95) for odf in lstOrigins: odf.reset_index(inplace=True) lstDestinations = df_to_list(destnatDf) RESULTS = [] for destino in lstDestinations: for oDf in lstOrigins: matrix = dist_matrix( str(oDf.geom.str.cat(sep="|")), str(destino["geom"]), oDf.shape[0], 1, transport_mode=transMode, useKey='AIzaSyAmyPmqtxD20urqtpCpn4ER74a6J4N403k') matrix = pandas.DataFrame(matrix) matrix = listval_to_newcols(matrix, "elements") matrix = matrix.merge(oDf, how='inner', left_index=True, right_index=True) matrix.rename(columns={ 'old_fid': "fid_origin", 0: "cost" }, inplace=True) matrix["fid_destin"] = destino['old_fid'] RESULTS.append(matrix) time.sleep(5) # Join all dataframes RESULT = merge_df(RESULTS, ignIndex=False) RESULT = sanitizeDataCols(RESULT, "cost") RESULT.drop([ x for x in originsDf.columns.values if x != "geometry" and x != "old_fid" ], axis=1, inplace=True) RESULT.rename(columns={"geometry": "origin_geom"}, inplace=True) RESULT = RESULT.merge(destnatDf, how='inner', left_on=["fid_destin"], right_on=["old_fid"]) RESULT.drop([x for x in destnatDf.columns.values if x != "geometry"], axis=1, inplace=True) RESULT.rename(columns={"geometry": "destin_geom"}, inplace=True) RESULT["origin_geom"] = RESULT.origin_geom.astype(str) RESULT["destin_geom"] = RESULT.destin_geom.astype(str) obj_to_tbl(RESULT, result, sheetsName=get_filename(result)) return result
def search_tweets(lat=None, lng=None, radius=None, keyword=None, NR_ITEMS=500, only_geo=None, __lang=None, key=None, resultType='mixed'): """ Basic tool to extract data from Twitter using a keyword and/or a buffer * radius should be in Km * options for resulType: mixed, recent, popular Returns an array with the encountered data """ import tweepy import pandas from gasp.mng.fld.df import listval_to_newcols from gasp import unicode_to_str if not key: TOKEN, SECRET, CONSUMER_KEY, CONSUMER_SECRET = TWITTER_TOKEN['TOKEN'],\ TWITTER_TOKEN['SECRET'], TWITTER_TOKEN['CONSUMER_KEY'],\ TWITTER_TOKEN['CONSUMER_SECRET'] else: TOKEN, SECRET, CONSUMER_KEY, CONSUMER_SECRET = key resultType = None if resultType == 'mixed' else resultType # Give our credentials to the Twitter API auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(TOKEN, SECRET) api = tweepy.API(auth) # Request data from twitter if not keyword: keyword = '' else: if type(keyword) == unicode: keyword = unicode_to_str(keyword) if not lat or not lng or not radius: data = [ i._json for i in tweepy.Cursor(api.search, q=keyword, lang=__lang, count=50, result_type=resultType).items(NR_ITEMS) ] else: __geostr = '{_lat},{_lng},{r}km'.format(_lat=str(lat), _lng=str(lng), r=str(radius)) data = [ i._json for i in tweepy.Cursor(api.search, q=keyword, geocode=__geostr, lang=__lang, count=50, result_type=resultType).items(NR_ITEMS) ] data = pandas.DataFrame(data) if not data.shape[0]: return None data.rename(columns={ "id": "fid", "created_at": "tweet_time", "lang": "tweet_lang" }, inplace=True) if "place" in data.columns.values: from shapely.geometry import shape def get_wkt(x): if type(x) == dict: g = shape(x) return str(g.wkt) else: return 'None' # Split in several columns data = listval_to_newcols(data, "place") cols = list(data.columns.values) colsRename = {} for c in cols: if c == "name": colsRename[c] = "place_name" elif c == "country": colsRename[c] = "place_country" elif c == "country_code": colsRename[c] = "place_countryc" elif c == "id": colsRename[c] = "place_id" else: continue data.rename(columns=colsRename, inplace=True) if 'bounding_box' in data.columns.values: data["place_box"] = data.bounding_box.apply(get_wkt) else: data["place_box"] = 'None' cols = list(data.columns.values) INTEREST_COLS = [ 'user', 'text', 'fid', 'geo', 'tweet_time', 'retweeted', 'tweet_lang', 'place_name', 'place_country', 'place_countryc', 'place_id', 'place_box' ] delCols = [x for x in cols if x not in INTEREST_COLS] data.drop(delCols, axis=1, inplace=True) dfGeom = data[data["geo"].astype(str) != 'None'] if only_geo and not dfGeom.shape[0]: return None elif not only_geo and not dfGeom.shape[0]: result = data result["latitude"] = result["geo"] result["longitude"] = result["geo"] result.drop("geo", axis=1, inplace=True) else: dfGeom = pandas.concat( [dfGeom.drop(["geo"], axis=1), dfGeom["geo"].apply(pandas.Series)], axis=1) dfGeom = pandas.concat([ dfGeom.drop(["coordinates"], axis=1), dfGeom["coordinates"].apply( pandas.Series) ], axis=1) dfGeom.rename(columns={0: 'latitude', 1: 'longitude'}, inplace=True) dfGeom.drop("type", axis=1, inplace=True) if only_geo: result = dfGeom else: dfNoGeom = data[data["geo"].astype(str) == 'None'] dfNoGeom["latitude"] = dfNoGeom["geo"] dfNoGeom["longitude"] = dfNoGeom["geo"] dfNoGeom.drop("geo", axis=1, inplace=True) result = dfGeom.append(dfNoGeom, ignore_index=True) result = pandas.concat( [result.drop(["user"], axis=1), result["user"].apply(pandas.Series)], axis=1) result.rename(columns={ 'screen_name': 'user', 'id': 'user_id', 'location': 'user_location', 'name': 'username' }, inplace=True) INTEREST_COLS += [ 'user', 'followers_count', 'user_id', 'user_location', 'username', 'latitude', 'longitude' ] cols = list(result.columns.values) delCols = [c for c in cols if c not in INTEREST_COLS] result.drop(delCols, axis=1, inplace=True) result["url"] = 'https://twitter.com/' + \ result["user"].astype(str) + '/status/' + \ result["fid"].astype(str) return result