def tweets_to_xls(outxls, searchword=None, searchGeom=None, srs=None, lng='pt', NTW=1000, twType='mixed', Key=None): """ Search for Tweets and Export them to XLS """ from glass.ng.wt import obj_to_tbl data = search_tweets(keyword=searchword, in_geom=searchGeom, epsg=srs, __lang=lng, NR_ITEMS=NTW, resultType=twType, key=Key) try: if not data: return 0 except: pass obj_to_tbl(data, outxls, sheetsName='twitter') return outxls
def get_day_table(day): print('Starting: ' + day) if EXCLUDE_DAYS: if day in EXCLUDE_DAYS: print('Ending: ' + day) return 0 COUNTING = [] for __int in INTERVALS: start, end = __int COUNT_FIELD = 'p{}h{}_{}h{}'.format(str(start[0]), str(start[1]), str(end[0]), str(end[1])) if COUNT_FIELD not in INTERVAL_COLUMNS: INTERVAL_COLUMNS.append(COUNT_FIELD) countTbl = count_by_period_entity(psqldb, start, end, pgtable, DAY_FIELD, day, HOUR_FIELD, MINUTES_FIELD, ENTITY_FIELD) COUNTING.append(countTbl) main_table = COUNTING[0] for i in range(1, len(COUNTING)): main_table = combine_dfs(main_table, COUNTING[i], ENTITY_FIELD) if workspace_day_tables: obj_to_tbl(main_table, os.path.join(workspace_day_tables, 'ti_{}.xlsx')) return main_table
def count_entity_periods_with_certain_duration(db, PERIOD_INTERVAL, PGTABLE, TIME_FIELD, ENTITY_FIELD, OUT_TABLE, filterWhere=None): """ Count rows in a pgtable for a given period of X minutes for each interest entity PERIOD_INTERVAL = "01:00:00" """ from glass.pys.tm import day_to_intervals2 from glass.ng.pd.joins import combine_dfs from glass.ng.wt import obj_to_tbl # Get Intervals INTERVALS = day_to_intervals2(PERIOD_INTERVAL) # For each interval/period, count the number of rows by entity counting = [] for _int in INTERVALS: Q = ("SELECT {entityCol}, COUNT({entityCol}) AS {cntCol} " "FROM {table} WHERE " "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') >= " "TO_TIMESTAMP('{minLower}', 'HH24:MI:SS') AND " "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') < " "TO_TIMESTAMP('{minUpper}', 'HH24:MI:SS'){whr} " "GROUP BY {entityCol}").format(cntCol="s{}_e{}".format( _int[0][:5], _int[1][:5]).replace(":", "_"), table=PGTABLE, timeCol=TIME_FIELD, entityCol=ENTITY_FIELD, minLower=_int[0], minUpper=_int[1], whr="" if not filterWhere else " AND ({}) ".format(filterWhere)) count = q_to_obj(db, Q, db_api='psql') counting.append(count) mainDf = combine_dfs(counting[0], counting[1:], ENTITY_FIELD) obj_to_tbl(mainDf, OUT_TABLE) return OUT_TABLE
def show_duplicates_in_xls(db_name, table, pkCols, outFile, tableIsQuery=None): """ Find duplicates and write these objects in a table """ from glass.pys import obj_to_lst from glass.ng.sql.q import q_to_obj from glass.ng.wt import obj_to_tbl pkCols = obj_to_lst(pkCols) if not pkCols: raise ValueError("pkCols value is not valid") if not tableIsQuery: q = ( "SELECT {t}.* FROM {t} INNER JOIN (" "SELECT {cls}, COUNT({cnt}) AS conta FROM {t} " "GROUP BY {cls}" ") AS foo ON {rel} " "WHERE conta > 1" ).format( t=table, cls=", ".join(pkCols), cnt=pkCols[0], rel=" AND ".join([ "{t}.{c} = foo.{c}".format(t=table, c=col) for col in pkCols ]) ) else: q = ( "SELECT foo.* FROM ({q_}) AS foo INNER JOIN (" "SELECT {cls}, COUNT({cnt}) AS conta " "FROM ({q_}) AS foo2 GROUP BY {cls}" ") AS jt ON {rel} " "WHERE conta > 1" ).format( q_=table, cls=", ".join(pkCols), cnt=pkCols[0], rel=" AND ".join([ "foo.{c} = jt.{c}".format(c=x) for x in pkCols ]) ) data = q_to_obj(db_name, q, db_api='psql') obj_to_tbl(data, outFile) return outFile
def count_by_periods_with_certain_duration(db, PERIOD_INTERVAL, pgtable, TIME_FIELD, outTable, filterWhere=None): """ Count rows in a pgtable by periods of X minutes PERIOD_INTERVAL = "01:00:00" """ import pandas from glass.pys.tm import day_to_intervals2 # Get Intervals INTERVALS = day_to_intervals2(PERIOD_INTERVAL) # For each interval/period, count the number of rows counting = None for _int_ in INTERVALS: QUERY = ("SELECT COUNT(*) AS count FROM {table} WHERE " "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') >= " "TO_TIMESTAMP('{minLower}', 'HH24:MI:SS') AND " "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') < " "TO_TIMESTAMP('{minUpper}', 'HH24:MI:SS'){whr}").format( table=pgtable, timeCol=TIME_FIELD, minLower=_int_[0], minUpper=_int_[1], whr="" if not filterWhere else " AND ({})".format(filterWhere)) count = q_to_obj(db, QUERY, db_api='psql') count.rename(index={0: "{}-{}".format(_int_[0][:5], _int_[1][:5])}, inplace=True) if type(counting) != pandas.DataFrame: counting = count.copy() else: counting = counting.append(count, ignore_index=False) obj_to_tbl(counting, outTable) return outTable
def field_sum_two_tables(tableOne, tableTwo, joinFieldOne, joinFieldTwo, field_to_sum, outTable): """ Sum same field in different tables Table 1: id | field 0 | 10 1 | 11 2 | 13 3 | 10 Table 2: id | field 0 | 10 1 | 9 2 | 17 4 | 15 Create the new table id | field 0 | 20 1 | 20 2 | 30 3 | 10 4 | 15 """ from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd.joins import sum_field_of_two_tables # Open two tables df_one = tbl_to_obj(tableOne) df_two = tbl_to_obj(tableTwo) # Do it! outDf = sum_field_of_two_tables(df_one, joinFieldOne, df_two, joinFieldTwo, field_to_sum) obj_to_tbl(outDf, outTable) return outTable
def merge_tbls(folder, out_tbl, tbl_format='.dbf'): """ Merge all tables in folder into one single table """ from glass.pys.oss import lst_ff from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd import merge_df tbls = lst_ff(folder, file_format=tbl_format) tbls_dfs = [tbl_to_obj(t) for t in tbls] out_df = merge_df(tbls_dfs) obj_to_tbl(out_df, out_tbl) return out_tbl
def record_time_consumed(timeData, outXls): """ Record the time consumed by a OSM2LULC procedure version in a excel table """ import pandas from glass.ng.wt import obj_to_tbl # Produce main table - Time consumed by rule main = [{ 'rule': timeData[i][0], 'time': timeData[i][1] } for i in range(len(timeData.keys())) if timeData[i]] # Produce detailed table - Time consumed inside rules timeInsideRule = [] timeDataKeys = list(timeData.keys()) timeDataKeys.sort() for i in timeDataKeys: if not timeData[i]: continue if len(timeData[i]) == 2: timeInsideRule.append({ 'rule': timeData[i][0], 'task': timeData[i][0], 'time': timeData[i][1] }) elif len(timeData[i]) == 3: taskKeys = list(timeData[i][2].keys()) taskKeys.sort() for task in taskKeys: if not timeData[i][2][task]: continue timeInsideRule.append({ 'rule': timeData[i][0], 'task': timeData[i][2][task][0], 'time': timeData[i][2][task][1] }) else: print('timeData object with key {} is not valid'.format(i)) # Export tables to excel dfs = [pandas.DataFrame(main), pandas.DataFrame(timeInsideRule)] return obj_to_tbl(dfs, outXls, sheetsName=['general', 'detailed'])
def model_conf_matrix(tblFile, refCol, clsCol, outMxt): """ Model Evaluation """ import pandas as pd from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from sklearn.metrics import confusion_matrix, classification_report data = tbl_to_obj(tblFile) data[refCol] = data[refCol].astype(str) data[clsCol] = data[clsCol].astype(str) ref_id = data[[refCol]].drop_duplicates().sort_values(refCol) conf_mat = confusion_matrix(data[refCol], data[clsCol]) mxt = pd.DataFrame(conf_mat, columns=ref_id[refCol].values, index=ref_id[refCol].values) mxt.reset_index(inplace=True) mxt.rename(columns={'index': 'confusion_mxt'}, inplace=True) # Get classification report report = classification_report(data[refCol], data[clsCol], target_names=ref_id[refCol], output_dict=True) global_keys = ['accuracy', 'macro avg', 'micro avg', 'weighted avg'] cls_eval = {k: report[k] for k in report if k not in global_keys} glb_eval = {k: report[k] for k in report if k in global_keys} if 'accuracy' in glb_eval: glb_eval['accuracy'] = { 'f1-score': glb_eval['accuracy'], 'precision': 0, 'recall': 0, 'support': 0 } cls_eval = pd.DataFrame(cls_eval).T gbl_eval = pd.DataFrame(glb_eval).T return obj_to_tbl([gbl_eval, cls_eval, mxt], outMxt, sheetsName=['global', 'report', 'matrix'])
def merge_xls_in_folder(tbl_folder, out_table): """ Get all excel tables in a folder and make one table of them """ import pandas from glass.pys.oss import lst_ff from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl tables = lst_ff(tbl_folder, file_format=['.xls', '.xlsx']) dfs = [tbl_to_obj(table) for table in tables] result = pandas.concat(dfs) out_table = obj_to_tbl(result, out_table) return out_table
def tbl_to_tbl(inTbl, outTbl, inSheet=None, txtDelimiter=None, inTxtDelimiter=None, inEncoding='utf-8'): """ Convert data format """ from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl data = tbl_to_obj(inTbl, sheet=inSheet, encoding_=inEncoding, _delimiter=inTxtDelimiter) outTbl = obj_to_tbl(data, outTbl, delimiter=txtDelimiter) return outTbl
def exp_by_group_relfeat(shp, group_col, relfeat, relfeat_id, reltbl, reltbl_sheet, group_fk, relfeat_fk, out_folder, out_tbl): """ Identify groups in shp, get features related with these groups and export group features and related features to new file """ import os import pandas as pd from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import obj_to_shp from glass.g.prop.prj import get_shp_epsg epsg = get_shp_epsg(shp) # Open data shp_df = shp_to_obj(shp) rel_df = shp_to_obj(relfeat) # Get table with relations N-N nn_tbl = tbl_to_obj(reltbl, sheet=reltbl_sheet) # Relate relfeat with shp groups rel_df = rel_df.merge(nn_tbl, how='inner', left_on=relfeat_id, right_on=relfeat_fk) # List Groups grp_df = pd.DataFrame({ 'cnttemp': shp_df.groupby([group_col])[group_col].agg('count') }).reset_index() ntbls = [] # Filter and export for idx, row in grp_df.iterrows(): # Get shp_df filter new_shp = shp_df[shp_df[group_col] == row[group_col]] # Get relfeat filter new_relf = rel_df[rel_df[group_fk] == row[group_col]] # Export shp_i = obj_to_shp( new_shp, 'geometry', epsg, os.path.join(out_folder, 'lyr_{}.shp'.format(row[group_col]))) rel_i = obj_to_shp( new_relf, 'geometry', epsg, os.path.join(out_folder, 'rel_{}.shp'.format(row[group_col]))) ntbls.append([row[group_col], shp_i, rel_i]) ntbls = pd.DataFrame(ntbls, columns=['group_id', 'shp_i', 'rel_i']) obj_to_tbl(ntbls, out_tbl) return out_tbl
def timedist_stopsPairs(db, GTFS_SCHEMA, outfile): """ Use GTFS DB to calculate the mean time between all stops pairs for all route_id. Definition of a stop pair: For a route with 10 stops, the time distance will be estimated for the following pairs: 1|2; 2|3; 3|4; 4|5; 5|6; 6|7; 7|8; 8|9; 9|10. So, the time distance will not be calculated for all possible combinations of bus stops. GTFS_SCHEMA = { "TRIPS" : { "TNAME" : "trips", "TRIP_ID" : "trip_id", "ROUTE_ID" : "route_id" }, "ROUTES" : { "TNAME" : "routes", "ROUTE_ID" : "route_id", "ROUTE_NAME" : "route_short_name" }, "STOP_TIMES" : { "TNAME" : "stop_times", "TRIP_ID" : "trip_id", "STOP_ID" : "stop_id", "ORDER" : "stop_sequence", "ARRIVAL" : "arrival_time", "DEPARTURE" : "departure_time" } } The output will be something like this: route | origin | o_order | destination | d_order | duration 12E | XXX | 1 | XXX | 2 | XX:XX:XX 12E | XXX | 2 | XXX | 3 | XX:XX:XX 12E | XXX | 3 | XXX | 4 | XX:XX:XX 12E | XXX | 4 | XXX | 5 | XX:XX:XX 12E | XXX | 5 | XXX | 6 | XX:XX:XX 12E | XXX | 6 | XXX | 7 | XX:XX:XX 15E | XXX | 1 | XXX | 2 | XX:XX:XX 15E | XXX | 2 | XXX | 3 | XX:XX:XX 15E | XXX | 3 | XXX | 4 | XX:XX:XX 15E | XXX | 4 | XXX | 5 | XX:XX:XX """ from glass.ng.sql.q import q_to_obj from glass.ng.wt import obj_to_tbl SQL_QUERY = ( "SELECT route, origin, o_order, destination, d_order, AVG(duration) AS duration FROM (" "SELECT foo.*, (foo.time_arrival - foo.time_departure) AS duration FROM (" "SELECT {tripid}, {stopid} AS origin, {stp_order} AS o_order, " "LEAD({stopid}) OVER(PARTITION BY {tripid} ORDER BY {tripid}, {stp_order}) AS destination, " "LEAD({stp_order}) OVER(PARTITION BY {tripid} ORDER BY {tripid}, {stp_order}) AS d_order, " "TO_TIMESTAMP({dep_time}, 'HH24:MI:SS') AS time_departure, " "LEAD(TO_TIMESTAMP({arr_time}, 'HH24:MI:SS')) OVER(" "PARTITION BY {tripid} ORDER BY {tripid}, {stp_order}) AS time_arrival, " "{route_name} AS route FROM {stopTm} INNER JOIN (" "SELECT {tripsT}.{Ttripsid} AS trip_fid, " "{routesT}.{route_name} FROM {tripsT} INNER JOIN {routesT} ON " "{tripsT}.{Ttrouteid} = {routesT}.{Rrouteid}" ") AS trips_routes ON {stopTm}.{tripid} = trips_routes.trip_fid " "ORDER BY {tripid}, {stp_order}" ") AS foo " "WHERE time_arrival IS NOT NULL " "ORDER BY {tripid}, o_order" ") AS allods " "GROUP BY route, origin, o_order, destination, d_order " "ORDER BY route, o_order").format( tripid=GTFS_SCHEMA["STOP_TIMES"]["TRIP_ID"], stopid=GTFS_SCHEMA["STOP_TIMES"]["STOP_ID"], stp_order=GTFS_SCHEMA["STOP_TIMES"]["ORDER"], dep_time=GTFS_SCHEMA["STOP_TIMES"]["DEPARTURE"], arr_time=GTFS_SCHEMA["STOP_TIMES"]["ARRIVAL"], stopTm=GTFS_SCHEMA["STOP_TIMES"]["TNAME"], route_name=GTFS_SCHEMA["ROUTES"]["ROUTE_NAME"], routesT=GTFS_SCHEMA["ROUTES"]["TNAME"], Rrouteid=GTFS_SCHEMA["ROUTES"]["ROUTE_ID"], Ttrouteid=GTFS_SCHEMA["TRIPS"]["ROUTE_ID"], tripsT=GTFS_SCHEMA["TRIPS"]["TNAME"], Ttripsid=GTFS_SCHEMA["TRIPS"]["TRIP_ID"]) table = q_to_obj(db, SQL_QUERY) return obj_to_tbl(table, outfile)
def meanrowsday_of_periods_by_entity(psql_con, pgtable, dayField, hourField, minutesField, secondField, entityField, PERIODS, outFile, filterData=None, numberDays=None): """ Evolution of meanday_of_periods_by_entity: For every day in a pgtable, count the number of rows by periods of X minutes for each interest entity. At the end, calculate the mean between every day for each period. This method uses SQL and TimeInterval columns. PERIODS = [('07:30:00', '09:30:00'), ('07:30:00', '09:30:00')] It is not complete because the output table not have a column for each period """ from glass.pys import obj_to_lst from glass.ng.sql.q import q_to_obj from glass.ng.wt import obj_to_tbl def get_case(PTUPLE, PFIELD): return ("CASE " "WHEN TO_TIMESTAMP(" "COALESCE(CAST({h} AS text), '') || ':' || " "COALESCE(CAST({m} AS text), '') || ':' || " "COALESCE(CAST({s} AS text), ''), 'HH24:MI:SS'" ") >= TO_TIMESTAMP('{tLower}', 'HH24:MI:SS') AND " "TO_TIMESTAMP(" "COALESCE(CAST({h} AS text), '') || ':' || " "COALESCE(CAST({m} AS text), '') || ':' || " "COALESCE(CAST({s} AS text), ''), 'HH24:MI:SS'" ") < TO_TIMESTAMP('{tUpper}', 'HH24:MI:SS') " "THEN 1 ELSE 0 " "END AS {fld}").format(h=hourField, m=minutesField, s=secondField, tLower=PTUPLE[0], tUpper=PTUPLE[1], fld=PFIELD) entityField = obj_to_lst(entityField) periodsCols = [ "p{ha}h{ma}_{hb}h{mb}".format(ha=p[0].split(':')[0], ma=p[0].split(':')[1], hb=p[1].split(':')[0], mb=p[1].split(':')[1]) for p in PERIODS ] ndaysQ = "SELECT {} AS nday".format(numberDays) if numberDays else \ ("SELECT MAX(nday) AS nday FROM (" "SELECT row_number() OVER(ORDER BY {dayF}) AS nday " "FROM {t} {whr}" "GROUP BY {dayF}" ") AS dayt") # Get mean rows of all days by entity and period q = ("SELECT {entityF}, {meanSq}, nday FROM (" "SELECT {entityF}, {dayF}, {sumSeq} FROM (" "SELECT {entityF}, {dayF}, {caseSt} FROM {t} {whr}" ") AS foo " "WHERE {whrSq} " "GROUP BY {entityF}, {dayF}" ") AS foo2, ({getND}) AS fooday " "GROUP BY {entityF}, nday").format( entityF=", ".join(entityField), meanSq=", ".join([ "(SUM({f}) / nday) AS {f}".format(f=p) for p in periodsCols ]), dayF=dayField, sumSeq=", ".join( ["SUM({f}) AS {f}".format(f=p) for p in periodsCols]), caseSt=", ".join([ get_case(PERIODS[x], periodsCols[x]) for x in range(len(PERIODS)) ]), t=pgtable, whr="" if not filterData else "WHERE {} ".format(filterData), whrSq=" OR ".join(["{}=1".format(p) for p in periodsCols]), getND=ndaysQ) data = q_to_obj(psql_con, q, db_api='psql') obj_to_tbl(data, outFile) return outFile
def get_not_used_tags(OSM_FILE, OUT_TBL): """ Use a file OSM to detect tags not considered in the OSM2LULC procedure """ import os from glass.ng.wt import obj_to_tbl from glass.g.tbl.filter import sel_by_attr from glass.ng.sql.q import q_to_obj from glass.ng.pd.split import df_split from glass.pys.oss import fprop from glass.g.it.osm import osm_to_gpkg OSM_TAG_MAP = { "DB" : os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'osmtolulc.sqlite' ), "OSM_FEAT" : "osm_features", "KEY_COL" : "key", "VALUE_COL" : "value", "GEOM_COL" : "geom" } WORKSPACE = os.path.dirname(OUT_TBL) sqdb = osm_to_gpkg(OSM_FILE, os.path.join( WORKSPACE, fprop(OSM_FILE, 'fn') + '.gpkg' )) # Get Features we are considering ourOSMFeatures = q_to_obj(OSM_TAG_MAP["DB"], ( "SELECT {key} AS key_y, {value} AS value_y, {geom} AS geom_y " "FROM {tbl}" ).format( key=OSM_TAG_MAP["KEY_COL"], value=OSM_TAG_MAP["VALUE_COL"], geom=OSM_TAG_MAP["GEOM_COL"], tbl=OSM_TAG_MAP["OSM_FEAT"] ), db_api='sqlite') # Get Features in File TABLES_TAGS = { 'points' : ['highway', 'man_made', 'building'], 'lines' : ['highway', 'waterway', 'aerialway', 'barrier', 'man_made', 'railway'], 'multipolygons' : ['aeroway', 'amenity', 'barrier', 'building', 'craft', 'historic', 'land_area', '' 'landuse', 'leisure', 'man_made', 'military', 'natural', 'office', 'place', 'shop', 'sport', 'tourism', 'waterway', 'power', 'railway', 'healthcare', 'highway'] } Qs = [ " UNION ALL ".join([( "SELECT '{keycol}' AS key, {keycol} AS value, " "'{geomtype}' AS geom FROM {tbl} WHERE " "{keycol} IS NOT NULL" ).format( keycol=c, geomtype='Point' if table == 'points' else 'Line' \ if table == 'lines' else 'Polygon', tbl=table ) for c in TABLES_TAGS[table]]) for table in TABLES_TAGS ] fileOSMFeatures = q_to_obj(sqdb, ( "SELECT key, value, geom FROM ({}) AS foo " "GROUP BY key, value, geom" ).format(" UNION ALL ".join(Qs)), db_api='sqlite') _fileOSMFeatures = fileOSMFeatures.merge( ourOSMFeatures, how='outer', left_on=["key", "value", "geom"], right_on=["key_y", "value_y", "geom_y"] ) # Select OSM Features of file without correspondence _fileOSMFeatures["isnew"] =_fileOSMFeatures.key_y.fillna(value='nenhum') newTags = _fileOSMFeatures[_fileOSMFeatures.isnew == 'nenhum'] newTags["value"] = newTags.value.str.replace("'", "''") newTags["whr"] = newTags.key + "='" + newTags.value + "'" # Export tags not being used to new shapefile def to_regular_str(row): san_str = row.whr row["whr_san"] = san_str return row for t in TABLES_TAGS: if t == 'points': filterDf = newTags[newTags.geom == 'Point'] elif t == 'lines': filterDf = newTags[newTags.geom == 'Line'] elif t == 'multipolygons': filterDf = newTags[newTags.geom == 'Polygon'] if filterDf.shape[0] > 500: dfs = df_split(filterDf, 500, nrows=True) else: dfs = [filterDf] Q = "SELECT * FROM {} WHERE {}".format( t, filterDf.whr.str.cat(sep=" OR ")) i = 1 for df in dfs: fn = t + '.shp' if len(dfs) == 1 else '{}_{}.shp'.format( t, str(i) ) try: shp = sel_by_attr(sqdb, Q.format( t, df.whr.str.cat(sep=" OR ") ), os.path.join(WORKSPACE, fn), api_gis='ogr') except: __df = df.apply(lambda x: to_regular_str(x), axis=1) shp = sel_by_attr(sqdb, Q.format( t, __df.whr.str.cat(sep=" OR ") ), os.path.join(WORKSPACE, fn)) i += 1 # Export OUT_TBL with tags not being used newTags.drop(['key_y', 'value_y', 'geom_y', 'isnew', 'whr'], axis=1, inplace=True) obj_to_tbl(newTags, OUT_TBL, sheetsName="new_tags", sanitizeUtf8=True) return OUT_TBL
def datatocls_multiref(shpfile, mapstbl, sheet, slugs, titles, ncls, decplace, outshp, outmapstbl, method="QUANTILE"): """ Create classes/intervals for each layout in table (mapstbl) One layout could have more than one map... deal with that situation method options: * QUANTILE; * JENKS - natural breaks (jenks); """ import pandas as pd import numpy as np from glass.pys import obj_to_lst from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import df_to_shp from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd.fld import listval_to_newcols from glass.g.lyt.diutils import eval_intervals methods = ["QUANTILE", "JENKS"] if method not in methods: raise ValueError(f'Method {method} is not available') if method == "QUANTILE": from glass.ng.pd.stats import get_intervals elif method == "JENKS": import jenkspy slugs = obj_to_lst(slugs) titles = obj_to_lst(titles) # Read data shp = shp_to_obj(shpfile) maps = tbl_to_obj(mapstbl, sheet=sheet) # Get intervals for each map istats = [] cols = [] for i, row in maps.iterrows(): ddig = row[decplace] icols = [row[slug] for slug in slugs] ititles = [row[title] for title in titles] istatsrow = [] for _i in range(len(icols)): min_v = shp[icols[_i]].min() max_v = shp[icols[_i]].max() mean_v = shp[icols[_i]].mean() std_v = shp[icols[_i]].std() if method == "QUANTILE": intervals = get_intervals( shp, icols[_i], ncls, method="QUANTILE") intervals.append(max_v) elif method == "JENKS": breaks = jenkspy.jenks_breaks(shp[icols[_i]], nb_class=ncls) intervals = breaks[1:] if not str(shp[icols[_i]].dtype).startswith('int'): __intervals = [round(itv, ddig) for itv in intervals] __intervals, ndig = eval_intervals( intervals, __intervals, ddig, round(min_v, ddig) ) istatsrow.extend([ icols[_i], ititles[_i], round(min_v, ndig), round(max_v, ndig), round(mean_v, ddig), round(std_v, ddig), __intervals ]) shp[icols[_i]] = shp[icols[_i]].round(ddig) else: for _e in range(len(intervals)): if not _e: rzero = 1 if round(intervals[_e], 0) > min_v else 0 else: rzero = 1 if round(intervals[_e], 0) > \ round(intervals[_e -1], 0) else 0 if not rzero: break __intervals = [round( _o, ddig if not rzero else 0 ) for _o in intervals] __intervals, ndig = eval_intervals( intervals, __intervals, ddig, min_v ) istatsrow.extend([ icols[_i], ititles[_i], min_v, max_v, int(round(mean_v, 0)) if rzero else round(mean_v, ddig), int(round(std_v, 0)) if rzero else round(std_v, ddig), __intervals ]) if not i: cols.extend([ f'slug{str(_i+1)}', f'title{str(_i+1)}', f'min_value{str(_i+1)}', f'max_value{str(_i+1)}', f'mean_value{str(_i+1)}', f'std_value{str(_i+1)}', f'intervals{str(_i+1)}' ]) istats.append(istatsrow) istats = pd.DataFrame(istats, columns=cols) rename_cols = {} for idx, row in istats.iterrows(): for _i in range(len(slugs)): # Get intervals int_ = row[f'intervals{str(_i+1)}'] # Add columns for intervals ids newcol = 'i_' + row[f'slug{str(_i+1)}'] shp[newcol] = 0 for itv in range(len(int_)): if not itv: shp[newcol] = np.where( shp[row[f'slug{str(_i+1)}']] <= int_[itv], itv + 1, shp[newcol] ) else: shp[newcol] = np.where( (shp[row[f'slug{str(_i+1)}']] > int_[itv-1]) & (shp[row[f'slug{str(_i+1)}']] <= int_[itv]), itv + 1, shp[newcol] ) rename_cols[newcol] = row[f'slug{str(_i+1)}'] dc = [] for c in range(len(slugs)): dc.extend(istats[f'slug{str(c+1)}'].tolist()) shp.drop(dc, axis=1, inplace=True) shp.rename(columns=rename_cols, inplace=True) for i in range(len(slugs)): istats = listval_to_newcols(istats, f'intervals{str(i+1)}') istats.rename(columns={ ii : f'intervals{str(i+1)}_{str(ii+1)}' for ii in range(ncls) }, inplace=True) # Write outputs df_to_shp(shp, outshp) obj_to_tbl(istats, outmapstbl) return outshp, outmapstbl
def datatocls_meanstd(shp_data, maps_table, sheet, slug, title, ncls, decplace, nodata, out_shp, out_maps_tbl, grpcol=None): """ Create classes based on mean and standard deviation decplace - Numero casas decimais que vao aparecer nos valores do layout nodata - Must be always smaller than the min of min values """ import pandas as pd import numpy as np from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import df_to_shp from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd.fld import listval_to_newcols from glass.g.lyt.diutils import eval_intervals # Read data shp_df = shp_to_obj(shp_data) maps_df = tbl_to_obj(maps_table, sheet=sheet) if grpcol: maps_cols = maps_df[slug].tolist() for c in maps_cols: shp_df[c] = shp_df[c].astype(float) agg_dict = {c : 'mean' for c in maps_cols} shp_df = pd.DataFrame(shp_df.groupby([grpcol]).agg( agg_dict )).reset_index() def get_intervals(_ncls, mean, std): mean_class = mean + (std / 2) less_mean = [] major_mean = [] for e in range(_ncls): if not e: less_mean.append(mean - (std / 2)) major_mean.append(mean_class + std) else: less_mean.append(less_mean[e - 1] - std) major_mean.append(major_mean[e - 1] + std) less_mean.reverse() intervals = less_mean + [mean_class] + major_mean return intervals # Calculo intervalos para cada indicador # metodo intervalos baseados na media e no desvio padrao # Get min, max, mean and standard deviation # Round values i_stats = [] for idx, row in maps_df.iterrows(): ddig = row[decplace] i = row[slug] t = row[title] if nodata in shp_df[i].unique(): vals = list(shp_df[i].unique()) vals.sort() min_v = vals[1] tdf = shp_df[[i]].copy() tdf = tdf[tdf[i] >= min_v] tdf.reset_index(drop=True, inplace=True) max_v = tdf[i].max() mean_v = tdf[i].mean() std_v = tdf[i].std() else: min_v = shp_df[i].min() max_v = shp_df[i].max() mean_v = shp_df[i].mean() std_v = shp_df[i].std() fbreak = min_v - 1 __std = std_v while fbreak <= min_v: intervals = get_intervals(ncls, mean_v, __std) repeat = 0 for __i in intervals[:-1]: if __i > max_v: repeat = 1 if repeat: break fbreak = intervals[0] if not repeat else min_v - 1 __std = __std / 2 intervals[-1] = max_v if not str(shp_df[i].dtype).startswith('int'): __intervals = [round(_i, ddig) for _i in intervals] repeat = 1 __intervals, ndig = eval_intervals( intervals, __intervals, ddig, round(min_v, ddig) ) i_stats.append([ i, t, round(min_v, ndig), round(max_v, ndig), round(mean_v, ddig), round(std_v, ddig), __intervals ]) shp_df[i] = shp_df[i].round(ddig) else: for _e in range(len(intervals)): if not _e: rzero = 1 if round(intervals[_e], 0) > min_v else 0 else: rzero = 1 if round(intervals[_e], 0) > \ round(intervals[_e - 1], 0) else 0 if not rzero: break __intervals = [round(_o, ddig if not rzero else 0) for _o in intervals] __intervals, ndig = eval_intervals(intervals, __intervals, ddig, min_v) i_stats.append([ i, t, min_v, max_v, int(round(mean_v, 0)) if rzero else round(mean_v, ddig), int(round(std_v, 0)) if rzero else round(std_v, ddig), __intervals ]) i_stats = pd.DataFrame(i_stats, columns=[ 'slug', 'title', 'min_value', 'max_value', 'mean_value', 'std_value', 'intervals' ]) rename_cols = {} for idx, row in i_stats.iterrows(): # Get intervals. int_ = row.intervals # Add columns for intervals i_col = 'i_' + row.slug shp_df[i_col] = 0 for _i in range(len(int_)): if not _i: shp_df[i_col] = np.where( (shp_df[row.slug] > nodata) & (shp_df[row.slug] <= int_[_i]), _i + 1, shp_df[i_col] ) else: shp_df[i_col] = np.where( (shp_df[row.slug] > int_[_i - 1]) & (shp_df[row.slug] <= int_[_i]), _i + 1, shp_df[i_col] ) rename_cols[i_col] = row.slug shp_df.drop(i_stats.slug, axis=1, inplace=True) shp_df.rename(columns=rename_cols, inplace=True) i_stats = listval_to_newcols(i_stats, 'intervals') i_stats.rename(columns={ i : 'interval_' + str(i+1) for i in range((ncls * 2) + 1) }, inplace=True) if grpcol: nshp_df = shp_to_obj(shp_data) nshp_df.drop(maps_cols, axis=1, inplace=True) shp_df.rename(columns={grpcol : grpcol + '_y'}, inplace=True) shp_df = nshp_df.merge(shp_df, how='left', left_on=grpcol, right_on=grpcol + '_y') df_to_shp(shp_df, out_shp) obj_to_tbl(i_stats, out_maps_tbl) return out_shp, out_maps_tbl
def meandays_by_entity(db, pgtable, DAY_FIELD, ENTITY_FIELD, COUNT_FIELD_NAME, OUTPUT_FILE, EXCLUDE_DAYS=None): """ For every day in a pgtable, count the number of rows for each interest entity. At the end, calculate the mean of rows between every day for each entity. Day field must be of type text """ from glass.ng.sql.q import q_to_obj from glass.ng.wt import obj_to_tbl # Get days VALUES = q_to_obj(db, "SELECT {col} FROM {t} GROUP BY {col}".format( col=DAY_FIELD, t=pgtable), db_api='psql')[DAY_FIELD].tolist() # For every day, Group rows by entities tableArray = [] for day in VALUES: if EXCLUDE_DAYS: if day[0] in EXCLUDE_DAYS: continue QUERY = ("SELECT {col}, COUNT({col}) AS {countname} FROM {table} " "WHERE {dayF}='{d}' GROUP BY {col}").format( col=ENTITY_FIELD, countname=COUNT_FIELD_NAME, table=pgtable, dayF=DAY_FIELD, d=day[0]) countTbl = q_to_obj(db, QUERY, db_api='psql') tableArray.append(countTbl) # Get mean for all entities main_table = tableArray[0] TMP_COUNT_FIELD_NAME = 'join_' + COUNT_FIELD_NAME TMP_JOIN_FIELD = 'id_entity' for i in range(1, len(tableArray)): tableArray[i].rename(columns={ COUNT_FIELD_NAME: TMP_COUNT_FIELD_NAME, ENTITY_FIELD: TMP_JOIN_FIELD }, inplace=True) main_table = main_table.merge(tableArray[i], how='outer', left_on=ENTITY_FIELD, right_on=TMP_JOIN_FIELD) main_table.fillna(0, inplace=True) main_table[ENTITY_FIELD].replace(0, main_table[TMP_JOIN_FIELD], inplace=True) main_table[COUNT_FIELD_NAME] = main_table[COUNT_FIELD_NAME] + \ main_table[TMP_COUNT_FIELD_NAME] main_table.drop([TMP_COUNT_FIELD_NAME, TMP_JOIN_FIELD], axis=1, inplace=True) main_table[COUNT_FIELD_NAME] = main_table[COUNT_FIELD_NAME] / len( tableArray) obj_to_tbl(main_table, OUTPUT_FILE) return OUTPUT_FILE
def datatocls(shpfile, mapstbl, sheet, slug, title, ncls, decplace, outshp, outmapstbl, method="QUANTILE"): """ Create classes/intervals for each map in table method options: * QUANTILE; * JENKS - natural breaks (jenks); """ import pandas as pd import numpy as np from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import df_to_shp from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd.fld import listval_to_newcols from glass.g.lyt.diutils import eval_intervals methods = ["QUANTILE", "JENKS"] if method not in methods: raise ValueError(f'Method {method} is not available') if method == "QUANTILE": from glass.ng.pd.stats import get_intervals elif method == "JENKS": import jenkspy # Read data shp = shp_to_obj(shpfile) maps = tbl_to_obj(mapstbl, sheet=sheet) # Get intervals for each map istats = [] for i, row in maps.iterrows(): ddig = row[decplace] icol = row[slug] titl = row[title] min_v = shp[icol].min() max_v = shp[icol].max() mean_v = shp[icol].mean() std_v = shp[icol].std() if method == "QUANTILE": intervals = get_intervals(shp, icol, ncls, method="QUANTILE") intervals.append(max_v) elif method == "JENKS": breaks = jenkspy.jenks_breaks(shp[icol], nb_class=ncls) intervals = breaks[1:] if not str(shp[icol].dtype).startswith('int'): __intervals = [round(i, ddig) for i in intervals] __intervals, ndig = eval_intervals( intervals, __intervals, ddig, round(min_v, ddig) ) istats.append([ icol, titl, round(min_v, ndig), round(max_v, ndig), round(mean_v, ddig), round(std_v, ddig), __intervals ]) shp[icol] = shp[icol].round(ddig) else: for _e in range(len(intervals)): if not _e: rzero = 1 if round(intervals[_e], 0) > min_v else 0 else: rzero = 1 if round(intervals[_e], 0) > \ round(intervals[_e - 1], 0) else 0 if not rzero: break __intervals = [round( _o, ddig if not rzero else 0 ) for _o in intervals] __intervals, ndig = eval_intervals( intervals, __intervals, ddig, min_v) istats.append([ icol, titl, min_v, max_v, int(round(mean_v, 0)) if rzero else round(mean_v, ddig), int(round(std_v, 0)) if rzero else round(std_v, ddig), __intervals ]) istats = pd.DataFrame(istats, columns=[ "slug", "title", "min_value", "max_value", "mean_value", "std_value", "intervals" ]) rename_cols = {} for idx, row in istats.iterrows(): # Get intervals int_ = row.intervals # Add columns for intervals i_col = 'i_' + row.slug shp[i_col] = 0 for _i in range(len(int_)): if not _i: shp[i_col] = np.where( shp[row.slug] <= int_[_i], _i + 1, shp[i_col] ) else: shp[i_col] = np.where( (shp[row.slug] > int_[_i - 1]) & (shp[row.slug] <= int_[_i]), _i + 1, shp[i_col] ) rename_cols[i_col] = row.slug shp.drop(istats.slug, axis=1, inplace=True) shp.rename(columns=rename_cols, inplace=True) istats = listval_to_newcols(istats, 'intervals') istats.rename(columns={ i : 'interval_' + str(i+1) for i in range(ncls) }, inplace=True) # Write outputs df_to_shp(shp, outshp) obj_to_tbl(istats, outmapstbl) return outshp, outmapstbl
def calc_mean_samecol_sevshp(intbls, pk, meancol, output, tformat='.shp'): """ Calculate mean of the same column in different tables Assume we have N tables with a numerical column with the same name This script calculate the mean of all these columns """ import os from glass.ng.wt import obj_to_tbl from glass.g.rd.shp import shp_to_obj if os.path.isdir(intbls): from glass.pys.oss import lst_ff tbls = lst_ff(intbls, file_format='.shp' if not tformat else tformat) else: if type(intbls) == list: tbls = intbls else: raise ValueError('intbls has an invalid value') # Read data dfs = [shp_to_obj(t) for t in tbls] # Drop uncessary cols mantain_cols = [pk, meancol] for d in range(len(dfs)): dfs[d].drop( [c for c in dfs[d].columns.values if c not in mantain_cols], axis=1, inplace=True) if d: dfs[d].rename(columns={ pk: "{}_{}".format(pk, str(d)), meancol: "{}_{}".format(meancol, str(d)) }, inplace=True) # Join all DFS main_df = dfs[0] for d in range(1, len(dfs)): main_df = main_df.merge(dfs[d], how='outer', left_on=pk, right_on="{}_{}".format(pk, str(d))) main_df[meancol] = main_df[meancol] + main_df[meancol + "_" + str(d)] # Get mean main_df[meancol] = main_df[meancol] / len(dfs) # Drop uncessary cols drop_cols = [] for d in range(1, len(dfs)): drop_cols.append("{}_{}".format(pk, str(d))) drop_cols.append("{}_{}".format(meancol, str(d))) main_df.drop(drop_cols, axis=1, inplace=True) # Export Result obj_to_tbl(main_df, output) return output
def join_tables_in_table(mainTable, mainIdField, joinTables, outTable): """ Join one table with all tables in a folder joinTables = { r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-06.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_6'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-13.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_13'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-20.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_20'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-27.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_27'} } } #TODO: only works with xlsx tables as join TABLES """ # Modules import os import pandas from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl # Get table format tableType = os.path.splitext(mainTable)[1] tableDf = tbl_to_obj(mainTable) for table in joinTables: xlsDf = tbl_to_obj(table) join_field = 'id_entity' if joinTables[table]["JOIN_FIELD"] == mainIdField \ else joinTables[table]["JOIN_FIELD"] if joinTables[table]["JOIN_FIELD"] == mainIdField: xlsDf.rename(columns={mainIdField: join_field}, inplace=True) xlsDf.rename(columns=joinTables[table]["COLS_TO_JOIN"], inplace=True) tableDf = tableDf.merge(xlsDf, how='outer', left_on=mainIdField, right_on=join_field) tableDf.fillna(0, inplace=True) tableDf[mainIdField].replace(0, tableDf[join_field], inplace=True) tableDf.drop(join_field, axis=1, inplace=True) obj_to_tbl(tableDf, outTable) return outTable
def meanrowsday_by_entity(psqldb, pgtable, dayField, entityField, out_file, filterData=None, newMeanField=None, numberDays=None): """ For every day in a pgtable, count the number of rows for each interest entity. At the end, calculate the mean of rows between every day for each entity. Day field must be of type text Difference in relation to meandays_by_entity: this one uses only SQL and PGSQL and not Pandas. if numberDays=None, the number of days used will be based on the days included in the data. If you want the mean for 5 days, but there are no data for one of these days, with numberDays=None, the mean will be only for 4 days. """ from glass.pys import obj_to_lst from glass.ng.sql.q import q_to_obj from glass.ng.wt import obj_to_tbl entityField = obj_to_lst(entityField) mean_field = "mean_rows" if not newMeanField else newMeanField ndaysQ = "SELECT {} AS nday".format(numberDays) if numberDays else \ ("SELECT MAX(nday) AS nday FROM (" "SELECT row_number() OVER(ORDER BY {dayF}) AS nday " "FROM {t} {whr}" "GROUP BY {dayF}" ") AS fooday").format( whr="" if not filterData else "WHERE {} ".format(filterData), dayF=dayField, t=pgtable ) # Get mean rows of all days by entity q = ("SELECT {entityF}, (SUM(conta) / nday) AS {mF} " "FROM (" "SELECT {entityF}, {dayF}, COUNT({cnt}) AS conta " "FROM {t} {whr}" "GROUP BY {entityF}, {dayF}" ") AS foo, ({getD}) AS foo2 " "GROUP BY {entityF}, nday").format( entityF=", ".join(entityField), dayF=dayField, mF=mean_field, cnt=entityField[0], t=pgtable, whr="" if not filterData else "WHERE {} ".format(filterData), getD=ndaysQ) data = q_to_obj(psqldb, q, db_api='psql') obj_to_tbl(data, out_file) return out_file
def ID_rows_with_temporal_proximity_by_entities(db, table, entity_field, day_field, hour_field, hour_decimal_field, time_tolerance, outXlsPath): """ Retrieve rows from one pgtable with some temporal proximity Table structure should be entity | day | hour | hour_decimal 0 | 2018-01-02 | 5 | 5,10 0 | 2018-01-03 | 4 | 4,15 0 | 2018-01-02 | 5 | 5,12 0 | 2018-01-02 | 5 | 5,8 1 | 2018-01-02 | 4 | 4,10 1 | 2018-01-02 | 5 | 5,12 1 | 2018-01-02 | 4 | 4,20 1 | 2018-01-02 | 4 | 4,12 1 | 2018-01-02 | 4 | 4,6 For a time_tolerance of 5 minutes, the output table will have the rows with a temporal difference inside/bellow that time tolerance entity_field could be more than one field This method only identifies if one entity, for one day, has rows very close of each others, in terms of time. Not a good strategy for large tables. For large tables, SQL based methods are needed """ from glass.pys import obj_to_lst from glass.ng.sql.q import q_to_obj from glass.ng.prop.sql import cols_type from glass.ng.wt import obj_to_tbl entity_field = obj_to_lst(entity_field) COLS = entity_field + [day_field, hour_field] COLS_TYPE = cols_type(db, table) # TIME TOLERANCE IN HOURS TIME_TOLERANCE = time_tolerance / 60.0 def thereIsRowsSameTimeInt(row): whr = [] for c in COLS: if COLS_TYPE[c] == str: whr.append("{}='{}'".format(c, row[c])) else: whr.append("{}={}".format(c, row[c])) hourRows = q_to_obj(db, "SELECT {} FROM {} WHERE {}".format( hour_decimal_field, table, " AND ".join(whr)), db_api='psql')[hour_decimal_field].tolist() for i in range(len(hourRows)): for e in range(i + 1, len(hourRows)): dif = abs(hourRows[i][0] - hourRows[e][0]) if dif < TIME_TOLERANCE: break if dif < TIME_TOLERANCE: break if dif < TIME_TOLERANCE: row['time_difference'] = 1 else: row['time_difference'] = 0 return row # Count entity occourrences for one day and hour countsByEntityTime = q_to_obj( db, ("SELECT {scols}, conta FROM " "(SELECT {scols}, COUNT({ent}) AS conta FROM {tbl} " "GROUP BY {scols}) AS foo WHERE conta > 1").format( scols=', '.join(COLS), ent=entity_field[0], tbl=table), db_api='psql') # For each row in the last count, When count is > 1 # Check time difference between rows for one day and hour countsByEntityTime = countsByEntityTime.apply( lambda x: thereIsRowsSameTimeInt(x), axis=1) obj_to_tbl(countsByEntityTime, outXlsPath) return outXlsPath
def name_circulations(db, GTFS_SCHEMA, OTHER_SCHEMA, output, other_db=None, serviceSchema=None, routeIdColName=None, tripIdColName=None): """ Get all circulations from GTFS and associate these circulations to other meta columns of other database GTFS_SCHEMA = { "TNAME" : "stop_times", "TRIP" : "trip_id", "STOP" : "stop_id", "SEQUENCE" : "stop_sequence", "DEPARTURE" : "departure_time" } OTHER_SCHEMA = { "TNAME" : "percursos_geom_v2", "ROUTE" : ["carreira", "variante", "sentido"], "SEQUENCE" : "ordem", "STOP" : "paragem" } serviceSchema = { "TRIPS" : { "TNAME" : "trips", "TRIP" : "trip_id", "SERVICE" : "service_id" }, "CALENDAR" : { "TNAME" : "calendar_dates", "SERVICE" : "service_id", "DATE" : "date" }, "FILTER_DAY" : 20180308 } """ import os from glass.pys import obj_to_lst from glass.ng.sql.q import q_to_obj other_db = db if not other_db else other_db # Sanitize Route ID in Other Schema OTHER_SCHEMA_ROUTE = obj_to_lst(OTHER_SCHEMA["ROUTE"]) if len(OTHER_SCHEMA_ROUTE) > 1: from glass.ng.sql.col import txt_cols_to_col ROUTE_COL = routeIdColName if routeIdColName else "fid_route" txt_cols_to_col(other_db, OTHER_SCHEMA["TNAME"], OTHER_SCHEMA_ROUTE, "|", ROUTE_COL) else: ROUTE_COL = routeIdColName if routeIdColName else \ OTHER_SCHEMA_ROUTE[0] """ Get all circulations in GTFS and their start time """ if serviceSchema: serviceSchema["FILTER_DAY"] = obj_to_lst(serviceSchema["FILTER_DAY"]) where = "" if not serviceSchema else (" WHERE {} ").format(" OR ".join([ "{}.{} = {}".format(serviceSchema["CALENDAR"]["TNAME"], serviceSchema["CALENDAR"]["DATE"], d) for d in serviceSchema["FILTER_DAY"] ])) injoinQ = "" if not serviceSchema else ( "INNER JOIN (" "SELECT {tripsTbl}.{tripsTripId} " "FROM {tripsTbl} INNER JOIN {calenTbl} ON " "{tripsTbl}.{tripsServId} = {calenTbl}.{calenServId}{whr} " "GROUP BY {tripsTbl}.{tripsTripId}" ") AS trip_service ON {stopTimeTbl}.{stopTimeTrip} " "= trip_service.{tripsTripId} ").format( tripsTbl=serviceSchema["TRIPS"]["TNAME"], tripsTripId=serviceSchema["TRIPS"]["TRIP"], tripsServId=serviceSchema["TRIPS"]["SERVICE"], calenTbl=serviceSchema["CALENDAR"]["TNAME"], calenServId=serviceSchema["CALENDAR"]["SERVICE"], stopTimeTbl=GTFS_SCHEMA["TNAME"], stopTimeTrip=GTFS_SCHEMA["TRIP"], whr=where) newTripCol = tripIdColName if tripIdColName else GTFS_SCHEMA["TRIP"] Q = ( "SELECT {stopTimesT}.{tripId} AS {newTrip}, " "array_agg({stopTimesT}.{stopId} " "ORDER BY {stopTimesT}.{tripId}, {stopTimesT}.{stopSq}) AS stops, " "array_agg({stopTimesT}.{stopSq} " "ORDER BY {stopTimesT}.{tripId}, {stopTimesT}.{stopSq}) AS stops_order, " "MIN({stopTimesT}.{depTime}) AS departure, " "MAX({stopTimesT}.{depTime}) AS depar_last_stop " "FROM {stopTimesT} {injoin}" "GROUP BY {stopTimesT}.{tripId}").format( tripId=GTFS_SCHEMA["TRIP"], stopId=GTFS_SCHEMA["STOP"], stopSq=GTFS_SCHEMA["SEQUENCE"], depTime=GTFS_SCHEMA["DEPARTURE"], stopTimesT=GTFS_SCHEMA["TNAME"], injoin=injoinQ, newTrip=newTripCol) circ = q_to_obj(db, Q) """ Get all routes metadata in the "Other Database/Table" """ Q = ("SELECT {idRoute}, " "array_agg({stopF} ORDER BY {idRoute}, {stopSq}) AS stops, " "array_agg({stopSq} ORDER BY {idRoute}, {stopSq}) AS stops_order " "FROM {t} GROUP BY {idRoute}").format(idRoute=ROUTE_COL, stopF=OTHER_SCHEMA["STOP"], stopSq=OTHER_SCHEMA["SEQUENCE"], t=OTHER_SCHEMA["TNAME"]) routes = q_to_obj(other_db, Q) def sanitizeDf(df, col): df[col] = df[col].astype(str) df[col] = df[col].str.replace('L', '') df[col] = df[col].str.replace(' ', '') df[col] = df[col].str.replace('[', '') df[col] = df[col].str.replace(']', '') return df circ = sanitizeDf(circ, "stops") routes = sanitizeDf(routes, "stops") newDf = circ.merge(routes, how='inner', left_on="stops", right_on="stops") if os.path.dirname(output): # Write XLS from glass.ng.wt import obj_to_tbl obj_to_tbl(newDf, output) else: # Send to pgsql from glass.g.wt.sql import df_to_db df_to_db(db, newDf, output, api='psql') return output
def correlated_words(dataFile, refCol, dataCol, outTbl, lang='english', N=2, refSheet=None): """ Get words correlated with some text class """ from sklearn.feature_selection import chi2 from glass.ng.wt import obj_to_tbl from glass.ng.rd import tbl_to_obj from glass.ng.clstxt import txt_to_num_representation # Data to DataFrame trainDf = tbl_to_obj( dataFile, sheet=refSheet ) if type(dataFile) != pd.DataFrame else dataFile # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[dataCol])] trainDf = trainDf[pd.notnull(trainDf[refCol])] """ Add a column encoding the reference classes as an integer because categorical variables are often better represented by integers than strings """ from io import StringIO # Get a ID for Ref/text classes values trainDf['ref_id'] = trainDf[refCol].factorize()[0] # Create Dataframe only with ref_id's, without duplicates ref_id_df = trainDf[[refCol, 'ref_id']].drop_duplicates().sort_values( 'ref_id' ) # Create dicts to easy relate ref_id with ref_value ref_to_id = dict(ref_id_df.values) id_to_ref = dict(ref_id_df[['ref_id', refCol]].values) """ Text to numbers """ features, tfidf = txt_to_num_representation( trainDf, dataCol, lang, returnTfiDf=True) labels = trainDf.ref_id """ Get most correlated words """ corr_words = [] for ref_name, ref_id in sorted(ref_to_id.items()): features_chi2 = chi2(features, labels == ref_id) indices = np.argsort(features_chi2[0]) feat_names = np.array(tfidf.get_feature_names())[indices] unigrams = [v for v in feat_names if len(v.split(' ')) == 1][-N:] bigrams = [v for v in feat_names if len(v.split(' ')) == 2][-N:] cols_d = [ref_name] + unigrams + bigrams corr_words.append(cols_d) COLS_NAME = ['ref_name'] + [ 'unigram_{}'.format(str(i+1)) for i in range(N) ] + [ 'bigram_{}'.format(str(i+1)) for i in range(N) ] dfCorrWords = pd.DataFrame(corr_words,columns=COLS_NAME) return obj_to_tbl(dfCorrWords, outTbl)
def binary_eval(refTbl, refId, refCol, tstTbl, tstId, outTbl=None, tstCol=None): """ Evaluation of a binary classification When tstCol is None, the script assumes that in tstTbl there are only positives A tabela de referencia deve ter positivos e negativos; mas a tabela de teste pode ter so positivos. """ import numpy as np import pandas import math from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl # Data to Pandas Dataframe ref_df = tbl_to_obj(refTbl, fields=[ refId, refCol ]) if type(refTbl) != pandas.DataFrame else refTbl[[refId, refCol]] tst_df = tbl_to_obj( tstTbl, fields=[tstId] if not tstCol else [tstId, tstCol] ) if type(refTbl) != pandas.DataFrame else tstTbl[[tstId]] \ if not tstCol else tstTbl[[tstId, tstCol]] # Check if refId is equal to tstId; they must be different if refId == tstId: colRename = {tstId: 'tst_fid__'} # Do the same for refCol and tstCol if refCol == tstCol: colRename[tstCol] = 'tst_col__' tst_df.rename(columns=colRename, inplace=True) tstId = 'tst_fid__' if refCol == tstCol: tstCol = 'tst_col__' df = ref_df.merge(tst_df, how='left', left_on=refId, right_on=tstId) # Check if we have a tstCol if not tstCol: df[tstId].fillna('None', inplace=True) tstCol = 'cls_tst' df[tstCol] = np.where(df[tstId] == 'None', 0, 1) # Get VP, VN, FP, FN df['confusion'] = np.where( (df[refCol] == 1) & (df[tstCol] == 1), 'VP', np.where((df[refCol] == 0) & (df[tstCol] == 0), 'VN', np.where((df[refCol] == 1) & (df[tstCol] == 0), 'FN', 'FP'))) # tabela sintese conf_tbl = pandas.DataFrame() conf_tbl['nrows'] = df.groupby(['confusion'])[refId].nunique() conf_tbl.reset_index(inplace=True) conf_tbl['percentage'] = (conf_tbl.nrows * 100) / df.shape[0] # Get some evaluation mesures dConf = {} for row in conf_tbl.to_dict(orient='records'): dConf[row['confusion']] = row['nrows'] l = ['VP', 'VN', 'FP', 'FN'] for i in l: if i not in dConf: dConf[i] = 0 """ Error rate Error rate (ERR) is calculated as the number of all incorrect predictions divided by the total number of the dataset. The best error rate is 0.0, whereas the worst is 1.0. """ ERR = (dConf['FP'] + dConf['FN']) / (dConf['VP'] + dConf['VN'] + dConf['FN'] + dConf['FP']) """ Accuracy Accuracy (ACC) is calculated as the number of all correct predictions divided by the total number of the dataset. The best accuracy is 1.0, whereas the worst is 0.0. It can also be calculated by 1 – ERR. """ ACC = (dConf['VP'] + dConf['VN']) / (dConf['VP'] + dConf['VN'] + dConf['FN'] + dConf['FP']) """ Sensitivity (Recall or True positive rate) Sensitivity (SN) is calculated as the number of correct positive predictions divided by the total number of positives. It is also called recall (REC) or true positive rate (TPR). The best sensitivity is 1.0, whereas the worst is 0.0. """ try: SN = dConf['VP'] / (dConf['VP'] + dConf['FN']) except: SN = -99 """ Specificity (True negative rate) Specificity (SP) is calculated as the number of correct negative predictions divided by the total number of negatives. It is also called true negative rate (TNR). The best specificity is 1.0, whereas the worst is 0.0. """ SP = dConf['VN'] / (dConf['VN'] + dConf['FP']) """ Precision (Positive predictive value) Precision (PREC) is calculated as the number of correct positive predictions divided by the total number of positive predictions. It is also called positive predictive value (PPV). The best precision is 1.0, whereas the worst is 0.0. """ PREC = dConf["VP"] / (dConf["VP"] + dConf['FP']) """ False positive rate False positive rate (FPR) is calculated as the number of incorrect positive predictions divided by the total number of negatives. The best false positive rate is 0.0 whereas the worst is 1.0. It can also be calculated as 1 – specificity. """ FPR = dConf['FP'] / (dConf['VN'] + dConf['FP']) """ Matthews correlation coefficient Matthews correlation coefficient (MCC) is a correlation coefficient calculated using all four values in the confusion matrix. """ try: MCC = (dConf['VP'] * dConf['VN'] - dConf['FP'] * dConf['FN']) / (math.sqrt( (dConf['VP'] + dConf['FP']) * (dConf['VP'] + dConf['FN']) * (dConf['VN'] + dConf['FP']) * (dConf['VN'] + dConf['FN']))) except: MCC = -99 """ F-score F-score is a harmonic mean of precision and recall. """ F0_5 = ((1 + 0.5**2) * (PREC * SN)) / (0.5**2 * PREC + SN) F_1 = (2 * PREC * SN) / (PREC + SN) F_2 = (5 * PREC * SN) / (4 * PREC + SN) evalMeasures = pandas.DataFrame( [['Error rate', ERR], ['Accuracy', ACC], ['Sensitivity', SN], ['Specificity', SP], ['Precision', PREC], [ 'False positive rate', FPR ], ['Matthews correlation coefficient', MCC], ['F-score 0.5', F0_5], ['F-score 1', F_1], ['F-score 2', F_2]], columns=['eval_mesure', 'value']) if outTbl: return obj_to_tbl([conf_tbl, evalMeasures, df], outTbl, sheetsName=['matrix', 'eval_mesures', 'tbl']) else: return conf_tbl, evalMeasures, df
def tbl_to_areamtx(inShp, col_a, col_b, outXls, db=None, with_metrics=None): """ Table to Matrix Table as: FID | col_a | col_b | geom 0 | 1 | A | A | .... 0 | 2 | A | B | .... 0 | 3 | A | A | .... 0 | 4 | A | C | .... 0 | 5 | A | B | .... 0 | 6 | B | A | .... 0 | 7 | B | A | .... 0 | 8 | B | B | .... 0 | 9 | B | B | .... 0 | 10 | C | A | .... 0 | 11 | C | B | .... 0 | 11 | C | D | .... To: classe | A | B | C | D A | | | | B | | | | C | | | | D | | | | col_a = rows col_b = cols api options: * pandas; * psql; """ # TODO: check if col_a and col_b exists in table if not db: import pandas as pd import numpy as np from glass.g.rd.shp import shp_to_obj from glass.ng.wt import obj_to_tbl # Open data df = shp_to_obj(inShp) # Remove nan values df = df[pd.notnull(df[col_a])] df = df[pd.notnull(df[col_b])] # Get Area df['realarea'] = df.geometry.area / 1000000 # Get rows and Cols rows = df[col_a].unique() cols = df[col_b].unique() refval = list(np.sort(np.unique(np.append(rows, cols)))) # Produce matrix outDf = [] for row in refval: newCols = [row] for col in refval: newDf = df[(df[col_a] == row) & (df[col_b] == col)] if not newDf.shape[0]: newCols.append(0) else: area = newDf.realarea.sum() newCols.append(area) outDf.append(newCols) outcols = ['class'] + refval outDf = pd.DataFrame(outDf, columns=outcols) if with_metrics: from glass.ng.cls.eval import get_measures_for_mtx out_df = get_measures_for_mtx(outDf, 'class') return obj_to_tbl(out_df, outXls) # Export to Excel return obj_to_tbl(outDf, outXls) else: from glass.pys.oss import fprop from glass.ng.sql.db import create_db from glass.ng.prop.sql import db_exists from glass.g.it.db import shp_to_psql from glass.g.dp.tomtx.sql import tbl_to_area_mtx from glass.ng.it import db_to_tbl # Create database if not exists is_db = db_exists(db) if not is_db: create_db(db, api='psql') # Add data to database tbl = shp_to_psql(db, inShp, api='shp2pgsql') # Create matrix mtx = tbl_to_area_mtx(db, tbl, col_a, col_b, fprop(outXls, 'fn')) # Export result return db_to_tbl(db, mtx, outXls, sheetsNames='matrix')
def model_selection(dataFile, refCol, dataCol, outTbl, lang='english', CV=5): """ See which model is better to use in text classification for a specific data sample Compare: Logistic Regression (LogisticRegression) (Multinomial) Naive Bayes (MultinomialNB) Linear Support Vector Machine (LinearSVC) Random Forest (RandomForestClassifier) """ import os from glass.pys.oss import fprop from glass.ng.rd import tbl_to_obj from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import cross_val_score from glass.ng.wt import obj_to_tbl # Data to DataFrame trainDf = tbl_to_obj(dataFile) # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[dataCol])] trainDf = trainDf[pd.notnull(trainDf[refCol])] # Ref col to integers from io import StringIO trainDf['ref_id'] = trainDf[refCol].factorize()[0] # Text to numbers features = txt_to_num_representation(trainDf, dataCol, lang) labels = trainDf.ref_id """ Test Models """ models = [ RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), LinearSVC(), MultinomialNB(), LogisticRegression(random_state=0) ] cv_df = pd.DataFrame(index=range(CV * len(models))) entries = [] for model in models: m_name = model.__class__.__name__ accuracies = cross_val_score( model, features, labels, scoring='accuracy', cv=CV ) for fold_idx, accuracy in enumerate(accuracies): entries.append((m_name, fold_idx, accuracy)) # Create and Export evaluation table cv_df = pd.DataFrame( entries, columns=['model_name', 'fold_idx', 'accuracy']) cv_df_gp = pd.DataFrame(cv_df.groupby('model_name').accuracy.mean()) cv_df_gp.reset_index(inplace=True) # Export Graphic import seaborn as sns a = sns.boxplot(x='model_name', y='accuracy', data=cv_df) b = sns.stripplot( x='model_name', y='accuracy', data=cv_df, size=10, jitter=True, edgecolor="gray", linewidth=2) fig = b.get_figure() fig.savefig(os.path.join( os.path.dirname(outTbl), fprop(outTbl, 'fn') + '.png' )) return obj_to_tbl(cv_df_gp, outTbl)
def clsep_matrix(ref, var, out, fileformat=None): """ Produce matrix with classes separability from a satelite images """ import os import pandas as pd from osgeo import gdal, gdal_array from glass.ng.wt import obj_to_tbl # Open data ref_src = gdal.Open(ref, gdal.GA_ReadOnly) if type(var) != list: # Check if it is a folder if os.path.isdir(var): # List images in folder from glass.pys.oss import lst_ff var = lst_ff(var, file_format=fileformat if fileformat else '.tif') else: var = [var] var_src = [gdal.Open(i, gdal.GA_ReadOnly) for i in var] # Get Band number for each raster img_bnd = [i.RasterCount for i in var_src] # Check images shape # Return error if the shapes are different ref_shp = (ref_src.RasterYSize, ref_src.RasterXSize) for r in var_src: rst_shp = (r.RasterYSize, r.RasterXSize) if ref_shp != rst_shp: raise ValueError( 'There are at least two raster files with different shape') # Get NoData Value nd_val = ref_src.GetRasterBand(1).GetNoDataValue() # Get Number of features nvar = sum(img_bnd) # Convert imgs to Array, remove nodata values and reshape ref_num = ref_src.GetRasterBand(1).ReadAsArray() ref_num = ref_num.reshape((-1, 1)) ref_num_ = ref_num[ref_num != nd_val] X = np.zeros((ref_num_.shape[0], nvar), gdal_array.GDALTypeCodeToNumericTypeCode( var_src[0].GetRasterBand(1).DataType)) f = 0 for r in range(len(var_src)): for b in range(img_bnd[r]): a = var_src[r].GetRasterBand(b + 1).ReadAsArray() a = a.reshape((-1, 1)) a = a[ref_num != nd_val] X[:, f] = a f += 1 # Create arrays for each class classes = list(np.sort(np.unique(ref_num_))) clsdata = [X[ref_num_ == c] for c in classes] # Get separability matrix mtx_b = [] mtx_jm = [] for v in range(len(classes)): row_b = [] row_jm = [] for v_ in range(len(classes)): if v < v_: b = None jm = None else: b = bha_dist(clsdata[v], clsdata[v_]) jm = jm_dist(b) row_b.append(b) row_jm.append(jm) mtx_b.append(row_b) mtx_jm.append(row_jm) mtx_bd = pd.DataFrame(mtx_b, columns=classes, index=classes) mtx_bd.reset_index(inplace=True) mtx_bd.rename(columns={'index': 'class_id'}, inplace=True) mtx_jm = pd.DataFrame(mtx_jm, columns=classes, index=classes) mtx_jm.reset_index(inplace=True) mtx_jm.rename(columns={'index': 'class_id'}, inplace=True) obj_to_tbl([mtx_bd, mtx_jm], out, sheetsName=['Bhattacharyya_Distance', 'Jeffries-Matusita']) return out
def text_prediction(trainData, classData, trainRefCol, trainClsCol, clsDataCol, outfile, method='NaiveBayes', lang='english'): """ Text classification Classifier Options: 1) NaiveBayes; 2) LinearSupportVectorMachine; 3) RandomForest; 4) LogisticRegression. """ import pandas as pd from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl # Data to Dataframe trainDf = tbl_to_obj(trainData) if type(trainData) != pd.DataFrame else trainData classDf = tbl_to_obj(classData) if type(classData) != pd.DataFrame else classData # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[trainClsCol])] trainDf = trainDf[pd.notnull(trainDf[trainRefCol])] classDf = classDf[pd.notnull(classDf[clsDataCol])] if method == 'NaiveBayes': from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer """" Train Model """ # X train is trainClsCol # Y train is trainRefCol x_train, y_train = trainDf[trainClsCol], trainDf[trainRefCol] count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(x_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) """ Predict """ result = clf.predict(count_vect.transform(classDf[clsDataCol])) classDf['classification'] = result elif method == 'LinearSupportVectorMachine': import numpy from sklearn.svm import LinearSVC # Get features and Labels trainDf['ref_id'] = trainDf[trainRefCol].factorize()[0] labels = trainDf.ref_id features, tvect = txt_to_num_representation( trainDf, trainClsCol, __lang=lang, returnTfiDf=True) featTst = tvect.transform(classDf[clsDataCol]) """ Train model """ model = LinearSVC() model.fit(features, labels) y_pred = model.predict(featTst) classDf['classification'] = y_pred # Create Dataframe only with ref_id's, without duplicates ref_id_df = trainDf[[ trainRefCol, 'ref_id' ]].drop_duplicates().sort_values('ref_id') ref_id_df.columns = ['class_name', 'ref_fid'] classDf = classDf.merge( ref_id_df, how='inner', left_on='classification', right_on='ref_fid' ) classDf.loc[:, 'classification'] = classDf.class_name classDf.drop(['ref_fid', 'class_name'], axis=1, inplace=True) elif method == 'RandomForest': from sklearn.ensemble import RandomForestClassifier # Get features features, tvect = txt_to_num_representation( trainDf, trainClsCol, __lang=lang, returnTfiDf=True) featTst = tvect.transform(classDf[clsDataCol]) classifier = RandomForestClassifier( n_estimators=1000, random_state=0 ) classifier.fit(features, trainDf[trainRefCol]) y_pred = classifier.predict(featTst) classDf['classification'] = y_pred elif method == 'LogisticRegression': from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression logreg = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(n_jobs=1, C=1e5, multi_class='auto', solver='lbfgs')), ]) logreg.fit(trainDf[trainClsCol], trainDf[trainRefCol]) y_pred = logreg.predict(classDf[clsDataCol]) classDf['classification'] = y_pred return obj_to_tbl(classDf, outfile)