def get_stop_words(inTbl, fidCol, txtCol, outFile, lang='portuguese', inSheet=None, db=None): """ Pick a text column and save it in a new column only with the stop words. Uses PostgreSQL dictionaries to get stop words """ from glass.pys.oss import fprop from glass.ng.prop.sql import cols_name from glass.ng.sql.db import create_db from glass.ng.it.sql import tbl_to_db from glass.ng.it import db_to_tbl FILENAME = fprop(inTbl, 'fn') # Create Temp database db = create_db("db_" + FILENAME if not db else db) # Send table to PostgreSQL tbl = tbl_to_db(inTbl, db, FILENAME, sheet=inSheet, api_db='psql') cols = cols_name(db, tbl, sanitizeSpecialWords=None, api='psql') # Sanitize data and create a new column only with stop words Q1 = ( "(SELECT *, to_tsvector('{_lang}', regexp_replace(" "regexp_replace(lower(unaccent({txt_c})), 'http://[^:\s]+(\S+)', " "' ', 'g'), '[^\w]+', ' ', 'g')) " "AS txt_data FROM {t}) AS stop_table" ).format(_lang=lang, txt_c=txtCol, t=tbl) Q2 = ( "SELECT {selCols}, ARRAY_TO_STRING(array_agg(" "word ORDER BY word_index), ' ', '*') AS {outCol}, " "REPLACE(CAST(STRIP(" "stop_table.txt_data) AS text), '''', '') AS no_duplicated " "FROM (" "SELECT fid, word, CAST(UNNEST(word_index) AS integer) " "AS word_index FROM (" "SELECT fid, SPLIT_PART(tst, ';', 1) AS word, " "STRING_TO_ARRAY(SPLIT_PART(tst, ';', 2), ',') AS word_index FROM (" "SELECT {fid} AS fid, REPLACE(REPLACE(REPLACE(REPLACE(REPLACE(" "CAST(UNNEST(txt_data) AS text), " "',{{', ',\"{{'), ',\"{{', ';'), '}}\"', ''), " "'(', ''), '}}', '') AS tst " "FROM {tbl}" ") AS foo" ") AS foo2" ") AS foo3 INNER JOIN {tbl} ON foo3.fid = stop_table.{fid} " "GROUP BY {selCols}, stop_table.txt_data" ).format( outCol="clean_" + txtCol, tbl=Q1, fid=fidCol, selCols=", ".join(["stop_table.{}".format(i) for i in cols]) ) # Export new table return db_to_tbl(db, Q2, outFile, sheetsNames=inSheet)
def matrix_od_mean_dist_by_group(MATRIX_OD, ORIGIN_COL, GROUP_ORIGIN_ID, GROUP_ORIGIN_NAME, GROUP_DESTINA_ID, GROUP_DESTINA_NAME, TIME_COL, epsg, db, RESULT_MATRIX): """ Calculate Mean GROUP distance from OD Matrix OD MATRIX EXAMPLE | origin_entity | origin_group | destina_entity | destina_group | distance | XXXX | XXXX | XXXX | XXX | XXX OUTPUT EXAMPLE | origin_group | destina_group | mean_distance | XXXX | XXXX | XXXX """ from glass.pys.oss import fprop from glass.g.it.db import shp_to_psql from glass.ng.sql.db import create_db from glass.ng.sql.q import q_to_ntbl from glass.ng.it import db_to_tbl db = create_db(fprop(MATRIX_OD, 'fn'), overwrite=True, api='psql') TABLE = shp_to_psql(db, MATRIX_OD, pgTable="tbl_{}".format(db), api="pandas", srsEpsgCode=epsg) OUT_TABLE = q_to_ntbl( db, fprop(RESULT_MATRIX, 'fn'), ("SELECT {groupOriginCod}, {groupOriginName}, {groupDestCod}, " "{groupDestName}, AVG(mean_time) AS mean_time FROM (" "SELECT {origin}, {groupOriginCod}, {groupOriginName}, " "{groupDestCod}, {groupDestName}, " "AVG({timeCol}) AS mean_time FROM {t} " "GROUP BY {origin}, {groupOriginCod}, {groupOriginName}, " "{groupDestCod}, {groupDestName}" ") AS foo " "GROUP BY {groupOriginCod}, {groupOriginName}, " "{groupDestCod}, {groupDestName} " "ORDER BY {groupOriginCod}, {groupDestCod}").format( groupOriginCod=GROUP_ORIGIN_ID, groupOriginName=GROUP_ORIGIN_NAME, groupDestCod=GROUP_DESTINA_ID, groupDestName=GROUP_DESTINA_NAME, origin=ORIGIN_COL, timeCol=TIME_COL, t=TABLE), api='psql') return db_to_tbl(db, "SELECT * FROM {}".format(OUT_TABLE), RESULT_MATRIX, sheetsNames="matrix", dbAPI='psql')
def tbl_to_areamtx(inShp, col_a, col_b, outXls, db=None, with_metrics=None): """ Table to Matrix Table as: FID | col_a | col_b | geom 0 | 1 | A | A | .... 0 | 2 | A | B | .... 0 | 3 | A | A | .... 0 | 4 | A | C | .... 0 | 5 | A | B | .... 0 | 6 | B | A | .... 0 | 7 | B | A | .... 0 | 8 | B | B | .... 0 | 9 | B | B | .... 0 | 10 | C | A | .... 0 | 11 | C | B | .... 0 | 11 | C | D | .... To: classe | A | B | C | D A | | | | B | | | | C | | | | D | | | | col_a = rows col_b = cols api options: * pandas; * psql; """ # TODO: check if col_a and col_b exists in table if not db: import pandas as pd import numpy as np from glass.g.rd.shp import shp_to_obj from glass.ng.wt import obj_to_tbl # Open data df = shp_to_obj(inShp) # Remove nan values df = df[pd.notnull(df[col_a])] df = df[pd.notnull(df[col_b])] # Get Area df['realarea'] = df.geometry.area / 1000000 # Get rows and Cols rows = df[col_a].unique() cols = df[col_b].unique() refval = list(np.sort(np.unique(np.append(rows, cols)))) # Produce matrix outDf = [] for row in refval: newCols = [row] for col in refval: newDf = df[(df[col_a] == row) & (df[col_b] == col)] if not newDf.shape[0]: newCols.append(0) else: area = newDf.realarea.sum() newCols.append(area) outDf.append(newCols) outcols = ['class'] + refval outDf = pd.DataFrame(outDf, columns=outcols) if with_metrics: from glass.ng.cls.eval import get_measures_for_mtx out_df = get_measures_for_mtx(outDf, 'class') return obj_to_tbl(out_df, outXls) # Export to Excel return obj_to_tbl(outDf, outXls) else: from glass.pys.oss import fprop from glass.ng.sql.db import create_db from glass.ng.prop.sql import db_exists from glass.g.it.db import shp_to_psql from glass.g.dp.tomtx.sql import tbl_to_area_mtx from glass.ng.it import db_to_tbl # Create database if not exists is_db = db_exists(db) if not is_db: create_db(db, api='psql') # Add data to database tbl = shp_to_psql(db, inShp, api='shp2pgsql') # Create matrix mtx = tbl_to_area_mtx(db, tbl, col_a, col_b, fprop(outXls, 'fn')) # Export result return db_to_tbl(db, mtx, outXls, sheetsNames='matrix')
def check_shape_diff(SHAPES_TO_COMPARE, OUT_FOLDER, REPORT, DB, GRASS_REGION_TEMPLATE): """ Script to check differences between pairs of Feature Classes Suponha que temos diversas Feature Classes (FC) e que cada uma delas possui um determinado atributo; imagine tambem que, considerando todos os pares possiveis entre estas FC, se pretende comparar as diferencas na distribuicao dos valores desse atributo para cada par. * Dependencias: - GRASS; - PostgreSQL; - PostGIS. """ import datetime import os import pandas from glass.ng.sql.q import q_to_obj from glass.ng.it import db_to_tbl from glass.g.wt.sql import df_to_db from glass.g.dp.rst.toshp import rst_to_polyg from glass.g.it.db import shp_to_psql from glass.g.dp.tomtx import tbl_to_area_mtx from glass.g.prop import check_isRaster from glass.pys.oss import fprop from glass.ng.sql.db import create_db from glass.ng.sql.tbl import tbls_to_tbl from glass.ng.sql.q import q_to_ntbl # Check if folder exists, if not create it if not os.path.exists(OUT_FOLDER): from glass.pys.oss import mkdir mkdir(OUT_FOLDER, overwrite=None) else: raise ValueError('{} already exists!'.format(OUT_FOLDER)) from glass.g.wenv.grs import run_grass gbase = run_grass(OUT_FOLDER, grassBIN='grass78', location='shpdif', srs=GRASS_REGION_TEMPLATE) import grass.script as grass import grass.script.setup as gsetup gsetup.init(gbase, OUT_FOLDER, 'shpdif', 'PERMANENT') from glass.g.it.shp import shp_to_grs, grs_to_shp from glass.g.it.rst import rst_to_grs from glass.g.tbl.col import rn_cols # Convert to SHAPE if file is Raster i = 0 _SHP_TO_COMPARE = {} for s in SHAPES_TO_COMPARE: isRaster = check_isRaster(s) if isRaster: # To GRASS rstName = fprop(s, 'fn') inRst = rst_to_grs(s, "rst_" + rstName, as_cmd=True) # To Vector d = rst_to_polyg(inRst, rstName, rstColumn="lulc_{}".format(i), gisApi="grass") # Export Shapefile shp = grs_to_shp(d, os.path.join(OUT_FOLDER, d + '.shp'), "area") _SHP_TO_COMPARE[shp] = "lulc_{}".format(i) else: # To GRASS grsV = shp_to_grs(s, fprop(s, 'fn'), asCMD=True) # Change name of column with comparing value ncol = "lulc_{}".format(str(i)) rn_cols(grsV, {SHAPES_TO_COMPARE[s]: "lulc_{}".format(str(i))}, api="grass") # Export shp = grs_to_shp(grsV, os.path.join(OUT_FOLDER, grsV + '_rn.shp'), "area") _SHP_TO_COMPARE[shp] = "lulc_{}".format(str(i)) i += 1 SHAPES_TO_COMPARE = _SHP_TO_COMPARE __SHAPES_TO_COMPARE = SHAPES_TO_COMPARE # Create database create_db(DB, api='psql') """ Union SHAPEs """ UNION_SHAPE = {} FIX_GEOM = {} SHPS = list(__SHAPES_TO_COMPARE.keys()) for i in range(len(SHPS)): for e in range(i + 1, len(SHPS)): # Optimized Union print("Union between {} and {}".format(SHPS[i], SHPS[e])) time_a = datetime.datetime.now().replace(microsecond=0) __unShp = optimized_union_anls( SHPS[i], SHPS[e], os.path.join(OUT_FOLDER, "un_{}_{}.shp".format(i, e)), GRASS_REGION_TEMPLATE, os.path.join(OUT_FOLDER, "work_{}_{}".format(i, e)), multiProcess=True) time_b = datetime.datetime.now().replace(microsecond=0) print(time_b - time_a) # Rename cols unShp = rn_cols( __unShp, { "a_" + __SHAPES_TO_COMPARE[SHPS[i]]: __SHAPES_TO_COMPARE[SHPS[i]], "b_" + __SHAPES_TO_COMPARE[SHPS[e]]: __SHAPES_TO_COMPARE[SHPS[e]] }) UNION_SHAPE[(SHPS[i], SHPS[e])] = unShp # Send data to postgresql SYNTH_TBL = {} for uShp in UNION_SHAPE: # Send data to PostgreSQL union_tbl = shp_to_psql(DB, UNION_SHAPE[uShp], api='shp2pgsql') # Produce table with % of area equal in both maps areaMapTbl = q_to_ntbl( DB, "{}_syn".format(union_tbl), ("SELECT CAST('{lulc_1}' AS text) AS lulc_1, " "CAST('{lulc_2}' AS text) AS lulc_2, " "round(" "CAST(SUM(g_area) / 1000000 AS numeric), 4" ") AS agree_area, round(" "CAST((SUM(g_area) / MIN(total_area)) * 100 AS numeric), 4" ") AS agree_percentage, " "round(" "CAST(MIN(total_area) / 1000000 AS numeric), 4" ") AS total_area FROM (" "SELECT {map1_cls}, {map2_cls}, ST_Area(geom) AS g_area, " "CASE " "WHEN {map1_cls} = {map2_cls} " "THEN 1 ELSE 0 " "END AS isthesame, total_area FROM {tbl}, (" "SELECT SUM(ST_Area(geom)) AS total_area FROM {tbl}" ") AS foo2" ") AS foo WHERE isthesame = 1 " "GROUP BY isthesame").format( lulc_1=fprop(uShp[0], 'fn'), lulc_2=fprop(uShp[1], 'fn'), map1_cls=__SHAPES_TO_COMPARE[uShp[0]], map2_cls=__SHAPES_TO_COMPARE[uShp[1]], tbl=union_tbl), api='psql') # Produce confusion matrix for the pair in comparison matrixTbl = tbl_to_area_mtx(DB, union_tbl, __SHAPES_TO_COMPARE[uShp[0]], __SHAPES_TO_COMPARE[uShp[1]], union_tbl + '_mtx') SYNTH_TBL[uShp] = {"TOTAL": areaMapTbl, "MATRIX": matrixTbl} # UNION ALL TOTAL TABLES total_table = tbls_to_tbl(DB, [SYNTH_TBL[k]["TOTAL"] for k in SYNTH_TBL], 'total_table') # Create table with % of agreement between each pair of maps mapsNames = q_to_obj( DB, ("SELECT lulc FROM (" "SELECT lulc_1 AS lulc FROM {tbl} GROUP BY lulc_1 " "UNION ALL " "SELECT lulc_2 AS lulc FROM {tbl} GROUP BY lulc_2" ") AS lu GROUP BY lulc ORDER BY lulc").format(tbl=total_table), db_api='psql').lulc.tolist() FLDS_TO_PIVOT = ["agree_percentage", "total_area"] Q = ("SELECT * FROM crosstab('" "SELECT CASE " "WHEN foo.lulc_1 IS NOT NULL THEN foo.lulc_1 ELSE jtbl.tmp1 " "END AS lulc_1, CASE " "WHEN foo.lulc_2 IS NOT NULL THEN foo.lulc_2 ELSE jtbl.tmp2 " "END AS lulc_2, CASE " "WHEN foo.{valCol} IS NOT NULL THEN foo.{valCol} ELSE 0 " "END AS agree_percentage FROM (" "SELECT lulc_1, lulc_2, {valCol} FROM {tbl} UNION ALL " "SELECT lulc_1, lulc_2, {valCol} FROM (" "SELECT lulc_1 AS lulc_2, lulc_2 AS lulc_1, {valCol} " "FROM {tbl}" ") AS tst" ") AS foo FULL JOIN (" "SELECT lulc_1 AS tmp1, lulc_2 AS tmp2 FROM (" "SELECT lulc_1 AS lulc_1 FROM {tbl} GROUP BY lulc_1 " "UNION ALL " "SELECT lulc_2 AS lulc_1 FROM {tbl} GROUP BY lulc_2" ") AS tst_1, (" "SELECT lulc_1 AS lulc_2 FROM {tbl} GROUP BY lulc_1 " "UNION ALL " "SELECT lulc_2 AS lulc_2 FROM {tbl} GROUP BY lulc_2" ") AS tst_2 WHERE lulc_1 = lulc_2 GROUP BY lulc_1, lulc_2" ") AS jtbl ON foo.lulc_1 = jtbl.tmp1 AND foo.lulc_2 = jtbl.tmp2 " "ORDER BY lulc_1, lulc_2" "') AS ct(" "lulc_map text, {crossCols}" ")") TOTAL_AGREE_TABLE = None TOTAL_AREA_TABLE = None for f in FLDS_TO_PIVOT: if not TOTAL_AGREE_TABLE: TOTAL_AGREE_TABLE = q_to_ntbl( DB, "agreement_table", Q.format(tbl=total_table, valCol=f, crossCols=", ".join([ "{} numeric".format(map_) for map_ in mapsNames ])), api='psql') else: TOTAL_AREA_TABLE = q_to_ntbl(DB, "area_table", Q.format(tbl=total_table, valCol=f, crossCols=", ".join([ "{} numeric".format(map_) for map_ in mapsNames ])), api='psql') # Union Mapping UNION_MAPPING = pandas.DataFrame( [[k[0], k[1], fprop(UNION_SHAPE[k], 'fn')] for k in UNION_SHAPE], columns=['shp_a', 'shp_b', 'union_shp']) UNION_MAPPING = df_to_db(DB, UNION_MAPPING, 'union_map', api='psql') # Export Results TABLES = [UNION_MAPPING, TOTAL_AGREE_TABLE, TOTAL_AREA_TABLE ] + [SYNTH_TBL[x]["MATRIX"] for x in SYNTH_TBL] SHEETS = ["union_map", "agreement_percentage", "area_with_data_km"] + [ "{}_{}".format(fprop(x[0], 'fn')[:15], fprop(x[1], 'fn')[:15]) for x in SYNTH_TBL ] db_to_tbl(DB, ["SELECT * FROM {}".format(x) for x in TABLES], REPORT, sheetsNames=SHEETS, dbAPI='psql') return REPORT
def get_rows_related_with_event(db, tblSchema, words, resultTbl, startTime=None, endTime=None): """ Take a table of a database and see if the text in one column is related with some event. The relation between rows and event will be true when a set of words given as input exists in a row. tblSchema = { "TNAME" : "facedata", "TEXTCOL" : "message", "TIMECOL" : "datahora", "SELCOL" : [post_id, type], "TEXTCASE" : ( "CASE WHEN type = 'link' THEN lower(unaccent(description)) " "ELSE lower(unaccent(message)) END" ) } NOTE: only works for PostgreSQL """ from glass.pys import obj_to_lst from glass.ng.it import db_to_tbl if "TNAME" not in tblSchema or "TEXTCOL" not in tblSchema: raise ValueError(( "tblSchema input should be a dict with at least TNAME and TEXTCOL " "keys. The value of the first should be a table name; " "the value of the second shoulbe the name of a column with text " "to classify!" )) # Words to list words = obj_to_lst(words) cols = None if "SELCOL" not in tblSchema else obj_to_lst(tblSchema["SELCOL"]) like_words = " OR ".join(["{} LIKE '%%{}%%'".format( tblSchema["TEXTCOL"], word) for word in words]) time_where = "" if "TIMECOL" not in tblSchema or not startTime \ or not endTime else ( " {w} TO_TIMESTAMP({c}, 'YYYY-MM-DD HH24:MI:SS') > " "TO_TIMESTAMP('{l}', 'YYYY-MM-DD HH24:MI:SS') AND " "TO_TIMESTAMP({c}, 'YYYY-MM-DD HH24:MI:SS') < " "TO_TIMESTAMP('{h}', 'YYYY-MM-DD HH24:MI:SS')" ).format( w="WHERE" if "TEXTCASE" in tblSchema else "AND", c=tblSchema["TIMECOL"], l=startTime, h=endTime ) Q = ( "SELECT * FROM (" "SELECT {selCols} FROM {tbl}{timeWhr}" ") AS foo WHERE {wordsWhr}" ).format( tbl=tblSchema["TNAME"], wordsWhr=like_words, selCols="{} AS {}".format( tblSchema["TEXTCASE"], tblSchema["TEXTCOL"] ) if not cols else "{}, {} AS {}".format( ", ".join(cols), tblSchema["TEXTCASE"], tblSchema["TEXTCOL"] ), timeWhr=time_where ) if "TEXTCASE" in tblSchema else ( "SELECT {selCols} FROM {tbl} WHERE ({wordsWhr}){timeWhr}" ).format( selCols=", ".join(cols + [tblSchema["TEXTCOL"]]), tbl=tblSchema["TNAME"], wordsWhr=like_words, timeWhr=time_where ) return db_to_tbl(db, Q, resultTbl)