def change_field_type(db, table, fields, outable, cols=None): """ Imagine a table with numeric data saved as text. This method convert that numeric data to a numeric field. fields = {'field_name' : 'field_type'} """ from gasp.sql.i import cols_name if not cols: cols = cols_name(db, table) else: from gasp.pyt import obj_to_lst cols = obj_to_lst(cols) select_fields = [f for f in cols if f not in fields] con = sqlcon(db) # Create new table with the new field with converted values cursor = con.cursor() cursor.execute(('CREATE TABLE {} AS SELECT {}, {} FROM {}').format( outable, ', '.join(select_fields), ', '.join([ 'CAST({f_} AS {t}) AS {f_}'.format(f_=f, t=fields[f]) for f in fields ]), table)) con.commit() cursor.close() con.close()
def del_cols(lyr, cols, api='grass', lyrn=1): """ Remove Columns from Tables """ from gasp.pyt import obj_to_lst cols = obj_to_lst(cols) if api == 'grass': from gasp import exec_cmd rcmd = exec_cmd(("v.db.dropcolumn map={} layer={} columns={} " "--quiet").format(lyr, str(lyrn), ','.join(cols))) elif api == 'pygrass': from grass.pygrass.modules import Module m = Module("v.db.dropcolumn", map=lyr, layer=lyrn, columns=cols, quiet=True, run_=True) else: raise ValueError("API {} is not available".format(api)) return lyr
def geom_to_points(db, table, geomCol, outTable, selCols=None, newGeomCol=None): """ Convert a Polygon/Polyline Geometry to Points Equivalent to feature to point tool """ from gasp.pyt import obj_to_lst from gasp.sql.to import q_to_ntbl selCols = obj_to_lst(selCols) Q = ("SELECT {cols}(ST_DumpPoints({geom})).geom AS {newCol} " "FROM {tbl}").format( cols="" if not selCols else "{}, ".format(", ".join(selCols)), geom=geomCol, newCol="geom" if not newGeomCol else newGeomCol, tbl=table) return q_to_ntbl(db, outTable, Q, api='psql')
def tweets_to_df(keyword=None, inGeom=None, epsg=None, LANG='pt', NTWEETS=1000, tweetType='mixed', apiKey=None, dropFields=None): """ Search for Tweets and Export them to XLS """ from gasp.pyt import obj_to_lst if not inGeom and not keyword: raise ValueError('inGeom or keyword, one of them are required') if inGeom and not epsg: raise ValueError('inGeom implies epsg') if inGeom: from gasp.gt.prop.feat.bf import getBufferParam x, y, dist = getBufferParam(inGeom, epsg, outSRS=4326) dist = float(dist) / 1000 else: x, y, dist = None, None, None data = search_tweets(lat=y, lng=x, radius=dist, keyword=keyword, NR_ITEMS=NTWEETS, only_geo=None, __lang=LANG, resultType=tweetType, key=apiKey) try: if not data: return 0 except: pass if keyword: data["keyword"] = keyword else: data["keyword"] = 'nan' dropFields = obj_to_lst(dropFields) if dropFields: data.drop(dropFields, axis=1, inplace=True) return data
def by_query(search_type, keyword=None, x_center=None, y_center=None, dist=None, limit='100', face_fields=None): """ Search data on facebook based on: - Keyword; - search type (user, page, event, place, placetopic); - location (center and distance from center); - limit (maximum number of users/pages/etc. to be returned)*. * Our default is 100, but the Facebook default is 60. Returns an array with the id of the data in facebook """ import pandas from gasp.pyt import obj_to_lst from gasp.fm.web import http_to_json # Deal with spaces in the keyword expression and with special characters keyword = keyword.replace(' ', '%20') if keyword and ' ' in keyword \ else keyword face_fields = obj_to_lst(face_fields) URL = ( '{graph}search?access_token={_id}|{scrt}' '{_q}{typ}{cnt}{dst}{lmt}{flds}' ).format( graph=FACEBOOK_GRAPH_URL, _id = FACEBOOK_TOKEN['APP_ID'], scrt = FACEBOOK_TOKEN['APP_SECRET'], _q = '' if not keyword else '&q={}'.format(keyword), typ = '&type={}'.format(search_type), cnt = '' if not x_center and not y_center else '¢er={},{}'.format( y_center, x_center ), dst = '' if not dist else '&distance={}'.format(dist), lmt = '' if not limit else '&limit={}'.format(str(limit)), flds = '' if not face_fields else '&fields={}'.format(','.join(face_fields)) ) face_table = pandas.DataFrame(http_to_json(URL)['data']) if not face_table.shape[0]: return None face_table["url"] = "https://facebook.com//" + face_table["id"] if face_fields: if "location" in face_fields: face_table = pandas.concat([ face_table.drop(["location"], axis=1), face_table["location"].apply(pandas.Series) ], axis=1) return face_table
def q_to_obj(dbname, query, db_api='psql', geomCol=None, epsg=None, of='df', cols=None): """ Query database and convert data to Pandas Dataframe/GeoDataFrame API's Available: * psql; * sqlite; * mysql; output format options ("of" parameter): * df (Pandas Dataframe); * dict (Python Dict); """ if not query.startswith('SELECT '): # Assuming query is a table name from gasp.pyt import obj_to_lst from gasp.sql.i import cols_name cols = cols_name(dbname, query) if not cols else \ obj_to_lst(cols) query = "SELECT {} FROM {}".format( ", ".join(["{t}.{c} AS {c}".format(t=query, c=i) for i in cols]), query) if not geomCol: import pandas from gasp.sql.c import alchemy_engine pgengine = alchemy_engine(dbname, api=db_api) df = pandas.read_sql(query, pgengine, columns=None) else: from geopandas import GeoDataFrame from gasp.sql.c import sqlcon con = sqlcon(dbname, sqlAPI='psql') df = GeoDataFrame.from_postgis( query, con, geom_col=geomCol, crs="epsg:{}".format(str(epsg)) if epsg else None) if of == 'dict': df = df.to_dict(orient="records") return df
def del_file(_file): """ Delete files if exists """ from gasp.pyt import obj_to_lst for ff in obj_to_lst(_file): if os.path.isfile(ff) and os.path.exists(ff): os.remove(ff)
def shape_to_rst_wShapeCheck(inShp, maxCellNumber, desiredCellsizes, outRst, inEPSG): """ Convert one Feature Class to Raster using the cellsizes included in desiredCellsizes. For each cellsize, check if the number of cells exceeds maxCellNumber. The raster with lower cellsize but lower than maxCellNumber will be the returned raster """ import os from gasp.pyt import obj_to_lst from gasp.gt.prop.rst import rst_shape desiredCellsizes = obj_to_lst(desiredCellsizes) if not desiredCellsizes: raise ValueError('desiredCellsizes does not have a valid value') workspace = os.path.dirname(outRst) RASTERS = [ shp_to_rst(inShp, cellsize, -1, os.path.join(workspace, 'tst_cell_{}.tif'.format(cellSize)), inEPSG) for cellSize in desiredCellsizes ] tstShape = rst_shape(RASTERS, gisApi='gdal') for rst in tstShape: NCELLS = tstShape[rst][0] * tstShape[rst][1] tstShape[rst] = NCELLS NICE_RASTER = None for i in range(len(desiredCellsizes)): if tstShape[RASTERS[i]] <= maxCellNumber: NICE_RASTER = RASTERS[i] break else: continue if not NICE_RASTER: return None else: os.rename(NICE_RASTER, outRst) for rst in RASTERS: if os.path.isfile(rst) and os.path.exists(rst): os.remove(rst) return outRst
def txt_cols_to_col(db, inTable, columns, strSep, newCol, outTable=None): """ Several text columns to a single column """ from gasp.pyt import obj_to_lst from gasp.sql.i import cols_type mergeCols = obj_to_lst(columns) tblCols = cols_type(db, inTable, sanitizeColName=None, pyType=False) for col in mergeCols: if tblCols[col] != 'text' and tblCols[col] != 'varchar': raise ValueError('{} should be of type text'.format(col)) coalesce = "" for i in range(len(mergeCols)): if not i: coalesce += "COALESCE({}, '')".format(mergeCols[i]) else: coalesce += " || '{}' || COALESCE({}, '')".format( strSep, mergeCols[i]) if outTable: # Write new table colsToSelect = [_c for _c in tblCols if _c not in mergeCols] if not colsToSelect: sel = coalesce + " AS {}".format(newCol) else: sel = "{}, {}".format(", ".join(colsToSelect), coalesce + " AS {}".format(newCol)) q_to_ntbl(db, outTable, "SELECT {} FROM {}".format(sel, inTable), api='psql') return outTable else: # Add column to inTable from gasp.sql.tbl import update_table add_field(db, inTable, {newCol: 'text'}) update_table(db, inTable, {newCol: coalesce}) return inTable
def fprop(__file, prop, forceLower=None, fs_unit=None): """ Return some property of file prop options: * filename or fn - return filename """ from gasp.pyt import obj_to_lst prop = obj_to_lst(prop) result = {} if 'filename' in prop or 'fn' in prop: fn, ff = os.path.splitext(os.path.basename(__file)) result['filename'] = fn if 'fileformat' in prop or 'fn' in prop: result['fileformat'] = ff elif 'fileformat' in prop or 'ff' in prop: result['fileformat'] = os.path.splitext(__file)[1] if 'filesize' in prop or 'fs' in prop: fs_unit = 'MB' if not fs_unit else fs_unit fs = os.path.getsize(__file) if fs_unit == 'MB': fs = (fs / 1024.0) / 1024 elif fs_unit == 'KB': fs = fs / 1024.0 result['filesize'] = fs if len(prop) == 1: if prop[0] == 'fn': return result['filename'] elif prop[0] == 'ff': return result['fileformat'] elif prop[0] == 'fs': return result['filesize'] else: return result[prop[0]] else: return result
def distinct_val(db, pgtable, column): """ Get distinct values in one column of one pgtable """ from gasp.pyt import obj_to_lst from gasp.sql.fm import q_to_obj data = q_to_obj(db, "SELECT {col} FROM {t} GROUP BY {col};".format( col=", ".join(obj_to_lst(column)), t=pgtable ), db_api='psql' ).to_dict(orient="records") return data
def trim_char_in_col(db, pgtable, cols, trim_str, outTable, onlyTrailing=None, onlyLeading=None): """ Python implementation of the TRIM PSQL Function The PostgreSQL trim function is used to remove spaces or set of characters from the leading or trailing or both side from a string. """ from gasp.pyt import obj_to_lst from gasp.sql.i import cols_type cols = obj_to_lst(cols) colsTypes = cols_type(db, pgtable, sanitizeColName=None, pyType=False) for col in cols: if colsTypes[col] != 'text' and colsTypes[col] != 'varchar': raise ValueError('{} should be of type text'.format(col)) colsToSelect = [_c for _c in colsTypes if _c not in cols] tail_lead_str = "" if not onlyTrailing and not onlyLeading else \ "TRAILING " if onlyTrailing and not onlyLeading else \ "LEADING " if not onlyTrailing and onlyLeading else "" trimCols = [ "TRIM({tol}{char} FROM {c}) AS {c}".format(c=col, tol=tail_lead_str, char=trim_str) for col in cols ] if not colsToSelect: cols_to_select = "{}".format(", ".join(trimCols)) else: cols_to_select = "{}, {}".format(", ".join(colsToSelect), ", ".join(trimCols)) q_to_ntbl(db, outTable, "SELECT {} FROM {}".format(colsToSelect, pgtable), api='psql')
def tbl_fromdb_todb(from_db, to_db, tables, qForTbl=None, api='pandas'): """ Send PGSQL Tables from one database to other """ from gasp.pyt import obj_to_lst api = 'pandas' if api != 'pandas' and api != 'psql' else api tables = obj_to_lst(tables) if api == 'pandas': from gasp.sql.fm import q_to_obj for table in tables: if not qForTbl: tblDf = q_to_obj(from_db, "SELECT * FROM {}".format( table), db_api='psql') else: if table not in qForTbl: tblDf = q_to_obj(from_db, "SELECT * FROM {}".format( table), db_api='psql') else: tblDf = q_to_obj(from_db, qForTbl[table], db_api='psql') df_to_db(to_db, tblDf, table, api='psql') else: import os from gasp.pyt.oss import mkdir, del_folder from gasp.sql.fm import dump_tbls from gasp.sql.to import restore_tbls tmpFolder = mkdir( os.path.dirname(os.path.abspath(__file__)), randName=True ) # Dump sqlScript = dump_tbls(from_db, tables, os.path.join( tmpFolder, "tables_data.sql" )) # Restore restore_tbls(to_db, sqlScript, tables) del_folder(tmpFolder)
def show_duplicates_in_xls(db_name, table, pkCols, outFile, tableIsQuery=None): """ Find duplicates and write these objects in a table """ import pandas from gasp.pyt import obj_to_lst from gasp.sql.fm import q_to_obj from gasp.to import obj_to_tbl pkCols = obj_to_lst(pkCols) if not pkCols: raise ValueError("pkCols value is not valid") if not tableIsQuery: q = ("SELECT {t}.* FROM {t} INNER JOIN (" "SELECT {cls}, COUNT({cnt}) AS conta FROM {t} " "GROUP BY {cls}" ") AS foo ON {rel} " "WHERE conta > 1").format(t=table, cls=", ".join(pkCols), cnt=pkCols[0], rel=" AND ".join([ "{t}.{c} = foo.{c}".format(t=table, c=col) for col in pkCols ])) else: q = ("SELECT foo.* FROM ({q_}) AS foo INNER JOIN (" "SELECT {cls}, COUNT({cnt}) AS conta " "FROM ({q_}) AS foo2 GROUP BY {cls}" ") AS jt ON {rel} " "WHERE conta > 1").format(q_=table, cls=", ".join(pkCols), cnt=pkCols[0], rel=" AND ".join([ "foo.{c} = jt.{c}".format(c=x) for x in pkCols ])) data = q_to_obj(db_name, q, db_api='psql') obj_to_tbl(data, outFile) return outFile
def st_dissolve(db, table, geomColumn, outTable, whrClause=None, diss_cols=None, outTblIsFile=None, api='sqlite'): """ Dissolve a Polygon table """ from gasp.pyt import obj_to_lst diss_cols = obj_to_lst(diss_cols) if diss_cols else None geomcol = "geometry" if api == 'sqlite' else 'geom' sql = ( "SELECT{selCols} ST_UnaryUnion(ST_Collect({geom})) AS {gout} " "FROM {tbl}{whr}{grpBy}" ).format( selCols="" if not diss_cols else " {},".format(", ".join(diss_cols)), geom=geomColumn, tbl=table, whr="" if not whrClause else " WHERE {}".format(whrClause), grpBy="" if not diss_cols else " GROUP BY {}".format( ", ".join(diss_cols) ), gout=geomcol ) if outTblIsFile: if api == 'sqlite': from gasp.gt.attr import sel_by_attr sel_by_attr(db, sql, outTable, api_gis='ogr') elif api == 'psql': from gasp.gt.toshp.db import dbtbl_to_shp dbtbl_to_shp( db, table, geomColumn, outTable, api='pgsql2shp', tableIsQuery=True ) else: from gasp.sql.to import q_to_ntbl q_to_ntbl( db, outTable, sql, api='ogr2ogr' if api == 'sqlite' else 'psql' ) return outTable
def sql_proj(dbname, tbl, otbl, oepsg, cols=None, geomCol=None, newGeom=None, whr=None, new_pk=None): """ Reproject geometric layer to another spatial reference system (srs) """ from gasp.pyt import obj_to_lst from gasp.sql.to import q_to_ntbl geomCol = 'geom' if not geomCol else geomCol newGeom = 'geom' if not newGeom else newGeom if not cols: from gasp.sql.i import cols_name cols = cols_name(dbname, tbl) cols.remove(geomCol) else: cols = obj_to_lst(cols) if geomCol in cols and geomCol == newGeom: cols.remove(geomCol) cols.append('{c} AS old_{c}'.format(c=geomCol)) Q = ("SELECT {}, ST_Transform({}, {}) AS {} " "FROM {}{}").format(", ".join(cols), geomCol, str(oepsg), newGeom, tbl, "" if not whr else " WHERE {}".format(whr)) otbl = q_to_ntbl(dbname, otbl, Q, api='psql') if new_pk: from gasp.sql.k import create_pk create_pk(dbname, otbl, new_pk) return otbl
def sel_where_groupByIs(db, table, groupByCols, grpByOp, grpByVal, outTable, filterWhere=None): """ Select rows in table where the GROUP BY values of the groupByCols agrees with the statment formed by grpByOp and grpByVal For the following parameters: table=tst_table, groupByCols=[day, hour], grpByOp=>, grpByVal=1 This method will create a new table using a query such SELECT tst_table.* FROM tst_table INNER JOIN ( SELECT day, hour, COUNT(day) AS cnt_day FROM tst_table GROUP BY day, hour ) AS foo ON tst_table.day = foo.day AND tst_table.hour = foo.hour WHERE foo.cnt_day > 1 """ from gasp.pyt import obj_to_lst from gasp.sql.to import q_to_ntbl groupByCols = obj_to_lst(groupByCols) q = ("SELECT {t}.* FROM {t} INNER JOIN (" "SELECT {cls}, COUNT({col}) AS cnt_{col} " "FROM {t} GROUP BY {cls}" ") AS foo ON {jOn} " "WHERE foo.cnt_{col} {op} {val}{fwhr}").format( t=table, cls=", ".join(groupByCols), col=groupByCols[0], jOn=" AND ".join([ "{t}.{c} = foo.{c}".format(t=table, c=x) for x in groupByCols ]), op=grpByOp, val=grpByVal, fwhr="" if not filterWhere else " AND ({})".format(filterWhere)) outTable = q_to_ntbl(db, outTable, q, api='psql') return outTable
def get_sheet_position(xlsObj, sheetNames): """ Return sheet position by name """ from gasp.pyt import obj_to_lst names = obj_to_lst(sheetNames) d = {} c = 0 for sh in xlsObj.sheets(): if sh.name in names: d[sh.name] = c c += 1 return d
def geomext_to_rst_wShapeCheck(inGeom, maxCellNumber, desiredCellsizes, outRst, inEPSG): """ Convert one Geometry to Raster using the cellsizes included in desiredCellsizes. For each cellsize, check if the number of cells exceeds maxCellNumber. The raster with lower cellsize but lower than maxCellNumber will be the returned raster """ import os from gasp.pyt import obj_to_lst desiredCellsizes = obj_to_lst(desiredCellsizes) if not desiredCellsizes: raise ValueError('desiredCellsizes does not have a valid value') # Get geom extent left, right, bottom, top = inGeom.GetEnvelope() # Check Rasters Shape for each desired cellsize SEL_CELLSIZE = None for cellsize in desiredCellsizes: # Get Row and Columns Number NROWS = int(round((top - bottom) / cellsize, 0)) NCOLS = int(round((right - left) / cellsize, 0)) NCELLS = NROWS * NCOLS if NCELLS <= maxCellNumber: SEL_CELLSIZE = cellsize break if not SEL_CELLSIZE: return None else: shpext_to_rst(inGeom, outRst, SEL_CELLSIZE, epsg=inEPSG, invalidResultAsNone=True) return outRst
def drop_col(db, pg_table, columns): """ Delete column from pg_table """ from gasp.pyt import obj_to_lst con = sqlcon(db) cursor = con.cursor() columns = obj_to_lst(columns) cursor.execute('ALTER TABLE {} {};'.format( pg_table, ', '.join(['DROP COLUMN {}'.format(x) for x in columns]))) con.commit() cursor.close() con.close()
def kernel_density_for_field(points, fields, radius, folderoutput, template): """ Run Kernel Density for every field in fields """ import os from gasp.pyt import obj_to_lst fields = obj_to_lst(fields) if not fields: raise ValueError('fields value is not valid') for field in fields: kernel_density( points, field, radius, template, os.path.join( folderoutput, os.path.splitext(os.path.basename(points))[0] + '_{}.tif'.format(field)))
def exec_write_q(db_name, queries, api='psql'): """ Execute Queries and save result in the database """ from gasp.pyt import obj_to_lst qs = obj_to_lst(queries) if not qs: raise ValueError("queries value is not valid") if api == 'psql': from gasp.sql.c import sqlcon con = sqlcon(db_name) cs = con.cursor() for q in qs: cs.execute(q) con.commit() cs.close() con.close() elif api == 'sqlite': import sqlite3 con = sqlite3.connect(db_name) cs = con.cursor() for q in qs: cs.execute(q) con.commit() cs.close() con.close() else: raise ValueError('API {} is not available'.format(api))
def lst_views(db, schema='public', basename=None): """ List Views in database """ from gasp.pyt import obj_to_lst from gasp.sql.fm import q_to_obj basename = obj_to_lst(basename) basenameStr = "" if not basename else "{}".format(" OR ".join( ["{} LIKE '%%{}%%'".format("table_name", b) for b in basename])) views = q_to_obj( db, ("SELECT table_name FROM information_schema.views " "WHERE table_schema='{}'{}").format( schema, "" if not basename else " AND ({})".format(basenameStr)), db_api='psql') return views.table_name.tolist()
def replace_char_in_col(db, pgtable, cols, match_str, replace_str, outTable): """ Replace char in all columns in cols for the value of replace_str Python implementation of the REPLACE PSQL Function """ from gasp.pyt import obj_to_lst from gasp.sql.i import cols_type cols = obj_to_lst(cols) colsTypes = cols_type(db, pgtable, sanitizeColName=None, pyType=False) for col in cols: if colsTypes[col] != 'text' and colsTypes[col] != 'varchar': raise ValueError('{} should be of type text'.format(col)) colsToSelect = [_c for _c in colsTypes if _c not in cols] colsReplace = [ "REPLACE({c}, '{char}', '{nchar}') AS {c}".format(c=col, char=match_str, nchar=replace_str) for col in cols ] if not colsToSelect: cols_to_select = "{}".format(", ".join(colsReplace)) else: cols_to_select = "{}, {}".format(", ".join(colsToSelect), ", ".join(colsReplace)) q_to_ntbl(db, outTable, "SELECT {cols} FROM {tbl}".format(cols=cols_to_select, tbl=pgtable), api='psql') return outTable
def rst_val_to_points2(pntShp, listRasters): """ Pick raster value for each point in pntShp """ from osgeo import ogr from gasp.pyt import obj_to_lst from gasp.gt.prop.ff import drv_name listRasters = obj_to_lst(listRasters) shp = ogr.GetDriverByName(drv_name(pntShp)).Open(pnt, 0) lyr = shp.GetLayer() pntDict = {} for feat in lyr: geom = feat.GetGeometryRef() x, y = geom.GetX(), geom.GetY() l = [] for rst in listRasters: img = gdal.Open(rst) geo_transform = img.GetGeoTransform() band = img.GetRasterBand(1) px = int((x - geo_transform[0]) / geo_transform[1]) py = int((y - geo_transform[3]) / geo_transform[5]) value = band.ReadAsArray(px, py, 1, 1) l.append(list(value)[0]) del img, geo_transform, band, px, py pntDict[feat.GetFID()] = l shp.Destroy() return pntDict
def df_groupBy(df, grpCols, STAT=None, STAT_FIELD=None): """ Group By Pandas Dataframe STAT OPTIONS: * MIN * MAX """ from gasp.pyt import obj_to_lst grpCols = obj_to_lst(grpCols) if not grpCols: raise ValueError("grpCols value is not valid") if not STAT: newDf = df.groupby(grpCols, axis=0, as_index=False) else: if not STAT_FIELD: raise ValueError("To use STAT, you must specify STAT_FIELD") if STAT == 'MIN': newDf = df.groupby(grpCols, axis=0, as_index=False)[STAT_FIELD].min() elif STAT == 'MAX': newDf = df.groupby(grpCols, axis=0, as_index=False)[STAT_FIELD].max() elif STAT == 'SUM': newDf = df.groupby(grpCols, axis=0, as_index=False)[STAT_FIELD].sum() else: raise ValueError("{} is not a valid option".format(STAT)) return newDf
def dump_tbls(db, tables, outsql, startWith=None): """ Dump one table into a SQL File """ from gasp import exec_cmd from gasp.pyt import obj_to_lst from gasp.cons.psql import con_psql tbls = obj_to_lst(tables) if startWith: from gasp.sql.i import lst_tbl db_tbls = lst_tbl(db, api='psql') dtbls = [] for t in db_tbls: for b in tbls: if t.startswith(b): dtbls.append(t) tbls = dtbls condb = con_psql() outcmd = exec_cmd(("pg_dump -Fc -U {user} -h {host} -p {port} " "-w {tbl} {db} > {out}").format( user=condb["USER"], host=condb["HOST"], port=condb["PORT"], db=db, out=outsql, tbl=" ".join(["-t {}".format(t) for t in tbls]))) return outsql
def get_text_in_CssClass(url, classTag, cssCls, texTags=['p']): """ Get text from tags inside a specific object with one tag (classTag) and CSS Class (cssCls) Not recursive: textTags must be direct child of the classTag/cssCls """ import urllib2 import re from bs4 import BeautifulSoup from gasp.pyt import obj_to_lst resp = urllib2.urlopen(url) html_doc = resp.read() soup = BeautifulSoup(html_doc, 'html.parser') data = soup.find_all(classTag, class_=cssCls) rslt = {} texTags = obj_to_lst(texTags) for node in data: for t in texTags: chld = node.findChildren(t, recursive=False) l = [re.sub('<[^>]+>', '', str(x)).strip('\n') for x in chld] if t not in rslt: rslt[t] = l else: rslt[t] += l return rslt
def col_to_timestamp(db, inTbl, dayCol, hourCol, minCol, secCol, newTimeCol, outTbl, selColumns=None, whr=None): """ Columns to timestamp column """ from gasp.pyt import obj_to_lst selCols = obj_to_lst(selColumns) sql = ("SELECT {C}, TO_TIMESTAMP(" "COALESCE(CAST({day} AS text), '') || ' ' || " "COALESCE(CAST({hor} AS text), '') || ':' || " "COALESCE(CAST({min} AS text), '') || ':' || " "COALESCE(CAST({sec} AS text), ''), 'YYYY-MM-DD HH24:MI:SS'" ") AS {TC} FROM {T}{W}").format( C="*" if not selCols else ", ".join(selCols), day=dayCol, hor=hourCol, min=minCol, sec=secCol, TC=newTimeCol, T=inTbl, W="" if not whr else " WHERE {}".format(whr)) q_to_ntbl(db, outTbl, sql, api='psql') return outTbl
def osm_to_relationaldb(osmData, inSchema, osmGeoTbl, osmCatTbl, osmRelTbl, outSQL=None, db_name=None): """ PostgreSQL - OSM Data to Relational Model TODO: Just work for one geom table at once E.g. osmData = '/home/jasp/flainar/osm_centro.xml' inSchema = { "TBL" : ['points', 'lines', 'multipolygons'], 'FID' : 'CAST(osm_id AS bigint)', "COLS" : [ 'name', "ST_X(wkb_geometry) AS longitude", "ST_Y(wkb_geometry) AS latitude", "wkb_geometry AS geom", "NULL AS featurecategoryid", "NULL AS flainarcategoryid", "NULL AS createdby", "NOW() AS createdon", "NULL AS updatedon", "NULL AS deletedon" ], "NOT_KEYS" : [ 'ogc_fid', 'osm_id', 'name', "wkb_geometry", 'healthcare2', 'other_tags' ] } osmGeoTbl = {"TBL" : 'position', "FID" : 'positionid'} osmCatTbl = { "TBL" : 'osmcategory', "FID" : "osmcategoryid", "KEY_COL" : "keycategory", "VAL_COL" : "value", "COLS" : [ "NULL AS createdby", "NOW() AS createdon", "NULL AS updatedon", "NULL AS deletedon" ] } osmRelTbl = { "TBL" : "position_osmcat", "FID" : 'pososmcatid' } """ from gasp.pyt import obj_to_lst from gasp.pyt.oss import fprop from gasp.sql.i import cols_name from gasp.sql.to import q_to_ntbl from gasp.sql.db import create_db inSchema["TBL"] = obj_to_lst(inSchema["TBL"]) # Create DB db = create_db(fprop(osmData, 'fn') if not db_name else db_name, api='psql') # Send OSM data to Database osm_to_psql(osmData, db) # Get KEYS COLUMNS transcols = {} for tbl in inSchema["TBL"]: transcols[tbl] = [ c for c in cols_name(db, tbl, sanitizeSpecialWords=None) if c not in inSchema["NOT_KEYS"] ] # Create osmGeoTbl osmgeotbl = [ q_to_ntbl(db, osmGeoTbl[tbl]['TBL'], ("SELECT {} AS {}, {} FROM {}").format( inSchema["FID"], osmGeoTbl[tbl]["FID"], ", ".join(inSchema["COLS"]), tbl), api='psql') for tbl in inSchema["TBL"] ] # Create OSM categories table qs = [] for tbl in inSchema["TBL"]: qs.extend([ ("SELECT '{keyV}' AS {keyC}, CAST({t}.{keyV} AS text) AS {valC} " "FROM {t} WHERE {t}.{keyV} IS NOT NULL " "GROUP BY {t}.{keyV}").format(keyV=c, t=tbl, keyC=osmCatTbl["KEY_COL"], valC=osmCatTbl["VAL_COL"]) for c in transcols[tbl] ]) osmcatbl = q_to_ntbl( db, osmCatTbl["TBL"], ("SELECT row_number() OVER(ORDER BY {keyC}) " "AS {osmcatid}, {keyC}, {valC}{ocols} " "FROM ({q}) AS foo").format( q="SELECT {k}, {v} FROM ({t}) AS kvtbl GROUP BY {k}, {v}".format( k=osmCatTbl["KEY_COL"], v=osmCatTbl["VAL_COL"], t=" UNION ALL ".join(qs), ) if len(inSchema["TBL"]) > 1 else " UNION ALL ".join(qs), keyC=osmCatTbl["KEY_COL"], osmcatid=osmCatTbl["FID"], valC=osmCatTbl["VAL_COL"], ocols="" if "COLS" not in osmCatTbl else ", {}".format(", ".join( osmCatTbl["COLS"]))), api='psql') # Create relation table osmreltbl = [] for tbl in inSchema["TBL"]: qs = [( "SELECT {fid}, '{keyV}' AS key, CAST({t}.{keyV} AS text) AS osmval " "FROM {t} WHERE {t}.{keyV} IS NOT NULL").format( fid=inSchema["FID"], keyV=c, t=tbl) for c in transcols[tbl]] osmreltbl.append( q_to_ntbl( db, osmRelTbl[tbl]["TBL"], ("SELECT foo.{fid} AS {nfid}, catbl.{osmcatfid} " "FROM ({mtbl}) AS foo INNER JOIN {catTbl} AS catbl " "ON foo.key = catbl.{catkey} AND foo.osmval = catbl.{catval}" ).format(mtbl=" UNION ALL ".join(qs), fid=inSchema["FID"], nfid=osmRelTbl[tbl]["FID"], catTbl=osmCatTbl["TBL"], osmcatfid=osmCatTbl["FID"], catkey=osmCatTbl["KEY_COL"], catval=osmCatTbl["VAL_COL"]), api='psql')) if not outSQL: return osmgeotbl, osmcatbl, osmreltbl else: from gasp.sql.fm import dump_tbls return dump_tbls(db, osmgeotbl + [osmcatbl] + osmreltbl, outSQL)