Beispiel #1
0
def get_stop_words(inTbl, fidCol, txtCol, outFile,
                   lang='portuguese', inSheet=None, db=None):
    """
    Pick a text column and save it in a new column only with the stop words.
    
    Uses PostgreSQL dictionaries to get stop words
    """
    
    from glass.pys.oss     import fprop
    from glass.ng.prop.sql    import cols_name
    from glass.ng.sql.db      import create_db
    from glass.ng.it.sql import tbl_to_db
    from glass.ng.it       import db_to_tbl
    
    FILENAME = fprop(inTbl, 'fn')
    
    # Create Temp database
    db = create_db("db_" + FILENAME if not db else db)
    
    # Send table to PostgreSQL
    tbl = tbl_to_db(inTbl, db, FILENAME, sheet=inSheet, api_db='psql')
    
    cols = cols_name(db, tbl, sanitizeSpecialWords=None, api='psql')
    
    # Sanitize data  and create a new column only with stop words
    Q1 = (
        "(SELECT *, to_tsvector('{_lang}', regexp_replace("
            "regexp_replace(lower(unaccent({txt_c})), 'http://[^:\s]+(\S+)', "
            "' ', 'g'), '[^\w]+', ' ', 'g')) "
        "AS txt_data FROM {t}) AS stop_table"
    ).format(_lang=lang, txt_c=txtCol, t=tbl)
    
    Q2 = (
        "SELECT {selCols}, ARRAY_TO_STRING(array_agg("
            "word ORDER BY word_index), ' ', '*') AS {outCol}, "
        "REPLACE(CAST(STRIP("
            "stop_table.txt_data) AS text), '''', '') AS no_duplicated "
        "FROM ("
            "SELECT fid, word, CAST(UNNEST(word_index) AS integer) "
            "AS word_index FROM ("
                "SELECT fid, SPLIT_PART(tst, ';', 1) AS word, "
                "STRING_TO_ARRAY(SPLIT_PART(tst, ';', 2), ',') AS word_index FROM ("
                    "SELECT {fid} AS fid, REPLACE(REPLACE(REPLACE(REPLACE(REPLACE("
                        "CAST(UNNEST(txt_data) AS text), "
                            "',{{', ',\"{{'), ',\"{{', ';'), '}}\"', ''), "
                            "'(', ''), '}}', '') AS tst "
                    "FROM {tbl}"
                ") AS foo"
            ") AS foo2"
        ") AS foo3 INNER JOIN {tbl} ON foo3.fid = stop_table.{fid} "
        "GROUP BY {selCols}, stop_table.txt_data"
    ).format(
        outCol="clean_" + txtCol, tbl=Q1, fid=fidCol,
        selCols=", ".join(["stop_table.{}".format(i) for i in cols])
    )
    
    # Export new table
    return db_to_tbl(db, Q2, outFile, sheetsNames=inSheet)
Beispiel #2
0
def matrix_od_mean_dist_by_group(MATRIX_OD, ORIGIN_COL, GROUP_ORIGIN_ID,
                                 GROUP_ORIGIN_NAME, GROUP_DESTINA_ID,
                                 GROUP_DESTINA_NAME, TIME_COL, epsg, db,
                                 RESULT_MATRIX):
    """
    Calculate Mean GROUP distance from OD Matrix
    
    OD MATRIX EXAMPLE
    | origin_entity | origin_group | destina_entity | destina_group | distance
    |     XXXX      |     XXXX     |      XXXX      |      XXX      |   XXX
    
    OUTPUT EXAMPLE
    | origin_group | destina_group | mean_distance
    |     XXXX     |      XXXX     |      XXXX
    """

    from glass.pys.oss import fprop
    from glass.g.it.db import shp_to_psql
    from glass.ng.sql.db import create_db
    from glass.ng.sql.q import q_to_ntbl
    from glass.ng.it import db_to_tbl

    db = create_db(fprop(MATRIX_OD, 'fn'), overwrite=True, api='psql')

    TABLE = shp_to_psql(db,
                        MATRIX_OD,
                        pgTable="tbl_{}".format(db),
                        api="pandas",
                        srsEpsgCode=epsg)

    OUT_TABLE = q_to_ntbl(
        db,
        fprop(RESULT_MATRIX, 'fn'),
        ("SELECT {groupOriginCod}, {groupOriginName}, {groupDestCod}, "
         "{groupDestName}, AVG(mean_time) AS mean_time FROM ("
         "SELECT {origin}, {groupOriginCod}, {groupOriginName}, "
         "{groupDestCod}, {groupDestName}, "
         "AVG({timeCol}) AS mean_time FROM {t} "
         "GROUP BY {origin}, {groupOriginCod}, {groupOriginName}, "
         "{groupDestCod}, {groupDestName}"
         ") AS foo "
         "GROUP BY {groupOriginCod}, {groupOriginName}, "
         "{groupDestCod}, {groupDestName} "
         "ORDER BY {groupOriginCod}, {groupDestCod}").format(
             groupOriginCod=GROUP_ORIGIN_ID,
             groupOriginName=GROUP_ORIGIN_NAME,
             groupDestCod=GROUP_DESTINA_ID,
             groupDestName=GROUP_DESTINA_NAME,
             origin=ORIGIN_COL,
             timeCol=TIME_COL,
             t=TABLE),
        api='psql')

    return db_to_tbl(db,
                     "SELECT * FROM {}".format(OUT_TABLE),
                     RESULT_MATRIX,
                     sheetsNames="matrix",
                     dbAPI='psql')
Beispiel #3
0
def tbl_to_areamtx(inShp, col_a, col_b, outXls, db=None, with_metrics=None):
    """
    Table to Matrix
    
    Table as:
        FID | col_a | col_b | geom
    0 |  1  |   A   |   A   | ....
    0 |  2  |   A   |   B   | ....
    0 |  3  |   A   |   A   | ....
    0 |  4  |   A   |   C   | ....
    0 |  5  |   A   |   B   | ....
    0 |  6  |   B   |   A   | ....
    0 |  7  |   B   |   A   | ....
    0 |  8  |   B   |   B   | ....
    0 |  9  |   B   |   B   | ....
    0 | 10  |   C   |   A   | ....
    0 | 11  |   C   |   B   | ....
    0 | 11  |   C   |   D   | ....
    
    To:
    classe | A | B | C | D
       A   |   |   |   | 
       B   |   |   |   |
       C   |   |   |   |
       D   |   |   |   |
    
    col_a = rows
    col_b = cols

    api options:
    * pandas;
    * psql;
    """

    # TODO: check if col_a and col_b exists in table

    if not db:
        import pandas as pd
        import numpy as np
        from glass.g.rd.shp import shp_to_obj
        from glass.ng.wt    import obj_to_tbl
    
        # Open data
        df = shp_to_obj(inShp)

        # Remove nan values
        df = df[pd.notnull(df[col_a])]
        df = df[pd.notnull(df[col_b])]
    
        # Get Area
        df['realarea'] = df.geometry.area / 1000000
    
        # Get rows and Cols
        rows = df[col_a].unique()
        cols = df[col_b].unique()
        refval = list(np.sort(np.unique(np.append(rows, cols))))
    
        # Produce matrix
        outDf = []
        for row in refval:
            newCols = [row]
            for col in refval:
                newDf = df[(df[col_a] == row) & (df[col_b] == col)]

                if not newDf.shape[0]:
                    newCols.append(0)
                
                else:
                    area = newDf.realarea.sum()
            
                    newCols.append(area)
        
            outDf.append(newCols)
    
        outcols = ['class'] + refval
        outDf = pd.DataFrame(outDf, columns=outcols)

        if with_metrics:
            from glass.ng.cls.eval import get_measures_for_mtx

            out_df = get_measures_for_mtx(outDf, 'class')

            return obj_to_tbl(out_df, outXls)
    
        # Export to Excel
        return obj_to_tbl(outDf, outXls)
    
    else:
        from glass.pys.oss        import fprop
        from glass.ng.sql.db      import create_db
        from glass.ng.prop.sql    import db_exists
        from glass.g.it.db       import shp_to_psql
        from glass.g.dp.tomtx.sql import tbl_to_area_mtx
        from glass.ng.it          import db_to_tbl

        # Create database if not exists
        is_db = db_exists(db)

        if not is_db:
            create_db(db, api='psql')

        # Add data to database
        tbl = shp_to_psql(db, inShp, api='shp2pgsql')

        # Create matrix
        mtx = tbl_to_area_mtx(db, tbl, col_a, col_b, fprop(outXls, 'fn'))

        # Export result
        return db_to_tbl(db, mtx, outXls, sheetsNames='matrix')
Beispiel #4
0
def check_shape_diff(SHAPES_TO_COMPARE, OUT_FOLDER, REPORT, DB,
                     GRASS_REGION_TEMPLATE):
    """
    Script to check differences between pairs of Feature Classes
    
    Suponha que temos diversas Feature Classes (FC) e que cada uma delas
    possui um determinado atributo; imagine tambem que,
    considerando todos os pares possiveis entre estas FC,
    se pretende comparar as diferencas na distribuicao dos valores
    desse atributo para cada par.
    
    * Dependencias:
    - GRASS;
    - PostgreSQL;
    - PostGIS.
    """

    import datetime
    import os
    import pandas
    from glass.ng.sql.q import q_to_obj
    from glass.ng.it import db_to_tbl
    from glass.g.wt.sql import df_to_db
    from glass.g.dp.rst.toshp import rst_to_polyg
    from glass.g.it.db import shp_to_psql
    from glass.g.dp.tomtx import tbl_to_area_mtx
    from glass.g.prop import check_isRaster
    from glass.pys.oss import fprop
    from glass.ng.sql.db import create_db
    from glass.ng.sql.tbl import tbls_to_tbl
    from glass.ng.sql.q import q_to_ntbl

    # Check if folder exists, if not create it
    if not os.path.exists(OUT_FOLDER):
        from glass.pys.oss import mkdir
        mkdir(OUT_FOLDER, overwrite=None)
    else:
        raise ValueError('{} already exists!'.format(OUT_FOLDER))

    from glass.g.wenv.grs import run_grass

    gbase = run_grass(OUT_FOLDER,
                      grassBIN='grass78',
                      location='shpdif',
                      srs=GRASS_REGION_TEMPLATE)

    import grass.script as grass
    import grass.script.setup as gsetup

    gsetup.init(gbase, OUT_FOLDER, 'shpdif', 'PERMANENT')

    from glass.g.it.shp import shp_to_grs, grs_to_shp
    from glass.g.it.rst import rst_to_grs
    from glass.g.tbl.col import rn_cols

    # Convert to SHAPE if file is Raster
    i = 0
    _SHP_TO_COMPARE = {}
    for s in SHAPES_TO_COMPARE:
        isRaster = check_isRaster(s)

        if isRaster:
            # To GRASS
            rstName = fprop(s, 'fn')
            inRst = rst_to_grs(s, "rst_" + rstName, as_cmd=True)
            # To Vector
            d = rst_to_polyg(inRst,
                             rstName,
                             rstColumn="lulc_{}".format(i),
                             gisApi="grass")

            # Export Shapefile
            shp = grs_to_shp(d, os.path.join(OUT_FOLDER, d + '.shp'), "area")

            _SHP_TO_COMPARE[shp] = "lulc_{}".format(i)

        else:
            # To GRASS
            grsV = shp_to_grs(s, fprop(s, 'fn'), asCMD=True)

            # Change name of column with comparing value
            ncol = "lulc_{}".format(str(i))
            rn_cols(grsV, {SHAPES_TO_COMPARE[s]: "lulc_{}".format(str(i))},
                    api="grass")

            # Export
            shp = grs_to_shp(grsV, os.path.join(OUT_FOLDER, grsV + '_rn.shp'),
                             "area")

            _SHP_TO_COMPARE[shp] = "lulc_{}".format(str(i))

        i += 1

    SHAPES_TO_COMPARE = _SHP_TO_COMPARE
    __SHAPES_TO_COMPARE = SHAPES_TO_COMPARE

    # Create database
    create_db(DB, api='psql')
    """ Union SHAPEs """

    UNION_SHAPE = {}
    FIX_GEOM = {}

    SHPS = list(__SHAPES_TO_COMPARE.keys())
    for i in range(len(SHPS)):
        for e in range(i + 1, len(SHPS)):
            # Optimized Union
            print("Union between {} and {}".format(SHPS[i], SHPS[e]))
            time_a = datetime.datetime.now().replace(microsecond=0)
            __unShp = optimized_union_anls(
                SHPS[i],
                SHPS[e],
                os.path.join(OUT_FOLDER, "un_{}_{}.shp".format(i, e)),
                GRASS_REGION_TEMPLATE,
                os.path.join(OUT_FOLDER, "work_{}_{}".format(i, e)),
                multiProcess=True)
            time_b = datetime.datetime.now().replace(microsecond=0)
            print(time_b - time_a)

            # Rename cols
            unShp = rn_cols(
                __unShp, {
                    "a_" + __SHAPES_TO_COMPARE[SHPS[i]]:
                    __SHAPES_TO_COMPARE[SHPS[i]],
                    "b_" + __SHAPES_TO_COMPARE[SHPS[e]]:
                    __SHAPES_TO_COMPARE[SHPS[e]]
                })

            UNION_SHAPE[(SHPS[i], SHPS[e])] = unShp

    # Send data to postgresql
    SYNTH_TBL = {}

    for uShp in UNION_SHAPE:
        # Send data to PostgreSQL
        union_tbl = shp_to_psql(DB, UNION_SHAPE[uShp], api='shp2pgsql')

        # Produce table with % of area equal in both maps
        areaMapTbl = q_to_ntbl(
            DB,
            "{}_syn".format(union_tbl),
            ("SELECT CAST('{lulc_1}' AS text) AS lulc_1, "
             "CAST('{lulc_2}' AS text) AS lulc_2, "
             "round("
             "CAST(SUM(g_area) / 1000000 AS numeric), 4"
             ") AS agree_area, round("
             "CAST((SUM(g_area) / MIN(total_area)) * 100 AS numeric), 4"
             ") AS agree_percentage, "
             "round("
             "CAST(MIN(total_area) / 1000000 AS numeric), 4"
             ") AS total_area FROM ("
             "SELECT {map1_cls}, {map2_cls}, ST_Area(geom) AS g_area, "
             "CASE "
             "WHEN {map1_cls} = {map2_cls} "
             "THEN 1 ELSE 0 "
             "END AS isthesame, total_area FROM {tbl}, ("
             "SELECT SUM(ST_Area(geom)) AS total_area FROM {tbl}"
             ") AS foo2"
             ") AS foo WHERE isthesame = 1 "
             "GROUP BY isthesame").format(
                 lulc_1=fprop(uShp[0], 'fn'),
                 lulc_2=fprop(uShp[1], 'fn'),
                 map1_cls=__SHAPES_TO_COMPARE[uShp[0]],
                 map2_cls=__SHAPES_TO_COMPARE[uShp[1]],
                 tbl=union_tbl),
            api='psql')

        # Produce confusion matrix for the pair in comparison
        matrixTbl = tbl_to_area_mtx(DB, union_tbl,
                                    __SHAPES_TO_COMPARE[uShp[0]],
                                    __SHAPES_TO_COMPARE[uShp[1]],
                                    union_tbl + '_mtx')

        SYNTH_TBL[uShp] = {"TOTAL": areaMapTbl, "MATRIX": matrixTbl}

    # UNION ALL TOTAL TABLES
    total_table = tbls_to_tbl(DB, [SYNTH_TBL[k]["TOTAL"] for k in SYNTH_TBL],
                              'total_table')

    # Create table with % of agreement between each pair of maps
    mapsNames = q_to_obj(
        DB,
        ("SELECT lulc FROM ("
         "SELECT lulc_1 AS lulc FROM {tbl} GROUP BY lulc_1 "
         "UNION ALL "
         "SELECT lulc_2 AS lulc FROM {tbl} GROUP BY lulc_2"
         ") AS lu GROUP BY lulc ORDER BY lulc").format(tbl=total_table),
        db_api='psql').lulc.tolist()

    FLDS_TO_PIVOT = ["agree_percentage", "total_area"]

    Q = ("SELECT * FROM crosstab('"
         "SELECT CASE "
         "WHEN foo.lulc_1 IS NOT NULL THEN foo.lulc_1 ELSE jtbl.tmp1 "
         "END AS lulc_1, CASE "
         "WHEN foo.lulc_2 IS NOT NULL THEN foo.lulc_2 ELSE jtbl.tmp2 "
         "END AS lulc_2, CASE "
         "WHEN foo.{valCol} IS NOT NULL THEN foo.{valCol} ELSE 0 "
         "END AS agree_percentage FROM ("
         "SELECT lulc_1, lulc_2, {valCol} FROM {tbl} UNION ALL "
         "SELECT lulc_1, lulc_2, {valCol} FROM ("
         "SELECT lulc_1 AS lulc_2, lulc_2 AS lulc_1, {valCol} "
         "FROM {tbl}"
         ") AS tst"
         ") AS foo FULL JOIN ("
         "SELECT lulc_1 AS tmp1, lulc_2 AS tmp2 FROM ("
         "SELECT lulc_1 AS lulc_1 FROM {tbl} GROUP BY lulc_1 "
         "UNION ALL "
         "SELECT lulc_2 AS lulc_1 FROM {tbl} GROUP BY lulc_2"
         ") AS tst_1, ("
         "SELECT lulc_1 AS lulc_2 FROM {tbl} GROUP BY lulc_1 "
         "UNION ALL "
         "SELECT lulc_2 AS lulc_2 FROM {tbl} GROUP BY lulc_2"
         ") AS tst_2 WHERE lulc_1 = lulc_2 GROUP BY lulc_1, lulc_2"
         ") AS jtbl ON foo.lulc_1 = jtbl.tmp1 AND foo.lulc_2 = jtbl.tmp2 "
         "ORDER BY lulc_1, lulc_2"
         "') AS ct("
         "lulc_map text, {crossCols}"
         ")")

    TOTAL_AGREE_TABLE = None
    TOTAL_AREA_TABLE = None
    for f in FLDS_TO_PIVOT:
        if not TOTAL_AGREE_TABLE:
            TOTAL_AGREE_TABLE = q_to_ntbl(
                DB,
                "agreement_table",
                Q.format(tbl=total_table,
                         valCol=f,
                         crossCols=", ".join([
                             "{} numeric".format(map_) for map_ in mapsNames
                         ])),
                api='psql')

        else:
            TOTAL_AREA_TABLE = q_to_ntbl(DB,
                                         "area_table",
                                         Q.format(tbl=total_table,
                                                  valCol=f,
                                                  crossCols=", ".join([
                                                      "{} numeric".format(map_)
                                                      for map_ in mapsNames
                                                  ])),
                                         api='psql')

    # Union Mapping
    UNION_MAPPING = pandas.DataFrame(
        [[k[0], k[1], fprop(UNION_SHAPE[k], 'fn')] for k in UNION_SHAPE],
        columns=['shp_a', 'shp_b', 'union_shp'])

    UNION_MAPPING = df_to_db(DB, UNION_MAPPING, 'union_map', api='psql')

    # Export Results
    TABLES = [UNION_MAPPING, TOTAL_AGREE_TABLE, TOTAL_AREA_TABLE
              ] + [SYNTH_TBL[x]["MATRIX"] for x in SYNTH_TBL]

    SHEETS = ["union_map", "agreement_percentage", "area_with_data_km"] + [
        "{}_{}".format(fprop(x[0], 'fn')[:15],
                       fprop(x[1], 'fn')[:15]) for x in SYNTH_TBL
    ]

    db_to_tbl(DB, ["SELECT * FROM {}".format(x) for x in TABLES],
              REPORT,
              sheetsNames=SHEETS,
              dbAPI='psql')

    return REPORT
Beispiel #5
0
def get_rows_related_with_event(db, tblSchema, words, resultTbl,
                                startTime=None, endTime=None):
    """
    Take a table of a database and see if the text in one column
    is related with some event. The relation between rows and event will
    be true when a set of words given as input exists in a row.
    
    tblSchema = {
        "TNAME"   : "facedata",
        "TEXTCOL" : "message",
        "TIMECOL" : "datahora",
        "SELCOL"  : [post_id, type],
        "TEXTCASE" : (
            "CASE WHEN type = 'link' THEN lower(unaccent(description)) "
            "ELSE lower(unaccent(message)) END"
        )
    }
    
    NOTE: only works for PostgreSQL
    """
    
    from glass.pys   import obj_to_lst
    from glass.ng.it import db_to_tbl
    
    if "TNAME" not in tblSchema or "TEXTCOL" not in tblSchema:
        raise ValueError((
            "tblSchema input should be a dict with at least TNAME and TEXTCOL "
            "keys. The value of the first should be a table name; "
            "the value of the second shoulbe the name of a column with text "
            "to classify!"
        ))
    
    # Words to list
    words = obj_to_lst(words)
    
    cols = None if "SELCOL" not in tblSchema else obj_to_lst(tblSchema["SELCOL"])
    
    like_words = " OR ".join(["{} LIKE '%%{}%%'".format(
        tblSchema["TEXTCOL"], word) for word in words])
    
    time_where = "" if "TIMECOL" not in tblSchema or not startTime \
        or not endTime else (
            " {w} TO_TIMESTAMP({c}, 'YYYY-MM-DD HH24:MI:SS') > "
            "TO_TIMESTAMP('{l}', 'YYYY-MM-DD HH24:MI:SS') AND "
            "TO_TIMESTAMP({c}, 'YYYY-MM-DD HH24:MI:SS') < "
            "TO_TIMESTAMP('{h}', 'YYYY-MM-DD HH24:MI:SS')"
        ).format(
            w="WHERE" if "TEXTCASE" in tblSchema else "AND",
            c=tblSchema["TIMECOL"], l=startTime, h=endTime
        )
    
    Q = (
        "SELECT * FROM ("
            "SELECT {selCols} FROM {tbl}{timeWhr}"
        ") AS foo WHERE {wordsWhr}"
        ).format(
            tbl=tblSchema["TNAME"], wordsWhr=like_words,
            selCols="{} AS {}".format(
                tblSchema["TEXTCASE"], tblSchema["TEXTCOL"]
            ) if not cols else "{}, {} AS {}".format(
                ", ".join(cols), tblSchema["TEXTCASE"], tblSchema["TEXTCOL"]
            ),
            timeWhr=time_where
        ) if "TEXTCASE" in tblSchema else (
            "SELECT {selCols} FROM {tbl} WHERE ({wordsWhr}){timeWhr}"
        ).format(
            selCols=", ".join(cols + [tblSchema["TEXTCOL"]]),
            tbl=tblSchema["TNAME"], wordsWhr=like_words,
            timeWhr=time_where
            
        )
    
    return db_to_tbl(db, Q, resultTbl)