コード例 #1
0
ファイル: tw.py プロジェクト: jasp382/glass
def tweets_to_xls(outxls,
                  searchword=None,
                  searchGeom=None,
                  srs=None,
                  lng='pt',
                  NTW=1000,
                  twType='mixed',
                  Key=None):
    """
    Search for Tweets and Export them to XLS
    """

    from glass.ng.wt import obj_to_tbl

    data = search_tweets(keyword=searchword,
                         in_geom=searchGeom,
                         epsg=srs,
                         __lang=lng,
                         NR_ITEMS=NTW,
                         resultType=twType,
                         key=Key)

    try:
        if not data:
            return 0
    except:
        pass

    obj_to_tbl(data, outxls, sheetsName='twitter')

    return outxls
コード例 #2
0
ファイル: mean.py プロジェクト: jasp382/glass
    def get_day_table(day):
        print('Starting: ' + day)

        if EXCLUDE_DAYS:
            if day in EXCLUDE_DAYS:
                print('Ending: ' + day)
                return 0

        COUNTING = []
        for __int in INTERVALS:
            start, end = __int
            COUNT_FIELD = 'p{}h{}_{}h{}'.format(str(start[0]), str(start[1]),
                                                str(end[0]), str(end[1]))

            if COUNT_FIELD not in INTERVAL_COLUMNS:
                INTERVAL_COLUMNS.append(COUNT_FIELD)

            countTbl = count_by_period_entity(psqldb, start, end, pgtable,
                                              DAY_FIELD, day, HOUR_FIELD,
                                              MINUTES_FIELD, ENTITY_FIELD)
            COUNTING.append(countTbl)

        main_table = COUNTING[0]
        for i in range(1, len(COUNTING)):
            main_table = combine_dfs(main_table, COUNTING[i], ENTITY_FIELD)

        if workspace_day_tables:
            obj_to_tbl(main_table,
                       os.path.join(workspace_day_tables, 'ti_{}.xlsx'))

        return main_table
コード例 #3
0
ファイル: count.py プロジェクト: jasp382/glass
def count_entity_periods_with_certain_duration(db,
                                               PERIOD_INTERVAL,
                                               PGTABLE,
                                               TIME_FIELD,
                                               ENTITY_FIELD,
                                               OUT_TABLE,
                                               filterWhere=None):
    """
    Count rows in a pgtable for a given period of X minutes for each
    interest entity
    
    PERIOD_INTERVAL = "01:00:00"
    """

    from glass.pys.tm import day_to_intervals2
    from glass.ng.pd.joins import combine_dfs
    from glass.ng.wt import obj_to_tbl

    # Get Intervals
    INTERVALS = day_to_intervals2(PERIOD_INTERVAL)

    # For each interval/period, count the number of rows by entity
    counting = []
    for _int in INTERVALS:
        Q = ("SELECT {entityCol}, COUNT({entityCol}) AS {cntCol} "
             "FROM {table} WHERE "
             "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') >= "
             "TO_TIMESTAMP('{minLower}', 'HH24:MI:SS') AND "
             "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') < "
             "TO_TIMESTAMP('{minUpper}', 'HH24:MI:SS'){whr} "
             "GROUP BY {entityCol}").format(cntCol="s{}_e{}".format(
                 _int[0][:5], _int[1][:5]).replace(":", "_"),
                                            table=PGTABLE,
                                            timeCol=TIME_FIELD,
                                            entityCol=ENTITY_FIELD,
                                            minLower=_int[0],
                                            minUpper=_int[1],
                                            whr="" if not filterWhere else
                                            " AND ({}) ".format(filterWhere))

        count = q_to_obj(db, Q, db_api='psql')

        counting.append(count)

    mainDf = combine_dfs(counting[0], counting[1:], ENTITY_FIELD)

    obj_to_tbl(mainDf, OUT_TABLE)

    return OUT_TABLE
コード例 #4
0
ファイル: duplicate.py プロジェクト: jasp382/glass
def show_duplicates_in_xls(db_name, table, pkCols, outFile,
                           tableIsQuery=None):
    """
    Find duplicates and write these objects in a table
    """
    
    from glass.pys      import obj_to_lst
    from glass.ng.sql.q import q_to_obj
    from glass.ng.wt    import obj_to_tbl
    
    pkCols = obj_to_lst(pkCols)
    
    if not pkCols:
        raise ValueError("pkCols value is not valid")
    
    if not tableIsQuery:
        q = (
            "SELECT {t}.* FROM {t} INNER JOIN ("
                "SELECT {cls}, COUNT({cnt}) AS conta FROM {t} "
                "GROUP BY {cls}"
            ") AS foo ON {rel} "
            "WHERE conta > 1"
        ).format(
            t=table, cls=", ".join(pkCols), cnt=pkCols[0],
            rel=" AND ".join([
                "{t}.{c} = foo.{c}".format(t=table, c=col) for col in pkCols
            ])
        )
    
    else:
        q = (
            "SELECT foo.* FROM ({q_}) AS foo INNER JOIN ("
                "SELECT {cls}, COUNT({cnt}) AS conta "
                "FROM ({q_}) AS foo2 GROUP BY {cls}"
            ") AS jt ON {rel} "
            "WHERE conta > 1" 
        ).format(
            q_=table, cls=", ".join(pkCols), cnt=pkCols[0],
            rel=" AND ".join([
                "foo.{c} = jt.{c}".format(c=x) for x in pkCols
            ])
        )
    
    data = q_to_obj(db_name, q, db_api='psql')
    
    obj_to_tbl(data, outFile)
    
    return outFile
コード例 #5
0
ファイル: count.py プロジェクト: jasp382/glass
def count_by_periods_with_certain_duration(db,
                                           PERIOD_INTERVAL,
                                           pgtable,
                                           TIME_FIELD,
                                           outTable,
                                           filterWhere=None):
    """
    Count rows in a pgtable by periods of X minutes
    
    PERIOD_INTERVAL = "01:00:00"
    """

    import pandas
    from glass.pys.tm import day_to_intervals2

    # Get Intervals
    INTERVALS = day_to_intervals2(PERIOD_INTERVAL)

    # For each interval/period, count the number of rows
    counting = None
    for _int_ in INTERVALS:
        QUERY = ("SELECT COUNT(*) AS count FROM {table} WHERE "
                 "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') >= "
                 "TO_TIMESTAMP('{minLower}', 'HH24:MI:SS') AND "
                 "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') < "
                 "TO_TIMESTAMP('{minUpper}', 'HH24:MI:SS'){whr}").format(
                     table=pgtable,
                     timeCol=TIME_FIELD,
                     minLower=_int_[0],
                     minUpper=_int_[1],
                     whr=""
                     if not filterWhere else " AND ({})".format(filterWhere))

        count = q_to_obj(db, QUERY, db_api='psql')

        count.rename(index={0: "{}-{}".format(_int_[0][:5], _int_[1][:5])},
                     inplace=True)

        if type(counting) != pandas.DataFrame:
            counting = count.copy()

        else:
            counting = counting.append(count, ignore_index=False)

    obj_to_tbl(counting, outTable)

    return outTable
コード例 #6
0
ファイル: joins.py プロジェクト: jasp382/glass
def field_sum_two_tables(tableOne, tableTwo, joinFieldOne, joinFieldTwo,
                         field_to_sum, outTable):
    """
    Sum same field in different tables
    
    Table 1:
    id | field
    0 |  10
    1 |  11
    2 |  13
    3 |  10
    
    Table 2:
    id | field
    0 |  10
    1 |   9
    2 |  17
    4 |  15
    
    Create the new table
    id | field
    0 |  20
    1 |  20
    2 |  30
    3 |  10
    4 |  15
    """

    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    from glass.ng.pd.joins import sum_field_of_two_tables

    # Open two tables
    df_one = tbl_to_obj(tableOne)
    df_two = tbl_to_obj(tableTwo)

    # Do it!
    outDf = sum_field_of_two_tables(df_one, joinFieldOne, df_two, joinFieldTwo,
                                    field_to_sum)

    obj_to_tbl(outDf, outTable)

    return outTable
コード例 #7
0
ファイル: tbl.py プロジェクト: jasp382/glass
def merge_tbls(folder, out_tbl, tbl_format='.dbf'):
    """
    Merge all tables in folder into one single table
    """

    from glass.pys.oss import lst_ff
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    from glass.ng.pd import merge_df

    tbls = lst_ff(folder, file_format=tbl_format)

    tbls_dfs = [tbl_to_obj(t) for t in tbls]

    out_df = merge_df(tbls_dfs)

    obj_to_tbl(out_df, out_tbl)

    return out_tbl
コード例 #8
0
def record_time_consumed(timeData, outXls):
    """
    Record the time consumed by a OSM2LULC procedure version
    in a excel table
    """

    import pandas
    from glass.ng.wt import obj_to_tbl

    # Produce main table - Time consumed by rule
    main = [{
        'rule': timeData[i][0],
        'time': timeData[i][1]
    } for i in range(len(timeData.keys())) if timeData[i]]

    # Produce detailed table - Time consumed inside rules
    timeInsideRule = []
    timeDataKeys = list(timeData.keys())
    timeDataKeys.sort()

    for i in timeDataKeys:
        if not timeData[i]:
            continue

        if len(timeData[i]) == 2:
            timeInsideRule.append({
                'rule': timeData[i][0],
                'task': timeData[i][0],
                'time': timeData[i][1]
            })

        elif len(timeData[i]) == 3:
            taskKeys = list(timeData[i][2].keys())
            taskKeys.sort()
            for task in taskKeys:
                if not timeData[i][2][task]:
                    continue

                timeInsideRule.append({
                    'rule': timeData[i][0],
                    'task': timeData[i][2][task][0],
                    'time': timeData[i][2][task][1]
                })

        else:
            print('timeData object with key {} is not valid'.format(i))

    # Export tables to excel
    dfs = [pandas.DataFrame(main), pandas.DataFrame(timeInsideRule)]

    return obj_to_tbl(dfs, outXls, sheetsName=['general', 'detailed'])
コード例 #9
0
ファイル: eval.py プロジェクト: jasp382/glass
def model_conf_matrix(tblFile, refCol, clsCol, outMxt):
    """
    Model Evaluation
    """

    import pandas as pd
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    from sklearn.metrics import confusion_matrix, classification_report

    data = tbl_to_obj(tblFile)

    data[refCol] = data[refCol].astype(str)
    data[clsCol] = data[clsCol].astype(str)

    ref_id = data[[refCol]].drop_duplicates().sort_values(refCol)

    conf_mat = confusion_matrix(data[refCol], data[clsCol])

    mxt = pd.DataFrame(conf_mat,
                       columns=ref_id[refCol].values,
                       index=ref_id[refCol].values)
    mxt.reset_index(inplace=True)
    mxt.rename(columns={'index': 'confusion_mxt'}, inplace=True)

    # Get classification report
    report = classification_report(data[refCol],
                                   data[clsCol],
                                   target_names=ref_id[refCol],
                                   output_dict=True)

    global_keys = ['accuracy', 'macro avg', 'micro avg', 'weighted avg']

    cls_eval = {k: report[k] for k in report if k not in global_keys}
    glb_eval = {k: report[k] for k in report if k in global_keys}

    if 'accuracy' in glb_eval:
        glb_eval['accuracy'] = {
            'f1-score': glb_eval['accuracy'],
            'precision': 0,
            'recall': 0,
            'support': 0
        }

    cls_eval = pd.DataFrame(cls_eval).T
    gbl_eval = pd.DataFrame(glb_eval).T

    return obj_to_tbl([gbl_eval, cls_eval, mxt],
                      outMxt,
                      sheetsName=['global', 'report', 'matrix'])
コード例 #10
0
ファイル: gen.py プロジェクト: jasp382/glass
def merge_xls_in_folder(tbl_folder, out_table):
    """
    Get all excel tables in a folder and make one table of them
    """

    import pandas
    from glass.pys.oss import lst_ff
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl

    tables = lst_ff(tbl_folder, file_format=['.xls', '.xlsx'])

    dfs = [tbl_to_obj(table) for table in tables]

    result = pandas.concat(dfs)

    out_table = obj_to_tbl(result, out_table)

    return out_table
コード例 #11
0
def tbl_to_tbl(inTbl,
               outTbl,
               inSheet=None,
               txtDelimiter=None,
               inTxtDelimiter=None,
               inEncoding='utf-8'):
    """
    Convert data format
    """

    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl

    data = tbl_to_obj(inTbl,
                      sheet=inSheet,
                      encoding_=inEncoding,
                      _delimiter=inTxtDelimiter)

    outTbl = obj_to_tbl(data, outTbl, delimiter=txtDelimiter)

    return outTbl
コード例 #12
0
def exp_by_group_relfeat(shp, group_col, relfeat, relfeat_id, reltbl,
                         reltbl_sheet, group_fk, relfeat_fk, out_folder,
                         out_tbl):
    """
    Identify groups in shp, get features related with
    these groups and export group features and related
    features to new file
    """

    import os
    import pandas as pd
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    from glass.g.rd.shp import shp_to_obj
    from glass.g.wt.shp import obj_to_shp
    from glass.g.prop.prj import get_shp_epsg

    epsg = get_shp_epsg(shp)

    # Open data
    shp_df = shp_to_obj(shp)
    rel_df = shp_to_obj(relfeat)

    # Get table with relations N-N
    nn_tbl = tbl_to_obj(reltbl, sheet=reltbl_sheet)

    # Relate relfeat with shp groups
    rel_df = rel_df.merge(nn_tbl,
                          how='inner',
                          left_on=relfeat_id,
                          right_on=relfeat_fk)

    # List Groups
    grp_df = pd.DataFrame({
        'cnttemp':
        shp_df.groupby([group_col])[group_col].agg('count')
    }).reset_index()

    ntbls = []
    # Filter and export
    for idx, row in grp_df.iterrows():
        # Get shp_df filter
        new_shp = shp_df[shp_df[group_col] == row[group_col]]

        # Get relfeat filter
        new_relf = rel_df[rel_df[group_fk] == row[group_col]]

        # Export
        shp_i = obj_to_shp(
            new_shp, 'geometry', epsg,
            os.path.join(out_folder, 'lyr_{}.shp'.format(row[group_col])))
        rel_i = obj_to_shp(
            new_relf, 'geometry', epsg,
            os.path.join(out_folder, 'rel_{}.shp'.format(row[group_col])))

        ntbls.append([row[group_col], shp_i, rel_i])

    ntbls = pd.DataFrame(ntbls, columns=['group_id', 'shp_i', 'rel_i'])

    obj_to_tbl(ntbls, out_tbl)

    return out_tbl
コード例 #13
0
ファイル: timedist.py プロジェクト: jasp382/glass
def timedist_stopsPairs(db, GTFS_SCHEMA, outfile):
    """
    Use GTFS DB to calculate the mean time between all stops pairs for all
    route_id.
    
    Definition of a stop pair:
    For a route with 10 stops, the time distance will be estimated
    for the following pairs: 1|2; 2|3; 3|4; 4|5; 5|6; 6|7; 7|8; 8|9; 9|10.
    So, the time distance will not be calculated for all possible combinations
    of bus stops.
    
    GTFS_SCHEMA = {
        "TRIPS" : {
            "TNAME"    : "trips",
            "TRIP_ID"  : "trip_id",
            "ROUTE_ID" : "route_id"
        },
        "ROUTES" : {
            "TNAME"      : "routes",
            "ROUTE_ID"   : "route_id",
            "ROUTE_NAME" : "route_short_name"
        },
        "STOP_TIMES" : {
            "TNAME"     : "stop_times",
            "TRIP_ID"   : "trip_id",
            "STOP_ID"   : "stop_id",
            "ORDER"     : "stop_sequence",
            "ARRIVAL"   : "arrival_time",
            "DEPARTURE" : "departure_time"
        }
    }
    
    The output will be something like this:
    route | origin | o_order | destination | d_order | duration
     12E  |  XXX   |    1    |    XXX      |    2    | XX:XX:XX
     12E  |  XXX   |    2    |    XXX      |    3    | XX:XX:XX
     12E  |  XXX   |    3    |    XXX      |    4    | XX:XX:XX
     12E  |  XXX   |    4    |    XXX      |    5    | XX:XX:XX
     12E  |  XXX   |    5    |    XXX      |    6    | XX:XX:XX
     12E  |  XXX   |    6    |    XXX      |    7    | XX:XX:XX
     15E  |  XXX   |    1    |    XXX      |    2    | XX:XX:XX
     15E  |  XXX   |    2    |    XXX      |    3    | XX:XX:XX
     15E  |  XXX   |    3    |    XXX      |    4    | XX:XX:XX
     15E  |  XXX   |    4    |    XXX      |    5    | XX:XX:XX
    """

    from glass.ng.sql.q import q_to_obj
    from glass.ng.wt import obj_to_tbl

    SQL_QUERY = (
        "SELECT route, origin, o_order, destination, d_order, AVG(duration) AS duration FROM ("
        "SELECT foo.*, (foo.time_arrival - foo.time_departure) AS duration FROM ("
        "SELECT {tripid}, {stopid} AS origin, {stp_order} AS o_order, "
        "LEAD({stopid}) OVER(PARTITION BY {tripid} ORDER BY {tripid}, {stp_order}) AS destination, "
        "LEAD({stp_order}) OVER(PARTITION BY {tripid} ORDER BY {tripid}, {stp_order}) AS d_order, "
        "TO_TIMESTAMP({dep_time}, 'HH24:MI:SS') AS time_departure, "
        "LEAD(TO_TIMESTAMP({arr_time}, 'HH24:MI:SS')) OVER("
        "PARTITION BY {tripid} ORDER BY {tripid}, {stp_order}) AS time_arrival, "
        "{route_name} AS route FROM {stopTm} INNER JOIN ("
        "SELECT {tripsT}.{Ttripsid} AS trip_fid, "
        "{routesT}.{route_name} FROM {tripsT} INNER JOIN {routesT} ON "
        "{tripsT}.{Ttrouteid} = {routesT}.{Rrouteid}"
        ") AS trips_routes ON {stopTm}.{tripid} = trips_routes.trip_fid "
        "ORDER BY {tripid}, {stp_order}"
        ") AS foo "
        "WHERE time_arrival IS NOT NULL "
        "ORDER BY {tripid}, o_order"
        ") AS allods "
        "GROUP BY route, origin, o_order, destination, d_order "
        "ORDER BY route, o_order").format(
            tripid=GTFS_SCHEMA["STOP_TIMES"]["TRIP_ID"],
            stopid=GTFS_SCHEMA["STOP_TIMES"]["STOP_ID"],
            stp_order=GTFS_SCHEMA["STOP_TIMES"]["ORDER"],
            dep_time=GTFS_SCHEMA["STOP_TIMES"]["DEPARTURE"],
            arr_time=GTFS_SCHEMA["STOP_TIMES"]["ARRIVAL"],
            stopTm=GTFS_SCHEMA["STOP_TIMES"]["TNAME"],
            route_name=GTFS_SCHEMA["ROUTES"]["ROUTE_NAME"],
            routesT=GTFS_SCHEMA["ROUTES"]["TNAME"],
            Rrouteid=GTFS_SCHEMA["ROUTES"]["ROUTE_ID"],
            Ttrouteid=GTFS_SCHEMA["TRIPS"]["ROUTE_ID"],
            tripsT=GTFS_SCHEMA["TRIPS"]["TNAME"],
            Ttripsid=GTFS_SCHEMA["TRIPS"]["TRIP_ID"])

    table = q_to_obj(db, SQL_QUERY)

    return obj_to_tbl(table, outfile)
コード例 #14
0
ファイル: mean.py プロジェクト: jasp382/glass
def meanrowsday_of_periods_by_entity(psql_con,
                                     pgtable,
                                     dayField,
                                     hourField,
                                     minutesField,
                                     secondField,
                                     entityField,
                                     PERIODS,
                                     outFile,
                                     filterData=None,
                                     numberDays=None):
    """
    Evolution of meanday_of_periods_by_entity:
    For every day in a pgtable, count the number of rows by periods of X minutes
    for each interest entity.
    
    At the end, calculate the mean between every day for each period.
    
    This method uses SQL and TimeInterval columns.
    
    PERIODS = [('07:30:00', '09:30:00'), ('07:30:00', '09:30:00')]
    
    It is not complete because the output table not have a column for each
    period
    """

    from glass.pys import obj_to_lst
    from glass.ng.sql.q import q_to_obj
    from glass.ng.wt import obj_to_tbl

    def get_case(PTUPLE, PFIELD):
        return ("CASE "
                "WHEN TO_TIMESTAMP("
                "COALESCE(CAST({h} AS text), '') || ':' || "
                "COALESCE(CAST({m} AS text), '') || ':' || "
                "COALESCE(CAST({s} AS text), ''), 'HH24:MI:SS'"
                ") >= TO_TIMESTAMP('{tLower}', 'HH24:MI:SS') AND "
                "TO_TIMESTAMP("
                "COALESCE(CAST({h} AS text), '') || ':' || "
                "COALESCE(CAST({m} AS text), '') || ':' || "
                "COALESCE(CAST({s} AS text), ''), 'HH24:MI:SS'"
                ") < TO_TIMESTAMP('{tUpper}', 'HH24:MI:SS') "
                "THEN 1 ELSE 0 "
                "END AS {fld}").format(h=hourField,
                                       m=minutesField,
                                       s=secondField,
                                       tLower=PTUPLE[0],
                                       tUpper=PTUPLE[1],
                                       fld=PFIELD)

    entityField = obj_to_lst(entityField)

    periodsCols = [
        "p{ha}h{ma}_{hb}h{mb}".format(ha=p[0].split(':')[0],
                                      ma=p[0].split(':')[1],
                                      hb=p[1].split(':')[0],
                                      mb=p[1].split(':')[1]) for p in PERIODS
    ]

    ndaysQ = "SELECT {} AS nday".format(numberDays) if numberDays else \
        ("SELECT MAX(nday) AS nday FROM ("
            "SELECT row_number() OVER(ORDER BY {dayF}) AS nday "
            "FROM {t} {whr}"
            "GROUP BY {dayF}"
        ") AS dayt")

    # Get mean rows of all days by entity and period
    q = ("SELECT {entityF}, {meanSq}, nday FROM ("
         "SELECT {entityF}, {dayF}, {sumSeq} FROM ("
         "SELECT {entityF}, {dayF}, {caseSt} FROM {t} {whr}"
         ") AS foo "
         "WHERE {whrSq} "
         "GROUP BY {entityF}, {dayF}"
         ") AS foo2, ({getND}) AS fooday "
         "GROUP BY {entityF}, nday").format(
             entityF=", ".join(entityField),
             meanSq=", ".join([
                 "(SUM({f}) / nday) AS {f}".format(f=p) for p in periodsCols
             ]),
             dayF=dayField,
             sumSeq=", ".join(
                 ["SUM({f}) AS {f}".format(f=p) for p in periodsCols]),
             caseSt=", ".join([
                 get_case(PERIODS[x], periodsCols[x])
                 for x in range(len(PERIODS))
             ]),
             t=pgtable,
             whr="" if not filterData else "WHERE {} ".format(filterData),
             whrSq=" OR ".join(["{}=1".format(p) for p in periodsCols]),
             getND=ndaysQ)

    data = q_to_obj(psql_con, q, db_api='psql')

    obj_to_tbl(data, outFile)

    return outFile
コード例 #15
0
ファイル: tags.py プロジェクト: jasp382/glass
def get_not_used_tags(OSM_FILE, OUT_TBL):
    """
    Use a file OSM to detect tags not considered in the
    OSM2LULC procedure
    """
    
    import os
    from glass.ng.wt        import obj_to_tbl
    from glass.g.tbl.filter import sel_by_attr
    from glass.ng.sql.q     import q_to_obj
    from glass.ng.pd.split  import df_split
    from glass.pys.oss      import fprop
    from glass.g.it.osm     import osm_to_gpkg
    
    OSM_TAG_MAP = {
        "DB"        : os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'osmtolulc.sqlite'
        ),
        "OSM_FEAT"  : "osm_features",
        "KEY_COL"   : "key",
        "VALUE_COL" : "value",
        "GEOM_COL"  : "geom"
    }
    
    WORKSPACE = os.path.dirname(OUT_TBL)
    
    sqdb = osm_to_gpkg(OSM_FILE, os.path.join(
        WORKSPACE, fprop(OSM_FILE, 'fn') + '.gpkg'
    ))
    
    # Get Features we are considering
    ourOSMFeatures = q_to_obj(OSM_TAG_MAP["DB"], (
        "SELECT {key} AS key_y, {value} AS value_y, {geom} AS geom_y "
        "FROM {tbl}"
    ).format(
        key=OSM_TAG_MAP["KEY_COL"], value=OSM_TAG_MAP["VALUE_COL"],
        geom=OSM_TAG_MAP["GEOM_COL"], tbl=OSM_TAG_MAP["OSM_FEAT"]
    ), db_api='sqlite')
    
    # Get Features in File
    TABLES_TAGS = {
        'points'        : ['highway', 'man_made', 'building'],
        'lines'         : ['highway', 'waterway', 'aerialway', 'barrier',
                           'man_made', 'railway'],
        'multipolygons' : ['aeroway', 'amenity', 'barrier', 'building',
                           'craft', 'historic', 'land_area', ''
                           'landuse', 'leisure', 'man_made', 'military',
                           'natural', 'office', 'place', 'shop',
                           'sport', 'tourism', 'waterway', 'power',
                           'railway', 'healthcare', 'highway']
    }
    
    Qs = [
        " UNION ALL ".join([(
            "SELECT '{keycol}' AS key, {keycol} AS value, "
            "'{geomtype}' AS geom FROM {tbl} WHERE "
            "{keycol} IS NOT NULL"
        ).format(
            keycol=c, geomtype='Point' if table == 'points' else 'Line' \
                if table == 'lines' else 'Polygon',
            tbl=table
        ) for c in TABLES_TAGS[table]]) for table in TABLES_TAGS
    ]
    
    fileOSMFeatures = q_to_obj(sqdb, (
        "SELECT key, value, geom FROM ({}) AS foo "
        "GROUP BY key, value, geom"
    ).format(" UNION ALL ".join(Qs)), db_api='sqlite')
    
    _fileOSMFeatures = fileOSMFeatures.merge(
        ourOSMFeatures, how='outer',
        left_on=["key", "value", "geom"],
        right_on=["key_y", "value_y", "geom_y"]
    )
    
    # Select OSM Features of file without correspondence
    _fileOSMFeatures["isnew"] =_fileOSMFeatures.key_y.fillna(value='nenhum')
    
    newTags = _fileOSMFeatures[_fileOSMFeatures.isnew == 'nenhum']
    
    newTags["value"] = newTags.value.str.replace("'", "''")
    
    newTags["whr"] = newTags.key + "='" + newTags.value + "'"
    
    # Export tags not being used to new shapefile
    def to_regular_str(row):
        san_str = row.whr
        
        row["whr_san"] = san_str
        
        return row
    
    for t in TABLES_TAGS:
        if t == 'points':
            filterDf = newTags[newTags.geom == 'Point']
        
        elif t == 'lines':
            filterDf = newTags[newTags.geom == 'Line']
        
        elif t == 'multipolygons':
            filterDf = newTags[newTags.geom == 'Polygon']
        
        if filterDf.shape[0] > 500:
            dfs = df_split(filterDf, 500, nrows=True)
        else:
            dfs = [filterDf]
        
        Q = "SELECT * FROM {} WHERE {}".format(
            t, filterDf.whr.str.cat(sep=" OR "))
        
        i = 1
        for df in dfs:
            fn = t + '.shp' if len(dfs) == 1 else '{}_{}.shp'.format(
                t, str(i)
            )
            try:
                shp = sel_by_attr(sqdb, Q.format(
                    t, df.whr.str.cat(sep=" OR ")
                ), os.path.join(WORKSPACE, fn), api_gis='ogr')
            except:
                __df = df.apply(lambda x: to_regular_str(x), axis=1)
            
                shp = sel_by_attr(sqdb, Q.format(
                    t, __df.whr.str.cat(sep=" OR ")
                ), os.path.join(WORKSPACE, fn))
            
            i += 1
    
    # Export OUT_TBL with tags not being used
    newTags.drop(['key_y', 'value_y', 'geom_y', 'isnew', 'whr'], axis=1, inplace=True)
    obj_to_tbl(newTags, OUT_TBL, sheetsName="new_tags", sanitizeUtf8=True)
    
    return OUT_TBL
コード例 #16
0
def datatocls_multiref(shpfile, mapstbl, sheet, slugs, titles, ncls, decplace,
    outshp, outmapstbl, method="QUANTILE"):
    """
    Create classes/intervals for each layout in table (mapstbl)
    One layout could have more than one map... deal with that situation

    method options:
    * QUANTILE;
    * JENKS - natural breaks (jenks);
    """

    import pandas            as pd
    import numpy             as np
    from glass.pys           import obj_to_lst
    from glass.g.rd.shp      import shp_to_obj
    from glass.g.wt.shp      import df_to_shp
    from glass.ng.rd         import tbl_to_obj
    from glass.ng.wt         import obj_to_tbl
    from glass.ng.pd.fld     import listval_to_newcols
    from glass.g.lyt.diutils import eval_intervals

    methods = ["QUANTILE", "JENKS"]

    if method not in methods:
        raise ValueError(f'Method {method} is not available')
    
    if method == "QUANTILE":
        from glass.ng.pd.stats import get_intervals
    
    elif method == "JENKS":
        import jenkspy
    
    slugs  = obj_to_lst(slugs)
    titles = obj_to_lst(titles)
    
    # Read data
    shp  = shp_to_obj(shpfile)
    maps = tbl_to_obj(mapstbl, sheet=sheet)

    # Get intervals for each map
    istats = []
    cols   = []
    for i, row in maps.iterrows():
        ddig  = row[decplace]
        icols = [row[slug] for slug in slugs]
        ititles = [row[title] for title in titles]

        istatsrow = []
        for _i in range(len(icols)):
            min_v  = shp[icols[_i]].min()
            max_v  = shp[icols[_i]].max()
            mean_v = shp[icols[_i]].mean()
            std_v  = shp[icols[_i]].std()

            if method == "QUANTILE":
                intervals = get_intervals(
                    shp, icols[_i], ncls, method="QUANTILE")
                intervals.append(max_v)
            
            elif method == "JENKS":
                breaks = jenkspy.jenks_breaks(shp[icols[_i]], nb_class=ncls)
                intervals = breaks[1:]
            
            if not str(shp[icols[_i]].dtype).startswith('int'):
                __intervals = [round(itv, ddig) for itv in intervals]

                __intervals, ndig = eval_intervals(
                    intervals, __intervals, ddig, round(min_v, ddig)
                )

                istatsrow.extend([
                    icols[_i], ititles[_i], round(min_v, ndig),
                    round(max_v, ndig), round(mean_v, ddig),
                    round(std_v, ddig), __intervals
                ])

                shp[icols[_i]] = shp[icols[_i]].round(ddig)
            
            else:
                for _e in range(len(intervals)):
                    if not _e:
                        rzero = 1 if round(intervals[_e], 0) > min_v else 0
                    
                    else:
                        rzero = 1 if round(intervals[_e], 0) > \
                            round(intervals[_e -1], 0) else 0
                    
                    if not rzero:
                        break
                
                __intervals = [round(
                    _o, ddig if not rzero else 0
                ) for _o in intervals]

                __intervals, ndig = eval_intervals(
                    intervals, __intervals, ddig, min_v
                )

                istatsrow.extend([
                    icols[_i], ititles[_i], min_v, max_v,
                    int(round(mean_v, 0)) if rzero else round(mean_v, ddig),
                    int(round(std_v, 0)) if rzero else round(std_v, ddig),
                    __intervals
                ])
            
            if not i:
                cols.extend([
                    f'slug{str(_i+1)}', f'title{str(_i+1)}',
                    f'min_value{str(_i+1)}', f'max_value{str(_i+1)}',
                    f'mean_value{str(_i+1)}',
                    f'std_value{str(_i+1)}', f'intervals{str(_i+1)}'
                ])
        
        istats.append(istatsrow)
    
    istats = pd.DataFrame(istats, columns=cols)

    rename_cols = {}
    for idx, row in istats.iterrows():
        for _i in range(len(slugs)):
            # Get intervals
            int_ = row[f'intervals{str(_i+1)}']

            # Add columns for intervals ids
            newcol = 'i_' + row[f'slug{str(_i+1)}']
            shp[newcol] = 0

            for itv in range(len(int_)):
                if not itv:
                    shp[newcol] = np.where(
                        shp[row[f'slug{str(_i+1)}']] <= int_[itv],
                        itv + 1, shp[newcol]
                    )
                
                else:
                    shp[newcol] = np.where(
                        (shp[row[f'slug{str(_i+1)}']] > int_[itv-1]) & (shp[row[f'slug{str(_i+1)}']] <= int_[itv]),
                        itv + 1, shp[newcol]
                    )
            
            rename_cols[newcol] = row[f'slug{str(_i+1)}']
    
    dc = []
    for c in range(len(slugs)):
        dc.extend(istats[f'slug{str(c+1)}'].tolist())
    
    shp.drop(dc, axis=1, inplace=True)
    shp.rename(columns=rename_cols, inplace=True)

    
    for i in range(len(slugs)):
        istats = listval_to_newcols(istats, f'intervals{str(i+1)}')
        istats.rename(columns={
            ii : f'intervals{str(i+1)}_{str(ii+1)}' for ii in range(ncls)
        }, inplace=True)
    
    # Write outputs
    df_to_shp(shp, outshp)
    obj_to_tbl(istats, outmapstbl)

    return outshp, outmapstbl
コード例 #17
0
def datatocls_meanstd(shp_data, maps_table, sheet, slug, title,
    ncls, decplace, nodata, out_shp, out_maps_tbl, grpcol=None):
    """
    Create classes based on mean and standard deviation

    decplace - Numero casas decimais que vao aparecer nos valores do layout
    nodata - Must be always smaller than the min of min values
    """

    import pandas            as pd
    import numpy             as np
    from glass.g.rd.shp      import shp_to_obj
    from glass.g.wt.shp      import df_to_shp
    from glass.ng.rd         import tbl_to_obj
    from glass.ng.wt         import obj_to_tbl
    from glass.ng.pd.fld     import listval_to_newcols
    from glass.g.lyt.diutils import eval_intervals

    # Read data
    shp_df = shp_to_obj(shp_data)

    maps_df = tbl_to_obj(maps_table, sheet=sheet)

    if grpcol:
        maps_cols = maps_df[slug].tolist()
        for c in maps_cols:
            shp_df[c] = shp_df[c].astype(float)
        agg_dict = {c : 'mean' for c in maps_cols}
        shp_df = pd.DataFrame(shp_df.groupby([grpcol]).agg(
            agg_dict
        )).reset_index()
    
    def get_intervals(_ncls, mean, std):
        mean_class = mean + (std / 2)
    
        less_mean = []
        major_mean = []
        for e in range(_ncls):
            if not e:
                less_mean.append(mean - (std / 2))
                major_mean.append(mean_class + std)
            else:
                less_mean.append(less_mean[e - 1] - std)
                major_mean.append(major_mean[e - 1] + std)
        
        less_mean.reverse()
        intervals = less_mean + [mean_class] + major_mean
    
        return intervals
    
    # Calculo intervalos para cada indicador
    # metodo intervalos baseados na media e no desvio padrao

    # Get min, max, mean and standard deviation
    # Round values
    i_stats = []
    for idx, row in maps_df.iterrows():
        ddig = row[decplace]
        i    = row[slug]
        t    = row[title]

        if nodata in shp_df[i].unique():
            vals = list(shp_df[i].unique())
            vals.sort()

            min_v = vals[1]
        
            tdf = shp_df[[i]].copy()
        
            tdf = tdf[tdf[i] >= min_v]
            tdf.reset_index(drop=True, inplace=True)
        
            max_v = tdf[i].max()
            mean_v = tdf[i].mean()
            std_v = tdf[i].std()
        
        else:
            min_v  = shp_df[i].min()
            max_v  = shp_df[i].max()
            mean_v = shp_df[i].mean()
            std_v  = shp_df[i].std()
        
        fbreak = min_v - 1
        __std = std_v
        while fbreak <= min_v:
            intervals = get_intervals(ncls, mean_v, __std)

            repeat = 0
            for __i in intervals[:-1]:
                if __i > max_v:
                    repeat = 1
                
                if repeat:
                    break
            
            fbreak = intervals[0] if not repeat else min_v - 1
            __std = __std / 2
        
        intervals[-1] = max_v

        if not str(shp_df[i].dtype).startswith('int'):
            __intervals = [round(_i, ddig) for _i in intervals]
        
            repeat = 1
            __intervals, ndig = eval_intervals(
                intervals, __intervals, ddig,
                round(min_v, ddig)
            )
        
            i_stats.append([
                i, t, round(min_v, ndig), round(max_v, ndig),
                round(mean_v, ddig), round(std_v, ddig), __intervals
            ])
        
            shp_df[i] = shp_df[i].round(ddig)
        
        else:
            for _e in range(len(intervals)):
                if not _e:
                    rzero = 1 if round(intervals[_e], 0) > min_v else 0
                
                else:
                    rzero = 1 if round(intervals[_e], 0) > \
                        round(intervals[_e - 1], 0) else 0
            
                if not rzero:
                    break
            
            __intervals = [round(_o, ddig if not rzero else 0) for _o in intervals]

            __intervals, ndig = eval_intervals(intervals, __intervals, ddig, min_v)

            i_stats.append([
                i, t, min_v, max_v,
                int(round(mean_v, 0)) if rzero else round(mean_v, ddig),
                int(round(std_v, 0)) if rzero else round(std_v, ddig),
                __intervals
            ])
    
    i_stats = pd.DataFrame(i_stats, columns=[
        'slug', 'title', 'min_value', 'max_value',
        'mean_value', 'std_value', 'intervals'
    ])

    rename_cols = {}
    for idx, row in i_stats.iterrows():
        # Get intervals.
        int_ = row.intervals
    
        # Add columns for intervals
        i_col = 'i_' + row.slug
        shp_df[i_col] = 0
    
        for _i in range(len(int_)):
            if not _i:
                shp_df[i_col] = np.where(
                    (shp_df[row.slug] > nodata) & (shp_df[row.slug] <= int_[_i]),
                    _i + 1, shp_df[i_col]
                )
            else:
                shp_df[i_col] = np.where(
                    (shp_df[row.slug] > int_[_i - 1]) & (shp_df[row.slug] <= int_[_i]),
                    _i + 1, shp_df[i_col]
                )
    
        rename_cols[i_col] = row.slug
    
    shp_df.drop(i_stats.slug, axis=1, inplace=True)
    shp_df.rename(columns=rename_cols, inplace=True)

    i_stats = listval_to_newcols(i_stats, 'intervals')

    i_stats.rename(columns={
        i : 'interval_' + str(i+1) for i in range((ncls * 2) + 1)
    }, inplace=True)

    if grpcol:
        nshp_df = shp_to_obj(shp_data)

        nshp_df.drop(maps_cols, axis=1, inplace=True)

        shp_df.rename(columns={grpcol : grpcol + '_y'}, inplace=True)

        shp_df = nshp_df.merge(shp_df, how='left', left_on=grpcol, right_on=grpcol + '_y')
    
    df_to_shp(shp_df, out_shp)

    obj_to_tbl(i_stats, out_maps_tbl)

    return out_shp, out_maps_tbl
コード例 #18
0
ファイル: mean.py プロジェクト: jasp382/glass
def meandays_by_entity(db,
                       pgtable,
                       DAY_FIELD,
                       ENTITY_FIELD,
                       COUNT_FIELD_NAME,
                       OUTPUT_FILE,
                       EXCLUDE_DAYS=None):
    """
    For every day in a pgtable, count the number of rows for each interest entity.
    At the end, calculate the mean of rows between every day for each entity.
    
    Day field must be of type text
    """

    from glass.ng.sql.q import q_to_obj
    from glass.ng.wt import obj_to_tbl

    # Get days
    VALUES = q_to_obj(db,
                      "SELECT {col} FROM {t} GROUP BY {col}".format(
                          col=DAY_FIELD, t=pgtable),
                      db_api='psql')[DAY_FIELD].tolist()

    # For every day, Group rows by entities
    tableArray = []
    for day in VALUES:
        if EXCLUDE_DAYS:
            if day[0] in EXCLUDE_DAYS:
                continue

        QUERY = ("SELECT {col}, COUNT({col}) AS {countname} FROM {table} "
                 "WHERE {dayF}='{d}' GROUP BY {col}").format(
                     col=ENTITY_FIELD,
                     countname=COUNT_FIELD_NAME,
                     table=pgtable,
                     dayF=DAY_FIELD,
                     d=day[0])

        countTbl = q_to_obj(db, QUERY, db_api='psql')

        tableArray.append(countTbl)

    # Get mean for all entities
    main_table = tableArray[0]
    TMP_COUNT_FIELD_NAME = 'join_' + COUNT_FIELD_NAME
    TMP_JOIN_FIELD = 'id_entity'

    for i in range(1, len(tableArray)):
        tableArray[i].rename(columns={
            COUNT_FIELD_NAME: TMP_COUNT_FIELD_NAME,
            ENTITY_FIELD: TMP_JOIN_FIELD
        },
                             inplace=True)

        main_table = main_table.merge(tableArray[i],
                                      how='outer',
                                      left_on=ENTITY_FIELD,
                                      right_on=TMP_JOIN_FIELD)

        main_table.fillna(0, inplace=True)
        main_table[ENTITY_FIELD].replace(0,
                                         main_table[TMP_JOIN_FIELD],
                                         inplace=True)

        main_table[COUNT_FIELD_NAME] = main_table[COUNT_FIELD_NAME] + \
            main_table[TMP_COUNT_FIELD_NAME]
        main_table.drop([TMP_COUNT_FIELD_NAME, TMP_JOIN_FIELD],
                        axis=1,
                        inplace=True)

    main_table[COUNT_FIELD_NAME] = main_table[COUNT_FIELD_NAME] / len(
        tableArray)

    obj_to_tbl(main_table, OUTPUT_FILE)

    return OUTPUT_FILE
コード例 #19
0
def datatocls(shpfile, mapstbl, sheet, slug, title, ncls, decplace,
    outshp, outmapstbl, method="QUANTILE"):
    """
    Create classes/intervals for each map in table

    method options:
    * QUANTILE;
    * JENKS - natural breaks (jenks);
    """

    import pandas            as pd
    import numpy             as np
    from glass.g.rd.shp      import shp_to_obj
    from glass.g.wt.shp      import df_to_shp
    from glass.ng.rd         import tbl_to_obj
    from glass.ng.wt         import obj_to_tbl
    from glass.ng.pd.fld     import listval_to_newcols
    from glass.g.lyt.diutils import eval_intervals

    methods = ["QUANTILE", "JENKS"]

    if method not in methods:
        raise ValueError(f'Method {method} is not available')

    if method == "QUANTILE":
        from glass.ng.pd.stats import get_intervals
    
    elif method == "JENKS":
        import jenkspy

    # Read data
    shp  = shp_to_obj(shpfile)
    maps = tbl_to_obj(mapstbl, sheet=sheet)

    # Get intervals for each map
    istats = []
    for i, row in maps.iterrows():
        ddig = row[decplace]
        icol = row[slug]
        titl = row[title]
    
        min_v  = shp[icol].min()
        max_v  = shp[icol].max()
        mean_v = shp[icol].mean()
        std_v  = shp[icol].std()

        if method == "QUANTILE":
            intervals = get_intervals(shp, icol, ncls, method="QUANTILE")
            intervals.append(max_v)
        
        elif method == "JENKS":
            breaks = jenkspy.jenks_breaks(shp[icol], nb_class=ncls)
            intervals = breaks[1:]
        
        if not str(shp[icol].dtype).startswith('int'):
            __intervals = [round(i, ddig) for i in intervals]

            __intervals, ndig = eval_intervals(
                intervals, __intervals, ddig, round(min_v, ddig)
            )

            istats.append([
                icol, titl, round(min_v, ndig),
                round(max_v, ndig), round(mean_v, ddig),
                round(std_v, ddig), __intervals
            ])

            shp[icol] = shp[icol].round(ddig)
        
        else:
            for _e in range(len(intervals)):
                if not _e:
                    rzero = 1 if round(intervals[_e], 0) > min_v else 0
                
                else:
                    rzero = 1 if round(intervals[_e], 0) > \
                        round(intervals[_e - 1], 0) else 0
                
                if not rzero:
                    break
            
            __intervals = [round(
                _o, ddig if not rzero else 0
            ) for _o in intervals]

            __intervals, ndig = eval_intervals(
                intervals, __intervals, ddig, min_v)
            
            istats.append([
                icol, titl, min_v, max_v,
                int(round(mean_v, 0)) if rzero else round(mean_v, ddig),
                int(round(std_v, 0)) if rzero else round(std_v, ddig),
                __intervals
            ])
    
    istats = pd.DataFrame(istats, columns=[
        "slug", "title", "min_value", "max_value",
        "mean_value", "std_value", "intervals"
    ])

    rename_cols = {}
    for idx, row in istats.iterrows():
        # Get intervals
        int_ = row.intervals
    
        # Add columns for intervals
        i_col = 'i_' + row.slug
        shp[i_col] = 0
    
        for _i in range(len(int_)):
            if not _i:
                shp[i_col] = np.where(
                    shp[row.slug] <= int_[_i],
                    _i + 1, shp[i_col]
                )
        
            else:
                shp[i_col] = np.where(
                    (shp[row.slug] > int_[_i - 1]) & (shp[row.slug] <= int_[_i]),
                    _i + 1, shp[i_col]
                )
    
        rename_cols[i_col] = row.slug
    
    shp.drop(istats.slug, axis=1, inplace=True)
    shp.rename(columns=rename_cols, inplace=True)

    istats = listval_to_newcols(istats, 'intervals')

    istats.rename(columns={
        i : 'interval_' + str(i+1) for i in range(ncls)
    }, inplace=True)

    # Write outputs
    df_to_shp(shp, outshp)
    obj_to_tbl(istats, outmapstbl)

    return outshp, outmapstbl
コード例 #20
0
ファイル: joins.py プロジェクト: jasp382/glass
def calc_mean_samecol_sevshp(intbls, pk, meancol, output, tformat='.shp'):
    """
    Calculate mean of the same column in different tables

    Assume we have N tables with a numerical column with the same name

    This script calculate the mean of all these columns
    """

    import os
    from glass.ng.wt import obj_to_tbl
    from glass.g.rd.shp import shp_to_obj

    if os.path.isdir(intbls):
        from glass.pys.oss import lst_ff

        tbls = lst_ff(intbls, file_format='.shp' if not tformat else tformat)

    else:
        if type(intbls) == list:
            tbls = intbls
        else:
            raise ValueError('intbls has an invalid value')

    # Read data
    dfs = [shp_to_obj(t) for t in tbls]

    # Drop uncessary cols
    mantain_cols = [pk, meancol]
    for d in range(len(dfs)):
        dfs[d].drop(
            [c for c in dfs[d].columns.values if c not in mantain_cols],
            axis=1,
            inplace=True)

        if d:
            dfs[d].rename(columns={
                pk: "{}_{}".format(pk, str(d)),
                meancol: "{}_{}".format(meancol, str(d))
            },
                          inplace=True)

    # Join all DFS
    main_df = dfs[0]

    for d in range(1, len(dfs)):
        main_df = main_df.merge(dfs[d],
                                how='outer',
                                left_on=pk,
                                right_on="{}_{}".format(pk, str(d)))

        main_df[meancol] = main_df[meancol] + main_df[meancol + "_" + str(d)]

    # Get mean
    main_df[meancol] = main_df[meancol] / len(dfs)

    # Drop uncessary cols
    drop_cols = []
    for d in range(1, len(dfs)):
        drop_cols.append("{}_{}".format(pk, str(d)))
        drop_cols.append("{}_{}".format(meancol, str(d)))

    main_df.drop(drop_cols, axis=1, inplace=True)

    # Export Result
    obj_to_tbl(main_df, output)

    return output
コード例 #21
0
ファイル: joins.py プロジェクト: jasp382/glass
def join_tables_in_table(mainTable, mainIdField, joinTables, outTable):
    """
    Join one table with all tables in a folder
    
    joinTables = {
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-06.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_6'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-13.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_13'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-20.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_20'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-27.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_27'}
        }
    }
    
    #TODO: only works with xlsx tables as join TABLES
    """

    # Modules
    import os
    import pandas
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl

    # Get table format
    tableType = os.path.splitext(mainTable)[1]

    tableDf = tbl_to_obj(mainTable)

    for table in joinTables:
        xlsDf = tbl_to_obj(table)

        join_field = 'id_entity' if joinTables[table]["JOIN_FIELD"] == mainIdField \
            else joinTables[table]["JOIN_FIELD"]

        if joinTables[table]["JOIN_FIELD"] == mainIdField:
            xlsDf.rename(columns={mainIdField: join_field}, inplace=True)

        xlsDf.rename(columns=joinTables[table]["COLS_TO_JOIN"], inplace=True)

        tableDf = tableDf.merge(xlsDf,
                                how='outer',
                                left_on=mainIdField,
                                right_on=join_field)

        tableDf.fillna(0, inplace=True)
        tableDf[mainIdField].replace(0, tableDf[join_field], inplace=True)

        tableDf.drop(join_field, axis=1, inplace=True)

    obj_to_tbl(tableDf, outTable)

    return outTable
コード例 #22
0
ファイル: mean.py プロジェクト: jasp382/glass
def meanrowsday_by_entity(psqldb,
                          pgtable,
                          dayField,
                          entityField,
                          out_file,
                          filterData=None,
                          newMeanField=None,
                          numberDays=None):
    """
    For every day in a pgtable, count the number of rows for each interest entity.
    At the end, calculate the mean of rows between every day for each entity.
    
    Day field must be of type text
    
    Difference in relation to meandays_by_entity:
    this one uses only SQL and PGSQL and not Pandas.
    
    if numberDays=None, the number of days used will be based on the days
    included in the data. If you want the mean for 5 days, but there are no data
    for one of these days, with numberDays=None, the mean will be only for
    4 days.
    """

    from glass.pys import obj_to_lst
    from glass.ng.sql.q import q_to_obj
    from glass.ng.wt import obj_to_tbl

    entityField = obj_to_lst(entityField)
    mean_field = "mean_rows" if not newMeanField else newMeanField

    ndaysQ = "SELECT {} AS nday".format(numberDays) if numberDays else \
        ("SELECT MAX(nday) AS nday FROM ("
            "SELECT row_number() OVER(ORDER BY {dayF}) AS nday "
            "FROM {t} {whr}"
            "GROUP BY {dayF}"
        ") AS fooday").format(
            whr="" if not filterData else "WHERE {} ".format(filterData),
            dayF=dayField, t=pgtable
        )

    # Get mean rows of all days by entity
    q = ("SELECT {entityF}, (SUM(conta) / nday) AS {mF} "
         "FROM ("
         "SELECT {entityF}, {dayF}, COUNT({cnt}) AS conta "
         "FROM {t} {whr}"
         "GROUP BY {entityF}, {dayF}"
         ") AS foo, ({getD}) AS foo2 "
         "GROUP BY {entityF}, nday").format(
             entityF=", ".join(entityField),
             dayF=dayField,
             mF=mean_field,
             cnt=entityField[0],
             t=pgtable,
             whr="" if not filterData else "WHERE {} ".format(filterData),
             getD=ndaysQ)

    data = q_to_obj(psqldb, q, db_api='psql')

    obj_to_tbl(data, out_file)

    return out_file
コード例 #23
0
def ID_rows_with_temporal_proximity_by_entities(db, table, entity_field,
                                                day_field, hour_field,
                                                hour_decimal_field,
                                                time_tolerance, outXlsPath):
    """
    Retrieve rows from one pgtable with some temporal proximity
    
    Table structure should be
    entity |     day    | hour | hour_decimal
      0    | 2018-01-02 |  5   |   5,10
      0    | 2018-01-03 |  4   |   4,15
      0    | 2018-01-02 |  5   |   5,12
      0    | 2018-01-02 |  5   |   5,8
      1    | 2018-01-02 |  4   |   4,10
      1    | 2018-01-02 |  5   |   5,12
      1    | 2018-01-02 |  4   |   4,20
      1    | 2018-01-02 |  4   |   4,12
      1    | 2018-01-02 |  4   |   4,6
    
    For a time_tolerance of 5 minutes, the output table will have
    the rows with a temporal difference inside/bellow that time tolerance
    
    entity_field could be more than one field
    
    This method only identifies if one entity, for one day, has rows 
    very close of each others, in terms of time.
    
    Not a good strategy for large tables. For large tables, SQL based methods
    are needed
    """

    from glass.pys import obj_to_lst
    from glass.ng.sql.q import q_to_obj
    from glass.ng.prop.sql import cols_type
    from glass.ng.wt import obj_to_tbl

    entity_field = obj_to_lst(entity_field)
    COLS = entity_field + [day_field, hour_field]
    COLS_TYPE = cols_type(db, table)

    # TIME TOLERANCE IN HOURS
    TIME_TOLERANCE = time_tolerance / 60.0

    def thereIsRowsSameTimeInt(row):
        whr = []
        for c in COLS:
            if COLS_TYPE[c] == str:
                whr.append("{}='{}'".format(c, row[c]))
            else:
                whr.append("{}={}".format(c, row[c]))

        hourRows = q_to_obj(db,
                            "SELECT {} FROM {} WHERE {}".format(
                                hour_decimal_field, table, " AND ".join(whr)),
                            db_api='psql')[hour_decimal_field].tolist()

        for i in range(len(hourRows)):
            for e in range(i + 1, len(hourRows)):
                dif = abs(hourRows[i][0] - hourRows[e][0])

                if dif < TIME_TOLERANCE:
                    break

            if dif < TIME_TOLERANCE:
                break

        if dif < TIME_TOLERANCE:
            row['time_difference'] = 1
        else:
            row['time_difference'] = 0

        return row

    # Count entity occourrences for one day and hour
    countsByEntityTime = q_to_obj(
        db,
        ("SELECT {scols}, conta FROM "
         "(SELECT {scols}, COUNT({ent}) AS conta FROM {tbl} "
         "GROUP BY {scols}) AS foo WHERE conta > 1").format(
             scols=', '.join(COLS), ent=entity_field[0], tbl=table),
        db_api='psql')

    # For each row in the last count, When count is > 1
    # Check time difference between rows for one day and hour
    countsByEntityTime = countsByEntityTime.apply(
        lambda x: thereIsRowsSameTimeInt(x), axis=1)

    obj_to_tbl(countsByEntityTime, outXlsPath)

    return outXlsPath
コード例 #24
0
ファイル: circ.py プロジェクト: jasp382/glass
def name_circulations(db,
                      GTFS_SCHEMA,
                      OTHER_SCHEMA,
                      output,
                      other_db=None,
                      serviceSchema=None,
                      routeIdColName=None,
                      tripIdColName=None):
    """
    Get all circulations from GTFS and associate these circulations to
    other meta columns of other database
    
    GTFS_SCHEMA = {
        "TNAME"     : "stop_times",
        "TRIP"      : "trip_id",
        "STOP"      : "stop_id",
        "SEQUENCE"  : "stop_sequence",
        "DEPARTURE" : "departure_time"
    }
    
    OTHER_SCHEMA = {
        "TNAME"    : "percursos_geom_v2",
        "ROUTE"    : ["carreira", "variante", "sentido"],
        "SEQUENCE" : "ordem",
        "STOP"     : "paragem"
    }
    
    serviceSchema = {
        "TRIPS" : {
            "TNAME"   : "trips",
            "TRIP"    : "trip_id",
            "SERVICE" : "service_id"
        },
        "CALENDAR" : {
            "TNAME"   : "calendar_dates",
            "SERVICE" : "service_id",
            "DATE"    : "date"
        },
        "FILTER_DAY" : 20180308
    }
    """

    import os
    from glass.pys import obj_to_lst
    from glass.ng.sql.q import q_to_obj

    other_db = db if not other_db else other_db

    # Sanitize Route ID in Other Schema
    OTHER_SCHEMA_ROUTE = obj_to_lst(OTHER_SCHEMA["ROUTE"])

    if len(OTHER_SCHEMA_ROUTE) > 1:
        from glass.ng.sql.col import txt_cols_to_col

        ROUTE_COL = routeIdColName if routeIdColName else "fid_route"

        txt_cols_to_col(other_db, OTHER_SCHEMA["TNAME"], OTHER_SCHEMA_ROUTE,
                        "|", ROUTE_COL)

    else:
        ROUTE_COL = routeIdColName if routeIdColName else \
            OTHER_SCHEMA_ROUTE[0]
    """
    Get all circulations in GTFS and their start time
    """
    if serviceSchema:
        serviceSchema["FILTER_DAY"] = obj_to_lst(serviceSchema["FILTER_DAY"])

    where = "" if not serviceSchema else (" WHERE {} ").format(" OR ".join([
        "{}.{} = {}".format(serviceSchema["CALENDAR"]["TNAME"],
                            serviceSchema["CALENDAR"]["DATE"], d)
        for d in serviceSchema["FILTER_DAY"]
    ]))

    injoinQ = "" if not serviceSchema else (
        "INNER JOIN ("
        "SELECT {tripsTbl}.{tripsTripId} "
        "FROM {tripsTbl} INNER JOIN {calenTbl} ON "
        "{tripsTbl}.{tripsServId} = {calenTbl}.{calenServId}{whr} "
        "GROUP BY {tripsTbl}.{tripsTripId}"
        ") AS trip_service ON {stopTimeTbl}.{stopTimeTrip} "
        "= trip_service.{tripsTripId} ").format(
            tripsTbl=serviceSchema["TRIPS"]["TNAME"],
            tripsTripId=serviceSchema["TRIPS"]["TRIP"],
            tripsServId=serviceSchema["TRIPS"]["SERVICE"],
            calenTbl=serviceSchema["CALENDAR"]["TNAME"],
            calenServId=serviceSchema["CALENDAR"]["SERVICE"],
            stopTimeTbl=GTFS_SCHEMA["TNAME"],
            stopTimeTrip=GTFS_SCHEMA["TRIP"],
            whr=where)

    newTripCol = tripIdColName if tripIdColName else GTFS_SCHEMA["TRIP"]

    Q = (
        "SELECT {stopTimesT}.{tripId} AS {newTrip}, "
        "array_agg({stopTimesT}.{stopId} "
        "ORDER BY {stopTimesT}.{tripId}, {stopTimesT}.{stopSq}) AS stops, "
        "array_agg({stopTimesT}.{stopSq} "
        "ORDER BY {stopTimesT}.{tripId}, {stopTimesT}.{stopSq}) AS stops_order, "
        "MIN({stopTimesT}.{depTime}) AS departure, "
        "MAX({stopTimesT}.{depTime}) AS depar_last_stop "
        "FROM {stopTimesT} {injoin}"
        "GROUP BY {stopTimesT}.{tripId}").format(
            tripId=GTFS_SCHEMA["TRIP"],
            stopId=GTFS_SCHEMA["STOP"],
            stopSq=GTFS_SCHEMA["SEQUENCE"],
            depTime=GTFS_SCHEMA["DEPARTURE"],
            stopTimesT=GTFS_SCHEMA["TNAME"],
            injoin=injoinQ,
            newTrip=newTripCol)

    circ = q_to_obj(db, Q)
    """
    Get all routes metadata in the "Other Database/Table"
    """
    Q = ("SELECT {idRoute}, "
         "array_agg({stopF} ORDER BY {idRoute}, {stopSq}) AS stops, "
         "array_agg({stopSq} ORDER BY {idRoute}, {stopSq}) AS stops_order "
         "FROM {t} GROUP BY {idRoute}").format(idRoute=ROUTE_COL,
                                               stopF=OTHER_SCHEMA["STOP"],
                                               stopSq=OTHER_SCHEMA["SEQUENCE"],
                                               t=OTHER_SCHEMA["TNAME"])

    routes = q_to_obj(other_db, Q)

    def sanitizeDf(df, col):
        df[col] = df[col].astype(str)
        df[col] = df[col].str.replace('L', '')
        df[col] = df[col].str.replace(' ', '')
        df[col] = df[col].str.replace('[', '')
        df[col] = df[col].str.replace(']', '')

        return df

    circ = sanitizeDf(circ, "stops")
    routes = sanitizeDf(routes, "stops")

    newDf = circ.merge(routes, how='inner', left_on="stops", right_on="stops")

    if os.path.dirname(output):
        # Write XLS
        from glass.ng.wt import obj_to_tbl

        obj_to_tbl(newDf, output)

    else:
        # Send to pgsql
        from glass.g.wt.sql import df_to_db

        df_to_db(db, newDf, output, api='psql')

    return output
コード例 #25
0
ファイル: freq.py プロジェクト: jasp382/glass
def correlated_words(dataFile, refCol, dataCol, outTbl, lang='english', N=2,
                     refSheet=None):
    """
    Get words correlated with some text class 
    """
    
    from sklearn.feature_selection import chi2
    from glass.ng.wt               import obj_to_tbl
    from glass.ng.rd               import tbl_to_obj
    from glass.ng.clstxt           import txt_to_num_representation
    
    # Data to DataFrame
    trainDf = tbl_to_obj(
        dataFile, sheet=refSheet
    ) if type(dataFile) != pd.DataFrame else dataFile
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[dataCol])]
    trainDf = trainDf[pd.notnull(trainDf[refCol])]
    
    """
    Add a column encoding the reference classes as an integer because
    categorical variables are often better represented by integers
    than strings
    """
    
    from io import StringIO
    
    # Get a ID for Ref/text classes values
    trainDf['ref_id'] = trainDf[refCol].factorize()[0]
    
    # Create Dataframe only with ref_id's, without duplicates
    ref_id_df = trainDf[[refCol, 'ref_id']].drop_duplicates().sort_values(
        'ref_id'
    )
    
    # Create dicts to easy relate ref_id with ref_value
    ref_to_id = dict(ref_id_df.values)
    id_to_ref = dict(ref_id_df[['ref_id', refCol]].values)
    
    """
    Text to numbers
    """
    features, tfidf = txt_to_num_representation(
        trainDf, dataCol, lang, returnTfiDf=True)
    
    labels = trainDf.ref_id
    
    """
    Get most correlated words
    """
    
    corr_words = []
    for ref_name, ref_id in sorted(ref_to_id.items()):
        features_chi2 = chi2(features, labels == ref_id)
        
        indices = np.argsort(features_chi2[0])
        
        feat_names = np.array(tfidf.get_feature_names())[indices]
        
        unigrams = [v for v in feat_names if len(v.split(' ')) == 1][-N:]
        bigrams  = [v for v in feat_names if len(v.split(' ')) == 2][-N:]
        cols_d = [ref_name] + unigrams + bigrams
        
        corr_words.append(cols_d)
    
    COLS_NAME = ['ref_name'] + [
        'unigram_{}'.format(str(i+1)) for i in range(N)
    ] + [
        'bigram_{}'.format(str(i+1)) for i in range(N)
    ]
    dfCorrWords = pd.DataFrame(corr_words,columns=COLS_NAME)
    
    return obj_to_tbl(dfCorrWords, outTbl)
コード例 #26
0
ファイル: eval.py プロジェクト: jasp382/glass
def binary_eval(refTbl,
                refId,
                refCol,
                tstTbl,
                tstId,
                outTbl=None,
                tstCol=None):
    """
    Evaluation of a binary classification
    
    When tstCol is None, the script assumes that in tstTbl
    there are only positives
    
    A tabela de referencia deve ter positivos e negativos;
    mas a tabela de teste pode ter so positivos.
    """

    import numpy as np
    import pandas
    import math
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl

    # Data to Pandas Dataframe
    ref_df = tbl_to_obj(refTbl, fields=[
        refId, refCol
    ]) if type(refTbl) != pandas.DataFrame else refTbl[[refId, refCol]]
    tst_df = tbl_to_obj(
        tstTbl, fields=[tstId] if not tstCol else [tstId, tstCol]
    ) if type(refTbl) != pandas.DataFrame else tstTbl[[tstId]] \
        if not tstCol else tstTbl[[tstId, tstCol]]

    # Check if refId is equal to tstId; they must be different
    if refId == tstId:
        colRename = {tstId: 'tst_fid__'}

        # Do the same for refCol and tstCol
        if refCol == tstCol:
            colRename[tstCol] = 'tst_col__'

        tst_df.rename(columns=colRename, inplace=True)
        tstId = 'tst_fid__'

        if refCol == tstCol:
            tstCol = 'tst_col__'

    df = ref_df.merge(tst_df, how='left', left_on=refId, right_on=tstId)

    # Check if we have a tstCol
    if not tstCol:
        df[tstId].fillna('None', inplace=True)

        tstCol = 'cls_tst'
        df[tstCol] = np.where(df[tstId] == 'None', 0, 1)

    # Get VP, VN, FP, FN
    df['confusion'] = np.where(
        (df[refCol] == 1) & (df[tstCol] == 1), 'VP',
        np.where((df[refCol] == 0) & (df[tstCol] == 0), 'VN',
                 np.where((df[refCol] == 1) & (df[tstCol] == 0), 'FN', 'FP')))

    # tabela sintese
    conf_tbl = pandas.DataFrame()
    conf_tbl['nrows'] = df.groupby(['confusion'])[refId].nunique()

    conf_tbl.reset_index(inplace=True)

    conf_tbl['percentage'] = (conf_tbl.nrows * 100) / df.shape[0]

    # Get some evaluation mesures
    dConf = {}

    for row in conf_tbl.to_dict(orient='records'):
        dConf[row['confusion']] = row['nrows']

    l = ['VP', 'VN', 'FP', 'FN']
    for i in l:
        if i not in dConf:
            dConf[i] = 0
    """
    Error rate

    Error rate (ERR) is calculated as the number of all
    incorrect predictions divided by the total number of
    the dataset. The best error rate is 0.0, whereas the
    worst is 1.0.
    """

    ERR = (dConf['FP'] + dConf['FN']) / (dConf['VP'] + dConf['VN'] +
                                         dConf['FN'] + dConf['FP'])
    """
    Accuracy

    Accuracy (ACC) is calculated as the number of all correct
    predictions divided by the total number of the dataset.
    The best accuracy is 1.0, whereas the worst is 0.0. It can
    also be calculated by 1 – ERR.
    """

    ACC = (dConf['VP'] + dConf['VN']) / (dConf['VP'] + dConf['VN'] +
                                         dConf['FN'] + dConf['FP'])
    """
    Sensitivity (Recall or True positive rate)
    
    Sensitivity (SN) is calculated as the number of correct
    positive predictions divided by the total number of positives.
    It is also called recall (REC) or true positive rate (TPR).
    The best sensitivity is 1.0, whereas the worst is 0.0.
    """

    try:
        SN = dConf['VP'] / (dConf['VP'] + dConf['FN'])
    except:
        SN = -99
    """
    Specificity (True negative rate)

    Specificity (SP) is calculated as the number of correct negative
    predictions divided by the total number of negatives. It is
    also called true negative rate (TNR). The best specificity is 1.0,
    whereas the worst is 0.0.
    """

    SP = dConf['VN'] / (dConf['VN'] + dConf['FP'])
    """
    Precision (Positive predictive value)

    Precision (PREC) is calculated as the number of correct
    positive predictions divided by the total number of positive
    predictions. It is also called positive predictive value (PPV).
    The best precision is 1.0, whereas the worst is 0.0.
    """

    PREC = dConf["VP"] / (dConf["VP"] + dConf['FP'])
    """
    False positive rate

    False positive rate (FPR) is calculated as the number of
    incorrect positive predictions divided by the total number
    of negatives. The best false positive rate is 0.0 whereas the
    worst is 1.0. It can also be calculated as 1 – specificity.
    """

    FPR = dConf['FP'] / (dConf['VN'] + dConf['FP'])
    """
    Matthews correlation coefficient

    Matthews correlation coefficient (MCC) is a correlation
    coefficient calculated using all four values in the
    confusion matrix.
    """
    try:
        MCC = (dConf['VP'] * dConf['VN'] -
               dConf['FP'] * dConf['FN']) / (math.sqrt(
                   (dConf['VP'] + dConf['FP']) * (dConf['VP'] + dConf['FN']) *
                   (dConf['VN'] + dConf['FP']) * (dConf['VN'] + dConf['FN'])))
    except:
        MCC = -99
    """
    F-score

    F-score is a harmonic mean of precision and recall.
    """

    F0_5 = ((1 + 0.5**2) * (PREC * SN)) / (0.5**2 * PREC + SN)
    F_1 = (2 * PREC * SN) / (PREC + SN)
    F_2 = (5 * PREC * SN) / (4 * PREC + SN)

    evalMeasures = pandas.DataFrame(
        [['Error rate', ERR], ['Accuracy', ACC], ['Sensitivity', SN],
         ['Specificity', SP], ['Precision', PREC], [
             'False positive rate', FPR
         ], ['Matthews correlation coefficient', MCC], ['F-score 0.5', F0_5],
         ['F-score 1', F_1], ['F-score 2', F_2]],
        columns=['eval_mesure', 'value'])

    if outTbl:
        return obj_to_tbl([conf_tbl, evalMeasures, df],
                          outTbl,
                          sheetsName=['matrix', 'eval_mesures', 'tbl'])
    else:
        return conf_tbl, evalMeasures, df
コード例 #27
0
ファイル: __init__.py プロジェクト: jasp382/glass
def tbl_to_areamtx(inShp, col_a, col_b, outXls, db=None, with_metrics=None):
    """
    Table to Matrix
    
    Table as:
        FID | col_a | col_b | geom
    0 |  1  |   A   |   A   | ....
    0 |  2  |   A   |   B   | ....
    0 |  3  |   A   |   A   | ....
    0 |  4  |   A   |   C   | ....
    0 |  5  |   A   |   B   | ....
    0 |  6  |   B   |   A   | ....
    0 |  7  |   B   |   A   | ....
    0 |  8  |   B   |   B   | ....
    0 |  9  |   B   |   B   | ....
    0 | 10  |   C   |   A   | ....
    0 | 11  |   C   |   B   | ....
    0 | 11  |   C   |   D   | ....
    
    To:
    classe | A | B | C | D
       A   |   |   |   | 
       B   |   |   |   |
       C   |   |   |   |
       D   |   |   |   |
    
    col_a = rows
    col_b = cols

    api options:
    * pandas;
    * psql;
    """

    # TODO: check if col_a and col_b exists in table

    if not db:
        import pandas as pd
        import numpy as np
        from glass.g.rd.shp import shp_to_obj
        from glass.ng.wt    import obj_to_tbl
    
        # Open data
        df = shp_to_obj(inShp)

        # Remove nan values
        df = df[pd.notnull(df[col_a])]
        df = df[pd.notnull(df[col_b])]
    
        # Get Area
        df['realarea'] = df.geometry.area / 1000000
    
        # Get rows and Cols
        rows = df[col_a].unique()
        cols = df[col_b].unique()
        refval = list(np.sort(np.unique(np.append(rows, cols))))
    
        # Produce matrix
        outDf = []
        for row in refval:
            newCols = [row]
            for col in refval:
                newDf = df[(df[col_a] == row) & (df[col_b] == col)]

                if not newDf.shape[0]:
                    newCols.append(0)
                
                else:
                    area = newDf.realarea.sum()
            
                    newCols.append(area)
        
            outDf.append(newCols)
    
        outcols = ['class'] + refval
        outDf = pd.DataFrame(outDf, columns=outcols)

        if with_metrics:
            from glass.ng.cls.eval import get_measures_for_mtx

            out_df = get_measures_for_mtx(outDf, 'class')

            return obj_to_tbl(out_df, outXls)
    
        # Export to Excel
        return obj_to_tbl(outDf, outXls)
    
    else:
        from glass.pys.oss        import fprop
        from glass.ng.sql.db      import create_db
        from glass.ng.prop.sql    import db_exists
        from glass.g.it.db       import shp_to_psql
        from glass.g.dp.tomtx.sql import tbl_to_area_mtx
        from glass.ng.it          import db_to_tbl

        # Create database if not exists
        is_db = db_exists(db)

        if not is_db:
            create_db(db, api='psql')

        # Add data to database
        tbl = shp_to_psql(db, inShp, api='shp2pgsql')

        # Create matrix
        mtx = tbl_to_area_mtx(db, tbl, col_a, col_b, fprop(outXls, 'fn'))

        # Export result
        return db_to_tbl(db, mtx, outXls, sheetsNames='matrix')
コード例 #28
0
def model_selection(dataFile, refCol, dataCol, outTbl, lang='english', CV=5):
    """
    See which model is better to use in text classification for a specific
    data sample
    
    Compare:
    Logistic Regression (LogisticRegression)
    (Multinomial) Naive Bayes (MultinomialNB)
    Linear Support Vector Machine (LinearSVC)
    Random Forest (RandomForestClassifier)
    """
    
    import os
    from glass.pys.oss                   import fprop
    from glass.ng.rd                     import tbl_to_obj
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model            import LogisticRegression
    from sklearn.ensemble                import RandomForestClassifier
    from sklearn.svm                     import LinearSVC
    from sklearn.naive_bayes             import MultinomialNB
    from sklearn.model_selection         import cross_val_score
    from glass.ng.wt                     import obj_to_tbl
    
    # Data to DataFrame
    trainDf = tbl_to_obj(dataFile)
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[dataCol])]
    trainDf = trainDf[pd.notnull(trainDf[refCol])]
    
    # Ref col to integers
    from io import StringIO
    
    trainDf['ref_id'] = trainDf[refCol].factorize()[0]
    
    # Text to numbers
    features = txt_to_num_representation(trainDf, dataCol, lang)
    
    labels = trainDf.ref_id
    
    """ Test Models """
    models = [
        RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
        LinearSVC(),
        MultinomialNB(),
        LogisticRegression(random_state=0)
    ]
    
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    
    for model in models:
        m_name = model.__class__.__name__
        accuracies = cross_val_score(
            model, features, labels, scoring='accuracy', cv=CV
        )
        
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((m_name, fold_idx, accuracy))
    
    # Create and Export evaluation table
    cv_df = pd.DataFrame(
        entries, columns=['model_name', 'fold_idx', 'accuracy'])
    cv_df_gp = pd.DataFrame(cv_df.groupby('model_name').accuracy.mean())
    cv_df_gp.reset_index(inplace=True)
    
    # Export Graphic
    import seaborn as sns
        
    a = sns.boxplot(x='model_name', y='accuracy', data=cv_df)
        
    b = sns.stripplot(
        x='model_name', y='accuracy', data=cv_df,
        size=10, jitter=True, edgecolor="gray", linewidth=2)
        
    fig = b.get_figure()
    fig.savefig(os.path.join(
        os.path.dirname(outTbl), fprop(outTbl, 'fn') + '.png'
    ))
    
    return obj_to_tbl(cv_df_gp, outTbl)
コード例 #29
0
ファイル: sep.py プロジェクト: jasp382/glass
def clsep_matrix(ref, var, out, fileformat=None):
    """
    Produce matrix with classes separability from a satelite
    images
    """

    import os

    import pandas as pd
    from osgeo import gdal, gdal_array
    from glass.ng.wt import obj_to_tbl

    # Open data
    ref_src = gdal.Open(ref, gdal.GA_ReadOnly)

    if type(var) != list:
        # Check if it is a folder
        if os.path.isdir(var):
            # List images in folder
            from glass.pys.oss import lst_ff

            var = lst_ff(var, file_format=fileformat if fileformat else '.tif')

        else:
            var = [var]

    var_src = [gdal.Open(i, gdal.GA_ReadOnly) for i in var]

    # Get Band number for each raster
    img_bnd = [i.RasterCount for i in var_src]

    # Check images shape
    # Return error if the shapes are different
    ref_shp = (ref_src.RasterYSize, ref_src.RasterXSize)
    for r in var_src:
        rst_shp = (r.RasterYSize, r.RasterXSize)

        if ref_shp != rst_shp:
            raise ValueError(
                'There are at least two raster files with different shape')

    # Get NoData Value
    nd_val = ref_src.GetRasterBand(1).GetNoDataValue()

    # Get Number of features
    nvar = sum(img_bnd)

    # Convert imgs to Array, remove nodata values and reshape
    ref_num = ref_src.GetRasterBand(1).ReadAsArray()
    ref_num = ref_num.reshape((-1, 1))

    ref_num_ = ref_num[ref_num != nd_val]

    X = np.zeros((ref_num_.shape[0], nvar),
                 gdal_array.GDALTypeCodeToNumericTypeCode(
                     var_src[0].GetRasterBand(1).DataType))

    f = 0
    for r in range(len(var_src)):
        for b in range(img_bnd[r]):
            a = var_src[r].GetRasterBand(b + 1).ReadAsArray()
            a = a.reshape((-1, 1))
            a = a[ref_num != nd_val]

            X[:, f] = a

            f += 1

    # Create arrays for each class
    classes = list(np.sort(np.unique(ref_num_)))

    clsdata = [X[ref_num_ == c] for c in classes]

    # Get separability matrix
    mtx_b = []
    mtx_jm = []

    for v in range(len(classes)):
        row_b = []
        row_jm = []
        for v_ in range(len(classes)):
            if v < v_:
                b = None
                jm = None
            else:
                b = bha_dist(clsdata[v], clsdata[v_])

                jm = jm_dist(b)

            row_b.append(b)
            row_jm.append(jm)

        mtx_b.append(row_b)
        mtx_jm.append(row_jm)

    mtx_bd = pd.DataFrame(mtx_b, columns=classes, index=classes)
    mtx_bd.reset_index(inplace=True)
    mtx_bd.rename(columns={'index': 'class_id'}, inplace=True)

    mtx_jm = pd.DataFrame(mtx_jm, columns=classes, index=classes)
    mtx_jm.reset_index(inplace=True)
    mtx_jm.rename(columns={'index': 'class_id'}, inplace=True)

    obj_to_tbl([mtx_bd, mtx_jm],
               out,
               sheetsName=['Bhattacharyya_Distance', 'Jeffries-Matusita'])

    return out
コード例 #30
0
def text_prediction(trainData, classData, trainRefCol, trainClsCol, clsDataCol,
                    outfile, method='NaiveBayes', lang='english'):
    """
    Text classification
    
    Classifier Options:
    1) NaiveBayes;
    2) LinearSupportVectorMachine;
    3) RandomForest;
    4) LogisticRegression.
    """
    
    import pandas as pd
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    
    # Data to Dataframe
    trainDf = tbl_to_obj(trainData) if type(trainData) != pd.DataFrame else  trainData
    classDf = tbl_to_obj(classData) if type(classData) != pd.DataFrame else classData
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[trainClsCol])]
    trainDf = trainDf[pd.notnull(trainDf[trainRefCol])]
    classDf = classDf[pd.notnull(classDf[clsDataCol])]
    
    if method == 'NaiveBayes':
        from sklearn.naive_bayes             import MultinomialNB
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        
        """" Train Model """
        # X train is trainClsCol
        # Y train is trainRefCol
        x_train, y_train = trainDf[trainClsCol], trainDf[trainRefCol]
    
        count_vect = CountVectorizer()
    
        X_train_counts = count_vect.fit_transform(x_train)
    
        tfidf_transformer = TfidfTransformer()
    
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
        clf = MultinomialNB().fit(X_train_tfidf, y_train)
    
        """ Predict """
        result = clf.predict(count_vect.transform(classDf[clsDataCol]))
    
        classDf['classification'] = result
    
    elif method == 'LinearSupportVectorMachine':
        import numpy
        from sklearn.svm import LinearSVC
        
        # Get features and Labels
        trainDf['ref_id'] = trainDf[trainRefCol].factorize()[0]
        labels = trainDf.ref_id
        
        features, tvect = txt_to_num_representation(
            trainDf, trainClsCol, __lang=lang, returnTfiDf=True)
        
        featTst = tvect.transform(classDf[clsDataCol])
        
        """ Train model """
        model = LinearSVC()
        
        model.fit(features, labels)
        
        y_pred = model.predict(featTst)
        
        classDf['classification'] = y_pred
        
        # Create Dataframe only with ref_id's, without duplicates
        ref_id_df = trainDf[[
            trainRefCol, 'ref_id'
        ]].drop_duplicates().sort_values('ref_id')
        ref_id_df.columns = ['class_name', 'ref_fid']
        
        classDf = classDf.merge(
            ref_id_df, how='inner',
            left_on='classification', right_on='ref_fid'
        )
        
        classDf.loc[:, 'classification'] = classDf.class_name
        
        classDf.drop(['ref_fid', 'class_name'], axis=1, inplace=True)
    
    elif method == 'RandomForest':
        from sklearn.ensemble import RandomForestClassifier
        # Get features
        
        features, tvect = txt_to_num_representation(
            trainDf, trainClsCol, __lang=lang, returnTfiDf=True)
        
        featTst = tvect.transform(classDf[clsDataCol])
        
        classifier = RandomForestClassifier(
            n_estimators=1000, random_state=0
        )
        classifier.fit(features, trainDf[trainRefCol])
        
        y_pred = classifier.predict(featTst)
        
        classDf['classification'] = y_pred
    
    elif method == 'LogisticRegression':
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        from sklearn.pipeline                import Pipeline
        from sklearn.linear_model            import LogisticRegression
        
        logreg = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', LogisticRegression(n_jobs=1, C=1e5, multi_class='auto', solver='lbfgs')),
        ])
        
        logreg.fit(trainDf[trainClsCol], trainDf[trainRefCol])
        
        y_pred = logreg.predict(classDf[clsDataCol])
        
        classDf['classification'] = y_pred
    
    return obj_to_tbl(classDf, outfile)