Esempio n. 1
0
File: tw.py Progetto: jasp382/gasp
def tweets_to_xls(outxls,
                  searchword=None,
                  searchGeom=None,
                  srs=None,
                  lng='pt',
                  NTW=1000,
                  twType='mixed',
                  Key=None):
    """
    Search for Tweets and Export them to XLS
    """

    from gasp.to import obj_to_tbl

    data = tweets_to_df(keyword=searchword,
                        inGeom=searchGeom,
                        epsg=srs,
                        LANG=lng,
                        NTWEETS=NTW,
                        tweetType=twType,
                        apiKey=Key)

    try:
        if not data:
            return 0
    except:
        pass

    obj_to_tbl(data, outxls, sheetsName='twitter')

    return outxls
Esempio n. 2
0
def join_tables_in_table(mainTable, mainIdField, joinTables, outTable):
    """
    Join one table with all tables in a folder
    
    joinTables = {
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-06.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_6'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-13.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_13'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-20.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_20'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-27.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_27'}
        }
    }
    
    #TODO: only works with xlsx tables as join TABLES
    """
    
    # Modules
    import os;   import pandas
    from gasp.fm import tbl_to_obj
    from gasp.to import obj_to_tbl
    
    # Get table format
    tableType = os.path.splitext(mainTable)[1]
    
    tableDf = tbl_to_obj(mainTable)
    
    for table in joinTables:
        xlsDf = tbl_to_obj(table)
        
        join_field = 'id_entity' if joinTables[table]["JOIN_FIELD"] == mainIdField \
            else joinTables[table]["JOIN_FIELD"]
        
        if joinTables[table]["JOIN_FIELD"] == mainIdField:
            xlsDf.rename(columns={mainIdField : join_field}, inplace=True)
        
        xlsDf.rename(columns=joinTables[table]["COLS_TO_JOIN"], inplace=True)
        
        tableDf = tableDf.merge(
            xlsDf, how='outer', left_on=mainIdField,
            right_on=join_field
        )
        
        tableDf.fillna(0, inplace=True)
        tableDf[mainIdField].replace(0, tableDf[join_field], inplace=True)
        
        tableDf.drop(join_field, axis=1, inplace=True)
    
    obj_to_tbl(tableDf, outTable)
    
    return outTable
Esempio n. 3
0
    def get_day_table(day):
        print('Starting: ' + day)

        if EXCLUDE_DAYS:
            if day in EXCLUDE_DAYS:
                print('Ending: ' + day)
                return 0

        COUNTING = []
        for __int in INTERVALS:
            start, end = __int
            COUNT_FIELD = 'p{}h{}_{}h{}'.format(str(start[0]), str(start[1]),
                                                str(end[0]), str(end[1]))

            if COUNT_FIELD not in INTERVAL_COLUMNS:
                INTERVAL_COLUMNS.append(COUNT_FIELD)

            countTbl = count_by_period_entity(psqldb, start, end, pgtable,
                                              DAY_FIELD, day, HOUR_FIELD,
                                              MINUTES_FIELD, ENTITY_FIELD)
            COUNTING.append(countTbl)

        main_table = COUNTING[0]
        for i in range(1, len(COUNTING)):
            main_table = combine_dfs(main_table, COUNTING[i], ENTITY_FIELD)

        if workspace_day_tables:
            obj_to_tbl(main_table,
                       os.path.join(workspace_day_tables, 'ti_{}.xlsx'))

        return main_table
Esempio n. 4
0
def count_entity_periods_with_certain_duration(db,
                                               PERIOD_INTERVAL,
                                               PGTABLE,
                                               TIME_FIELD,
                                               ENTITY_FIELD,
                                               OUT_TABLE,
                                               filterWhere=None):
    """
    Count rows in a pgtable for a given period of X minutes for each
    interest entity
    
    PERIOD_INTERVAL = "01:00:00"
    """

    import pandas
    from gasp.pyt.tm import day_to_intervals2
    from gasp.pyt.df.joins import combine_dfs

    # Get Intervals
    INTERVALS = day_to_intervals2(PERIOD_INTERVAL)

    # For each interval/period, count the number of rows by entity
    counting = []
    for _int in INTERVALS:
        Q = ("SELECT {entityCol}, COUNT({entityCol}) AS {cntCol} "
             "FROM {table} WHERE "
             "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') >= "
             "TO_TIMESTAMP('{minLower}', 'HH24:MI:SS') AND "
             "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') < "
             "TO_TIMESTAMP('{minUpper}', 'HH24:MI:SS'){whr} "
             "GROUP BY {entityCol}").format(cntCol="s{}_e{}".format(
                 _int[0][:5], _int[1][:5]).replace(":", "_"),
                                            table=PGTABLE,
                                            timeCol=TIME_FIELD,
                                            entityCol=ENTITY_FIELD,
                                            minLower=_int[0],
                                            minUpper=_int[1],
                                            whr="" if not filterWhere else
                                            " AND ({}) ".format(filterWhere))

        count = q_to_obj(db, Q, db_api='psql')

        counting.append(count)

    mainDf = combine_dfs(counting[0], counting[1:], ENTITY_FIELD)

    obj_to_tbl(mainDf, OUT_TABLE)

    return OUT_TABLE
Esempio n. 5
0
def count_by_periods_with_certain_duration(conParam, PERIOD_INTERVAL, pgtable,
                                           TIME_FIELD, outTable,
                                           filterWhere=None):
    """
    Count rows in a pgtable by periods of X minutes
    
    PERIOD_INTERVAL = "01:00:00"
    """
    
    import pandas
    from gasp import day_to_intervals2
    
    # Get Intervals
    INTERVALS = day_to_intervals2(PERIOD_INTERVAL)
    
    # For each interval/period, count the number of rows
    counting = None
    for _int_ in INTERVALS:
        QUERY = (
            "SELECT COUNT(*) AS count FROM {table} WHERE "
            "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') >= "
            "TO_TIMESTAMP('{minLower}', 'HH24:MI:SS') AND "
            "TO_TIMESTAMP({timeCol}, 'HH24:MI:SS') < "
            "TO_TIMESTAMP('{minUpper}', 'HH24:MI:SS'){whr}"
        ).format(
            table    =  pgtable, timeCol  = TIME_FIELD,
            minLower = _int_[0], minUpper =   _int_[1],
            whr      = "" if not filterWhere else " AND ({})".format(
                filterWhere
            )
        )
        
        count = query_to_df(conParam, QUERY, db_api='psql')
        
        count.rename(index={0 : "{}-{}".format(
            _int_[0][:5], _int_[1][:5]
        )}, inplace=True)
        
        if type(counting) != pandas.DataFrame:
            counting = count.copy()
        
        else:
            counting = counting.append(count, ignore_index=False)
    
    obj_to_tbl(counting, outTable)
    
    return outTable
Esempio n. 6
0
def field_sum_two_tables(tableOne, tableTwo,
                         joinFieldOne, joinFieldTwo,
                         field_to_sum, outTable):
    """
    Sum same field in different tables
    
    Table 1:
    id | field
    0 |  10
    1 |  11
    2 |  13
    3 |  10
    
    Table 2:
    id | field
    0 |  10
    1 |   9
    2 |  17
    4 |  15
    
    Create the new table
    id | field
    0 |  20
    1 |  20
    2 |  30
    3 |  10
    4 |  15
    """
    
    from gasp.fm        import tbl_to_obj
    from gasp.to        import obj_to_tbl
    from gasp.mng.joins import sum_field_of_two_tables
    
    # Open two tables
    df_one = tbl_to_obj(tableOne)
    df_two = tbl_to_obj(tableTwo)
    
    # Do it!
    outDf = sum_field_of_two_tables(
        df_one, joinFieldOne,
        df_two, joinFieldTwo,
        field_to_sum
    )
    
    obj_to_tbl(outDf, outTable)
    
    return outTable
Esempio n. 7
0
def show_duplicates_in_xls(db_name, table, pkCols, outFile, tableIsQuery=None):
    """
    Find duplicates and write these objects in a table
    """

    import pandas
    from gasp.pyt import obj_to_lst
    from gasp.sql.fm import q_to_obj
    from gasp.to import obj_to_tbl

    pkCols = obj_to_lst(pkCols)

    if not pkCols:
        raise ValueError("pkCols value is not valid")

    if not tableIsQuery:
        q = ("SELECT {t}.* FROM {t} INNER JOIN ("
             "SELECT {cls}, COUNT({cnt}) AS conta FROM {t} "
             "GROUP BY {cls}"
             ") AS foo ON {rel} "
             "WHERE conta > 1").format(t=table,
                                       cls=", ".join(pkCols),
                                       cnt=pkCols[0],
                                       rel=" AND ".join([
                                           "{t}.{c} = foo.{c}".format(t=table,
                                                                      c=col)
                                           for col in pkCols
                                       ]))

    else:
        q = ("SELECT foo.* FROM ({q_}) AS foo INNER JOIN ("
             "SELECT {cls}, COUNT({cnt}) AS conta "
             "FROM ({q_}) AS foo2 GROUP BY {cls}"
             ") AS jt ON {rel} "
             "WHERE conta > 1").format(q_=table,
                                       cls=", ".join(pkCols),
                                       cnt=pkCols[0],
                                       rel=" AND ".join([
                                           "foo.{c} = jt.{c}".format(c=x)
                                           for x in pkCols
                                       ]))

    data = q_to_obj(db_name, q, db_api='psql')

    obj_to_tbl(data, outFile)

    return outFile
Esempio n. 8
0
def record_time_consumed(timeData, outXls):
    """
    Record the time consumed by a OSM2LULC procedure version
    in a excel table
    """

    import pandas
    from gasp.to import obj_to_tbl

    # Produce main table - Time consumed by rule
    main = [{
        'rule': timeData[i][0],
        'time': timeData[i][1]
    } for i in range(len(timeData.keys())) if timeData[i]]

    # Produce detailed table - Time consumed inside rules
    timeInsideRule = []
    timeDataKeys = timeData.keys()
    timeDataKeys.sort()

    for i in timeDataKeys:
        if not timeData[i]:
            continue

        if len(timeData[i]) == 2:
            timeInsideRule.append({
                'rule': timeData[i][0],
                'task': timeData[i][0],
                'time': timeData[i][1]
            })

        elif len(timeData[i]) == 3:
            taskKeys = timeData[i][2].keys()
            taskKeys.sort()
            for task in taskKeys:
                if not timeData[i][2][task]:
                    continue

                timeInsideRule.append({
                    'rule': timeData[i][0],
                    'task': timeData[i][2][task][0],
                    'time': timeData[i][2][task][1]
                })

        else:
            print 'timeData object with key {} is not valid'.format(i)

    # Export tables to excel
    dfs = [pandas.DataFrame(main), pandas.DataFrame(timeInsideRule)]

    return obj_to_tbl(dfs, outXls, sheetsName=['general', 'detailed'])
Esempio n. 9
0
def model_conf_matrix(tblFile, refCol, clsCol, outMxt):
    """
    Model Evaluation
    """

    import pandas as pd
    from gasp.fm import tbl_to_obj
    from gasp.to import obj_to_tbl
    from sklearn.metrics import confusion_matrix, classification_report

    data = tbl_to_obj(tblFile)

    data[refCol] = data[refCol].astype(str)
    data[clsCol] = data[clsCol].astype(str)

    ref_id = data[[refCol]].drop_duplicates().sort_values(refCol)

    conf_mat = confusion_matrix(data[refCol], data[clsCol])

    mxt = pd.DataFrame(conf_mat,
                       columns=ref_id[refCol].values,
                       index=ref_id[refCol].values)
    mxt.reset_index(inplace=True)
    mxt.rename(columns={'index': 'confusion_mxt'}, inplace=True)

    # Get classification report
    report = classification_report(data[refCol],
                                   data[clsCol],
                                   target_names=ref_id[refCol],
                                   output_dict=True)

    global_keys = ['accuracy', 'macro avg', 'micro avg', 'weighted avg']

    cls_eval = {k: report[k] for k in report if k not in global_keys}
    glb_eval = {k: report[k] for k in report if k in global_keys}

    if 'accuracy' in glb_eval:
        glb_eval['accuracy'] = {
            'f1-score': glb_eval['accuracy'],
            'precision': 0,
            'recall': 0,
            'support': 0
        }

    cls_eval = pd.DataFrame(cls_eval).T
    gbl_eval = pd.DataFrame(glb_eval).T

    return obj_to_tbl([gbl_eval, cls_eval, mxt],
                      outMxt,
                      sheetsName=['global', 'report', 'matrix'])
Esempio n. 10
0
File: gen.py Progetto: jasp382/gasp
def merge_xls_in_folder(tbl_folder, out_table):
    """
    Get all excel tables in a folder and make one table of them
    """

    import pandas
    from gasp.pyt.oss import lst_ff
    from gasp.fm import tbl_to_obj
    from gasp.to import obj_to_tbl

    tables = lst_ff(tbl_folder, file_format=['.xls', '.xlsx'])

    dfs = [tbl_to_obj(table) for table in tables]

    result = pandas.concat(dfs)

    out_table = obj_to_tbl(result, out_table)

    return out_table
Esempio n. 11
0
def meanday_of_periods_by_entity(psqldb,
                                 pgtable,
                                 DAY_FIELD,
                                 HOUR_FIELD,
                                 MINUTES_FIELD,
                                 ENTITY_FIELD,
                                 OUTPUT_FILE,
                                 PERIODS=None,
                                 PERIODS_INTERVAL=None,
                                 EXCLUDE_DAYS=None,
                                 workspace_day_tables=None):
    """
    For every day in a pgtable, count the number of rows by periods of X minutes
    for each interest entity.
    
    At the end, calculate the mean between every day for each period.
    """

    import os
    import pandas
    from gasp.pyt.tm import day_to_intervals
    from gasp.pyt.df.joins import combine_dfs
    from gasp.sql.fm import q_to_obj
    from gasp.to import obj_to_tbl
    from gasp.sql.q.count import count_by_period_entity

    if not PERIODS and not PERIODS_INTERVAL:
        raise ValueError(
            ("Please give value to PERIODS or PERIODS_INTERAL. "
             "If PERIODS and PERIODS_INTERVAL, PERIODS will have priority."))

    # Get intervals
    INTERVALS = day_to_intervals(PERIODS_INTERVAL) if not PERIODS else PERIODS

    # Get unique values
    VALUES = q_to_obj(
        psqldb, "SELECT {col} FROM {t} GROUP BY {col}".format(
            col=DAY_FIELD, t=pgtable))[DAY_FIELD].tolist()

    DAYS_ARRAY = []
    INTERVAL_COLUMNS = []

    def get_day_table(day):
        print('Starting: ' + day)

        if EXCLUDE_DAYS:
            if day in EXCLUDE_DAYS:
                print('Ending: ' + day)
                return 0

        COUNTING = []
        for __int in INTERVALS:
            start, end = __int
            COUNT_FIELD = 'p{}h{}_{}h{}'.format(str(start[0]), str(start[1]),
                                                str(end[0]), str(end[1]))

            if COUNT_FIELD not in INTERVAL_COLUMNS:
                INTERVAL_COLUMNS.append(COUNT_FIELD)

            countTbl = count_by_period_entity(psqldb, start, end, pgtable,
                                              DAY_FIELD, day, HOUR_FIELD,
                                              MINUTES_FIELD, ENTITY_FIELD)
            COUNTING.append(countTbl)

        main_table = COUNTING[0]
        for i in range(1, len(COUNTING)):
            main_table = combine_dfs(main_table, COUNTING[i], ENTITY_FIELD)

        if workspace_day_tables:
            obj_to_tbl(main_table,
                       os.path.join(workspace_day_tables, 'ti_{}.xlsx'))

        return main_table

    for day in VALUES:
        t = get_day_table(day[0])
        if type(t) == int:
            continue
        else:
            DAYS_ARRAY.append(t)

        print('Ending: ' + day[0])

    main_table = DAYS_ARRAY[0]

    for i in range(1, len(DAYS_ARRAY)):
        join_field = 'id_entity'

        renameDict = {col: 'join_' + col for col in INTERVAL_COLUMNS}
        renameDict.update({ENTITY_FIELD: join_field})

        DAYS_ARRAY[i].rename(columns=renameDict, inplace=True)

        main_table = main_table.merge(DAYS_ARRAY[i],
                                      how='outer',
                                      left_on=ENTITY_FIELD,
                                      right_on=join_field)

        main_table.fillna(0, inplace=True)
        main_table[ENTITY_FIELD].replace(0,
                                         main_table[join_field],
                                         inplace=True)

        main_table.drop(join_field, axis=1, inplace=True)
        for k in INTERVAL_COLUMNS:
            main_table[k] = main_table[k] + main_table[renameDict[k]]
            main_table.drop(renameDict[k], axis=1, inplace=True)

    for col in INTERVAL_COLUMNS:
        main_table[col] = main_table[col] / len(DAYS_ARRAY)

    obj_to_tbl(main_table, OUTPUT_FILE)
Esempio n. 12
0
File: cls.py Progetto: jasp382/gasp
def model_selection(dataFile, refCol, dataCol, outTbl, lang='english', CV=5):
    """
    See which model is better to use in text classification for a specific
    data sample
    
    Compare:
    Logistic Regression (LogisticRegression)
    (Multinomial) Naive Bayes (MultinomialNB)
    Linear Support Vector Machine (LinearSVC)
    Random Forest (RandomForestClassifier)
    """
    
    import os
    from gasp.pyt.oss                    import fprop
    from gasp.fm                         import tbl_to_obj
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model            import LogisticRegression
    from sklearn.ensemble                import RandomForestClassifier
    from sklearn.svm                     import LinearSVC
    from sklearn.naive_bayes             import MultinomialNB
    from sklearn.model_selection         import cross_val_score
    from gasp.to                         import obj_to_tbl
    
    # Data to DataFrame
    trainDf = tbl_to_obj(dataFile)
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[dataCol])]
    trainDf = trainDf[pd.notnull(trainDf[refCol])]
    
    # Ref col to integers
    from io import StringIO
    
    trainDf['ref_id'] = trainDf[refCol].factorize()[0]
    
    # Text to numbers
    features = txt_to_num_representation(trainDf, dataCol, lang)
    
    labels = trainDf.ref_id
    
    """ Test Models """
    models = [
        RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
        LinearSVC(),
        MultinomialNB(),
        LogisticRegression(random_state=0)
    ]
    
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    
    for model in models:
        m_name = model.__class__.__name__
        accuracies = cross_val_score(
            model, features, labels, scoring='accuracy', cv=CV
        )
        
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((m_name, fold_idx, accuracy))
    
    # Create and Export evaluation table
    cv_df = pd.DataFrame(
        entries, columns=['model_name', 'fold_idx', 'accuracy'])
    cv_df_gp = pd.DataFrame(cv_df.groupby('model_name').accuracy.mean())
    cv_df_gp.reset_index(inplace=True)
    
    # Export Graphic
    import seaborn as sns
        
    a = sns.boxplot(x='model_name', y='accuracy', data=cv_df)
        
    b = sns.stripplot(
        x='model_name', y='accuracy', data=cv_df,
        size=10, jitter=True, edgecolor="gray", linewidth=2)
        
    fig = b.get_figure()
    fig.savefig(os.path.join(
        os.path.dirname(outTbl), fprop(outTbl, 'fn') + '.png'
    ))
    
    return obj_to_tbl(cv_df_gp, outTbl)
Esempio n. 13
0
File: tags.py Progetto: jasp382/gasp
def get_not_used_tags(OSM_FILE, OUT_TBL):
    """
    Use a file OSM to detect tags not considered in the
    OSM2LULC procedure
    """

    import os
    from gasp.to import obj_to_tbl
    from gasp.gt.attr import sel_by_attr
    from gasp.sql.fm import q_to_obj
    from gasp.pyt.df.split import df_split
    from gasp.pyt.oss import fprop
    from gasp.gt.toshp.osm import osm_to_gpkg

    OSM_TAG_MAP = {
        "DB":
        os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'osmtolulc.sqlite'),
        "OSM_FEAT":
        "osm_features",
        "KEY_COL":
        "key",
        "VALUE_COL":
        "value",
        "GEOM_COL":
        "geom"
    }

    WORKSPACE = os.path.dirname(OUT_TBL)

    sqdb = osm_to_gpkg(
        OSM_FILE, os.path.join(WORKSPACE,
                               fprop(OSM_FILE, 'fn') + '.gpkg'))

    # Get Features we are considering
    ourOSMFeatures = q_to_obj(
        OSM_TAG_MAP["DB"],
        ("SELECT {key} AS key_y, {value} AS value_y, {geom} AS geom_y "
         "FROM {tbl}").format(key=OSM_TAG_MAP["KEY_COL"],
                              value=OSM_TAG_MAP["VALUE_COL"],
                              geom=OSM_TAG_MAP["GEOM_COL"],
                              tbl=OSM_TAG_MAP["OSM_FEAT"]),
        db_api='sqlite')

    # Get Features in File
    TABLES_TAGS = {
        'points': ['highway', 'man_made', 'building'],
        'lines':
        ['highway', 'waterway', 'aerialway', 'barrier', 'man_made', 'railway'],
        'multipolygons': [
            'aeroway', 'amenity', 'barrier', 'building', 'craft', 'historic',
            'land_area', ''
            'landuse', 'leisure', 'man_made', 'military', 'natural', 'office',
            'place', 'shop', 'sport', 'tourism', 'waterway', 'power',
            'railway', 'healthcare', 'highway'
        ]
    }

    Qs = [
        " UNION ALL ".join([(
            "SELECT '{keycol}' AS key, {keycol} AS value, "
            "'{geomtype}' AS geom FROM {tbl} WHERE "
            "{keycol} IS NOT NULL"
        ).format(
            keycol=c, geomtype='Point' if table == 'points' else 'Line' \
                if table == 'lines' else 'Polygon',
            tbl=table
        ) for c in TABLES_TAGS[table]]) for table in TABLES_TAGS
    ]

    fileOSMFeatures = q_to_obj(sqdb,
                               ("SELECT key, value, geom FROM ({}) AS foo "
                                "GROUP BY key, value, geom").format(
                                    " UNION ALL ".join(Qs)),
                               db_api='sqlite')

    _fileOSMFeatures = fileOSMFeatures.merge(
        ourOSMFeatures,
        how='outer',
        left_on=["key", "value", "geom"],
        right_on=["key_y", "value_y", "geom_y"])

    # Select OSM Features of file without correspondence
    _fileOSMFeatures["isnew"] = _fileOSMFeatures.key_y.fillna(value='nenhum')

    newTags = _fileOSMFeatures[_fileOSMFeatures.isnew == 'nenhum']

    newTags["value"] = newTags.value.str.replace("'", "''")

    newTags["whr"] = newTags.key + "='" + newTags.value + "'"

    # Export tags not being used to new shapefile
    def to_regular_str(row):
        san_str = row.whr

        row["whr_san"] = san_str

        return row

    for t in TABLES_TAGS:
        if t == 'points':
            filterDf = newTags[newTags.geom == 'Point']

        elif t == 'lines':
            filterDf = newTags[newTags.geom == 'Line']

        elif t == 'multipolygons':
            filterDf = newTags[newTags.geom == 'Polygon']

        if filterDf.shape[0] > 500:
            dfs = df_split(filterDf, 500, nrows=True)
        else:
            dfs = [filterDf]

        Q = "SELECT * FROM {} WHERE {}".format(
            t, filterDf.whr.str.cat(sep=" OR "))

        i = 1
        for df in dfs:
            fn = t + '.shp' if len(dfs) == 1 else '{}_{}.shp'.format(t, str(i))
            try:
                shp = sel_by_attr(sqdb,
                                  Q.format(t, df.whr.str.cat(sep=" OR ")),
                                  os.path.join(WORKSPACE, fn),
                                  api_gis='ogr')
            except:
                __df = df.apply(lambda x: to_regular_str(x), axis=1)

                shp = sel_by_attr(sqdb,
                                  Q.format(t, __df.whr.str.cat(sep=" OR ")),
                                  os.path.join(WORKSPACE, fn))

            i += 1

    # Export OUT_TBL with tags not being used
    newTags.drop(['key_y', 'value_y', 'geom_y', 'isnew', 'whr'],
                 axis=1,
                 inplace=True)
    obj_to_tbl(newTags, OUT_TBL, sheetsName="new_tags", sanitizeUtf8=True)

    return OUT_TBL
Esempio n. 14
0
File: cls.py Progetto: jasp382/gasp
def text_prediction(trainData, classData, trainRefCol, trainClsCol, clsDataCol,
                    outfile, method='NaiveBayes', lang='english'):
    """
    Text classification
    
    Classifier Options:
    1) NaiveBayes;
    2) LinearSupportVectorMachine;
    3) RandomForest;
    4) LogisticRegression.
    """
    
    import pandas as pd
    from gasp.fm import tbl_to_obj
    from gasp.to import obj_to_tbl
    
    # Data to Dataframe
    trainDf = tbl_to_obj(trainData) if type(trainData) != pd.DataFrame else  trainData
    classDf = tbl_to_obj(classData) if type(classData) != pd.DataFrame else classData
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[trainClsCol])]
    trainDf = trainDf[pd.notnull(trainDf[trainRefCol])]
    classDf = classDf[pd.notnull(classDf[clsDataCol])]
    
    if method == 'NaiveBayes':
        from sklearn.naive_bayes             import MultinomialNB
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        
        """" Train Model """
        # X train is trainClsCol
        # Y train is trainRefCol
        x_train, y_train = trainDf[trainClsCol], trainDf[trainRefCol]
    
        count_vect = CountVectorizer()
    
        X_train_counts = count_vect.fit_transform(x_train)
    
        tfidf_transformer = TfidfTransformer()
    
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
        clf = MultinomialNB().fit(X_train_tfidf, y_train)
    
        """ Predict """
        result = clf.predict(count_vect.transform(classDf[clsDataCol]))
    
        classDf['classification'] = result
    
    elif method == 'LinearSupportVectorMachine':
        import numpy
        from sklearn.svm import LinearSVC
        
        # Get features and Labels
        trainDf['ref_id'] = trainDf[trainRefCol].factorize()[0]
        labels = trainDf.ref_id
        
        features, tvect = txt_to_num_representation(
            trainDf, trainClsCol, __lang=lang, returnTfiDf=True)
        
        featTst = tvect.transform(classDf[clsDataCol])
        
        """ Train model """
        model = LinearSVC()
        
        model.fit(features, labels)
        
        y_pred = model.predict(featTst)
        
        classDf['classification'] = y_pred
        
        # Create Dataframe only with ref_id's, without duplicates
        ref_id_df = trainDf[[
            trainRefCol, 'ref_id'
        ]].drop_duplicates().sort_values('ref_id')
        ref_id_df.columns = ['class_name', 'ref_fid']
        
        classDf = classDf.merge(
            ref_id_df, how='inner',
            left_on='classification', right_on='ref_fid'
        )
        
        classDf.loc[:, 'classification'] = classDf.class_name
        
        classDf.drop(['ref_fid', 'class_name'], axis=1, inplace=True)
    
    elif method == 'RandomForest':
        from sklearn.ensemble import RandomForestClassifier
        # Get features
        
        features, tvect = txt_to_num_representation(
            trainDf, trainClsCol, __lang=lang, returnTfiDf=True)
        
        featTst = tvect.transform(classDf[clsDataCol])
        
        classifier = RandomForestClassifier(
            n_estimators=1000, random_state=0
        )
        classifier.fit(features, trainDf[trainRefCol])
        
        y_pred = classifier.predict(featTst)
        
        classDf['classification'] = y_pred
    
    elif method == 'LogisticRegression':
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        from sklearn.pipeline                import Pipeline
        from sklearn.linear_model            import LogisticRegression
        
        logreg = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', LogisticRegression(n_jobs=1, C=1e5, multi_class='auto', solver='lbfgs')),
        ])
        
        logreg.fit(trainDf[trainClsCol], trainDf[trainRefCol])
        
        y_pred = logreg.predict(classDf[clsDataCol])
        
        classDf['classification'] = y_pred
    
    return obj_to_tbl(classDf, outfile)
Esempio n. 15
0
File: freq.py Progetto: jasp382/gasp
def correlated_words(dataFile, refCol, dataCol, outTbl, lang='english', N=2,
                     refSheet=None):
    """
    Get words correlated with some text class 
    """
    
    from sklearn.feature_selection import chi2
    from gasp.to                   import obj_to_tbl
    from gasp.fm                   import tbl_to_obj
    from gasp.pyt.txtcls           import txt_to_num_representation
    
    # Data to DataFrame
    trainDf = tbl_to_obj(
        dataFile, sheet=refSheet
    ) if type(dataFile) != pd.DataFrame else dataFile
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[dataCol])]
    trainDf = trainDf[pd.notnull(trainDf[refCol])]
    
    """
    Add a column encoding the reference classes as an integer because
    categorical variables are often better represented by integers
    than strings
    """
    
    from io import StringIO
    
    # Get a ID for Ref/text classes values
    trainDf['ref_id'] = trainDf[refCol].factorize()[0]
    
    # Create Dataframe only with ref_id's, without duplicates
    ref_id_df = trainDf[[refCol, 'ref_id']].drop_duplicates().sort_values(
        'ref_id'
    )
    
    # Create dicts to easy relate ref_id with ref_value
    ref_to_id = dict(ref_id_df.values)
    id_to_ref = dict(ref_id_df[['ref_id', refCol]].values)
    
    """
    Text to numbers
    """
    features, tfidf = txt_to_num_representation(
        trainDf, dataCol, lang, returnTfiDf=True)
    
    labels = trainDf.ref_id
    
    """
    Get most correlated words
    """
    
    corr_words = []
    for ref_name, ref_id in sorted(ref_to_id.items()):
        features_chi2 = chi2(features, labels == ref_id)
        
        indices = np.argsort(features_chi2[0])
        
        feat_names = np.array(tfidf.get_feature_names())[indices]
        
        unigrams = [v for v in feat_names if len(v.split(' ')) == 1][-N:]
        bigrams  = [v for v in feat_names if len(v.split(' ')) == 2][-N:]
        cols_d = [ref_name] + unigrams + bigrams
        
        corr_words.append(cols_d)
    
    COLS_NAME = ['ref_name'] + [
        'unigram_{}'.format(str(i+1)) for i in range(N)
    ] + [
        'bigram_{}'.format(str(i+1)) for i in range(N)
    ]
    dfCorrWords = pd.DataFrame(corr_words,columns=COLS_NAME)
    
    return obj_to_tbl(dfCorrWords, outTbl)
Esempio n. 16
0
File: exct.py Progetto: zonakre/gasp
def run_query_for_values_in_col(conParam, query, table_interest_col,
                                interest_col, outworkspace):
    """
    Execute a query for each value in one column
    In each iteration, the values may participate in the query.
    
    Export the several tables to excel
    
    Example:
    ID_PERCURSO | PARAGEM |    DIA     | GEOM
        0       |   255   |'2018-01-01 | xxxx
        0       |   255   |'2018-01-01 | xxxx
        0       |   254   |'2018-01-01 | xxxx
        0       |   254   |'2018-01-01 | xxxx
        0       |   255   |'2018-01-02 | xxxx
        0       |   255   |'2018-01-02 | xxxx
        0       |   254   |'2018-01-02 | xxxx
        0       |   254   |'2018-01-02 | xxxx
    
    For a query as:
    SELECT ID_PERCURSO, PARAGEM, GEOM, DIA, COUNT(PARAGEM) AS conta FROM
    table WHERE DIA={} GROUP BY PARAGEM, GEOM, DIA;
    
    This method will generate two tables:
    First table:
    ID_PERCURSO | PARAGEM |    DIA     | GEOM | conta
         0     |   255   |'2018-01-01 | xxxx |   2
         0     |   254   |'2018-01-01 | xxxx |   2
    
    Second table:
    ID_PERCURSO | PARAGEM |    DIA     | GEOM | conta
          0     |   255   |'2018-01-02 | xxxx |   2
          0     |   254   |'2018-01-02 | xxxx |   2
    
    {} will be replaced for every value in the interest_column that will
    be iterated one by one
    """

    from gasp.fm.sql import query_to_df
    from gasp.sql.mng.fld import get_columns_type
    from gasp.to import obj_to_tbl

    fields_types = get_columns_type(conParam, table_interest_col)

    # Get  unique values
    VALUES = query_to_df(conParam,
                         "SELECT {col} FROM {t} GROUP BY {col}".format(
                             col=interest_col, t=table_interest_col),
                         db_api='psql')[interest_col].tolist()

    # Aplly query for every value in VALUES
    # Write data in excel
    for value in VALUES:
        data = query_to_df(conParam, query.format(
            str(value[0]) if fields_types[interest_col] != str \
            and fields_types[interest_col] != unicode else \
            "'{}'".format(str(value[0]))
        ), db_api='psql')

        obj_to_tbl(
            data,
            os.path.join(
                outworkspace, '{}_{}.xlsx'.format(table_interest_col,
                                                  str(value[0]))))
Esempio n. 17
0
def binary_eval(refTbl,
                refId,
                refCol,
                tstTbl,
                tstId,
                outTbl=None,
                tstCol=None):
    """
    Evaluation of a binary classification
    
    When tstCol is None, the script assumes that in tstTbl
    there are only positives
    
    A tabela de referencia deve ter positivos e negativos;
    mas a tabela de teste pode ter so positivos.
    """

    import numpy as np
    import pandas
    import math
    from gasp.fm import tbl_to_obj
    from gasp.to import obj_to_tbl

    # Data to Pandas Dataframe
    ref_df = tbl_to_obj(refTbl, fields=[
        refId, refCol
    ]) if type(refTbl) != pandas.DataFrame else refTbl[[refId, refCol]]
    tst_df = tbl_to_obj(
        tstTbl, fields=[tstId] if not tstCol else [tstId, tstCol]
    ) if type(refTbl) != pandas.DataFrame else tstTbl[[tstId]] \
        if not tstCol else tstTbl[[tstId, tstCol]]

    # Check if refId is equal to tstId; they must be different
    if refId == tstId:
        colRename = {tstId: 'tst_fid__'}

        # Do the same for refCol and tstCol
        if refCol == tstCol:
            colRename[tstCol] = 'tst_col__'

        tst_df.rename(columns=colRename, inplace=True)
        tstId = 'tst_fid__'

        if refCol == tstCol:
            tstCol = 'tst_col__'

    df = ref_df.merge(tst_df, how='left', left_on=refId, right_on=tstId)

    # Check if we have a tstCol
    if not tstCol:
        df[tstId].fillna('None', inplace=True)

        tstCol = 'cls_tst'
        df[tstCol] = np.where(df[tstId] == 'None', 0, 1)

    # Get VP, VN, FP, FN
    df['confusion'] = np.where(
        (df[refCol] == 1) & (df[tstCol] == 1), 'VP',
        np.where((df[refCol] == 0) & (df[tstCol] == 0), 'VN',
                 np.where((df[refCol] == 1) & (df[tstCol] == 0), 'FN', 'FP')))

    # tabela sintese
    conf_tbl = pandas.DataFrame()
    conf_tbl['nrows'] = df.groupby(['confusion'])[refId].nunique()

    conf_tbl.reset_index(inplace=True)

    conf_tbl['percentage'] = (conf_tbl.nrows * 100) / df.shape[0]

    # Get some evaluation mesures
    dConf = {}

    for row in conf_tbl.to_dict(orient='records'):
        dConf[row['confusion']] = row['nrows']

    l = ['VP', 'VN', 'FP', 'FN']
    for i in l:
        if i not in dConf:
            dConf[i] = 0
    """
    Error rate

    Error rate (ERR) is calculated as the number of all
    incorrect predictions divided by the total number of
    the dataset. The best error rate is 0.0, whereas the
    worst is 1.0.
    """

    ERR = (dConf['FP'] + dConf['FN']) / (dConf['VP'] + dConf['VN'] +
                                         dConf['FN'] + dConf['FP'])
    """
    Accuracy

    Accuracy (ACC) is calculated as the number of all correct
    predictions divided by the total number of the dataset.
    The best accuracy is 1.0, whereas the worst is 0.0. It can
    also be calculated by 1 – ERR.
    """

    ACC = (dConf['VP'] + dConf['VN']) / (dConf['VP'] + dConf['VN'] +
                                         dConf['FN'] + dConf['FP'])
    """
    Sensitivity (Recall or True positive rate)
    
    Sensitivity (SN) is calculated as the number of correct
    positive predictions divided by the total number of positives.
    It is also called recall (REC) or true positive rate (TPR).
    The best sensitivity is 1.0, whereas the worst is 0.0.
    """

    try:
        SN = dConf['VP'] / (dConf['VP'] + dConf['FN'])
    except:
        SN = -99
    """
    Specificity (True negative rate)

    Specificity (SP) is calculated as the number of correct negative
    predictions divided by the total number of negatives. It is
    also called true negative rate (TNR). The best specificity is 1.0,
    whereas the worst is 0.0.
    """

    SP = dConf['VN'] / (dConf['VN'] + dConf['FP'])
    """
    Precision (Positive predictive value)

    Precision (PREC) is calculated as the number of correct
    positive predictions divided by the total number of positive
    predictions. It is also called positive predictive value (PPV).
    The best precision is 1.0, whereas the worst is 0.0.
    """

    PREC = dConf["VP"] / (dConf["VP"] + dConf['FP'])
    """
    False positive rate

    False positive rate (FPR) is calculated as the number of
    incorrect positive predictions divided by the total number
    of negatives. The best false positive rate is 0.0 whereas the
    worst is 1.0. It can also be calculated as 1 – specificity.
    """

    FPR = dConf['FP'] / (dConf['VN'] + dConf['FP'])
    """
    Matthews correlation coefficient

    Matthews correlation coefficient (MCC) is a correlation
    coefficient calculated using all four values in the
    confusion matrix.
    """
    try:
        MCC = (dConf['VP'] * dConf['VN'] -
               dConf['FP'] * dConf['FN']) / (math.sqrt(
                   (dConf['VP'] + dConf['FP']) * (dConf['VP'] + dConf['FN']) *
                   (dConf['VN'] + dConf['FP']) * (dConf['VN'] + dConf['FN'])))
    except:
        MCC = -99
    """
    F-score

    F-score is a harmonic mean of precision and recall.
    """

    F0_5 = ((1 + 0.5**2) * (PREC * SN)) / (0.5**2 * PREC + SN)
    F_1 = (2 * PREC * SN) / (PREC + SN)
    F_2 = (5 * PREC * SN) / (4 * PREC + SN)

    evalMeasures = pandas.DataFrame(
        [['Error rate', ERR], ['Accuracy', ACC], ['Sensitivity', SN],
         ['Specificity', SP], ['Precision', PREC], [
             'False positive rate', FPR
         ], ['Matthews correlation coefficient', MCC], ['F-score 0.5', F0_5],
         ['F-score 1', F_1], ['F-score 2', F_2]],
        columns=['eval_mesure', 'value'])

    if outTbl:
        return obj_to_tbl([conf_tbl, evalMeasures, df],
                          outTbl,
                          sheetsName=['matrix', 'eval_mesures', 'tbl'])
    else:
        return conf_tbl, evalMeasures, df
Esempio n. 18
0
def count_by_groupcols_and_periods(conParam, pgtable, COLUMNS_TO_GROUP,
                                   HOUR_FIELD, MINUTES_FIELD, COUNT_FIELD_NAME,
                                   OUTPUT_FILE,
                                   PERIOD_INTERVAL=None, PERIODS=None):
    """
    Count rows in a pgtable by periods of X minutes grouping by columns values
    """
    
    from gasp import day_to_intervals
    
    if not PERIODS and not PERIODS_INTERVAL:
        raise ValueError((
            "Please give value to PERIODS or PERIODS_INTERAL. "
            "If PERIODS and PERIODS_INTERVAL, PERIODS will have priority."
        ))
    
    INTERVALS = day_to_intervals(PERIOD_INTERVAL) if not PERIODS else PERIODS
    
    i = 0
    for interval in INTERVALS:
        start, end = interval
        
        INTERVAL_STR = '{}h{}-{}h{}'.format(start[0], start[1], end[0], end[1])
        
        if start[0] == end[0]:
            QUERY = (
                "SELECT {cols}, COUNT({col}) AS {countname} FROM {table} "
                "WHERE {hourF}={hour} AND "
                "{minF} >= {minLower} AND {minF} < {minUpper} "
                "GROUP BY {cols}"
            ).format(
                table=pgtable, cols=', '.join(COLUMNS_TO_GROUP),
                col=COLUMNS_TO_GROUP[0], countname=COUNT_FIELD_NAME,
                hourF=HOUR_FIELD, hour=str(start[0]),
                minF=MINUTES_FIELD, minLower=str(start[1]), minUpper=str(end[1])
            )
        
        else:
            if end[0] - start[0] == 1:
                QUERY = (
                    "SELECT {cols}, COUNT({col}) AS {countname} FROM {table} "
                    "WHERE ({hourF}={hourLower} AND {minF}>={minLower}) OR "
                    "({hourF}={hourUpper} AND {minF} < {minUpper}) "
                    "GROUP BY {cols}"
                ).format(
                    table=pgtable, cols=', '.join(COLUMNS_TO_GROUP),
                    col=COLUMNS_TO_GROUP[0], countname=COUNT_FIELD_NAME,
                    hourF=HOUR_FIELD, hourLower=str(start[0]), hourUpper=str(end[0]),
                    minF=MINUTES_FIELD, minLower=str(start[1]), minUpper=str(end[1])
                )
            
            else:
                mHours = [start[0] + i for i in range(1, end[0] - start[0])]
                
                QUERY = (
                    "SELECT {cols}, COUNT({col}) AS {countname} FROM {table} "
                    "WHERE ({hourF}={hourLower} AND {minF}>={minLower}) OR "
                    "{mean_hours_exp} OR "
                    "({hourF}={hourUpper} AND {minF} < {minUpper}) "
                    "GROUP BY {cols}"
                ).format(
                    table_pgtable, cols=', '.join(COLUMNS_TO_GROUP),
                    col=COLUMNS_TO_GROUP[0], countname=COUNT_FIELD_NAME,
                    hourF=HOUR_FIELD, hourLower=str(start[0]), hourUpper=str(end[0]),
                    minF=MINUTES_FIELD, minLower=str(end[1]), minUpper=str(end[1]),
                    mean_hours_exp=" OR ".join([
                        "({}={} AND {} >= 0)".format(
                            HOUR_FIELD, h, MINUTES_FIELD
                        ) for h in mHours
                    ])
                )
        
        countTbl = query_to_df(conParam, QUERY, db_api='psql')
        
        countTbl[HOUR_FIELD] = INTERVAL_STR
        
        if not i:
            table = countTbl
            i+=1
        else:
            table = table.append(countTbl, ignore_index=True)
    
    obj_to_tbl(table, OUTPUT_FILE)
Esempio n. 19
0
def meanrowsday_of_periods_by_entity(psql_con,
                                     pgtable,
                                     dayField,
                                     hourField,
                                     minutesField,
                                     secondField,
                                     entityField,
                                     PERIODS,
                                     outFile,
                                     filterData=None,
                                     numberDays=None):
    """
    Evolution of meanday_of_periods_by_entity:
    For every day in a pgtable, count the number of rows by periods of X minutes
    for each interest entity.
    
    At the end, calculate the mean between every day for each period.
    
    This method uses SQL and TimeInterval columns.
    
    PERIODS = [('07:30:00', '09:30:00'), ('07:30:00', '09:30:00')]
    
    It is not complete because the output table not have a column for each
    period
    """

    import pandas
    from gasp.pyt import obj_to_lst
    from gasp.sql.fm import q_to_obj
    from gasp.to import obj_to_tbl

    def get_case(PTUPLE, PFIELD):
        return ("CASE "
                "WHEN TO_TIMESTAMP("
                "COALESCE(CAST({h} AS text), '') || ':' || "
                "COALESCE(CAST({m} AS text), '') || ':' || "
                "COALESCE(CAST({s} AS text), ''), 'HH24:MI:SS'"
                ") >= TO_TIMESTAMP('{tLower}', 'HH24:MI:SS') AND "
                "TO_TIMESTAMP("
                "COALESCE(CAST({h} AS text), '') || ':' || "
                "COALESCE(CAST({m} AS text), '') || ':' || "
                "COALESCE(CAST({s} AS text), ''), 'HH24:MI:SS'"
                ") < TO_TIMESTAMP('{tUpper}', 'HH24:MI:SS') "
                "THEN 1 ELSE 0 "
                "END AS {fld}").format(h=hourField,
                                       m=minutesField,
                                       s=secondField,
                                       tLower=PTUPLE[0],
                                       tUpper=PTUPLE[1],
                                       fld=PFIELD)

    entityField = obj_to_lst(entityField)

    periodsCols = [
        "p{ha}h{ma}_{hb}h{mb}".format(ha=p[0].split(':')[0],
                                      ma=p[0].split(':')[1],
                                      hb=p[1].split(':')[0],
                                      mb=p[1].split(':')[1]) for p in PERIODS
    ]

    ndaysQ = "SELECT {} AS nday".format(numberDays) if numberDays else \
        ("SELECT MAX(nday) AS nday FROM ("
            "SELECT row_number() OVER(ORDER BY {dayF}) AS nday "
            "FROM {t} {whr}"
            "GROUP BY {dayF}"
        ") AS dayt")

    # Get mean rows of all days by entity and period
    q = ("SELECT {entityF}, {meanSq}, nday FROM ("
         "SELECT {entityF}, {dayF}, {sumSeq} FROM ("
         "SELECT {entityF}, {dayF}, {caseSt} FROM {t} {whr}"
         ") AS foo "
         "WHERE {whrSq} "
         "GROUP BY {entityF}, {dayF}"
         ") AS foo2, ({getND}) AS fooday "
         "GROUP BY {entityF}, nday").format(
             entityF=", ".join(entityField),
             meanSq=", ".join([
                 "(SUM({f}) / nday) AS {f}".format(f=p) for p in periodsCols
             ]),
             dayF=dayField,
             sumSeq=", ".join(
                 ["SUM({f}) AS {f}".format(f=p) for p in periodsCols]),
             caseSt=", ".join([
                 get_case(PERIODS[x], periodsCols[x])
                 for x in range(len(PERIODS))
             ]),
             t=pgtable,
             whr="" if not filterData else "WHERE {} ".format(filterData),
             whrSq=" OR ".join(["{}=1".format(p) for p in periodsCols]),
             getND=ndaysQ)

    data = q_to_obj(psql_con, q, db_api='psql')

    obj_to_tbl(data, outFile)

    return outFile
Esempio n. 20
0
def get_not_used_tags(OSM_FILE, OUT_TBL):
    """
    Use a file OSM to detect tags not considered in the
    OSM2LULC procedure
    """

    import os
    from gasp.anls.exct import sel_by_attr
    from gasp.fm.sql import query_to_df
    from gasp.oss import get_filename
    from gasp.osm2lulc.utils import osm_to_sqdb
    from gasp.to import obj_to_tbl

    OSM_TAG_MAP = {
        "DB":
        os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'osmtolulc.sqlite'),
        "OSM_FEAT":
        "osm_features",
        "KEY_COL":
        "key",
        "VALUE_COL":
        "value",
        "GEOM_COL":
        "geom"
    }

    WORKSPACE = os.path.dirname(OUT_TBL)

    sqdb = osm_to_sqdb(
        OSM_FILE, os.path.join(WORKSPACE,
                               get_filename(OSM_FILE) + '.sqlite'))

    # Get Features we are considering
    ourOSMFeatures = query_to_df(
        OSM_TAG_MAP["DB"],
        ("SELECT {key} AS key_y, {value} AS value_y, {geom} AS geom_y "
         "FROM {tbl}").format(key=OSM_TAG_MAP["KEY_COL"],
                              value=OSM_TAG_MAP["VALUE_COL"],
                              geom=OSM_TAG_MAP["GEOM_COL"],
                              tbl=OSM_TAG_MAP["OSM_FEAT"]),
        db_api='sqlite')

    # Get Features in File
    TABLES_TAGS = {
        'points': ['highway', 'man_made', 'building'],
        'lines':
        ['highway', 'waterway', 'aerialway', 'barrier', 'man_made', 'railway'],
        'multipolygons': [
            'aeroway', 'amenity', 'barrier', 'building', 'craft', 'historic',
            'land_area', ''
            'landuse', 'leisure', 'man_made', 'military', 'natural', 'office',
            'place', 'shop', 'sport', 'tourism', 'waterway', 'power',
            'railway', 'healthcare', 'highway'
        ]
    }

    Qs = [
        " UNION ALL ".join([(
            "SELECT '{keycol}' AS key, {keycol} AS value, "
            "'{geomtype}' AS geom FROM {tbl} WHERE "
            "{keycol} IS NOT NULL"
        ).format(
            keycol=c, geomtype='Point' if table == 'points' else 'Line' \
                if table == 'lines' else 'Polygon',
            tbl=table
        ) for c in TABLES_TAGS[table]]) for table in TABLES_TAGS
    ]

    fileOSMFeatures = query_to_df(sqdb,
                                  ("SELECT key, value, geom FROM ({}) AS foo "
                                   "GROUP BY key, value, geom").format(
                                       " UNION ALL ".join(Qs)),
                                  db_api='sqlite')

    _fileOSMFeatures = fileOSMFeatures.merge(
        ourOSMFeatures,
        how='outer',
        left_on=["key", "value", "geom"],
        right_on=["key_y", "value_y", "geom_y"])

    # Select OSM Features of file without correspondence
    _fileOSMFeatures["isnew"] = _fileOSMFeatures.key_y.fillna(value='nenhum')

    newTags = _fileOSMFeatures[_fileOSMFeatures.isnew == 'nenhum']

    newTags["value"] = newTags.value.str.replace("'", "''")

    newTags["whr"] = newTags.key.str.encode('utf-8').astype(str) + "='" + \
        newTags.value.str.encode('utf-8').astype(str) + "'"

    # Export OUT_TBL with tags not being used
    obj_to_tbl(newTags, OUT_TBL, sheetsName="new_tags", sanitizeUtf8=True)

    # Export tags not being used to new shapefile
    def to_regular_str(row):
        from gasp import unicode_to_str

        san_str = unicode_to_str(row.whr)

        row["whr_san"] = san_str

        return row

    for t in TABLES_TAGS:
        if t == 'points':
            filterDf = newTags[newTags.geom == 'Point']

        elif t == 'lines':
            filterDf = newTags[newTags.geom == 'Line']

        elif t == 'multipolygons':
            filterDf = newTags[newTags.geom == 'Polygon']

        Q = unicode("SELECT * FROM {} WHERE {}",
                    'utf-8').format(unicode(t, 'utf-8'),
                                    filterDf.whr.str.cat(sep=" OR "), 'utf-8')

        try:
            shp = sel_by_attr(sqdb,
                              Q,
                              os.path.join(WORKPSACE, t + '.shp'),
                              api_gis='ogr')
        except:
            __filterDf = filterDf.apply(lambda x: to_regular_str(x), axis=1)

            _Q = "SELECT * FROM {} WHERE {}".format(
                t, __filterDf.whr_san.str.cat(sep=" OR "))

            shp = sel_by_attr(sqdb, _Q, os.path.join(WORKSPACE, t + '.shp'))

    return OUT_TBL
Esempio n. 21
0
File: time.py Progetto: zonakre/gasp
def ID_rows_with_temporal_proximity_by_entities(conParam, table, entity_field,
                                 day_field, hour_field, hour_decimal_field,
                                 time_tolerance, outXlsPath):
    """
    Retrieve rows from one pgtable with some temporal proximity
    
    Table structure should be
    entity |     day    | hour | hour_decimal
      0    | 2018-01-02 |  5   |   5,10
      0    | 2018-01-03 |  4   |   4,15
      0    | 2018-01-02 |  5   |   5,12
      0    | 2018-01-02 |  5   |   5,8
      1    | 2018-01-02 |  4   |   4,10
      1    | 2018-01-02 |  5   |   5,12
      1    | 2018-01-02 |  4   |   4,20
      1    | 2018-01-02 |  4   |   4,12
      1    | 2018-01-02 |  4   |   4,6
    
    For a time_tolerance of 5 minutes, the output table will have
    the rows with a temporal difference inside/bellow that time tolerance
    
    entity_field could be more than one field
    
    This method only identifies if one entity, for one day, has rows 
    very close of each others, in terms of time.
    
    Not a good strategy for large tables. For large tables, SQL based methods
    are needed
    """
    
    import pandas
    from gasp             import goToList
    from gasp.fm.sql      import query_to_df
    from gasp.sql.mng.fld import get_columns_type
    from gasp.to          import obj_to_tbl
    
    entity_field = goToList(entity_field)
    COLS = entity_field + [day_field, hour_field]
    COLS_TYPE = get_columns_type(conParam, table)
    
    # TIME TOLERANCE IN HOURS
    TIME_TOLERANCE = time_tolerance / 60.0
    
    def thereIsRowsSameTimeInt(row):
        whr = []
        for c in COLS:
            if COLS_TYPE[c] == str:
                whr.append("{}='{}'".format(c, row[c]))
            else:
                whr.append("{}={}".format(c, row[c]))
        
        hourRows = query_to_df(conParam,
            "SELECT {} FROM {} WHERE {}".format(
                hour_decimal_field, table,
                " AND ".join(whr)
            ), db_api='psql'
        )[hour_decimal_field].tolist()
        
        for i in range(len(hourRows)):
            for e in range(i+1, len(hourRows)):
                dif = abs(hourRows[i][0] - hourRows[e][0])
                
                if dif < TIME_TOLERANCE:
                    break
            
            if dif < TIME_TOLERANCE:
                break
        
        if dif < TIME_TOLERANCE:
            row['time_difference'] = 1
        else:
            row['time_difference'] = 0
        
        return row
    
    # Count entity occourrences for one day and hour
    countsByEntityTime = query_to_df(conParam, (
        "SELECT {scols}, conta FROM "
        "(SELECT {scols}, COUNT({ent}) AS conta FROM {tbl} "
        "GROUP BY {scols}) AS foo WHERE conta > 1"
    ).format(
        scols = ', '.join(COLS),
        ent = entity_field[0],
        tbl = table
    ), db_api='psql')
    
    # For each row in the last count, When count is > 1
    # Check time difference between rows for one day and hour
    countsByEntityTime = countsByEntityTime.apply(
        lambda x: thereIsRowsSameTimeInt(x), axis=1
    )
    
    obj_to_tbl(countsByEntityTime, outXlsPath)
    
    return outXlsPath
Esempio n. 22
0
                df = df[~df.b_refid.isnull()]

            if fn == 'ovl_union':
                df['areav'] = df.geometry.area

            df = pd.DataFrame({
                'areav':
                df.groupby(['a_FID'])['areav'].agg('sum')
            }).reset_index()

            fish_df = fish_df.merge(df,
                                    how='left',
                                    left_on='fid',
                                    right_on='a_FID')

            if fn != 'ovl_union':
                fish_df[fn] = fish_df.areav * 100 / fish_df.area

            else:
                fish_df['overlay'] = fish_df.areav * 100 / fish_df.area

            fish_df.drop(['areav', 'a_FID'], axis=1, inplace=True)

        # Save file
        df_to_shp(fish_df, os.path.join(results, os.path.basename(fishp)))

    # Write List of Fishnet
    from gasp.to import obj_to_tbl

    obj_to_tbl(df_fnet, os.path.join(results, 'fishnet_list.xlsx'))
Esempio n. 23
0
def meanrowsday_by_entity(psqldb,
                          pgtable,
                          dayField,
                          entityField,
                          out_file,
                          filterData=None,
                          newMeanField=None,
                          numberDays=None):
    """
    For every day in a pgtable, count the number of rows for each interest entity.
    At the end, calculate the mean of rows between every day for each entity.
    
    Day field must be of type text
    
    Difference in relation to meandays_by_entity:
    this one uses only SQL and PGSQL and not Pandas.
    
    if numberDays=None, the number of days used will be based on the days
    included in the data. If you want the mean for 5 days, but there are no data
    for one of these days, with numberDays=None, the mean will be only for
    4 days.
    """

    import pandas
    from gasp.pyt import obj_to_lst
    from gasp.sql.fm import q_to_obj
    from gasp.to import obj_to_tbl

    entityField = obj_to_lst(entityField)
    mean_field = "mean_rows" if not newMeanField else newMeanField

    ndaysQ = "SELECT {} AS nday".format(numberDays) if numberDays else \
        ("SELECT MAX(nday) AS nday FROM ("
            "SELECT row_number() OVER(ORDER BY {dayF}) AS nday "
            "FROM {t} {whr}"
            "GROUP BY {dayF}"
        ") AS fooday").format(
            whr="" if not filterData else "WHERE {} ".format(filterData),
            dayF=dayField, t=pgtable
        )

    # Get mean rows of all days by entity
    q = ("SELECT {entityF}, (SUM(conta) / nday) AS {mF} "
         "FROM ("
         "SELECT {entityF}, {dayF}, COUNT({cnt}) AS conta "
         "FROM {t} {whr}"
         "GROUP BY {entityF}, {dayF}"
         ") AS foo, ({getD}) AS foo2 "
         "GROUP BY {entityF}, nday").format(
             entityF=", ".join(entityField),
             dayF=dayField,
             mF=mean_field,
             cnt=entityField[0],
             t=pgtable,
             whr="" if not filterData else "WHERE {} ".format(filterData),
             getD=ndaysQ)

    data = q_to_obj(psqldb, q, db_api='psql')

    obj_to_tbl(data, out_file)

    return out_file
Esempio n. 24
0
def meandays_by_entity(db,
                       pgtable,
                       DAY_FIELD,
                       ENTITY_FIELD,
                       COUNT_FIELD_NAME,
                       OUTPUT_FILE,
                       EXCLUDE_DAYS=None):
    """
    For every day in a pgtable, count the number of rows for each interest entity.
    At the end, calculate the mean of rows between every day for each entity.
    
    Day field must be of type text
    """

    import os
    import pandas
    from gasp.sql.fm import q_to_obj
    from gasp.to import obj_to_tbl

    # Get days
    VALUES = q_to_obj(db,
                      "SELECT {col} FROM {t} GROUP BY {col}".format(
                          col=DAY_FIELD, t=pgtable),
                      db_api='psql')[DAY_FIELD].tolist()

    # For every day, Group rows by entities
    tableArray = []
    for day in VALUES:
        if EXCLUDE_DAYS:
            if day[0] in EXCLUDE_DAYS:
                continue

        QUERY = ("SELECT {col}, COUNT({col}) AS {countname} FROM {table} "
                 "WHERE {dayF}='{d}' GROUP BY {col}").format(
                     col=ENTITY_FIELD,
                     countname=COUNT_FIELD_NAME,
                     table=pgtable,
                     dayF=DAY_FIELD,
                     d=day[0])

        countTbl = q_to_obj(db, QUERY, db_api='psql')

        tableArray.append(countTbl)

    # Get mean for all entities
    main_table = tableArray[0]
    TMP_COUNT_FIELD_NAME = 'join_' + COUNT_FIELD_NAME
    TMP_JOIN_FIELD = 'id_entity'

    for i in range(1, len(tableArray)):
        tableArray[i].rename(columns={
            COUNT_FIELD_NAME: TMP_COUNT_FIELD_NAME,
            ENTITY_FIELD: TMP_JOIN_FIELD
        },
                             inplace=True)

        main_table = main_table.merge(tableArray[i],
                                      how='outer',
                                      left_on=ENTITY_FIELD,
                                      right_on=TMP_JOIN_FIELD)

        main_table.fillna(0, inplace=True)
        main_table[ENTITY_FIELD].replace(0,
                                         main_table[TMP_JOIN_FIELD],
                                         inplace=True)

        main_table[COUNT_FIELD_NAME] = main_table[COUNT_FIELD_NAME] + \
            main_table[TMP_COUNT_FIELD_NAME]
        main_table.drop([TMP_COUNT_FIELD_NAME, TMP_JOIN_FIELD],
                        axis=1,
                        inplace=True)

    main_table[COUNT_FIELD_NAME] = main_table[COUNT_FIELD_NAME] / len(
        tableArray)

    obj_to_tbl(main_table, OUTPUT_FILE)
Esempio n. 25
0
def tbl_to_areamtx(inShp, col_a, col_b, outXls, db=None, with_metrics=None):
    """
    Table to Matrix
    
    Table as:
        FID | col_a | col_b | geom
    0 |  1  |   A   |   A   | ....
    0 |  2  |   A   |   B   | ....
    0 |  3  |   A   |   A   | ....
    0 |  4  |   A   |   C   | ....
    0 |  5  |   A   |   B   | ....
    0 |  6  |   B   |   A   | ....
    0 |  7  |   B   |   A   | ....
    0 |  8  |   B   |   B   | ....
    0 |  9  |   B   |   B   | ....
    0 | 10  |   C   |   A   | ....
    0 | 11  |   C   |   B   | ....
    0 | 11  |   C   |   D   | ....
    
    To:
    classe | A | B | C | D
       A   |   |   |   | 
       B   |   |   |   |
       C   |   |   |   |
       D   |   |   |   |
    
    col_a = rows
    col_b = cols

    api options:
    * pandas;
    * psql;
    """

    if not db:
        import pandas as pd
        import numpy as np
        from gasp.gt.fmshp import shp_to_obj
        from gasp.to import obj_to_tbl

        # Open data
        df = shp_to_obj(inShp)

        # Remove nan values by -9999
        df = df[pd.notnull(df[col_a])]
        df = df[pd.notnull(df[col_b])]

        # Get Area
        df['realarea'] = df.geometry.area / 1000000

        # Get rows and Cols
        rows = df[col_a].unique()
        cols = df[col_b].unique()
        refval = list(np.sort(np.unique(np.append(rows, cols))))

        # Produce matrix
        outDf = []
        for row in refval:
            newCols = [row]
            for col in refval:
                newDf = df[(df[col_a] == row) & (df[col_b] == col)]

                if not newDf.shape[0]:
                    newCols.append(0)

                else:
                    area = newDf.realarea.sum()

                    newCols.append(area)

            outDf.append(newCols)

        outcols = ['class'] + refval
        outDf = pd.DataFrame(outDf, columns=outcols)

        if with_metrics:
            from gasp.pyt.dtcls.eval import get_measures_for_mtx

            out_df = get_measures_for_mtx(outDf, 'class')

            return obj_to_tbl(out_df, outXls)

        # Export to Excel
        return obj_to_tbl(outDf, outXls)

    else:
        from gasp.pyt.oss import fprop
        from gasp.sql.db import create_db
        from gasp.sql.i import db_exists
        from gasp.gql.to import shp_to_psql
        from gasp.gql.tomtx import tbl_to_area_mtx
        from gasp.to import db_to_tbl

        # Create database if not exists
        is_db = db_exists(db)

        if not is_db:
            create_db(db, api='psql')

        # Add data to database
        tbl = shp_to_psql(db, inShp, api='shp2pgsql')

        # Create matrix
        mtx = tbl_to_area_mtx(db, tbl, col_a, col_b, fprop(outXls, 'fn'))

        # Export result
        return db_to_tbl(db, mtx, outXls, sheetsNames='matrix')
Esempio n. 26
0
File: dmx.py Progetto: zonakre/gasp
def dist_matrix_using_shp(originsShp,
                          destinationsShp,
                          originsEpsg,
                          destinationsEpsg,
                          outTable,
                          transMode=None):
    """
    Create a distance matrix using shapes and Google Maps API
    """

    import time
    from threading import Thread
    from gasp.mng.split import split_df, split_df_inN
    from gasp.mng.prj import project
    from gasp.prop.feat import get_geom_type
    from gasp.mng.gen import merge_df
    from gasp.fm import tbl_to_obj
    from gasp.to import obj_to_tbl
    from gasp.web.glg import get_keys
    from gasp.web.glg.distmx import dist_matrix

    # Origins and Destionations to GeoDataframe
    originsDf = tbl_to_obj(originsShp)
    destnatDf = tbl_to_obj(destinationsShp)

    # Check Geometries type - shapes should be of type point
    originsGeom = get_geom_type(originsDf, gisApi='pandas')
    destGeom = get_geom_type(destnatDf, gisApi='pandas')
    if (originsGeom != 'Point' and originsGeom != 'MultiPoint') or \
        (destGeom != 'Point' and destGeom != 'MultiPoint'):
        raise ValueError('All input geometries must be of type point')

    # Re-project GeoDataframes if needed
    originsDf = originsDf if originsEpsg == 4326 else \
        project(originsDf, None, 4326, gisApi='pandas')

    destnatDf = destnatDf if destinationsEpsg == 4326 else \
        project(destnatDf, None, 4326, gisAPi='pandas')

    # Geom to Field as str
    originsDf["geom"] = originsDf["geometry"].y.astype(str) + "," + \
        originsDf["geometry"].x.astype(str)

    destnatDf["geom"] = destnatDf["geometry"].y.astype(str) + "," + \
        destnatDf["geometry"].x.astype(str)

    originsDf["old_fid"] = originsDf.index
    destnatDf["old_fid"] = destnatDf.index

    # Split destinations DataFrame into Dafaframes with
    lst_destinos = split_df(destnatDf, 10)

    # Get Keys
    KEYS = get_keys()
    lst_keys = KEYS["key"].tolist()
    origensByKey = split_df_inN(originsDf, KEYS.shape[0])

    if len(origensByKey) == len(lst_keys) + 1:
        origensByKey[-2] = origensByKey[-2].append(origensByKey[-1])
        del origensByKey[-1]

    # Produce matrix for each origins in origensByKey
    results = []

    def get_matrix(origins, key):
        subOrigins = split_df(origins, 10)

        for df in subOrigins:
            for __df in lst_destinos:
                matrix = dist_matrix(str(df.geom.str.cat(sep="|")),
                                     str(__df.geom.str.cat(sep="|")),
                                     df.shape[0],
                                     __df.shape[0],
                                     transport_mode=transMode,
                                     useKey=str(key))

                matrix = pandas.DataFrame(matrix)
                matrix = pandas.concat([
                    matrix.drop(["elements"], axis=1),
                    matrix["elements"].apply(pandas.Series)
                ],
                                       axis=1)

                originsFID = df.old_fid.tolist()
                destinaFID = __df.old_fid.tolist()

                mm = []
                for i in range(len(originsFID)):
                    for e in range(len(destinaFID)):
                        ll = [originsFID[i], destinaFID[e], matrix.iloc[i, e]]
                        mm.append(ll)

                Fmatrix = pandas.DataFrame(
                    mm, columns=["fid_origin", "fid_destin", "cost"])

                results.append(Fmatrix)

                time.sleep(5)

    # Create threads
    thrds = []
    i = 1

    for df in origensByKey:
        thrds.append(
            Thread(name="tk{}".format(str(i)),
                   target=get_matrix,
                   args=(df, lst_keys[i - 1])))
        i += 1

    # Start all threads
    for thr in thrds:
        thr.start()

    # Wait for all threads to finish
    for thr in thrds:
        thr.join()

    # Join all dataframes
    RESULT = merge_df(results, ignIndex=False)
    RESULT = sanitizeDataCols(RESULT, "cost")

    RESULT = RESULT.merge(originsDf,
                          how='inner',
                          left_on=["fid_origin"],
                          right_on=["old_fid"])
    RESULT.drop([x for x in originsDf.columns.values if x != "geometry"],
                axis=1,
                inplace=True)
    RESULT.rename(columns={"geometry": "origin_geom"}, inplace=True)

    RESULT = RESULT.merge(destnatDf,
                          how='inner',
                          left_on=["fid_destin"],
                          right_on=["old_fid"])
    RESULT.drop([x for x in destnatDf.columns.values if x != "geometry"],
                axis=1,
                inplace=True)
    RESULT.rename(columns={"geometry": "destin_geom"}, inplace=True)

    RESULT["origin_geom"] = RESULT.origin_geom.astype(str)
    RESULT["destin_geom"] = RESULT.destin_geom.astype(str)

    return obj_to_tbl(RESULT, outTable)
Esempio n. 27
0
def cost_od(shpOrigins, shpDestinations, epsgOrigins, epsgDestinations,
            table_result, mode='foot-walking'):
    """
    Matrix od Service Implementation
    """
    
    import pandas
    from threading              import Thread
    from gasp.fm.api.orouteserv import get_keys
    from gasp.fm.api.orouteserv import matrix_od
    from gasp.fm                import shp_to_df
    from gasp.mng.split         import split_df_inN
    from gasp.fm.geom           import pointxy_to_cols
    from gasp.mng.prj           import project
    from gasp.mng.gen           import merge_df
    from gasp.prop.feat         import get_geom_type
    from gasp.to                import obj_to_tbl
    
    origensDf = tbl_to_obj(     shpOrigins)
    destinoDf = tbl_to_obj(shpDestinations)
    
    # Check if SHPs are points
    inGeomType = get_geom_type(origensDf, geomCol="geometry", gisApi='pandas')
    
    if inGeomType != 'Point' and inGeomType != 'MultiPoint':
        raise ValueError('The input geometry must be of type point')
    
    inGeomType = get_geom_type(destinoDf, geomCol="geometry", gisApi='pandas')
    
    if inGeomType != 'Point' and inGeomType != 'MultiPoint':
        raise ValueError('The input geometry must be of type point')
    
    # Re-project if needed
    if epsgOrigins != 4326:
        origensDf = project(origensDf, None, 4326, gisApi='pandas')
    
    if epsgDestinations != 4326:
        destinoDf = project(destinoDf, None, 4326, gisApi='pandas')
    
    origensDf = pointxy_to_cols(
        origensDf, geomCol="geometry",
        colX="longitude", colY="latitude"
    ); destinoDf = pointxy_to_cols(
        destinoDf, geomCol="geometry",
        colX="longitude", colY="latitude"
    )
    
    origensDf["location"] = origensDf.longitude.astype(str) + "," + \
        origensDf.latitude.astype(str)
    destinoDf["location"] = destinoDf.longitude.astype(str) + "," + \
        destinoDf.latitude.astype(str)
    
    origensDf["old_fid"] = origensDf.index
    destinoDf["old_fid"] = destinoDf.index
    
    # Get Keys
    KEYS = get_keys()

    origensByKey = split_df_inN(origensDf, KEYS.shape[0])

    lst_keys = KEYS["key"].tolist()
    
    # Produce matrix
    results = []
    def get_matrix(origins, key):
        origins.reset_index(inplace=True)
        origins["rqst_idx"] = origins.index.astype(str)
        
        destinations = destinoDf.copy()
        
        strSource = origins.location.str.cat(sep="|")
        idxSource = origins.rqst_idx.str.cat(sep=",")
        
        destinations["rqst_idx"] = destinations.old_fid + origins.shape[0]
        destinations["rqst_idx"] = destinations.rqst_idx.astype(str)
        strDestin = destinations.location.str.cat(sep="|")
        idxDestin = destinations.rqst_idx.str.cat(sep=",")
        
        rslt = matrix_od(
            strSource + "|" + strDestin,
            idxSources=idxSource, idxDestinations=idxDestin,
            useKey=key, modeTransportation=mode
        )
        
        rslt = pandas.DataFrame(rslt["durations"])
        
        originsFID = origins.old_fid.tolist()
        destinaFID = destinations.old_fid.tolist()
        
        mm = []
        for lnh in range(len(originsFID)):
            for col in range(len(destinaFID)):
                ll = [
                    originsFID[lnh], destinaFID[col], rslt.iloc[lnh, col]
                ]
                mm.append(ll)
        
        matrix = pandas.DataFrame(
            mm, columns=["fid_origin", "fid_destin", "cost"])
        
        results.append(matrix)
    
    # Create threads
    thrds = []
    i= 1
    for df in origensByKey:
        thrds.append(Thread(
            name="tk{}".format(str(i)), target=get_matrix,
            args=(df, lst_keys[i - 1])
        ))
        i += 1
    
    # Start all threads
    for thr in thrds:
        thr.start()
    
    # Wait for all threads to finish
    for thr in thrds:
        thr.join()
    
    # Join all dataframes
    RESULT = merge_df(results, ignIndex=False)
    
    RESULT = RESULT.merge(
        origensDf             , how='inner',
        left_on=["fid_origin"], right_on=["old_fid"]
    ); RESULT.drop([
        x for x in origensDf.columns.values if x != "geometry"],
        axis=1, inplace=True
    ); RESULT.rename(columns={"geometry" : "origin_geom"}, inplace=True)
    
    RESULT = RESULT.merge(
        destinoDf, how='inner',
        left_on=["fid_destin"], right_on=["old_fid"]
    ); RESULT.drop([
        x for x in destinoDf.columns.values if x != "geometry"],
        axis=1, inplace=True
    ); RESULT.rename(columns={"geometry" : "destin_geom"}, inplace=True)
    
    RESULT["origin_geom"] = RESULT.origin_geom.astype(str)
    RESULT["destin_geom"] = RESULT.destin_geom.astype(str)
    
    return obj_to_tbl(RESULT, table_result)
Esempio n. 28
0
File: dmx.py Progetto: zonakre/gasp
def dist_matrix_by_shp(oShp, dShp, oEpsg, dEpsg, result, transMode=None):
    """
    Create distance matrix using shapes and Google Maps API
    
    - Uses my first API_KEY
    """

    import time
    import pandas
    from gasp.fm import tbl_to_obj
    from gasp.mng.split import split_df
    from gasp.mng.prj import project
    from gasp.mng.fld.df import listval_to_newcols
    from gasp.prop.feat import get_geom_type
    from gasp.mng.gen import merge_df
    from gasp.web.glg.distmx import dist_matrix
    from gasp.to import obj_to_tbl
    from gasp.to.obj import df_to_list
    from gasp.oss import get_filename

    # Origins and Destionations to GeoDataframe
    originsDf = tbl_to_obj(oShp)
    destnatDf = tbl_to_obj(dShp)

    # Check Geometries type - shapes should be of type point
    originsGeom = get_geom_type(originsDf, gisApi='pandas')
    destGeom = get_geom_type(destnatDf, gisApi='pandas')
    if (originsGeom != 'Point' and originsGeom != 'MultiPoint') or \
        (destGeom != 'Point' and destGeom != 'MultiPoint'):
        raise ValueError('All input geometries must be of type point')

    # Re-project GeoDataframes if needed
    originsDf = originsDf if oEpsg == 4326 else \
        project(originsDf, None, 4326, gisApi='pandas')

    destnatDf = destnatDf if dEpsg == 4326 else \
        project(destnatDf, None, 4326, gisApi='pandas')

    # Geom to Field as str
    originsDf["geom"] = originsDf["geometry"].y.astype(str) + "," + \
        originsDf["geometry"].x.astype(str)

    destnatDf["geom"] = destnatDf["geometry"].y.astype(str) + "," + \
        destnatDf["geometry"].x.astype(str)

    originsDf["old_fid"] = originsDf.index
    destnatDf["old_fid"] = destnatDf.index

    # Split Destinations
    lstOrigins = split_df(originsDf, 95)
    for odf in lstOrigins:
        odf.reset_index(inplace=True)

    lstDestinations = df_to_list(destnatDf)
    RESULTS = []
    for destino in lstDestinations:
        for oDf in lstOrigins:
            matrix = dist_matrix(
                str(oDf.geom.str.cat(sep="|")),
                str(destino["geom"]),
                oDf.shape[0],
                1,
                transport_mode=transMode,
                useKey='AIzaSyAmyPmqtxD20urqtpCpn4ER74a6J4N403k')

            matrix = pandas.DataFrame(matrix)
            matrix = listval_to_newcols(matrix, "elements")

            matrix = matrix.merge(oDf,
                                  how='inner',
                                  left_index=True,
                                  right_index=True)

            matrix.rename(columns={
                'old_fid': "fid_origin",
                0: "cost"
            },
                          inplace=True)

            matrix["fid_destin"] = destino['old_fid']

            RESULTS.append(matrix)

            time.sleep(5)

    # Join all dataframes
    RESULT = merge_df(RESULTS, ignIndex=False)
    RESULT = sanitizeDataCols(RESULT, "cost")

    RESULT.drop([
        x
        for x in originsDf.columns.values if x != "geometry" and x != "old_fid"
    ],
                axis=1,
                inplace=True)
    RESULT.rename(columns={"geometry": "origin_geom"}, inplace=True)

    RESULT = RESULT.merge(destnatDf,
                          how='inner',
                          left_on=["fid_destin"],
                          right_on=["old_fid"])
    RESULT.drop([x for x in destnatDf.columns.values if x != "geometry"],
                axis=1,
                inplace=True)
    RESULT.rename(columns={"geometry": "destin_geom"}, inplace=True)

    RESULT["origin_geom"] = RESULT.origin_geom.astype(str)
    RESULT["destin_geom"] = RESULT.destin_geom.astype(str)

    obj_to_tbl(RESULT, result, sheetsName=get_filename(result))

    return result