Exemple #1
0
def san_report_combine(report):
    from glass.ng.rd import tbl_to_obj
    from glass.ng.pd.fld import splitcol_to_newcols

    repdata = tbl_to_obj(report, _delimiter="z")

    repdata.rename(columns={repdata.columns.values[0]: 'data'}, inplace=True)
    repdata.drop([
        0, 1, 2, 3, repdata.shape[0] - 1, repdata.shape[0] - 2,
        repdata.shape[0] - 3, repdata.shape[0] - 4
    ],
                 axis=0,
                 inplace=True)

    repdata["data"] = repdata.data.str.replace(' ', '').str.replace(
        '.', '').str.replace('category',
                             '').str.replace("Category",
                                             '').str.replace(';', '|')

    repdata["data"] = repdata.data.str[1:-1]

    repdata = splitcol_to_newcols(
        repdata, "data", "|", {
            0: "new_value",
            1: "first_raster_val",
            2: "second_raster_val",
            3: "n_cells"
        })

    return repdata
Exemple #2
0
def field_sum_two_tables(tableOne, tableTwo, joinFieldOne, joinFieldTwo,
                         field_to_sum, outTable):
    """
    Sum same field in different tables
    
    Table 1:
    id | field
    0 |  10
    1 |  11
    2 |  13
    3 |  10
    
    Table 2:
    id | field
    0 |  10
    1 |   9
    2 |  17
    4 |  15
    
    Create the new table
    id | field
    0 |  20
    1 |  20
    2 |  30
    3 |  10
    4 |  15
    """

    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    from glass.ng.pd.joins import sum_field_of_two_tables

    # Open two tables
    df_one = tbl_to_obj(tableOne)
    df_two = tbl_to_obj(tableTwo)

    # Do it!
    outDf = sum_field_of_two_tables(df_one, joinFieldOne, df_two, joinFieldTwo,
                                    field_to_sum)

    obj_to_tbl(outDf, outTable)

    return outTable
Exemple #3
0
def select_using_excel_refs(db_name, excel_file, sheet_name,
                            pgtable, ref_fields,
                            tableInRef, tableOutRef=None):
    """
    Split PGTABLE using references in excel table
    
    Create two tables:
    * One with similar rows - columns combination are in excel table;
    * One with rows not in excel table.
    
    TODO: Check if it's works. 
    """
    
    from glass.ng.rd    import tbl_to_obj
    from glass.ng.prop.sql import cols_type
    from glass.ng.sql.q    import q_to_ntbl
    
    def to_and(row, cols, ctype):
        def get_equal(_type):
            return '{}=\'{}\'' if _type == str else '{}={}'
        
        row['AND_E'] = ' AND '.join(
            get_equal(ctype[col]).format(col, row[col]) for col in cols
        )
        
        row['AND_E'] = '(' + row['AND_E'] + ')'
        
        return row
    
    # Get excel data
    table = tbl_to_obj(excel_file, sheet=sheet_name)
    
    # Get reference fields type
    TYPE_COLS = cols_type(db_name, pgtable)
    
    table = table.apply(lambda x: to_and(x, ref_fields, TYPE_COLS))
    
    whr_equal = ' OR '.join(table['AND_E'])
    
    q_to_ntbl(db_name, tableInRef, "SELECT * FROM {} WHERE {}".format(
        pgtable, whr_equal
    ), api='psql')
    
    if tableOutRef:
        COLS_RELATION = " AND ".join(["{ft}.{f} = {st}.{f}".format(
            ft=pgtable, f=col, st=tableInRef
        ) for col in TYPE_COLS])
    
        q_to_ntbl(db_name, tableOutRef, (
            "SELECT {ft}.* FROM {ft} LEFT JOIN {st} ON "
            "{rel} WHERE {st}.{c} IS NULL"
        ).format(
            ft=pgtable, st=tableInRef, rel=COLS_RELATION,
            c=TYPE_COLS.keys()[0]
        ), api='psql')
Exemple #4
0
def layoutv1(mxd, geodata, mapstbl, lyt_template,
    map_template, lyrint, outmaps):
    """
    Layout V1

    muda apenas layer com valores quantitativos
    agrupados em intervalos de valores
    """

    import arcpy, pprint
    import os
    from glass.ng.rd import tbl_to_obj

    mapstodo = tbl_to_obj(mapstbl)

    mapsattr = list(mapstodo.columns.values)

    aprx = arcpy.mp.ArcGISProject(mxd)

    # Get map
    mapobj = aprx.listMaps(map_template)[0]
    lyr = mapobj.listLayers(lyrint)[0]

    for i, r in mapstodo.iterrows():
        current_dict = lyr.connectionProperties

        replace_dict = {
            'connection_info' : {'database' : geodata},
            'dataset' : '{}.shp'.format(r.slug),
            'workspace_factory' : 'Shape File'
        }

        lyr.updateConnectionProperties(current_dict, replace_dict)

        # Get Layout
        lyt = aprx.listLayouts(lyt_template)[0]

        # List elements
        elm = lyt.listElements("TEXT_ELEMENT")

        # Replace elements
        for e in elm:
            if e.name in mapsattr:
                if type(r[e.name]) == float:
                    e.text = str(r[e.name]).replace('.', ',')
                else:
                    e.text = str(r[e.name])
        
        lyt.exportToJPEG(os.path.join(
            outmaps, '{}.jpg'.format(r.slug)), resolution=500)
        
        aprx.saveACopy(os.path.join(outmaps, '{}.aprx'.format(r.slug)))
    
    return outmaps
Exemple #5
0
def layoutv1_nmaps(nmaps, mxd, geodata, mapstbl, lyttmp, mapst, lyrints, outmaps):
    """
    Layout v1 with N maps
    """

    import arcpy
    import os
    from glass.ng.rd import tbl_to_obj

    mapstodo = tbl_to_obj(mapstbl)

    return outmaps
Exemple #6
0
def model_conf_matrix(tblFile, refCol, clsCol, outMxt):
    """
    Model Evaluation
    """

    import pandas as pd
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    from sklearn.metrics import confusion_matrix, classification_report

    data = tbl_to_obj(tblFile)

    data[refCol] = data[refCol].astype(str)
    data[clsCol] = data[clsCol].astype(str)

    ref_id = data[[refCol]].drop_duplicates().sort_values(refCol)

    conf_mat = confusion_matrix(data[refCol], data[clsCol])

    mxt = pd.DataFrame(conf_mat,
                       columns=ref_id[refCol].values,
                       index=ref_id[refCol].values)
    mxt.reset_index(inplace=True)
    mxt.rename(columns={'index': 'confusion_mxt'}, inplace=True)

    # Get classification report
    report = classification_report(data[refCol],
                                   data[clsCol],
                                   target_names=ref_id[refCol],
                                   output_dict=True)

    global_keys = ['accuracy', 'macro avg', 'micro avg', 'weighted avg']

    cls_eval = {k: report[k] for k in report if k not in global_keys}
    glb_eval = {k: report[k] for k in report if k in global_keys}

    if 'accuracy' in glb_eval:
        glb_eval['accuracy'] = {
            'f1-score': glb_eval['accuracy'],
            'precision': 0,
            'recall': 0,
            'support': 0
        }

    cls_eval = pd.DataFrame(cls_eval).T
    gbl_eval = pd.DataFrame(glb_eval).T

    return obj_to_tbl([gbl_eval, cls_eval, mxt],
                      outMxt,
                      sheetsName=['global', 'report', 'matrix'])
Exemple #7
0
def export_cells_not_in(inTable, noTable, outTable, inSheet, noSheet, inFID,
                        noFID):
    """
    Export to a new file the cells of in Table not in noTable
    """

    import xlrd
    import xlwt
    from glass.ng.rd import tbl_to_obj
    from glass.ng.xls.fld import col_name, get_columns_position
    from glass.ng.xls.summ import list_unique_values_column

    # TODO: check if tables are xls

    # Get Data
    inData = tbl_to_obj(inTable, sheet=inSheet, output='array')
    COLUMNS = col_name(inTable, sheet_name=inSheet)

    # From noDATA, get IDS that will not be in the outTable
    noXls = xlrd.open_workbook(noTable)
    _noSheet = noXls.sheet_by_name(noSheet)
    colsPosition = get_columns_position(_noSheet, noFID)
    noFIDS = list_unique_values_column(_noSheet, colsPosition[noFID])

    # Create Output
    out_xls = xlwt.Workbook()
    new_sheet = out_xls.add_sheet(inSheet)

    # Write columns titles
    for c in range(len(COLUMNS)):
        new_sheet.write(0, c, COLUMNS[c])

    # Write data not in noData
    l = 1
    for row in inData:
        if row[inFID] not in noFIDS:
            c = 0
            for col in COLUMNS:
                new_sheet.write(l, c, row[col])
                c += 1

            l += 1

    out_xls.save(outTable)

    return outTable
Exemple #8
0
def merge_tbls(folder, out_tbl, tbl_format='.dbf'):
    """
    Merge all tables in folder into one single table
    """

    from glass.pys.oss import lst_ff
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    from glass.ng.pd import merge_df

    tbls = lst_ff(folder, file_format=tbl_format)

    tbls_dfs = [tbl_to_obj(t) for t in tbls]

    out_df = merge_df(tbls_dfs)

    obj_to_tbl(out_df, out_tbl)

    return out_tbl
Exemple #9
0
def pointXls_to_shp(xlsFile, outShp, x_col, y_col, epsg, sheet=None):
    """
    Excel table with Point information to ESRI Shapefile
    """

    from glass.ng.rd import tbl_to_obj
    from glass.g.it.pd import pnt_dfwxy_to_geodf
    from glass.g.wt.shp import df_to_shp

    # XLS TO PANDAS DATAFRAME
    dataDf = tbl_to_obj(xlsFile, sheet=sheet)

    # DATAFRAME TO GEO DATAFRAME
    geoDataDf = pnt_dfwxy_to_geodf(dataDf, x_col, y_col, epsg)

    # GEODATAFRAME TO ESRI SHAPEFILE
    df_to_shp(geoDataDf, outShp)

    return outShp
Exemple #10
0
def merge_xls_in_folder(tbl_folder, out_table):
    """
    Get all excel tables in a folder and make one table of them
    """

    import pandas
    from glass.pys.oss import lst_ff
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl

    tables = lst_ff(tbl_folder, file_format=['.xls', '.xlsx'])

    dfs = [tbl_to_obj(table) for table in tables]

    result = pandas.concat(dfs)

    out_table = obj_to_tbl(result, out_table)

    return out_table
Exemple #11
0
def shpcols_to_shp(inshp, tbl, col_cols, outcolname, outfolder):
    """
    Read a table with a list of columns in a shapefile

    For each column:
    in the input shapefile, delete all other columns
    rename the column, and save the changed shapefile

    explain why col_cols could be a list
    """

    import os
    from glass.pys import obj_to_lst
    from glass.g.rd.shp import shp_to_obj
    from glass.ng.rd import tbl_to_obj
    from glass.g.wt.shp import df_to_shp

    dfshp = shp_to_obj(inshp)
    dfcols = tbl_to_obj(tbl)

    col_cols = obj_to_lst(col_cols)

    refcols = []
    for cc in col_cols:
        refcols.extend(dfcols[cc].tolist())

    for i, r in dfcols.iterrows():
        for cc in col_cols:
            newdf = dfshp.copy()

            dc = [c for c in refcols if c != r[cc]]

            if outcolname in list(newdf.columns.values):
                dc.append(outcolname)

            newdf.drop(dc, axis=1, inplace=True)

            newdf.rename(columns={r[cc]: outcolname}, inplace=True)

            df_to_shp(newdf, os.path.join(outfolder, r[cc] + '.shp'))

    return outfolder
Exemple #12
0
def tbl_to_tbl(inTbl,
               outTbl,
               inSheet=None,
               txtDelimiter=None,
               inTxtDelimiter=None,
               inEncoding='utf-8'):
    """
    Convert data format
    """

    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl

    data = tbl_to_obj(inTbl,
                      sheet=inSheet,
                      encoding_=inEncoding,
                      _delimiter=inTxtDelimiter)

    outTbl = obj_to_tbl(data, outTbl, delimiter=txtDelimiter)

    return outTbl
Exemple #13
0
def predict_fm_mdl(mdlFile, vFile, data, txtCol, method='NaiveBayes'):
    """
    Text classification using file with fit data
    """
    
    from joblib      import load
    import pandas    as pd
    from glass.ng.rd import tbl_to_obj
    
    classDf = tbl_to_obj(data) if type(data) != pd.DataFrame else data
    classDf = classDf[pd.notnull(classDf[txtCol])]
    
    clf   = load(mdlFile)
    tvect = None if not vFile else load(vFile)
    
    if method == 'NaiveBayes':
        result = clf.predict(tvect.transform(data[txtCol]))
        
        data.loc[:, 'classification'] = result
    
    elif method == 'LinearSupportVectorMachine':
        feaTst = tvect.transform(classDf[txtCol])
        
        y_pred = clf.predict(feaTst)
        
        data.loc[:, 'classification'] = y_pred
    
    elif method == 'RandomForest':
        feaTst = tvect.transform(classDf[txtCol])
        
        y_pred=clf.predict(feaTst)
        
        data.loc[:, 'classification'] = y_pred
    
    elif method == 'LogisticRegression':
        y_pred = clf.predict(classDf[txtCol])
        
        data.loc[:, 'classification'] = y_pred
    
    return data
Exemple #14
0
def exp_by_group_relfeat(shp, group_col, relfeat, relfeat_id, reltbl,
                         reltbl_sheet, group_fk, relfeat_fk, out_folder,
                         out_tbl):
    """
    Identify groups in shp, get features related with
    these groups and export group features and related
    features to new file
    """

    import os
    import pandas as pd
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    from glass.g.rd.shp import shp_to_obj
    from glass.g.wt.shp import obj_to_shp
    from glass.g.prop.prj import get_shp_epsg

    epsg = get_shp_epsg(shp)

    # Open data
    shp_df = shp_to_obj(shp)
    rel_df = shp_to_obj(relfeat)

    # Get table with relations N-N
    nn_tbl = tbl_to_obj(reltbl, sheet=reltbl_sheet)

    # Relate relfeat with shp groups
    rel_df = rel_df.merge(nn_tbl,
                          how='inner',
                          left_on=relfeat_id,
                          right_on=relfeat_fk)

    # List Groups
    grp_df = pd.DataFrame({
        'cnttemp':
        shp_df.groupby([group_col])[group_col].agg('count')
    }).reset_index()

    ntbls = []
    # Filter and export
    for idx, row in grp_df.iterrows():
        # Get shp_df filter
        new_shp = shp_df[shp_df[group_col] == row[group_col]]

        # Get relfeat filter
        new_relf = rel_df[rel_df[group_fk] == row[group_col]]

        # Export
        shp_i = obj_to_shp(
            new_shp, 'geometry', epsg,
            os.path.join(out_folder, 'lyr_{}.shp'.format(row[group_col])))
        rel_i = obj_to_shp(
            new_relf, 'geometry', epsg,
            os.path.join(out_folder, 'rel_{}.shp'.format(row[group_col])))

        ntbls.append([row[group_col], shp_i, rel_i])

    ntbls = pd.DataFrame(ntbls, columns=['group_id', 'shp_i', 'rel_i'])

    obj_to_tbl(ntbls, out_tbl)

    return out_tbl
Exemple #15
0
def binary_eval(refTbl,
                refId,
                refCol,
                tstTbl,
                tstId,
                outTbl=None,
                tstCol=None):
    """
    Evaluation of a binary classification
    
    When tstCol is None, the script assumes that in tstTbl
    there are only positives
    
    A tabela de referencia deve ter positivos e negativos;
    mas a tabela de teste pode ter so positivos.
    """

    import numpy as np
    import pandas
    import math
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl

    # Data to Pandas Dataframe
    ref_df = tbl_to_obj(refTbl, fields=[
        refId, refCol
    ]) if type(refTbl) != pandas.DataFrame else refTbl[[refId, refCol]]
    tst_df = tbl_to_obj(
        tstTbl, fields=[tstId] if not tstCol else [tstId, tstCol]
    ) if type(refTbl) != pandas.DataFrame else tstTbl[[tstId]] \
        if not tstCol else tstTbl[[tstId, tstCol]]

    # Check if refId is equal to tstId; they must be different
    if refId == tstId:
        colRename = {tstId: 'tst_fid__'}

        # Do the same for refCol and tstCol
        if refCol == tstCol:
            colRename[tstCol] = 'tst_col__'

        tst_df.rename(columns=colRename, inplace=True)
        tstId = 'tst_fid__'

        if refCol == tstCol:
            tstCol = 'tst_col__'

    df = ref_df.merge(tst_df, how='left', left_on=refId, right_on=tstId)

    # Check if we have a tstCol
    if not tstCol:
        df[tstId].fillna('None', inplace=True)

        tstCol = 'cls_tst'
        df[tstCol] = np.where(df[tstId] == 'None', 0, 1)

    # Get VP, VN, FP, FN
    df['confusion'] = np.where(
        (df[refCol] == 1) & (df[tstCol] == 1), 'VP',
        np.where((df[refCol] == 0) & (df[tstCol] == 0), 'VN',
                 np.where((df[refCol] == 1) & (df[tstCol] == 0), 'FN', 'FP')))

    # tabela sintese
    conf_tbl = pandas.DataFrame()
    conf_tbl['nrows'] = df.groupby(['confusion'])[refId].nunique()

    conf_tbl.reset_index(inplace=True)

    conf_tbl['percentage'] = (conf_tbl.nrows * 100) / df.shape[0]

    # Get some evaluation mesures
    dConf = {}

    for row in conf_tbl.to_dict(orient='records'):
        dConf[row['confusion']] = row['nrows']

    l = ['VP', 'VN', 'FP', 'FN']
    for i in l:
        if i not in dConf:
            dConf[i] = 0
    """
    Error rate

    Error rate (ERR) is calculated as the number of all
    incorrect predictions divided by the total number of
    the dataset. The best error rate is 0.0, whereas the
    worst is 1.0.
    """

    ERR = (dConf['FP'] + dConf['FN']) / (dConf['VP'] + dConf['VN'] +
                                         dConf['FN'] + dConf['FP'])
    """
    Accuracy

    Accuracy (ACC) is calculated as the number of all correct
    predictions divided by the total number of the dataset.
    The best accuracy is 1.0, whereas the worst is 0.0. It can
    also be calculated by 1 – ERR.
    """

    ACC = (dConf['VP'] + dConf['VN']) / (dConf['VP'] + dConf['VN'] +
                                         dConf['FN'] + dConf['FP'])
    """
    Sensitivity (Recall or True positive rate)
    
    Sensitivity (SN) is calculated as the number of correct
    positive predictions divided by the total number of positives.
    It is also called recall (REC) or true positive rate (TPR).
    The best sensitivity is 1.0, whereas the worst is 0.0.
    """

    try:
        SN = dConf['VP'] / (dConf['VP'] + dConf['FN'])
    except:
        SN = -99
    """
    Specificity (True negative rate)

    Specificity (SP) is calculated as the number of correct negative
    predictions divided by the total number of negatives. It is
    also called true negative rate (TNR). The best specificity is 1.0,
    whereas the worst is 0.0.
    """

    SP = dConf['VN'] / (dConf['VN'] + dConf['FP'])
    """
    Precision (Positive predictive value)

    Precision (PREC) is calculated as the number of correct
    positive predictions divided by the total number of positive
    predictions. It is also called positive predictive value (PPV).
    The best precision is 1.0, whereas the worst is 0.0.
    """

    PREC = dConf["VP"] / (dConf["VP"] + dConf['FP'])
    """
    False positive rate

    False positive rate (FPR) is calculated as the number of
    incorrect positive predictions divided by the total number
    of negatives. The best false positive rate is 0.0 whereas the
    worst is 1.0. It can also be calculated as 1 – specificity.
    """

    FPR = dConf['FP'] / (dConf['VN'] + dConf['FP'])
    """
    Matthews correlation coefficient

    Matthews correlation coefficient (MCC) is a correlation
    coefficient calculated using all four values in the
    confusion matrix.
    """
    try:
        MCC = (dConf['VP'] * dConf['VN'] -
               dConf['FP'] * dConf['FN']) / (math.sqrt(
                   (dConf['VP'] + dConf['FP']) * (dConf['VP'] + dConf['FN']) *
                   (dConf['VN'] + dConf['FP']) * (dConf['VN'] + dConf['FN'])))
    except:
        MCC = -99
    """
    F-score

    F-score is a harmonic mean of precision and recall.
    """

    F0_5 = ((1 + 0.5**2) * (PREC * SN)) / (0.5**2 * PREC + SN)
    F_1 = (2 * PREC * SN) / (PREC + SN)
    F_2 = (5 * PREC * SN) / (4 * PREC + SN)

    evalMeasures = pandas.DataFrame(
        [['Error rate', ERR], ['Accuracy', ACC], ['Sensitivity', SN],
         ['Specificity', SP], ['Precision', PREC], [
             'False positive rate', FPR
         ], ['Matthews correlation coefficient', MCC], ['F-score 0.5', F0_5],
         ['F-score 1', F_1], ['F-score 2', F_2]],
        columns=['eval_mesure', 'value'])

    if outTbl:
        return obj_to_tbl([conf_tbl, evalMeasures, df],
                          outTbl,
                          sheetsName=['matrix', 'eval_mesures', 'tbl'])
    else:
        return conf_tbl, evalMeasures, df
Exemple #16
0
def text_prediction(trainData, classData, trainRefCol, trainClsCol, clsDataCol,
                    outfile, method='NaiveBayes', lang='english'):
    """
    Text classification
    
    Classifier Options:
    1) NaiveBayes;
    2) LinearSupportVectorMachine;
    3) RandomForest;
    4) LogisticRegression.
    """
    
    import pandas as pd
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl
    
    # Data to Dataframe
    trainDf = tbl_to_obj(trainData) if type(trainData) != pd.DataFrame else  trainData
    classDf = tbl_to_obj(classData) if type(classData) != pd.DataFrame else classData
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[trainClsCol])]
    trainDf = trainDf[pd.notnull(trainDf[trainRefCol])]
    classDf = classDf[pd.notnull(classDf[clsDataCol])]
    
    if method == 'NaiveBayes':
        from sklearn.naive_bayes             import MultinomialNB
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        
        """" Train Model """
        # X train is trainClsCol
        # Y train is trainRefCol
        x_train, y_train = trainDf[trainClsCol], trainDf[trainRefCol]
    
        count_vect = CountVectorizer()
    
        X_train_counts = count_vect.fit_transform(x_train)
    
        tfidf_transformer = TfidfTransformer()
    
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
        clf = MultinomialNB().fit(X_train_tfidf, y_train)
    
        """ Predict """
        result = clf.predict(count_vect.transform(classDf[clsDataCol]))
    
        classDf['classification'] = result
    
    elif method == 'LinearSupportVectorMachine':
        import numpy
        from sklearn.svm import LinearSVC
        
        # Get features and Labels
        trainDf['ref_id'] = trainDf[trainRefCol].factorize()[0]
        labels = trainDf.ref_id
        
        features, tvect = txt_to_num_representation(
            trainDf, trainClsCol, __lang=lang, returnTfiDf=True)
        
        featTst = tvect.transform(classDf[clsDataCol])
        
        """ Train model """
        model = LinearSVC()
        
        model.fit(features, labels)
        
        y_pred = model.predict(featTst)
        
        classDf['classification'] = y_pred
        
        # Create Dataframe only with ref_id's, without duplicates
        ref_id_df = trainDf[[
            trainRefCol, 'ref_id'
        ]].drop_duplicates().sort_values('ref_id')
        ref_id_df.columns = ['class_name', 'ref_fid']
        
        classDf = classDf.merge(
            ref_id_df, how='inner',
            left_on='classification', right_on='ref_fid'
        )
        
        classDf.loc[:, 'classification'] = classDf.class_name
        
        classDf.drop(['ref_fid', 'class_name'], axis=1, inplace=True)
    
    elif method == 'RandomForest':
        from sklearn.ensemble import RandomForestClassifier
        # Get features
        
        features, tvect = txt_to_num_representation(
            trainDf, trainClsCol, __lang=lang, returnTfiDf=True)
        
        featTst = tvect.transform(classDf[clsDataCol])
        
        classifier = RandomForestClassifier(
            n_estimators=1000, random_state=0
        )
        classifier.fit(features, trainDf[trainRefCol])
        
        y_pred = classifier.predict(featTst)
        
        classDf['classification'] = y_pred
    
    elif method == 'LogisticRegression':
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        from sklearn.pipeline                import Pipeline
        from sklearn.linear_model            import LogisticRegression
        
        logreg = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', LogisticRegression(n_jobs=1, C=1e5, multi_class='auto', solver='lbfgs')),
        ])
        
        logreg.fit(trainDf[trainClsCol], trainDf[trainRefCol])
        
        y_pred = logreg.predict(classDf[clsDataCol])
        
        classDf['classification'] = y_pred
    
    return obj_to_tbl(classDf, outfile)
Exemple #17
0
def randomtime_to_shprows(in_shp, o_shp, start_date, end_date):
    """
    Relate time value to rows in one shapefile.

    The time is determined randomly from interval

    start_data and end_data must be datetime objects.
    """

    import datetime as dt
    import random as rdn
    from calendar import monthrange
    from glass.ng.rd import tbl_to_obj
    from glass.g.wt.shp import df_to_shp

    # Shape to Pandas.Dataframe
    gdf = tbl_to_obj(in_shp)

    # Get Random Dates
    def get_year(start, end):
        # Get Year
        if start.year == end.year:
            year = start.year

        else:
            year = rdn.randint(start.year, end.year)

        return year

    def get_month(start, end, year):
        # Get Month
        if start.year == end.year:
            if start.month == end.month:
                month = start.month
            else:
                month = rdn.randint(start.month, end.month)

        else:
            if year == start.year:
                month = rdn.randint(start.month,
                                    12) if start.month < 12 else 12

            elif year == end.year:
                month = rdn.randint(1, end.month) if end.month > 1 else 1

        return month

    def get_day(s, e, y, m):
        # Get Day
        ndays = monthrange(y, m)[1]

        if s.year == e.year and s.month == e.month:
            if s.day == e.day:
                day = s.day
            else:
                day = rdn.randint(s.day, e.day)

        elif s.year == e.year and s.month != e.month:
            if m == s.month:
                day = rdn.randint(m, ndays)
            elif m == e.month:
                day = rdn.randint(1, e.day)
            else:
                day = rdn.randint(1, ndays)

        elif s.year != e.year:
            if y == s.year:
                if m == s.month:
                    day = rdn.randint(m, ndays)
                else:
                    day = rdn.randint(1, ndays)
            elif y == e.year:
                if m == e.month:
                    day = rdn.randint(1, e.day)
                else:
                    day = rdn.randint(1, ndays)
            else:
                day = rdn.randint(1, ndays)

        return day

    def get_hour(s, e, y, m, d):
        # Get Hour
        sDay = dt.datetime(s.year, s.month, s.day)
        eDay = dt.datetime(e.year, e.month, e.day)
        cDay = dt.datetime(y, m, d)

        if sDay == eDay:
            hour = rdn.randint(s.hour, e.hour)
        else:
            if sDay == cDay:
                hour = rdn.randint(s.hour, 23)
            elif eDay == cDay:
                hour = rdn.randint(0, e.hour)
            else:
                hour = rdn.randint(0, 23)

        return hour

    def get_minute(s, e, y, m, d, h):
        # Get minute

        sHour = dt.datetime(s.year, s.month, s.day, s.hour)
        eHour = dt.datetime(e.year, e.month, e.day, e.hour)
        cHour = dt.datetime(y, m, d, h)

        if sHour == eHour:
            minute = rdn.randint(s.minute, e.minute)
        else:
            if sHour == cHour:
                minute = rdn.randint(s.minute, 59)
            elif eHour == cHour:
                minute = rdn.randint(0, e.minute)
            else:
                minute = rdn.randint(0, 59)

        return minute

    def get_second(s, e, y, m, d, h, mi):
        # Get second

        sMinute = dt.datetime(s.year, s.month, s.day, s.hour, s.minute)
        eMinute = dt.datetime(e.year, e.month, e.day, e.hour, e.minute)
        cMinute = dt.datetime(y, m, d, h, mi)

        if sMinute == eMinute:
            second = rdn.randint(s.second, e.second)
        else:
            if sMinute == cMinute:
                second = rdn.randint(s.second, 59)
            elif eMinute == cMinute:
                second = rdn.randint(0, e.second)
            else:
                second = rdn.randint(0, 59)

        return second

    def sanitize(s):
        return "0{}".format(str(s)) if len(str(s)) == 1 else str(s)

    dates = []
    times = []
    for i in range(gdf.shape[0]):
        year = get_year(start_date, end_date)

        month = get_month(start_date, end_date, year)

        day = get_day(start_date, end_date, year, month)

        hour = get_hour(start_date, end_date, year, month, day)

        minute = get_minute(start_date, end_date, year, month, day, hour)

        second = get_second(start_date, end_date, year, month, day, hour,
                            minute)

        month, day, hour, minute, second = [
            sanitize(i) for i in [month, day, hour, minute, second]
        ]

        dates.append('{}-{}-{}'.format(year, month, day))
        times.append('{}:{}:{}'.format(hour, minute, second))

    # Set dates and times
    gdf['date'] = dates
    gdf['time'] = times

    # Export
    df_to_shp(gdf, o_shp)

    return o_shp
Exemple #18
0
def datatocls_multiref(shpfile, mapstbl, sheet, slugs, titles, ncls, decplace,
    outshp, outmapstbl, method="QUANTILE"):
    """
    Create classes/intervals for each layout in table (mapstbl)
    One layout could have more than one map... deal with that situation

    method options:
    * QUANTILE;
    * JENKS - natural breaks (jenks);
    """

    import pandas            as pd
    import numpy             as np
    from glass.pys           import obj_to_lst
    from glass.g.rd.shp      import shp_to_obj
    from glass.g.wt.shp      import df_to_shp
    from glass.ng.rd         import tbl_to_obj
    from glass.ng.wt         import obj_to_tbl
    from glass.ng.pd.fld     import listval_to_newcols
    from glass.g.lyt.diutils import eval_intervals

    methods = ["QUANTILE", "JENKS"]

    if method not in methods:
        raise ValueError(f'Method {method} is not available')
    
    if method == "QUANTILE":
        from glass.ng.pd.stats import get_intervals
    
    elif method == "JENKS":
        import jenkspy
    
    slugs  = obj_to_lst(slugs)
    titles = obj_to_lst(titles)
    
    # Read data
    shp  = shp_to_obj(shpfile)
    maps = tbl_to_obj(mapstbl, sheet=sheet)

    # Get intervals for each map
    istats = []
    cols   = []
    for i, row in maps.iterrows():
        ddig  = row[decplace]
        icols = [row[slug] for slug in slugs]
        ititles = [row[title] for title in titles]

        istatsrow = []
        for _i in range(len(icols)):
            min_v  = shp[icols[_i]].min()
            max_v  = shp[icols[_i]].max()
            mean_v = shp[icols[_i]].mean()
            std_v  = shp[icols[_i]].std()

            if method == "QUANTILE":
                intervals = get_intervals(
                    shp, icols[_i], ncls, method="QUANTILE")
                intervals.append(max_v)
            
            elif method == "JENKS":
                breaks = jenkspy.jenks_breaks(shp[icols[_i]], nb_class=ncls)
                intervals = breaks[1:]
            
            if not str(shp[icols[_i]].dtype).startswith('int'):
                __intervals = [round(itv, ddig) for itv in intervals]

                __intervals, ndig = eval_intervals(
                    intervals, __intervals, ddig, round(min_v, ddig)
                )

                istatsrow.extend([
                    icols[_i], ititles[_i], round(min_v, ndig),
                    round(max_v, ndig), round(mean_v, ddig),
                    round(std_v, ddig), __intervals
                ])

                shp[icols[_i]] = shp[icols[_i]].round(ddig)
            
            else:
                for _e in range(len(intervals)):
                    if not _e:
                        rzero = 1 if round(intervals[_e], 0) > min_v else 0
                    
                    else:
                        rzero = 1 if round(intervals[_e], 0) > \
                            round(intervals[_e -1], 0) else 0
                    
                    if not rzero:
                        break
                
                __intervals = [round(
                    _o, ddig if not rzero else 0
                ) for _o in intervals]

                __intervals, ndig = eval_intervals(
                    intervals, __intervals, ddig, min_v
                )

                istatsrow.extend([
                    icols[_i], ititles[_i], min_v, max_v,
                    int(round(mean_v, 0)) if rzero else round(mean_v, ddig),
                    int(round(std_v, 0)) if rzero else round(std_v, ddig),
                    __intervals
                ])
            
            if not i:
                cols.extend([
                    f'slug{str(_i+1)}', f'title{str(_i+1)}',
                    f'min_value{str(_i+1)}', f'max_value{str(_i+1)}',
                    f'mean_value{str(_i+1)}',
                    f'std_value{str(_i+1)}', f'intervals{str(_i+1)}'
                ])
        
        istats.append(istatsrow)
    
    istats = pd.DataFrame(istats, columns=cols)

    rename_cols = {}
    for idx, row in istats.iterrows():
        for _i in range(len(slugs)):
            # Get intervals
            int_ = row[f'intervals{str(_i+1)}']

            # Add columns for intervals ids
            newcol = 'i_' + row[f'slug{str(_i+1)}']
            shp[newcol] = 0

            for itv in range(len(int_)):
                if not itv:
                    shp[newcol] = np.where(
                        shp[row[f'slug{str(_i+1)}']] <= int_[itv],
                        itv + 1, shp[newcol]
                    )
                
                else:
                    shp[newcol] = np.where(
                        (shp[row[f'slug{str(_i+1)}']] > int_[itv-1]) & (shp[row[f'slug{str(_i+1)}']] <= int_[itv]),
                        itv + 1, shp[newcol]
                    )
            
            rename_cols[newcol] = row[f'slug{str(_i+1)}']
    
    dc = []
    for c in range(len(slugs)):
        dc.extend(istats[f'slug{str(c+1)}'].tolist())
    
    shp.drop(dc, axis=1, inplace=True)
    shp.rename(columns=rename_cols, inplace=True)

    
    for i in range(len(slugs)):
        istats = listval_to_newcols(istats, f'intervals{str(i+1)}')
        istats.rename(columns={
            ii : f'intervals{str(i+1)}_{str(ii+1)}' for ii in range(ncls)
        }, inplace=True)
    
    # Write outputs
    df_to_shp(shp, outshp)
    obj_to_tbl(istats, outmapstbl)

    return outshp, outmapstbl
Exemple #19
0
def datatocls_meanstd(shp_data, maps_table, sheet, slug, title,
    ncls, decplace, nodata, out_shp, out_maps_tbl, grpcol=None):
    """
    Create classes based on mean and standard deviation

    decplace - Numero casas decimais que vao aparecer nos valores do layout
    nodata - Must be always smaller than the min of min values
    """

    import pandas            as pd
    import numpy             as np
    from glass.g.rd.shp      import shp_to_obj
    from glass.g.wt.shp      import df_to_shp
    from glass.ng.rd         import tbl_to_obj
    from glass.ng.wt         import obj_to_tbl
    from glass.ng.pd.fld     import listval_to_newcols
    from glass.g.lyt.diutils import eval_intervals

    # Read data
    shp_df = shp_to_obj(shp_data)

    maps_df = tbl_to_obj(maps_table, sheet=sheet)

    if grpcol:
        maps_cols = maps_df[slug].tolist()
        for c in maps_cols:
            shp_df[c] = shp_df[c].astype(float)
        agg_dict = {c : 'mean' for c in maps_cols}
        shp_df = pd.DataFrame(shp_df.groupby([grpcol]).agg(
            agg_dict
        )).reset_index()
    
    def get_intervals(_ncls, mean, std):
        mean_class = mean + (std / 2)
    
        less_mean = []
        major_mean = []
        for e in range(_ncls):
            if not e:
                less_mean.append(mean - (std / 2))
                major_mean.append(mean_class + std)
            else:
                less_mean.append(less_mean[e - 1] - std)
                major_mean.append(major_mean[e - 1] + std)
        
        less_mean.reverse()
        intervals = less_mean + [mean_class] + major_mean
    
        return intervals
    
    # Calculo intervalos para cada indicador
    # metodo intervalos baseados na media e no desvio padrao

    # Get min, max, mean and standard deviation
    # Round values
    i_stats = []
    for idx, row in maps_df.iterrows():
        ddig = row[decplace]
        i    = row[slug]
        t    = row[title]

        if nodata in shp_df[i].unique():
            vals = list(shp_df[i].unique())
            vals.sort()

            min_v = vals[1]
        
            tdf = shp_df[[i]].copy()
        
            tdf = tdf[tdf[i] >= min_v]
            tdf.reset_index(drop=True, inplace=True)
        
            max_v = tdf[i].max()
            mean_v = tdf[i].mean()
            std_v = tdf[i].std()
        
        else:
            min_v  = shp_df[i].min()
            max_v  = shp_df[i].max()
            mean_v = shp_df[i].mean()
            std_v  = shp_df[i].std()
        
        fbreak = min_v - 1
        __std = std_v
        while fbreak <= min_v:
            intervals = get_intervals(ncls, mean_v, __std)

            repeat = 0
            for __i in intervals[:-1]:
                if __i > max_v:
                    repeat = 1
                
                if repeat:
                    break
            
            fbreak = intervals[0] if not repeat else min_v - 1
            __std = __std / 2
        
        intervals[-1] = max_v

        if not str(shp_df[i].dtype).startswith('int'):
            __intervals = [round(_i, ddig) for _i in intervals]
        
            repeat = 1
            __intervals, ndig = eval_intervals(
                intervals, __intervals, ddig,
                round(min_v, ddig)
            )
        
            i_stats.append([
                i, t, round(min_v, ndig), round(max_v, ndig),
                round(mean_v, ddig), round(std_v, ddig), __intervals
            ])
        
            shp_df[i] = shp_df[i].round(ddig)
        
        else:
            for _e in range(len(intervals)):
                if not _e:
                    rzero = 1 if round(intervals[_e], 0) > min_v else 0
                
                else:
                    rzero = 1 if round(intervals[_e], 0) > \
                        round(intervals[_e - 1], 0) else 0
            
                if not rzero:
                    break
            
            __intervals = [round(_o, ddig if not rzero else 0) for _o in intervals]

            __intervals, ndig = eval_intervals(intervals, __intervals, ddig, min_v)

            i_stats.append([
                i, t, min_v, max_v,
                int(round(mean_v, 0)) if rzero else round(mean_v, ddig),
                int(round(std_v, 0)) if rzero else round(std_v, ddig),
                __intervals
            ])
    
    i_stats = pd.DataFrame(i_stats, columns=[
        'slug', 'title', 'min_value', 'max_value',
        'mean_value', 'std_value', 'intervals'
    ])

    rename_cols = {}
    for idx, row in i_stats.iterrows():
        # Get intervals.
        int_ = row.intervals
    
        # Add columns for intervals
        i_col = 'i_' + row.slug
        shp_df[i_col] = 0
    
        for _i in range(len(int_)):
            if not _i:
                shp_df[i_col] = np.where(
                    (shp_df[row.slug] > nodata) & (shp_df[row.slug] <= int_[_i]),
                    _i + 1, shp_df[i_col]
                )
            else:
                shp_df[i_col] = np.where(
                    (shp_df[row.slug] > int_[_i - 1]) & (shp_df[row.slug] <= int_[_i]),
                    _i + 1, shp_df[i_col]
                )
    
        rename_cols[i_col] = row.slug
    
    shp_df.drop(i_stats.slug, axis=1, inplace=True)
    shp_df.rename(columns=rename_cols, inplace=True)

    i_stats = listval_to_newcols(i_stats, 'intervals')

    i_stats.rename(columns={
        i : 'interval_' + str(i+1) for i in range((ncls * 2) + 1)
    }, inplace=True)

    if grpcol:
        nshp_df = shp_to_obj(shp_data)

        nshp_df.drop(maps_cols, axis=1, inplace=True)

        shp_df.rename(columns={grpcol : grpcol + '_y'}, inplace=True)

        shp_df = nshp_df.merge(shp_df, how='left', left_on=grpcol, right_on=grpcol + '_y')
    
    df_to_shp(shp_df, out_shp)

    obj_to_tbl(i_stats, out_maps_tbl)

    return out_shp, out_maps_tbl
Exemple #20
0
	5  |  R. xxx  |    149   |   x   | ... |   x
	
	FID should be the first column
	"""
    
    import xlrd
    from glass.ng.rd   import tbl_to_obj
    from glass.pys.oss import del_file
    from glass.ng.it   import dict_to_xls
    
	if overwrite:
		del_file(out_path)

	# XLS data to dict
	data = tbl_to_obj(
        xls_path, sheet=sheet, useFirstColAsIndex=True, output='dict'
    )

	# Split interest_column (attribute)
	for fid in data:
		for col in data[fid]:
			if str(col) == str(interest_column):
				str_lst = data[fid][col].split(rule)

		data[fid][interest_column+'_1'] = str_lst[0]
		data[fid][interest_column+'_2'] = str_lst[1] if len(str_lst) > 1 else ''

	# Write data in a new file
	dict_to_xls(data, out_path, sheet)

Exemple #21
0
def datatocls(shpfile, mapstbl, sheet, slug, title, ncls, decplace,
    outshp, outmapstbl, method="QUANTILE"):
    """
    Create classes/intervals for each map in table

    method options:
    * QUANTILE;
    * JENKS - natural breaks (jenks);
    """

    import pandas            as pd
    import numpy             as np
    from glass.g.rd.shp      import shp_to_obj
    from glass.g.wt.shp      import df_to_shp
    from glass.ng.rd         import tbl_to_obj
    from glass.ng.wt         import obj_to_tbl
    from glass.ng.pd.fld     import listval_to_newcols
    from glass.g.lyt.diutils import eval_intervals

    methods = ["QUANTILE", "JENKS"]

    if method not in methods:
        raise ValueError(f'Method {method} is not available')

    if method == "QUANTILE":
        from glass.ng.pd.stats import get_intervals
    
    elif method == "JENKS":
        import jenkspy

    # Read data
    shp  = shp_to_obj(shpfile)
    maps = tbl_to_obj(mapstbl, sheet=sheet)

    # Get intervals for each map
    istats = []
    for i, row in maps.iterrows():
        ddig = row[decplace]
        icol = row[slug]
        titl = row[title]
    
        min_v  = shp[icol].min()
        max_v  = shp[icol].max()
        mean_v = shp[icol].mean()
        std_v  = shp[icol].std()

        if method == "QUANTILE":
            intervals = get_intervals(shp, icol, ncls, method="QUANTILE")
            intervals.append(max_v)
        
        elif method == "JENKS":
            breaks = jenkspy.jenks_breaks(shp[icol], nb_class=ncls)
            intervals = breaks[1:]
        
        if not str(shp[icol].dtype).startswith('int'):
            __intervals = [round(i, ddig) for i in intervals]

            __intervals, ndig = eval_intervals(
                intervals, __intervals, ddig, round(min_v, ddig)
            )

            istats.append([
                icol, titl, round(min_v, ndig),
                round(max_v, ndig), round(mean_v, ddig),
                round(std_v, ddig), __intervals
            ])

            shp[icol] = shp[icol].round(ddig)
        
        else:
            for _e in range(len(intervals)):
                if not _e:
                    rzero = 1 if round(intervals[_e], 0) > min_v else 0
                
                else:
                    rzero = 1 if round(intervals[_e], 0) > \
                        round(intervals[_e - 1], 0) else 0
                
                if not rzero:
                    break
            
            __intervals = [round(
                _o, ddig if not rzero else 0
            ) for _o in intervals]

            __intervals, ndig = eval_intervals(
                intervals, __intervals, ddig, min_v)
            
            istats.append([
                icol, titl, min_v, max_v,
                int(round(mean_v, 0)) if rzero else round(mean_v, ddig),
                int(round(std_v, 0)) if rzero else round(std_v, ddig),
                __intervals
            ])
    
    istats = pd.DataFrame(istats, columns=[
        "slug", "title", "min_value", "max_value",
        "mean_value", "std_value", "intervals"
    ])

    rename_cols = {}
    for idx, row in istats.iterrows():
        # Get intervals
        int_ = row.intervals
    
        # Add columns for intervals
        i_col = 'i_' + row.slug
        shp[i_col] = 0
    
        for _i in range(len(int_)):
            if not _i:
                shp[i_col] = np.where(
                    shp[row.slug] <= int_[_i],
                    _i + 1, shp[i_col]
                )
        
            else:
                shp[i_col] = np.where(
                    (shp[row.slug] > int_[_i - 1]) & (shp[row.slug] <= int_[_i]),
                    _i + 1, shp[i_col]
                )
    
        rename_cols[i_col] = row.slug
    
    shp.drop(istats.slug, axis=1, inplace=True)
    shp.rename(columns=rename_cols, inplace=True)

    istats = listval_to_newcols(istats, 'intervals')

    istats.rename(columns={
        i : 'interval_' + str(i+1) for i in range(ncls)
    }, inplace=True)

    # Write outputs
    df_to_shp(shp, outshp)
    obj_to_tbl(istats, outmapstbl)

    return outshp, outmapstbl
Exemple #22
0
def model_selection(dataFile, refCol, dataCol, outTbl, lang='english', CV=5):
    """
    See which model is better to use in text classification for a specific
    data sample
    
    Compare:
    Logistic Regression (LogisticRegression)
    (Multinomial) Naive Bayes (MultinomialNB)
    Linear Support Vector Machine (LinearSVC)
    Random Forest (RandomForestClassifier)
    """
    
    import os
    from glass.pys.oss                   import fprop
    from glass.ng.rd                     import tbl_to_obj
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model            import LogisticRegression
    from sklearn.ensemble                import RandomForestClassifier
    from sklearn.svm                     import LinearSVC
    from sklearn.naive_bayes             import MultinomialNB
    from sklearn.model_selection         import cross_val_score
    from glass.ng.wt                     import obj_to_tbl
    
    # Data to DataFrame
    trainDf = tbl_to_obj(dataFile)
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[dataCol])]
    trainDf = trainDf[pd.notnull(trainDf[refCol])]
    
    # Ref col to integers
    from io import StringIO
    
    trainDf['ref_id'] = trainDf[refCol].factorize()[0]
    
    # Text to numbers
    features = txt_to_num_representation(trainDf, dataCol, lang)
    
    labels = trainDf.ref_id
    
    """ Test Models """
    models = [
        RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
        LinearSVC(),
        MultinomialNB(),
        LogisticRegression(random_state=0)
    ]
    
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    
    for model in models:
        m_name = model.__class__.__name__
        accuracies = cross_val_score(
            model, features, labels, scoring='accuracy', cv=CV
        )
        
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((m_name, fold_idx, accuracy))
    
    # Create and Export evaluation table
    cv_df = pd.DataFrame(
        entries, columns=['model_name', 'fold_idx', 'accuracy'])
    cv_df_gp = pd.DataFrame(cv_df.groupby('model_name').accuracy.mean())
    cv_df_gp.reset_index(inplace=True)
    
    # Export Graphic
    import seaborn as sns
        
    a = sns.boxplot(x='model_name', y='accuracy', data=cv_df)
        
    b = sns.stripplot(
        x='model_name', y='accuracy', data=cv_df,
        size=10, jitter=True, edgecolor="gray", linewidth=2)
        
    fig = b.get_figure()
    fig.savefig(os.path.join(
        os.path.dirname(outTbl), fprop(outTbl, 'fn') + '.png'
    ))
    
    return obj_to_tbl(cv_df_gp, outTbl)
Exemple #23
0
def write_sld(attr_name,
              attr_colors,
              mapAttrKeys,
              sld_path,
              geometry=None,
              DATA='CATEGORICAL'):
    """
    Write a sld file using an association between field attributes and a color

    * attr_name -> name of a column in a layer
    
    * DATA -> CATEGORICAL | QUANTITATIVE
    
    * attr_colors -> list or table with styles for some category or interval
    
    QUANTITATIVE - TABLE EXAMPLE (Sheet Index = 0):
         | min | max | R | G | B
       1 |  0  |  5  | X | X | X
       2 |  5  |  10 | X | X | X
       3 |  10 |  15 | X | X | X
       4 |  15 |  20 | X | X | X
       5 |  20 |  25 | X | X | X
       
    QUANTITATIVE - LIST EXAMPLE:
    attr_colors = [
        {'min':  0, 'max':  5, 'R': X, 'G': X, 'B': X},
        {'min':  5, 'max': 10, 'R': X, 'G': X, 'B': X},
        {'min': 10, 'max': 15, 'R': X, 'G': X, 'B': X},
        {'min': 15, 'max': 20, 'R': X, 'G': X, 'B': X},
        {'min': 20, 'max': 25, 'R': X, 'G': X, 'B': X}
    ]
    
    CATEGORICAL - TABLE EXAMPLE
    
    CATEGORICAL - LIST EXAMPLE
    
    * mapAttrKeys -> dict with the relation between the meaning of the 
    columns/keys in attr_colors
    
    EXAMPLE:
    mapAttrKeys = {
        'r' : 'R', 'g' : 'G', 'b' : 'B', 'interval_min' : 'min',
        'interval_max' : 'max'
    }
    
    keys that could be used:
    * r -> attr_colors key/column with red of red|green|blue cat color
    * g -> attr_colors key/column with green of red|green|blue cat color
    * b -> attr_colors key/column with blue of red|green|blue cat color
    * hex -> attr_colors key/column with color hex
    * interval_min -> attr_colors key/column com limiar inferior do intervalo
    * interval_max -> attr_colors key/column com limiar superior do intervalo
    * stroke_hex -> attr_colors key/column with color hex for stroke
    * stroke_r -> attr_colors key/column with red of red|green|blue stroke color
    * stroke_g -> attr_colors key/column with green of red|green|blue stroke color
    * stroke_b -> attr_colors key/column with blue of red|green|blue stroke color
    * width -> attr_colors key/column with stroke width
    * opacity -> attr_colors key/column with opacity value for some category
    * category -> attr_colors key/column with category value
    
    sld_path -> path to sld file
    
    GEOMETRY -> Polygon | Line

    NOTE: This will work only for polygon/linear features
    """

    import os
    from glass.pys.Xml import write_xml_tree
    from glass.pys.oss import fprop
    from glass.g.wg.sld.rules import get_categorical_rules
    from glass.g.wg.sld.rules import get_quantitative_rules

    if DATA != 'CATEGORICAL' and DATA != 'QUANTITATIVE':
        raise ValueError(
            'DATA should has the value CATEGORICAL or QUANTITATIVE')

    if type(attr_colors) != list:
        if os.path.exists(attr_colors):
            ff = fprop(attr_colors, 'ff')

            if ff == '.json':
                import json

                attr_colors = json.load(open(attr_colors, 'r'))

            elif ff == '.xlsx' or ff == '.xls':
                from glass.ng.rd import tbl_to_obj

                attr_colors = tbl_to_obj(attr_colors,
                                         sheet=0,
                                         useFirstColAsIndex=None,
                                         output='array')

            elif ff == '.dbf':
                from glass.ng.rd import tbl_to_obj

                attr_colors = tbl_to_obj(attr_colors, output='array')

            else:
                raise ValueError('Your file is not a json or a xls')
        else:
            raise ValueError(
                ('ERROR in argument attribute_value_colors: '
                 'You need to define a list or give a valid path to a json '
                 'file or to a xls file'))

    GEOMETRY = str(geometry) if geometry else 'Polygon'

    # Create Feature Type Style RULES
    sldRules = get_categorical_rules(
        attr_colors, attr_name, GEOMETRY,
        mapAttrKeys) if DATA == 'CATEGORICAL' else get_quantitative_rules(
            attr_colors, attr_name, GEOMETRY,
            mapAttrKeys) if DATA == 'QUANTITATIVE' else None

    # SLD Basic structure
    xml_sld_root = ('sld:StyledLayerDescriptor', 'xmlns',
                    'http://www.opengis.net/sld', 'xmlns:sld',
                    'http://www.opengis.net/sld', 'xmlns:gml',
                    'http://www.opengis.net/gml', 'xmlns:ogc',
                    'http://www.opengis.net/ogc', 'version', '1.0.0')

    sld = {
        xml_sld_root: {
            'sld:UserLayer': {
                'sld:LayerFeatureConstraints': {
                    'sld:FeatureTypeConstraint': ''
                },
                'sld:UserStyle': {
                    'sld:Name': 'Default Styler',
                    'sld:IsDefault': '1',
                    'sld:FeatureTypeStyle': {
                        'sld:Name':
                        'group 0',
                        'sld:FeatureTypeName':
                        'Feature',
                        (1, 'sld:SemanticTypeIdentifier'):
                        'generic:geometry',
                        (2, 'sld:SemanticTypeIdentifier'):
                        'colorbrewer:unique:corinne'
                    }
                }
            }
        }
    }

    sld_order = {
        xml_sld_root: ['sld:UserLayer'],
        'sld:UserLayer': ['sld:LayerFeatureConstraints', 'sld:UserStyle'],
        'sld:UserStyle': ['sld:Name', 'sld:IsDefault', 'sld:FeatureTypeStyle'],
        'sld:FeatureTypeStyle': [
            'sld:Name',
            'sld:FeatureTypeName', (1, 'sld:SemanticTypeIdentifier'),
            (2, 'sld:SemanticTypeIdentifier')
        ],
        'ogc:PropertyIsEqualTo': ['ogc:PropertyName', 'ogc:Literal'],
        'ogc:And':
        ['ogc:PropertyIsLessThanOrEqualTo', 'ogc:PropertyIsGreaterThan'],
        'ogc:PropertyIsLessThanOrEqualTo': ['ogc:PropertyName', 'ogc:Literal'],
        'ogc:PropertyIsGreaterThan': ['ogc:PropertyName', 'ogc:Literal'],
        'sld:Fill': [('sld:CssParameter', 'name', 'fill'),
                     ('sld:CssParameter', 'name', 'fill-opacity')]
    }

    sld[xml_sld_root]['sld:UserLayer']['sld:UserStyle'][
        'sld:FeatureTypeStyle'].update(sldRules)

    symbolizer = 'sld:PolygonSymbolizer' if GEOMETRY == 'Polygon' \
        else 'sld:LineSymbolizer' if GEOMETRY == 'Line' \
        else 'sld:PolygonSimbolizer'

    for i in range(len(sldRules.keys())):
        sld_order['sld:FeatureTypeStyle'].append((i + 1, 'sld:Rule'))
        sld_order[(i + 1, 'sld:Rule')] = [
            'sld:Name', 'sld:Title', 'ogc:Filter', symbolizer
        ]

    if GEOMETRY == 'Polygon':
        for i in range(len(sldRules.keys())):
            sld_order['sld:PolygonSymbolizer'] = ['sld:Fill', 'sld:Stroke']

    write_xml_tree(sld, sld_path, nodes_order=sld_order)

    return sld_path
Exemple #24
0
def join_tables_in_table(mainTable, mainIdField, joinTables, outTable):
    """
    Join one table with all tables in a folder
    
    joinTables = {
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-06.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_6'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-13.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_13'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-20.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_20'}
        },
        r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-27.xlsx' : {
            "JOIN_FIELD"    : 'paragem',
            "COLS_TO_JOIN"  : {'n_validacao' : 'dia_27'}
        }
    }
    
    #TODO: only works with xlsx tables as join TABLES
    """

    # Modules
    import os
    import pandas
    from glass.ng.rd import tbl_to_obj
    from glass.ng.wt import obj_to_tbl

    # Get table format
    tableType = os.path.splitext(mainTable)[1]

    tableDf = tbl_to_obj(mainTable)

    for table in joinTables:
        xlsDf = tbl_to_obj(table)

        join_field = 'id_entity' if joinTables[table]["JOIN_FIELD"] == mainIdField \
            else joinTables[table]["JOIN_FIELD"]

        if joinTables[table]["JOIN_FIELD"] == mainIdField:
            xlsDf.rename(columns={mainIdField: join_field}, inplace=True)

        xlsDf.rename(columns=joinTables[table]["COLS_TO_JOIN"], inplace=True)

        tableDf = tableDf.merge(xlsDf,
                                how='outer',
                                left_on=mainIdField,
                                right_on=join_field)

        tableDf.fillna(0, inplace=True)
        tableDf[mainIdField].replace(0, tableDf[join_field], inplace=True)

        tableDf.drop(join_field, axis=1, inplace=True)

    obj_to_tbl(tableDf, outTable)

    return outTable
Exemple #25
0
def join_xls_table(main_table,
                   fid_main,
                   join_table,
                   fid_join,
                   copy_fields,
                   out_table,
                   main_sheet=None,
                   join_sheet=None):
    """
    Join tables using a commum attribute
    
    Relations:
    - 1 to 1
    - N to 1
    
    TODO: Use Pandas Instead
    """

    import xlwt
    from glass.ng.rd import tbl_to_obj
    from glass.ng.xls.fld import col_name

    copy_fields = [copy_fields] if type(copy_fields) == str else \
        copy_fields if type(copy_fields) == list else None

    if not copy_fields:
        raise ValueError('copy_fields should be a list or a string')

    # main_table to dict
    mainData = tbl_to_obj(main_table,
                          sheet=main_sheet,
                          useFirstColAsIndex=True,
                          output='dict')

    # join table to dict
    joinData = tbl_to_obj(join_table,
                          sheet=join_sheet,
                          useFirstColAsIndex=True,
                          output='dict')

    # write output data
    out_sheet_name = 'data' if not main_sheet and not join_sheet else join_sheet \
        if join_sheet and not main_sheet else main_sheet

    out_xls = xlwt.Workbook()
    new_sheet = out_xls.add_sheet(out_sheet_name)

    # Write tiles
    COLUMNS_ORDER = col_name(main_table, sheet_name=main_sheet)

    TITLES = COLUMNS_ORDER + copy_fields
    for i in range(len(TITLES)):
        new_sheet.write(0, i, TITLES[i])

    # parse data
    l = 1
    for fid in mainData:
        new_sheet.write(l, 0, fid)

        c = 1
        for col in COLUMNS_ORDER[1:]:
            new_sheet.write(l, c, mainData[fid][col])
            c += 1

        for col in copy_fields:
            if fid in joinData:
                new_sheet.write(l, c, joinData[fid][col])
            c += 1

        l += 1

    out_xls.save(out_table)
Exemple #26
0
def txtclsmdl_to_file(train, tRef, tData, outMdl, outTf,
                      method='NaiveBayes'):
    """
    Fit Text classification model and save model to file
    
    Classifier Options:
    1) NaiveBayes;
    2) LinearSupportVectorMachine;
    3) RandomForest;
    4) LogisticRegression.
    """
    
    import pandas as pd
    import joblib
    from glass.ng.rd import tbl_to_obj
    
    # Data to Dataframe
    trainDf = tbl_to_obj(train) if type(train) != pd.DataFrame else  train
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[tData])]
    trainDf = trainDf[pd.notnull(trainDf[tRef])]
    
    if method == 'NaiveBayes':
        from sklearn.naive_bayes             import MultinomialNB
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        
        """" Train Model """
        # X train is trainClsCol
        # Y train is trainRefCol
        x_train, y_train = trainDf[tData], trainDf[tRef]
    
        tvect = CountVectorizer()
    
        X_train_counts = tvect.fit_transform(x_train)
    
        tfidf_transformer = TfidfTransformer()
    
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
        clf = MultinomialNB().fit(X_train_tfidf, y_train)
    
    elif method == 'LinearSupportVectorMachine':
        from sklearn.svm import LinearSVC
        
        feat, tvect = txt_to_num_representation(
            trainDf, tData, __lang='english', returnTfiDf=True)
        
        # Train model
        clf = LinearSVC().fit(feat, trainDf[tRef])
    
    elif method == 'RandomForest':
        from sklearn.ensemble import RandomForestClassifier
        
        feat, tvect = txt_to_num_representation(
            trainDf, tData, __lang='english', returnTfiDf=True)
        
        clf = RandomForestClassifier(n_estimators=1000, random_state=0)
        clf.fit(feat, trainDf[tRef])
    
    elif method == 'LogisticRegression':
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        from sklearn.pipeline                import Pipeline
        from sklearn.linear_model            import LogisticRegression
        
        clf = Pipeline([
            ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
            ('clf', LogisticRegression(
                n_jobs=1, C=1e5, multi_class='auto', solver='lbfgs'))
        ])
        
        clf.fit(trainDf[tData], trainDf[tRef])
    
    if method != 'LogisticRegression':
        joblib.dump(tvect, outTf)
        joblib.dump(clf, outMdl)
    
    else:
        joblib.dump(clf, outMdl)
        outTf=None
    
    return outMdl, outTf
Exemple #27
0
def otp_cf_based_on_rel(incidents, group_incidents_col, facilities,
                        facilities_id, rel_inc_fac, sheet, group_fk,
                        facilities_fk, hour, day, output):
    """
    Calculate time travel considering specific facilities
    for each group of incidents

    Relations between incidents and facilities are in a auxiliar table (rel_inc_fac).
    Auxiliar table must be a xlsx file
    """

    import os
    import pandas as pd
    from glass.ng.rd import tbl_to_obj
    from glass.g.rd.shp import shp_to_obj
    from glass.g.wt.shp import obj_to_shp
    from glass.g.mob.otp.log import clsfacility
    from glass.g.prop.prj import get_shp_epsg
    from glass.ng.pd import merge_df
    from glass.pys.oss import fprop
    from glass.g.prj.obj import df_prj

    # Avoid problems when facilities_id == facilities_fk
    facilities_fk = facilities_fk + '_fk' if facilities_id == facilities_fk else \
        facilities_fk

    # Open data
    idf = df_prj(shp_to_obj(incidents), 4326)
    fdf = df_prj(shp_to_obj(facilities), 4326)

    rel_df = tbl_to_obj(rel_inc_fac, sheet=sheet)

    oepsg = get_shp_epsg(incidents)

    # Relate facilities with incidents groups
    fdf = fdf.merge(rel_df,
                    how='inner',
                    left_on=facilities_id,
                    right_on=facilities_fk)

    # List Groups
    grp_df = pd.DataFrame({
        'cnttemp':
        idf.groupby([group_incidents_col])[group_incidents_col].agg('count')
    }).reset_index()

    # Do calculations
    res = []
    logs = []
    for idx, row in grp_df.iterrows():
        # Get incidents for that group
        new_i = idf[idf[group_incidents_col] == row[group_incidents_col]]

        # Get facilities for that group
        new_f = fdf[fdf[group_fk] == row[group_incidents_col]]

        # calculate closest facility
        cfres, l = clsfacility(new_i, new_f, hour, day, out_epsg=oepsg)

        res.append(cfres)
        logs.extend(l)

    # Merge results
    out_df = merge_df(res)

    # Recovery facility id
    fdf.drop([c for c in fdf.columns.values if c != facilities_id],
             axis=1,
             inplace=True)
    out_df = out_df.merge(fdf, how='left', left_on='ffid', right_index=True)

    # Export result
    obj_to_shp(out_df, "geom", oepsg, output)

    # Write logs
    if len(logs) > 0:
        with open(
                os.path.join(os.path.dirname(output),
                             fprop(output, 'fn') + '_log.txt'), 'w') as txt:
            for i in logs:
                txt.write(("Incident_id: {}\n"
                           "Facility_id: {}\n"
                           "ERROR message:\n"
                           "{}\n"
                           "\n\n\n\n\n\n").format(str(i[0]), str(i[1]),
                                                  str(i[2])))

    return output
Exemple #28
0
def correlated_words(dataFile, refCol, dataCol, outTbl, lang='english', N=2,
                     refSheet=None):
    """
    Get words correlated with some text class 
    """
    
    from sklearn.feature_selection import chi2
    from glass.ng.wt               import obj_to_tbl
    from glass.ng.rd               import tbl_to_obj
    from glass.ng.clstxt           import txt_to_num_representation
    
    # Data to DataFrame
    trainDf = tbl_to_obj(
        dataFile, sheet=refSheet
    ) if type(dataFile) != pd.DataFrame else dataFile
    
    # Just in case, delete rows with NULL refCol and NULL dataCol
    trainDf = trainDf[pd.notnull(trainDf[dataCol])]
    trainDf = trainDf[pd.notnull(trainDf[refCol])]
    
    """
    Add a column encoding the reference classes as an integer because
    categorical variables are often better represented by integers
    than strings
    """
    
    from io import StringIO
    
    # Get a ID for Ref/text classes values
    trainDf['ref_id'] = trainDf[refCol].factorize()[0]
    
    # Create Dataframe only with ref_id's, without duplicates
    ref_id_df = trainDf[[refCol, 'ref_id']].drop_duplicates().sort_values(
        'ref_id'
    )
    
    # Create dicts to easy relate ref_id with ref_value
    ref_to_id = dict(ref_id_df.values)
    id_to_ref = dict(ref_id_df[['ref_id', refCol]].values)
    
    """
    Text to numbers
    """
    features, tfidf = txt_to_num_representation(
        trainDf, dataCol, lang, returnTfiDf=True)
    
    labels = trainDf.ref_id
    
    """
    Get most correlated words
    """
    
    corr_words = []
    for ref_name, ref_id in sorted(ref_to_id.items()):
        features_chi2 = chi2(features, labels == ref_id)
        
        indices = np.argsort(features_chi2[0])
        
        feat_names = np.array(tfidf.get_feature_names())[indices]
        
        unigrams = [v for v in feat_names if len(v.split(' ')) == 1][-N:]
        bigrams  = [v for v in feat_names if len(v.split(' ')) == 2][-N:]
        cols_d = [ref_name] + unigrams + bigrams
        
        corr_words.append(cols_d)
    
    COLS_NAME = ['ref_name'] + [
        'unigram_{}'.format(str(i+1)) for i in range(N)
    ] + [
        'bigram_{}'.format(str(i+1)) for i in range(N)
    ]
    dfCorrWords = pd.DataFrame(corr_words,columns=COLS_NAME)
    
    return obj_to_tbl(dfCorrWords, outTbl)
Exemple #29
0
def tbl_to_db(tblFile, db, sqlTbl, delimiter=None, encoding_='utf-8',
              sheet=None, isAppend=None, api_db='psql', colsMap=None):
    """
    Table file to Database Table
    
    API's available:
    * psql;
    * sqlite;
    """
    
    import os
    from glass.pys     import obj_to_lst
    from glass.pys.oss import fprop
    from glass.ng.rd   import tbl_to_obj
    from glass.g.wt.sql import df_to_db
    
    if os.path.isdir(tblFile):
        from glass.pys.oss import lst_ff
        
        tbls = lst_ff(tblFile)
    
    else:
        tbls = obj_to_lst(tblFile)
    
    outSQLTbl = obj_to_lst(sqlTbl)
    
    RTBL = []
    for i in range(len(tbls)):
        fp = fprop(tbls[i], ['fn', 'ff'])
        ff = fp['fileformat']
        fn = fp['filename']
    
        if ff == '.csv' or ff == '.txt' or ff == '.tsv':
            if not delimiter:
                raise ValueError((
                    "To convert TXT to DB table, you need to give a value for the "
                    "delimiter input parameter"
                ))
        
            __enc = 'utf-8' if not encoding_ else encoding_
        
            data = tbl_to_obj(
                tbls[i], _delimiter=delimiter, encoding_=__enc
            )
    
        elif ff == '.dbf':
            data = tbl_to_obj(tbls[i])
    
        elif ff == '.xls' or ff == '.xlsx':
            data = tbl_to_obj(tbls[i], sheet=sheet)
    
        elif ff == '.ods':
            if not sheet:
                raise ValueError((
                    "To convert ODS to DB table, you need to give a value "
                    "for the sheet input parameter"
                ))
        
            data = tbl_to_obj(tbls[i], sheet=sheet)
    
        else:
            raise ValueError('{} is not a valid table format!'.format(ff))
        
        if colsMap:
            data.rename(columns=colsMap, inplace=True)
    
        # Send data to database
        out_tbl = fn if not outSQLTbl else outSQLTbl[i] \
            if i+1 <= len(tbls) else fn
        _rtbl = df_to_db(
            db, data, out_tbl,
            append=isAppend, api=api_db
        )
        
        RTBL.append(_rtbl)
    
    return RTBL[0] if len(RTBL) == 1 else RTBL
Exemple #30
0
def join_shp_with_tbl(shp,
                      shp_pk,
                      tbl,
                      tbl_fk,
                      outShp,
                      joinFieldsMantain=None,
                      newNames=None,
                      csv_delimiter=';',
                      isbgri=None,
                      sheet=None):
    """
    Join BGRI ESRI Shapefile with table in xlsx or csv formats
    """

    import pandas as pd
    from glass.pys import obj_to_lst
    from glass.ng.rd import tbl_to_obj
    from glass.g.rd.shp import shp_to_obj
    from glass.g.wt.shp import df_to_shp

    # Read main_table
    mainDf = shp_to_obj(shp)

    # Read join table
    joinDf = tbl_to_obj(tbl,
                        _delimiter=csv_delimiter,
                        encoding_='utf-8',
                        sheet=sheet)

    # Force ids to strings
    mainDf[shp_pk] = mainDf[shp_pk].astype(str)
    joinDf[tbl_fk] = joinDf[tbl_fk].astype(str)

    # Sanitize GEO_COD of bgriCsv
    if isbgri:
        joinDf[tbl_fk] = joinDf[tbl_fk].str.replace("'", "")

    if joinFieldsMantain:
        joinFieldsMantain = obj_to_lst(joinFieldsMantain)

        dropCols = []
        for col in joinDf.columns.values:
            if col not in [shp_pk] + joinFieldsMantain:
                dropCols.append(col)

        joinDf.drop(dropCols, axis=1, inplace=True)

    # Force numeric columns to be numeric
    for c in joinDf.columns.values:
        if c != tbl_fk:
            joinDf[c] = pd.to_numeric(joinDf[c], errors='ignore')

    resultDf = mainDf.merge(joinDf,
                            how='inner',
                            left_on=shp_pk,
                            right_on=tbl_fk)
    if newNames:
        newNames = obj_to_lst(newNames)
        renDict = {
            joinFieldsMantain[n]: newNames[n]
            for n in range(len(joinFieldsMantain))
        }

        resultDf.rename(columns=renDict, inplace=True)

    df_to_shp(resultDf, outShp)

    return outShp