def san_report_combine(report): from glass.ng.rd import tbl_to_obj from glass.ng.pd.fld import splitcol_to_newcols repdata = tbl_to_obj(report, _delimiter="z") repdata.rename(columns={repdata.columns.values[0]: 'data'}, inplace=True) repdata.drop([ 0, 1, 2, 3, repdata.shape[0] - 1, repdata.shape[0] - 2, repdata.shape[0] - 3, repdata.shape[0] - 4 ], axis=0, inplace=True) repdata["data"] = repdata.data.str.replace(' ', '').str.replace( '.', '').str.replace('category', '').str.replace("Category", '').str.replace(';', '|') repdata["data"] = repdata.data.str[1:-1] repdata = splitcol_to_newcols( repdata, "data", "|", { 0: "new_value", 1: "first_raster_val", 2: "second_raster_val", 3: "n_cells" }) return repdata
def field_sum_two_tables(tableOne, tableTwo, joinFieldOne, joinFieldTwo, field_to_sum, outTable): """ Sum same field in different tables Table 1: id | field 0 | 10 1 | 11 2 | 13 3 | 10 Table 2: id | field 0 | 10 1 | 9 2 | 17 4 | 15 Create the new table id | field 0 | 20 1 | 20 2 | 30 3 | 10 4 | 15 """ from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd.joins import sum_field_of_two_tables # Open two tables df_one = tbl_to_obj(tableOne) df_two = tbl_to_obj(tableTwo) # Do it! outDf = sum_field_of_two_tables(df_one, joinFieldOne, df_two, joinFieldTwo, field_to_sum) obj_to_tbl(outDf, outTable) return outTable
def select_using_excel_refs(db_name, excel_file, sheet_name, pgtable, ref_fields, tableInRef, tableOutRef=None): """ Split PGTABLE using references in excel table Create two tables: * One with similar rows - columns combination are in excel table; * One with rows not in excel table. TODO: Check if it's works. """ from glass.ng.rd import tbl_to_obj from glass.ng.prop.sql import cols_type from glass.ng.sql.q import q_to_ntbl def to_and(row, cols, ctype): def get_equal(_type): return '{}=\'{}\'' if _type == str else '{}={}' row['AND_E'] = ' AND '.join( get_equal(ctype[col]).format(col, row[col]) for col in cols ) row['AND_E'] = '(' + row['AND_E'] + ')' return row # Get excel data table = tbl_to_obj(excel_file, sheet=sheet_name) # Get reference fields type TYPE_COLS = cols_type(db_name, pgtable) table = table.apply(lambda x: to_and(x, ref_fields, TYPE_COLS)) whr_equal = ' OR '.join(table['AND_E']) q_to_ntbl(db_name, tableInRef, "SELECT * FROM {} WHERE {}".format( pgtable, whr_equal ), api='psql') if tableOutRef: COLS_RELATION = " AND ".join(["{ft}.{f} = {st}.{f}".format( ft=pgtable, f=col, st=tableInRef ) for col in TYPE_COLS]) q_to_ntbl(db_name, tableOutRef, ( "SELECT {ft}.* FROM {ft} LEFT JOIN {st} ON " "{rel} WHERE {st}.{c} IS NULL" ).format( ft=pgtable, st=tableInRef, rel=COLS_RELATION, c=TYPE_COLS.keys()[0] ), api='psql')
def layoutv1(mxd, geodata, mapstbl, lyt_template, map_template, lyrint, outmaps): """ Layout V1 muda apenas layer com valores quantitativos agrupados em intervalos de valores """ import arcpy, pprint import os from glass.ng.rd import tbl_to_obj mapstodo = tbl_to_obj(mapstbl) mapsattr = list(mapstodo.columns.values) aprx = arcpy.mp.ArcGISProject(mxd) # Get map mapobj = aprx.listMaps(map_template)[0] lyr = mapobj.listLayers(lyrint)[0] for i, r in mapstodo.iterrows(): current_dict = lyr.connectionProperties replace_dict = { 'connection_info' : {'database' : geodata}, 'dataset' : '{}.shp'.format(r.slug), 'workspace_factory' : 'Shape File' } lyr.updateConnectionProperties(current_dict, replace_dict) # Get Layout lyt = aprx.listLayouts(lyt_template)[0] # List elements elm = lyt.listElements("TEXT_ELEMENT") # Replace elements for e in elm: if e.name in mapsattr: if type(r[e.name]) == float: e.text = str(r[e.name]).replace('.', ',') else: e.text = str(r[e.name]) lyt.exportToJPEG(os.path.join( outmaps, '{}.jpg'.format(r.slug)), resolution=500) aprx.saveACopy(os.path.join(outmaps, '{}.aprx'.format(r.slug))) return outmaps
def layoutv1_nmaps(nmaps, mxd, geodata, mapstbl, lyttmp, mapst, lyrints, outmaps): """ Layout v1 with N maps """ import arcpy import os from glass.ng.rd import tbl_to_obj mapstodo = tbl_to_obj(mapstbl) return outmaps
def model_conf_matrix(tblFile, refCol, clsCol, outMxt): """ Model Evaluation """ import pandas as pd from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from sklearn.metrics import confusion_matrix, classification_report data = tbl_to_obj(tblFile) data[refCol] = data[refCol].astype(str) data[clsCol] = data[clsCol].astype(str) ref_id = data[[refCol]].drop_duplicates().sort_values(refCol) conf_mat = confusion_matrix(data[refCol], data[clsCol]) mxt = pd.DataFrame(conf_mat, columns=ref_id[refCol].values, index=ref_id[refCol].values) mxt.reset_index(inplace=True) mxt.rename(columns={'index': 'confusion_mxt'}, inplace=True) # Get classification report report = classification_report(data[refCol], data[clsCol], target_names=ref_id[refCol], output_dict=True) global_keys = ['accuracy', 'macro avg', 'micro avg', 'weighted avg'] cls_eval = {k: report[k] for k in report if k not in global_keys} glb_eval = {k: report[k] for k in report if k in global_keys} if 'accuracy' in glb_eval: glb_eval['accuracy'] = { 'f1-score': glb_eval['accuracy'], 'precision': 0, 'recall': 0, 'support': 0 } cls_eval = pd.DataFrame(cls_eval).T gbl_eval = pd.DataFrame(glb_eval).T return obj_to_tbl([gbl_eval, cls_eval, mxt], outMxt, sheetsName=['global', 'report', 'matrix'])
def export_cells_not_in(inTable, noTable, outTable, inSheet, noSheet, inFID, noFID): """ Export to a new file the cells of in Table not in noTable """ import xlrd import xlwt from glass.ng.rd import tbl_to_obj from glass.ng.xls.fld import col_name, get_columns_position from glass.ng.xls.summ import list_unique_values_column # TODO: check if tables are xls # Get Data inData = tbl_to_obj(inTable, sheet=inSheet, output='array') COLUMNS = col_name(inTable, sheet_name=inSheet) # From noDATA, get IDS that will not be in the outTable noXls = xlrd.open_workbook(noTable) _noSheet = noXls.sheet_by_name(noSheet) colsPosition = get_columns_position(_noSheet, noFID) noFIDS = list_unique_values_column(_noSheet, colsPosition[noFID]) # Create Output out_xls = xlwt.Workbook() new_sheet = out_xls.add_sheet(inSheet) # Write columns titles for c in range(len(COLUMNS)): new_sheet.write(0, c, COLUMNS[c]) # Write data not in noData l = 1 for row in inData: if row[inFID] not in noFIDS: c = 0 for col in COLUMNS: new_sheet.write(l, c, row[col]) c += 1 l += 1 out_xls.save(outTable) return outTable
def merge_tbls(folder, out_tbl, tbl_format='.dbf'): """ Merge all tables in folder into one single table """ from glass.pys.oss import lst_ff from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd import merge_df tbls = lst_ff(folder, file_format=tbl_format) tbls_dfs = [tbl_to_obj(t) for t in tbls] out_df = merge_df(tbls_dfs) obj_to_tbl(out_df, out_tbl) return out_tbl
def pointXls_to_shp(xlsFile, outShp, x_col, y_col, epsg, sheet=None): """ Excel table with Point information to ESRI Shapefile """ from glass.ng.rd import tbl_to_obj from glass.g.it.pd import pnt_dfwxy_to_geodf from glass.g.wt.shp import df_to_shp # XLS TO PANDAS DATAFRAME dataDf = tbl_to_obj(xlsFile, sheet=sheet) # DATAFRAME TO GEO DATAFRAME geoDataDf = pnt_dfwxy_to_geodf(dataDf, x_col, y_col, epsg) # GEODATAFRAME TO ESRI SHAPEFILE df_to_shp(geoDataDf, outShp) return outShp
def merge_xls_in_folder(tbl_folder, out_table): """ Get all excel tables in a folder and make one table of them """ import pandas from glass.pys.oss import lst_ff from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl tables = lst_ff(tbl_folder, file_format=['.xls', '.xlsx']) dfs = [tbl_to_obj(table) for table in tables] result = pandas.concat(dfs) out_table = obj_to_tbl(result, out_table) return out_table
def shpcols_to_shp(inshp, tbl, col_cols, outcolname, outfolder): """ Read a table with a list of columns in a shapefile For each column: in the input shapefile, delete all other columns rename the column, and save the changed shapefile explain why col_cols could be a list """ import os from glass.pys import obj_to_lst from glass.g.rd.shp import shp_to_obj from glass.ng.rd import tbl_to_obj from glass.g.wt.shp import df_to_shp dfshp = shp_to_obj(inshp) dfcols = tbl_to_obj(tbl) col_cols = obj_to_lst(col_cols) refcols = [] for cc in col_cols: refcols.extend(dfcols[cc].tolist()) for i, r in dfcols.iterrows(): for cc in col_cols: newdf = dfshp.copy() dc = [c for c in refcols if c != r[cc]] if outcolname in list(newdf.columns.values): dc.append(outcolname) newdf.drop(dc, axis=1, inplace=True) newdf.rename(columns={r[cc]: outcolname}, inplace=True) df_to_shp(newdf, os.path.join(outfolder, r[cc] + '.shp')) return outfolder
def tbl_to_tbl(inTbl, outTbl, inSheet=None, txtDelimiter=None, inTxtDelimiter=None, inEncoding='utf-8'): """ Convert data format """ from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl data = tbl_to_obj(inTbl, sheet=inSheet, encoding_=inEncoding, _delimiter=inTxtDelimiter) outTbl = obj_to_tbl(data, outTbl, delimiter=txtDelimiter) return outTbl
def predict_fm_mdl(mdlFile, vFile, data, txtCol, method='NaiveBayes'): """ Text classification using file with fit data """ from joblib import load import pandas as pd from glass.ng.rd import tbl_to_obj classDf = tbl_to_obj(data) if type(data) != pd.DataFrame else data classDf = classDf[pd.notnull(classDf[txtCol])] clf = load(mdlFile) tvect = None if not vFile else load(vFile) if method == 'NaiveBayes': result = clf.predict(tvect.transform(data[txtCol])) data.loc[:, 'classification'] = result elif method == 'LinearSupportVectorMachine': feaTst = tvect.transform(classDf[txtCol]) y_pred = clf.predict(feaTst) data.loc[:, 'classification'] = y_pred elif method == 'RandomForest': feaTst = tvect.transform(classDf[txtCol]) y_pred=clf.predict(feaTst) data.loc[:, 'classification'] = y_pred elif method == 'LogisticRegression': y_pred = clf.predict(classDf[txtCol]) data.loc[:, 'classification'] = y_pred return data
def exp_by_group_relfeat(shp, group_col, relfeat, relfeat_id, reltbl, reltbl_sheet, group_fk, relfeat_fk, out_folder, out_tbl): """ Identify groups in shp, get features related with these groups and export group features and related features to new file """ import os import pandas as pd from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import obj_to_shp from glass.g.prop.prj import get_shp_epsg epsg = get_shp_epsg(shp) # Open data shp_df = shp_to_obj(shp) rel_df = shp_to_obj(relfeat) # Get table with relations N-N nn_tbl = tbl_to_obj(reltbl, sheet=reltbl_sheet) # Relate relfeat with shp groups rel_df = rel_df.merge(nn_tbl, how='inner', left_on=relfeat_id, right_on=relfeat_fk) # List Groups grp_df = pd.DataFrame({ 'cnttemp': shp_df.groupby([group_col])[group_col].agg('count') }).reset_index() ntbls = [] # Filter and export for idx, row in grp_df.iterrows(): # Get shp_df filter new_shp = shp_df[shp_df[group_col] == row[group_col]] # Get relfeat filter new_relf = rel_df[rel_df[group_fk] == row[group_col]] # Export shp_i = obj_to_shp( new_shp, 'geometry', epsg, os.path.join(out_folder, 'lyr_{}.shp'.format(row[group_col]))) rel_i = obj_to_shp( new_relf, 'geometry', epsg, os.path.join(out_folder, 'rel_{}.shp'.format(row[group_col]))) ntbls.append([row[group_col], shp_i, rel_i]) ntbls = pd.DataFrame(ntbls, columns=['group_id', 'shp_i', 'rel_i']) obj_to_tbl(ntbls, out_tbl) return out_tbl
def binary_eval(refTbl, refId, refCol, tstTbl, tstId, outTbl=None, tstCol=None): """ Evaluation of a binary classification When tstCol is None, the script assumes that in tstTbl there are only positives A tabela de referencia deve ter positivos e negativos; mas a tabela de teste pode ter so positivos. """ import numpy as np import pandas import math from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl # Data to Pandas Dataframe ref_df = tbl_to_obj(refTbl, fields=[ refId, refCol ]) if type(refTbl) != pandas.DataFrame else refTbl[[refId, refCol]] tst_df = tbl_to_obj( tstTbl, fields=[tstId] if not tstCol else [tstId, tstCol] ) if type(refTbl) != pandas.DataFrame else tstTbl[[tstId]] \ if not tstCol else tstTbl[[tstId, tstCol]] # Check if refId is equal to tstId; they must be different if refId == tstId: colRename = {tstId: 'tst_fid__'} # Do the same for refCol and tstCol if refCol == tstCol: colRename[tstCol] = 'tst_col__' tst_df.rename(columns=colRename, inplace=True) tstId = 'tst_fid__' if refCol == tstCol: tstCol = 'tst_col__' df = ref_df.merge(tst_df, how='left', left_on=refId, right_on=tstId) # Check if we have a tstCol if not tstCol: df[tstId].fillna('None', inplace=True) tstCol = 'cls_tst' df[tstCol] = np.where(df[tstId] == 'None', 0, 1) # Get VP, VN, FP, FN df['confusion'] = np.where( (df[refCol] == 1) & (df[tstCol] == 1), 'VP', np.where((df[refCol] == 0) & (df[tstCol] == 0), 'VN', np.where((df[refCol] == 1) & (df[tstCol] == 0), 'FN', 'FP'))) # tabela sintese conf_tbl = pandas.DataFrame() conf_tbl['nrows'] = df.groupby(['confusion'])[refId].nunique() conf_tbl.reset_index(inplace=True) conf_tbl['percentage'] = (conf_tbl.nrows * 100) / df.shape[0] # Get some evaluation mesures dConf = {} for row in conf_tbl.to_dict(orient='records'): dConf[row['confusion']] = row['nrows'] l = ['VP', 'VN', 'FP', 'FN'] for i in l: if i not in dConf: dConf[i] = 0 """ Error rate Error rate (ERR) is calculated as the number of all incorrect predictions divided by the total number of the dataset. The best error rate is 0.0, whereas the worst is 1.0. """ ERR = (dConf['FP'] + dConf['FN']) / (dConf['VP'] + dConf['VN'] + dConf['FN'] + dConf['FP']) """ Accuracy Accuracy (ACC) is calculated as the number of all correct predictions divided by the total number of the dataset. The best accuracy is 1.0, whereas the worst is 0.0. It can also be calculated by 1 – ERR. """ ACC = (dConf['VP'] + dConf['VN']) / (dConf['VP'] + dConf['VN'] + dConf['FN'] + dConf['FP']) """ Sensitivity (Recall or True positive rate) Sensitivity (SN) is calculated as the number of correct positive predictions divided by the total number of positives. It is also called recall (REC) or true positive rate (TPR). The best sensitivity is 1.0, whereas the worst is 0.0. """ try: SN = dConf['VP'] / (dConf['VP'] + dConf['FN']) except: SN = -99 """ Specificity (True negative rate) Specificity (SP) is calculated as the number of correct negative predictions divided by the total number of negatives. It is also called true negative rate (TNR). The best specificity is 1.0, whereas the worst is 0.0. """ SP = dConf['VN'] / (dConf['VN'] + dConf['FP']) """ Precision (Positive predictive value) Precision (PREC) is calculated as the number of correct positive predictions divided by the total number of positive predictions. It is also called positive predictive value (PPV). The best precision is 1.0, whereas the worst is 0.0. """ PREC = dConf["VP"] / (dConf["VP"] + dConf['FP']) """ False positive rate False positive rate (FPR) is calculated as the number of incorrect positive predictions divided by the total number of negatives. The best false positive rate is 0.0 whereas the worst is 1.0. It can also be calculated as 1 – specificity. """ FPR = dConf['FP'] / (dConf['VN'] + dConf['FP']) """ Matthews correlation coefficient Matthews correlation coefficient (MCC) is a correlation coefficient calculated using all four values in the confusion matrix. """ try: MCC = (dConf['VP'] * dConf['VN'] - dConf['FP'] * dConf['FN']) / (math.sqrt( (dConf['VP'] + dConf['FP']) * (dConf['VP'] + dConf['FN']) * (dConf['VN'] + dConf['FP']) * (dConf['VN'] + dConf['FN']))) except: MCC = -99 """ F-score F-score is a harmonic mean of precision and recall. """ F0_5 = ((1 + 0.5**2) * (PREC * SN)) / (0.5**2 * PREC + SN) F_1 = (2 * PREC * SN) / (PREC + SN) F_2 = (5 * PREC * SN) / (4 * PREC + SN) evalMeasures = pandas.DataFrame( [['Error rate', ERR], ['Accuracy', ACC], ['Sensitivity', SN], ['Specificity', SP], ['Precision', PREC], [ 'False positive rate', FPR ], ['Matthews correlation coefficient', MCC], ['F-score 0.5', F0_5], ['F-score 1', F_1], ['F-score 2', F_2]], columns=['eval_mesure', 'value']) if outTbl: return obj_to_tbl([conf_tbl, evalMeasures, df], outTbl, sheetsName=['matrix', 'eval_mesures', 'tbl']) else: return conf_tbl, evalMeasures, df
def text_prediction(trainData, classData, trainRefCol, trainClsCol, clsDataCol, outfile, method='NaiveBayes', lang='english'): """ Text classification Classifier Options: 1) NaiveBayes; 2) LinearSupportVectorMachine; 3) RandomForest; 4) LogisticRegression. """ import pandas as pd from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl # Data to Dataframe trainDf = tbl_to_obj(trainData) if type(trainData) != pd.DataFrame else trainData classDf = tbl_to_obj(classData) if type(classData) != pd.DataFrame else classData # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[trainClsCol])] trainDf = trainDf[pd.notnull(trainDf[trainRefCol])] classDf = classDf[pd.notnull(classDf[clsDataCol])] if method == 'NaiveBayes': from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer """" Train Model """ # X train is trainClsCol # Y train is trainRefCol x_train, y_train = trainDf[trainClsCol], trainDf[trainRefCol] count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(x_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) """ Predict """ result = clf.predict(count_vect.transform(classDf[clsDataCol])) classDf['classification'] = result elif method == 'LinearSupportVectorMachine': import numpy from sklearn.svm import LinearSVC # Get features and Labels trainDf['ref_id'] = trainDf[trainRefCol].factorize()[0] labels = trainDf.ref_id features, tvect = txt_to_num_representation( trainDf, trainClsCol, __lang=lang, returnTfiDf=True) featTst = tvect.transform(classDf[clsDataCol]) """ Train model """ model = LinearSVC() model.fit(features, labels) y_pred = model.predict(featTst) classDf['classification'] = y_pred # Create Dataframe only with ref_id's, without duplicates ref_id_df = trainDf[[ trainRefCol, 'ref_id' ]].drop_duplicates().sort_values('ref_id') ref_id_df.columns = ['class_name', 'ref_fid'] classDf = classDf.merge( ref_id_df, how='inner', left_on='classification', right_on='ref_fid' ) classDf.loc[:, 'classification'] = classDf.class_name classDf.drop(['ref_fid', 'class_name'], axis=1, inplace=True) elif method == 'RandomForest': from sklearn.ensemble import RandomForestClassifier # Get features features, tvect = txt_to_num_representation( trainDf, trainClsCol, __lang=lang, returnTfiDf=True) featTst = tvect.transform(classDf[clsDataCol]) classifier = RandomForestClassifier( n_estimators=1000, random_state=0 ) classifier.fit(features, trainDf[trainRefCol]) y_pred = classifier.predict(featTst) classDf['classification'] = y_pred elif method == 'LogisticRegression': from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression logreg = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression(n_jobs=1, C=1e5, multi_class='auto', solver='lbfgs')), ]) logreg.fit(trainDf[trainClsCol], trainDf[trainRefCol]) y_pred = logreg.predict(classDf[clsDataCol]) classDf['classification'] = y_pred return obj_to_tbl(classDf, outfile)
def randomtime_to_shprows(in_shp, o_shp, start_date, end_date): """ Relate time value to rows in one shapefile. The time is determined randomly from interval start_data and end_data must be datetime objects. """ import datetime as dt import random as rdn from calendar import monthrange from glass.ng.rd import tbl_to_obj from glass.g.wt.shp import df_to_shp # Shape to Pandas.Dataframe gdf = tbl_to_obj(in_shp) # Get Random Dates def get_year(start, end): # Get Year if start.year == end.year: year = start.year else: year = rdn.randint(start.year, end.year) return year def get_month(start, end, year): # Get Month if start.year == end.year: if start.month == end.month: month = start.month else: month = rdn.randint(start.month, end.month) else: if year == start.year: month = rdn.randint(start.month, 12) if start.month < 12 else 12 elif year == end.year: month = rdn.randint(1, end.month) if end.month > 1 else 1 return month def get_day(s, e, y, m): # Get Day ndays = monthrange(y, m)[1] if s.year == e.year and s.month == e.month: if s.day == e.day: day = s.day else: day = rdn.randint(s.day, e.day) elif s.year == e.year and s.month != e.month: if m == s.month: day = rdn.randint(m, ndays) elif m == e.month: day = rdn.randint(1, e.day) else: day = rdn.randint(1, ndays) elif s.year != e.year: if y == s.year: if m == s.month: day = rdn.randint(m, ndays) else: day = rdn.randint(1, ndays) elif y == e.year: if m == e.month: day = rdn.randint(1, e.day) else: day = rdn.randint(1, ndays) else: day = rdn.randint(1, ndays) return day def get_hour(s, e, y, m, d): # Get Hour sDay = dt.datetime(s.year, s.month, s.day) eDay = dt.datetime(e.year, e.month, e.day) cDay = dt.datetime(y, m, d) if sDay == eDay: hour = rdn.randint(s.hour, e.hour) else: if sDay == cDay: hour = rdn.randint(s.hour, 23) elif eDay == cDay: hour = rdn.randint(0, e.hour) else: hour = rdn.randint(0, 23) return hour def get_minute(s, e, y, m, d, h): # Get minute sHour = dt.datetime(s.year, s.month, s.day, s.hour) eHour = dt.datetime(e.year, e.month, e.day, e.hour) cHour = dt.datetime(y, m, d, h) if sHour == eHour: minute = rdn.randint(s.minute, e.minute) else: if sHour == cHour: minute = rdn.randint(s.minute, 59) elif eHour == cHour: minute = rdn.randint(0, e.minute) else: minute = rdn.randint(0, 59) return minute def get_second(s, e, y, m, d, h, mi): # Get second sMinute = dt.datetime(s.year, s.month, s.day, s.hour, s.minute) eMinute = dt.datetime(e.year, e.month, e.day, e.hour, e.minute) cMinute = dt.datetime(y, m, d, h, mi) if sMinute == eMinute: second = rdn.randint(s.second, e.second) else: if sMinute == cMinute: second = rdn.randint(s.second, 59) elif eMinute == cMinute: second = rdn.randint(0, e.second) else: second = rdn.randint(0, 59) return second def sanitize(s): return "0{}".format(str(s)) if len(str(s)) == 1 else str(s) dates = [] times = [] for i in range(gdf.shape[0]): year = get_year(start_date, end_date) month = get_month(start_date, end_date, year) day = get_day(start_date, end_date, year, month) hour = get_hour(start_date, end_date, year, month, day) minute = get_minute(start_date, end_date, year, month, day, hour) second = get_second(start_date, end_date, year, month, day, hour, minute) month, day, hour, minute, second = [ sanitize(i) for i in [month, day, hour, minute, second] ] dates.append('{}-{}-{}'.format(year, month, day)) times.append('{}:{}:{}'.format(hour, minute, second)) # Set dates and times gdf['date'] = dates gdf['time'] = times # Export df_to_shp(gdf, o_shp) return o_shp
def datatocls_multiref(shpfile, mapstbl, sheet, slugs, titles, ncls, decplace, outshp, outmapstbl, method="QUANTILE"): """ Create classes/intervals for each layout in table (mapstbl) One layout could have more than one map... deal with that situation method options: * QUANTILE; * JENKS - natural breaks (jenks); """ import pandas as pd import numpy as np from glass.pys import obj_to_lst from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import df_to_shp from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd.fld import listval_to_newcols from glass.g.lyt.diutils import eval_intervals methods = ["QUANTILE", "JENKS"] if method not in methods: raise ValueError(f'Method {method} is not available') if method == "QUANTILE": from glass.ng.pd.stats import get_intervals elif method == "JENKS": import jenkspy slugs = obj_to_lst(slugs) titles = obj_to_lst(titles) # Read data shp = shp_to_obj(shpfile) maps = tbl_to_obj(mapstbl, sheet=sheet) # Get intervals for each map istats = [] cols = [] for i, row in maps.iterrows(): ddig = row[decplace] icols = [row[slug] for slug in slugs] ititles = [row[title] for title in titles] istatsrow = [] for _i in range(len(icols)): min_v = shp[icols[_i]].min() max_v = shp[icols[_i]].max() mean_v = shp[icols[_i]].mean() std_v = shp[icols[_i]].std() if method == "QUANTILE": intervals = get_intervals( shp, icols[_i], ncls, method="QUANTILE") intervals.append(max_v) elif method == "JENKS": breaks = jenkspy.jenks_breaks(shp[icols[_i]], nb_class=ncls) intervals = breaks[1:] if not str(shp[icols[_i]].dtype).startswith('int'): __intervals = [round(itv, ddig) for itv in intervals] __intervals, ndig = eval_intervals( intervals, __intervals, ddig, round(min_v, ddig) ) istatsrow.extend([ icols[_i], ititles[_i], round(min_v, ndig), round(max_v, ndig), round(mean_v, ddig), round(std_v, ddig), __intervals ]) shp[icols[_i]] = shp[icols[_i]].round(ddig) else: for _e in range(len(intervals)): if not _e: rzero = 1 if round(intervals[_e], 0) > min_v else 0 else: rzero = 1 if round(intervals[_e], 0) > \ round(intervals[_e -1], 0) else 0 if not rzero: break __intervals = [round( _o, ddig if not rzero else 0 ) for _o in intervals] __intervals, ndig = eval_intervals( intervals, __intervals, ddig, min_v ) istatsrow.extend([ icols[_i], ititles[_i], min_v, max_v, int(round(mean_v, 0)) if rzero else round(mean_v, ddig), int(round(std_v, 0)) if rzero else round(std_v, ddig), __intervals ]) if not i: cols.extend([ f'slug{str(_i+1)}', f'title{str(_i+1)}', f'min_value{str(_i+1)}', f'max_value{str(_i+1)}', f'mean_value{str(_i+1)}', f'std_value{str(_i+1)}', f'intervals{str(_i+1)}' ]) istats.append(istatsrow) istats = pd.DataFrame(istats, columns=cols) rename_cols = {} for idx, row in istats.iterrows(): for _i in range(len(slugs)): # Get intervals int_ = row[f'intervals{str(_i+1)}'] # Add columns for intervals ids newcol = 'i_' + row[f'slug{str(_i+1)}'] shp[newcol] = 0 for itv in range(len(int_)): if not itv: shp[newcol] = np.where( shp[row[f'slug{str(_i+1)}']] <= int_[itv], itv + 1, shp[newcol] ) else: shp[newcol] = np.where( (shp[row[f'slug{str(_i+1)}']] > int_[itv-1]) & (shp[row[f'slug{str(_i+1)}']] <= int_[itv]), itv + 1, shp[newcol] ) rename_cols[newcol] = row[f'slug{str(_i+1)}'] dc = [] for c in range(len(slugs)): dc.extend(istats[f'slug{str(c+1)}'].tolist()) shp.drop(dc, axis=1, inplace=True) shp.rename(columns=rename_cols, inplace=True) for i in range(len(slugs)): istats = listval_to_newcols(istats, f'intervals{str(i+1)}') istats.rename(columns={ ii : f'intervals{str(i+1)}_{str(ii+1)}' for ii in range(ncls) }, inplace=True) # Write outputs df_to_shp(shp, outshp) obj_to_tbl(istats, outmapstbl) return outshp, outmapstbl
def datatocls_meanstd(shp_data, maps_table, sheet, slug, title, ncls, decplace, nodata, out_shp, out_maps_tbl, grpcol=None): """ Create classes based on mean and standard deviation decplace - Numero casas decimais que vao aparecer nos valores do layout nodata - Must be always smaller than the min of min values """ import pandas as pd import numpy as np from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import df_to_shp from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd.fld import listval_to_newcols from glass.g.lyt.diutils import eval_intervals # Read data shp_df = shp_to_obj(shp_data) maps_df = tbl_to_obj(maps_table, sheet=sheet) if grpcol: maps_cols = maps_df[slug].tolist() for c in maps_cols: shp_df[c] = shp_df[c].astype(float) agg_dict = {c : 'mean' for c in maps_cols} shp_df = pd.DataFrame(shp_df.groupby([grpcol]).agg( agg_dict )).reset_index() def get_intervals(_ncls, mean, std): mean_class = mean + (std / 2) less_mean = [] major_mean = [] for e in range(_ncls): if not e: less_mean.append(mean - (std / 2)) major_mean.append(mean_class + std) else: less_mean.append(less_mean[e - 1] - std) major_mean.append(major_mean[e - 1] + std) less_mean.reverse() intervals = less_mean + [mean_class] + major_mean return intervals # Calculo intervalos para cada indicador # metodo intervalos baseados na media e no desvio padrao # Get min, max, mean and standard deviation # Round values i_stats = [] for idx, row in maps_df.iterrows(): ddig = row[decplace] i = row[slug] t = row[title] if nodata in shp_df[i].unique(): vals = list(shp_df[i].unique()) vals.sort() min_v = vals[1] tdf = shp_df[[i]].copy() tdf = tdf[tdf[i] >= min_v] tdf.reset_index(drop=True, inplace=True) max_v = tdf[i].max() mean_v = tdf[i].mean() std_v = tdf[i].std() else: min_v = shp_df[i].min() max_v = shp_df[i].max() mean_v = shp_df[i].mean() std_v = shp_df[i].std() fbreak = min_v - 1 __std = std_v while fbreak <= min_v: intervals = get_intervals(ncls, mean_v, __std) repeat = 0 for __i in intervals[:-1]: if __i > max_v: repeat = 1 if repeat: break fbreak = intervals[0] if not repeat else min_v - 1 __std = __std / 2 intervals[-1] = max_v if not str(shp_df[i].dtype).startswith('int'): __intervals = [round(_i, ddig) for _i in intervals] repeat = 1 __intervals, ndig = eval_intervals( intervals, __intervals, ddig, round(min_v, ddig) ) i_stats.append([ i, t, round(min_v, ndig), round(max_v, ndig), round(mean_v, ddig), round(std_v, ddig), __intervals ]) shp_df[i] = shp_df[i].round(ddig) else: for _e in range(len(intervals)): if not _e: rzero = 1 if round(intervals[_e], 0) > min_v else 0 else: rzero = 1 if round(intervals[_e], 0) > \ round(intervals[_e - 1], 0) else 0 if not rzero: break __intervals = [round(_o, ddig if not rzero else 0) for _o in intervals] __intervals, ndig = eval_intervals(intervals, __intervals, ddig, min_v) i_stats.append([ i, t, min_v, max_v, int(round(mean_v, 0)) if rzero else round(mean_v, ddig), int(round(std_v, 0)) if rzero else round(std_v, ddig), __intervals ]) i_stats = pd.DataFrame(i_stats, columns=[ 'slug', 'title', 'min_value', 'max_value', 'mean_value', 'std_value', 'intervals' ]) rename_cols = {} for idx, row in i_stats.iterrows(): # Get intervals. int_ = row.intervals # Add columns for intervals i_col = 'i_' + row.slug shp_df[i_col] = 0 for _i in range(len(int_)): if not _i: shp_df[i_col] = np.where( (shp_df[row.slug] > nodata) & (shp_df[row.slug] <= int_[_i]), _i + 1, shp_df[i_col] ) else: shp_df[i_col] = np.where( (shp_df[row.slug] > int_[_i - 1]) & (shp_df[row.slug] <= int_[_i]), _i + 1, shp_df[i_col] ) rename_cols[i_col] = row.slug shp_df.drop(i_stats.slug, axis=1, inplace=True) shp_df.rename(columns=rename_cols, inplace=True) i_stats = listval_to_newcols(i_stats, 'intervals') i_stats.rename(columns={ i : 'interval_' + str(i+1) for i in range((ncls * 2) + 1) }, inplace=True) if grpcol: nshp_df = shp_to_obj(shp_data) nshp_df.drop(maps_cols, axis=1, inplace=True) shp_df.rename(columns={grpcol : grpcol + '_y'}, inplace=True) shp_df = nshp_df.merge(shp_df, how='left', left_on=grpcol, right_on=grpcol + '_y') df_to_shp(shp_df, out_shp) obj_to_tbl(i_stats, out_maps_tbl) return out_shp, out_maps_tbl
5 | R. xxx | 149 | x | ... | x FID should be the first column """ import xlrd from glass.ng.rd import tbl_to_obj from glass.pys.oss import del_file from glass.ng.it import dict_to_xls if overwrite: del_file(out_path) # XLS data to dict data = tbl_to_obj( xls_path, sheet=sheet, useFirstColAsIndex=True, output='dict' ) # Split interest_column (attribute) for fid in data: for col in data[fid]: if str(col) == str(interest_column): str_lst = data[fid][col].split(rule) data[fid][interest_column+'_1'] = str_lst[0] data[fid][interest_column+'_2'] = str_lst[1] if len(str_lst) > 1 else '' # Write data in a new file dict_to_xls(data, out_path, sheet)
def datatocls(shpfile, mapstbl, sheet, slug, title, ncls, decplace, outshp, outmapstbl, method="QUANTILE"): """ Create classes/intervals for each map in table method options: * QUANTILE; * JENKS - natural breaks (jenks); """ import pandas as pd import numpy as np from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import df_to_shp from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl from glass.ng.pd.fld import listval_to_newcols from glass.g.lyt.diutils import eval_intervals methods = ["QUANTILE", "JENKS"] if method not in methods: raise ValueError(f'Method {method} is not available') if method == "QUANTILE": from glass.ng.pd.stats import get_intervals elif method == "JENKS": import jenkspy # Read data shp = shp_to_obj(shpfile) maps = tbl_to_obj(mapstbl, sheet=sheet) # Get intervals for each map istats = [] for i, row in maps.iterrows(): ddig = row[decplace] icol = row[slug] titl = row[title] min_v = shp[icol].min() max_v = shp[icol].max() mean_v = shp[icol].mean() std_v = shp[icol].std() if method == "QUANTILE": intervals = get_intervals(shp, icol, ncls, method="QUANTILE") intervals.append(max_v) elif method == "JENKS": breaks = jenkspy.jenks_breaks(shp[icol], nb_class=ncls) intervals = breaks[1:] if not str(shp[icol].dtype).startswith('int'): __intervals = [round(i, ddig) for i in intervals] __intervals, ndig = eval_intervals( intervals, __intervals, ddig, round(min_v, ddig) ) istats.append([ icol, titl, round(min_v, ndig), round(max_v, ndig), round(mean_v, ddig), round(std_v, ddig), __intervals ]) shp[icol] = shp[icol].round(ddig) else: for _e in range(len(intervals)): if not _e: rzero = 1 if round(intervals[_e], 0) > min_v else 0 else: rzero = 1 if round(intervals[_e], 0) > \ round(intervals[_e - 1], 0) else 0 if not rzero: break __intervals = [round( _o, ddig if not rzero else 0 ) for _o in intervals] __intervals, ndig = eval_intervals( intervals, __intervals, ddig, min_v) istats.append([ icol, titl, min_v, max_v, int(round(mean_v, 0)) if rzero else round(mean_v, ddig), int(round(std_v, 0)) if rzero else round(std_v, ddig), __intervals ]) istats = pd.DataFrame(istats, columns=[ "slug", "title", "min_value", "max_value", "mean_value", "std_value", "intervals" ]) rename_cols = {} for idx, row in istats.iterrows(): # Get intervals int_ = row.intervals # Add columns for intervals i_col = 'i_' + row.slug shp[i_col] = 0 for _i in range(len(int_)): if not _i: shp[i_col] = np.where( shp[row.slug] <= int_[_i], _i + 1, shp[i_col] ) else: shp[i_col] = np.where( (shp[row.slug] > int_[_i - 1]) & (shp[row.slug] <= int_[_i]), _i + 1, shp[i_col] ) rename_cols[i_col] = row.slug shp.drop(istats.slug, axis=1, inplace=True) shp.rename(columns=rename_cols, inplace=True) istats = listval_to_newcols(istats, 'intervals') istats.rename(columns={ i : 'interval_' + str(i+1) for i in range(ncls) }, inplace=True) # Write outputs df_to_shp(shp, outshp) obj_to_tbl(istats, outmapstbl) return outshp, outmapstbl
def model_selection(dataFile, refCol, dataCol, outTbl, lang='english', CV=5): """ See which model is better to use in text classification for a specific data sample Compare: Logistic Regression (LogisticRegression) (Multinomial) Naive Bayes (MultinomialNB) Linear Support Vector Machine (LinearSVC) Random Forest (RandomForestClassifier) """ import os from glass.pys.oss import fprop from glass.ng.rd import tbl_to_obj from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import cross_val_score from glass.ng.wt import obj_to_tbl # Data to DataFrame trainDf = tbl_to_obj(dataFile) # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[dataCol])] trainDf = trainDf[pd.notnull(trainDf[refCol])] # Ref col to integers from io import StringIO trainDf['ref_id'] = trainDf[refCol].factorize()[0] # Text to numbers features = txt_to_num_representation(trainDf, dataCol, lang) labels = trainDf.ref_id """ Test Models """ models = [ RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), LinearSVC(), MultinomialNB(), LogisticRegression(random_state=0) ] cv_df = pd.DataFrame(index=range(CV * len(models))) entries = [] for model in models: m_name = model.__class__.__name__ accuracies = cross_val_score( model, features, labels, scoring='accuracy', cv=CV ) for fold_idx, accuracy in enumerate(accuracies): entries.append((m_name, fold_idx, accuracy)) # Create and Export evaluation table cv_df = pd.DataFrame( entries, columns=['model_name', 'fold_idx', 'accuracy']) cv_df_gp = pd.DataFrame(cv_df.groupby('model_name').accuracy.mean()) cv_df_gp.reset_index(inplace=True) # Export Graphic import seaborn as sns a = sns.boxplot(x='model_name', y='accuracy', data=cv_df) b = sns.stripplot( x='model_name', y='accuracy', data=cv_df, size=10, jitter=True, edgecolor="gray", linewidth=2) fig = b.get_figure() fig.savefig(os.path.join( os.path.dirname(outTbl), fprop(outTbl, 'fn') + '.png' )) return obj_to_tbl(cv_df_gp, outTbl)
def write_sld(attr_name, attr_colors, mapAttrKeys, sld_path, geometry=None, DATA='CATEGORICAL'): """ Write a sld file using an association between field attributes and a color * attr_name -> name of a column in a layer * DATA -> CATEGORICAL | QUANTITATIVE * attr_colors -> list or table with styles for some category or interval QUANTITATIVE - TABLE EXAMPLE (Sheet Index = 0): | min | max | R | G | B 1 | 0 | 5 | X | X | X 2 | 5 | 10 | X | X | X 3 | 10 | 15 | X | X | X 4 | 15 | 20 | X | X | X 5 | 20 | 25 | X | X | X QUANTITATIVE - LIST EXAMPLE: attr_colors = [ {'min': 0, 'max': 5, 'R': X, 'G': X, 'B': X}, {'min': 5, 'max': 10, 'R': X, 'G': X, 'B': X}, {'min': 10, 'max': 15, 'R': X, 'G': X, 'B': X}, {'min': 15, 'max': 20, 'R': X, 'G': X, 'B': X}, {'min': 20, 'max': 25, 'R': X, 'G': X, 'B': X} ] CATEGORICAL - TABLE EXAMPLE CATEGORICAL - LIST EXAMPLE * mapAttrKeys -> dict with the relation between the meaning of the columns/keys in attr_colors EXAMPLE: mapAttrKeys = { 'r' : 'R', 'g' : 'G', 'b' : 'B', 'interval_min' : 'min', 'interval_max' : 'max' } keys that could be used: * r -> attr_colors key/column with red of red|green|blue cat color * g -> attr_colors key/column with green of red|green|blue cat color * b -> attr_colors key/column with blue of red|green|blue cat color * hex -> attr_colors key/column with color hex * interval_min -> attr_colors key/column com limiar inferior do intervalo * interval_max -> attr_colors key/column com limiar superior do intervalo * stroke_hex -> attr_colors key/column with color hex for stroke * stroke_r -> attr_colors key/column with red of red|green|blue stroke color * stroke_g -> attr_colors key/column with green of red|green|blue stroke color * stroke_b -> attr_colors key/column with blue of red|green|blue stroke color * width -> attr_colors key/column with stroke width * opacity -> attr_colors key/column with opacity value for some category * category -> attr_colors key/column with category value sld_path -> path to sld file GEOMETRY -> Polygon | Line NOTE: This will work only for polygon/linear features """ import os from glass.pys.Xml import write_xml_tree from glass.pys.oss import fprop from glass.g.wg.sld.rules import get_categorical_rules from glass.g.wg.sld.rules import get_quantitative_rules if DATA != 'CATEGORICAL' and DATA != 'QUANTITATIVE': raise ValueError( 'DATA should has the value CATEGORICAL or QUANTITATIVE') if type(attr_colors) != list: if os.path.exists(attr_colors): ff = fprop(attr_colors, 'ff') if ff == '.json': import json attr_colors = json.load(open(attr_colors, 'r')) elif ff == '.xlsx' or ff == '.xls': from glass.ng.rd import tbl_to_obj attr_colors = tbl_to_obj(attr_colors, sheet=0, useFirstColAsIndex=None, output='array') elif ff == '.dbf': from glass.ng.rd import tbl_to_obj attr_colors = tbl_to_obj(attr_colors, output='array') else: raise ValueError('Your file is not a json or a xls') else: raise ValueError( ('ERROR in argument attribute_value_colors: ' 'You need to define a list or give a valid path to a json ' 'file or to a xls file')) GEOMETRY = str(geometry) if geometry else 'Polygon' # Create Feature Type Style RULES sldRules = get_categorical_rules( attr_colors, attr_name, GEOMETRY, mapAttrKeys) if DATA == 'CATEGORICAL' else get_quantitative_rules( attr_colors, attr_name, GEOMETRY, mapAttrKeys) if DATA == 'QUANTITATIVE' else None # SLD Basic structure xml_sld_root = ('sld:StyledLayerDescriptor', 'xmlns', 'http://www.opengis.net/sld', 'xmlns:sld', 'http://www.opengis.net/sld', 'xmlns:gml', 'http://www.opengis.net/gml', 'xmlns:ogc', 'http://www.opengis.net/ogc', 'version', '1.0.0') sld = { xml_sld_root: { 'sld:UserLayer': { 'sld:LayerFeatureConstraints': { 'sld:FeatureTypeConstraint': '' }, 'sld:UserStyle': { 'sld:Name': 'Default Styler', 'sld:IsDefault': '1', 'sld:FeatureTypeStyle': { 'sld:Name': 'group 0', 'sld:FeatureTypeName': 'Feature', (1, 'sld:SemanticTypeIdentifier'): 'generic:geometry', (2, 'sld:SemanticTypeIdentifier'): 'colorbrewer:unique:corinne' } } } } } sld_order = { xml_sld_root: ['sld:UserLayer'], 'sld:UserLayer': ['sld:LayerFeatureConstraints', 'sld:UserStyle'], 'sld:UserStyle': ['sld:Name', 'sld:IsDefault', 'sld:FeatureTypeStyle'], 'sld:FeatureTypeStyle': [ 'sld:Name', 'sld:FeatureTypeName', (1, 'sld:SemanticTypeIdentifier'), (2, 'sld:SemanticTypeIdentifier') ], 'ogc:PropertyIsEqualTo': ['ogc:PropertyName', 'ogc:Literal'], 'ogc:And': ['ogc:PropertyIsLessThanOrEqualTo', 'ogc:PropertyIsGreaterThan'], 'ogc:PropertyIsLessThanOrEqualTo': ['ogc:PropertyName', 'ogc:Literal'], 'ogc:PropertyIsGreaterThan': ['ogc:PropertyName', 'ogc:Literal'], 'sld:Fill': [('sld:CssParameter', 'name', 'fill'), ('sld:CssParameter', 'name', 'fill-opacity')] } sld[xml_sld_root]['sld:UserLayer']['sld:UserStyle'][ 'sld:FeatureTypeStyle'].update(sldRules) symbolizer = 'sld:PolygonSymbolizer' if GEOMETRY == 'Polygon' \ else 'sld:LineSymbolizer' if GEOMETRY == 'Line' \ else 'sld:PolygonSimbolizer' for i in range(len(sldRules.keys())): sld_order['sld:FeatureTypeStyle'].append((i + 1, 'sld:Rule')) sld_order[(i + 1, 'sld:Rule')] = [ 'sld:Name', 'sld:Title', 'ogc:Filter', symbolizer ] if GEOMETRY == 'Polygon': for i in range(len(sldRules.keys())): sld_order['sld:PolygonSymbolizer'] = ['sld:Fill', 'sld:Stroke'] write_xml_tree(sld, sld_path, nodes_order=sld_order) return sld_path
def join_tables_in_table(mainTable, mainIdField, joinTables, outTable): """ Join one table with all tables in a folder joinTables = { r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-06.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_6'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-13.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_13'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-20.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_20'} }, r'D:\TRENMO_JASP\CARRIS\valid_by_para\period_16_17h59\sabado\fvalidacoes_v6_2018-01-27.xlsx' : { "JOIN_FIELD" : 'paragem', "COLS_TO_JOIN" : {'n_validacao' : 'dia_27'} } } #TODO: only works with xlsx tables as join TABLES """ # Modules import os import pandas from glass.ng.rd import tbl_to_obj from glass.ng.wt import obj_to_tbl # Get table format tableType = os.path.splitext(mainTable)[1] tableDf = tbl_to_obj(mainTable) for table in joinTables: xlsDf = tbl_to_obj(table) join_field = 'id_entity' if joinTables[table]["JOIN_FIELD"] == mainIdField \ else joinTables[table]["JOIN_FIELD"] if joinTables[table]["JOIN_FIELD"] == mainIdField: xlsDf.rename(columns={mainIdField: join_field}, inplace=True) xlsDf.rename(columns=joinTables[table]["COLS_TO_JOIN"], inplace=True) tableDf = tableDf.merge(xlsDf, how='outer', left_on=mainIdField, right_on=join_field) tableDf.fillna(0, inplace=True) tableDf[mainIdField].replace(0, tableDf[join_field], inplace=True) tableDf.drop(join_field, axis=1, inplace=True) obj_to_tbl(tableDf, outTable) return outTable
def join_xls_table(main_table, fid_main, join_table, fid_join, copy_fields, out_table, main_sheet=None, join_sheet=None): """ Join tables using a commum attribute Relations: - 1 to 1 - N to 1 TODO: Use Pandas Instead """ import xlwt from glass.ng.rd import tbl_to_obj from glass.ng.xls.fld import col_name copy_fields = [copy_fields] if type(copy_fields) == str else \ copy_fields if type(copy_fields) == list else None if not copy_fields: raise ValueError('copy_fields should be a list or a string') # main_table to dict mainData = tbl_to_obj(main_table, sheet=main_sheet, useFirstColAsIndex=True, output='dict') # join table to dict joinData = tbl_to_obj(join_table, sheet=join_sheet, useFirstColAsIndex=True, output='dict') # write output data out_sheet_name = 'data' if not main_sheet and not join_sheet else join_sheet \ if join_sheet and not main_sheet else main_sheet out_xls = xlwt.Workbook() new_sheet = out_xls.add_sheet(out_sheet_name) # Write tiles COLUMNS_ORDER = col_name(main_table, sheet_name=main_sheet) TITLES = COLUMNS_ORDER + copy_fields for i in range(len(TITLES)): new_sheet.write(0, i, TITLES[i]) # parse data l = 1 for fid in mainData: new_sheet.write(l, 0, fid) c = 1 for col in COLUMNS_ORDER[1:]: new_sheet.write(l, c, mainData[fid][col]) c += 1 for col in copy_fields: if fid in joinData: new_sheet.write(l, c, joinData[fid][col]) c += 1 l += 1 out_xls.save(out_table)
def txtclsmdl_to_file(train, tRef, tData, outMdl, outTf, method='NaiveBayes'): """ Fit Text classification model and save model to file Classifier Options: 1) NaiveBayes; 2) LinearSupportVectorMachine; 3) RandomForest; 4) LogisticRegression. """ import pandas as pd import joblib from glass.ng.rd import tbl_to_obj # Data to Dataframe trainDf = tbl_to_obj(train) if type(train) != pd.DataFrame else train # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[tData])] trainDf = trainDf[pd.notnull(trainDf[tRef])] if method == 'NaiveBayes': from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer """" Train Model """ # X train is trainClsCol # Y train is trainRefCol x_train, y_train = trainDf[tData], trainDf[tRef] tvect = CountVectorizer() X_train_counts = tvect.fit_transform(x_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) elif method == 'LinearSupportVectorMachine': from sklearn.svm import LinearSVC feat, tvect = txt_to_num_representation( trainDf, tData, __lang='english', returnTfiDf=True) # Train model clf = LinearSVC().fit(feat, trainDf[tRef]) elif method == 'RandomForest': from sklearn.ensemble import RandomForestClassifier feat, tvect = txt_to_num_representation( trainDf, tData, __lang='english', returnTfiDf=True) clf = RandomForestClassifier(n_estimators=1000, random_state=0) clf.fit(feat, trainDf[tRef]) elif method == 'LogisticRegression': from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression( n_jobs=1, C=1e5, multi_class='auto', solver='lbfgs')) ]) clf.fit(trainDf[tData], trainDf[tRef]) if method != 'LogisticRegression': joblib.dump(tvect, outTf) joblib.dump(clf, outMdl) else: joblib.dump(clf, outMdl) outTf=None return outMdl, outTf
def otp_cf_based_on_rel(incidents, group_incidents_col, facilities, facilities_id, rel_inc_fac, sheet, group_fk, facilities_fk, hour, day, output): """ Calculate time travel considering specific facilities for each group of incidents Relations between incidents and facilities are in a auxiliar table (rel_inc_fac). Auxiliar table must be a xlsx file """ import os import pandas as pd from glass.ng.rd import tbl_to_obj from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import obj_to_shp from glass.g.mob.otp.log import clsfacility from glass.g.prop.prj import get_shp_epsg from glass.ng.pd import merge_df from glass.pys.oss import fprop from glass.g.prj.obj import df_prj # Avoid problems when facilities_id == facilities_fk facilities_fk = facilities_fk + '_fk' if facilities_id == facilities_fk else \ facilities_fk # Open data idf = df_prj(shp_to_obj(incidents), 4326) fdf = df_prj(shp_to_obj(facilities), 4326) rel_df = tbl_to_obj(rel_inc_fac, sheet=sheet) oepsg = get_shp_epsg(incidents) # Relate facilities with incidents groups fdf = fdf.merge(rel_df, how='inner', left_on=facilities_id, right_on=facilities_fk) # List Groups grp_df = pd.DataFrame({ 'cnttemp': idf.groupby([group_incidents_col])[group_incidents_col].agg('count') }).reset_index() # Do calculations res = [] logs = [] for idx, row in grp_df.iterrows(): # Get incidents for that group new_i = idf[idf[group_incidents_col] == row[group_incidents_col]] # Get facilities for that group new_f = fdf[fdf[group_fk] == row[group_incidents_col]] # calculate closest facility cfres, l = clsfacility(new_i, new_f, hour, day, out_epsg=oepsg) res.append(cfres) logs.extend(l) # Merge results out_df = merge_df(res) # Recovery facility id fdf.drop([c for c in fdf.columns.values if c != facilities_id], axis=1, inplace=True) out_df = out_df.merge(fdf, how='left', left_on='ffid', right_index=True) # Export result obj_to_shp(out_df, "geom", oepsg, output) # Write logs if len(logs) > 0: with open( os.path.join(os.path.dirname(output), fprop(output, 'fn') + '_log.txt'), 'w') as txt: for i in logs: txt.write(("Incident_id: {}\n" "Facility_id: {}\n" "ERROR message:\n" "{}\n" "\n\n\n\n\n\n").format(str(i[0]), str(i[1]), str(i[2]))) return output
def correlated_words(dataFile, refCol, dataCol, outTbl, lang='english', N=2, refSheet=None): """ Get words correlated with some text class """ from sklearn.feature_selection import chi2 from glass.ng.wt import obj_to_tbl from glass.ng.rd import tbl_to_obj from glass.ng.clstxt import txt_to_num_representation # Data to DataFrame trainDf = tbl_to_obj( dataFile, sheet=refSheet ) if type(dataFile) != pd.DataFrame else dataFile # Just in case, delete rows with NULL refCol and NULL dataCol trainDf = trainDf[pd.notnull(trainDf[dataCol])] trainDf = trainDf[pd.notnull(trainDf[refCol])] """ Add a column encoding the reference classes as an integer because categorical variables are often better represented by integers than strings """ from io import StringIO # Get a ID for Ref/text classes values trainDf['ref_id'] = trainDf[refCol].factorize()[0] # Create Dataframe only with ref_id's, without duplicates ref_id_df = trainDf[[refCol, 'ref_id']].drop_duplicates().sort_values( 'ref_id' ) # Create dicts to easy relate ref_id with ref_value ref_to_id = dict(ref_id_df.values) id_to_ref = dict(ref_id_df[['ref_id', refCol]].values) """ Text to numbers """ features, tfidf = txt_to_num_representation( trainDf, dataCol, lang, returnTfiDf=True) labels = trainDf.ref_id """ Get most correlated words """ corr_words = [] for ref_name, ref_id in sorted(ref_to_id.items()): features_chi2 = chi2(features, labels == ref_id) indices = np.argsort(features_chi2[0]) feat_names = np.array(tfidf.get_feature_names())[indices] unigrams = [v for v in feat_names if len(v.split(' ')) == 1][-N:] bigrams = [v for v in feat_names if len(v.split(' ')) == 2][-N:] cols_d = [ref_name] + unigrams + bigrams corr_words.append(cols_d) COLS_NAME = ['ref_name'] + [ 'unigram_{}'.format(str(i+1)) for i in range(N) ] + [ 'bigram_{}'.format(str(i+1)) for i in range(N) ] dfCorrWords = pd.DataFrame(corr_words,columns=COLS_NAME) return obj_to_tbl(dfCorrWords, outTbl)
def tbl_to_db(tblFile, db, sqlTbl, delimiter=None, encoding_='utf-8', sheet=None, isAppend=None, api_db='psql', colsMap=None): """ Table file to Database Table API's available: * psql; * sqlite; """ import os from glass.pys import obj_to_lst from glass.pys.oss import fprop from glass.ng.rd import tbl_to_obj from glass.g.wt.sql import df_to_db if os.path.isdir(tblFile): from glass.pys.oss import lst_ff tbls = lst_ff(tblFile) else: tbls = obj_to_lst(tblFile) outSQLTbl = obj_to_lst(sqlTbl) RTBL = [] for i in range(len(tbls)): fp = fprop(tbls[i], ['fn', 'ff']) ff = fp['fileformat'] fn = fp['filename'] if ff == '.csv' or ff == '.txt' or ff == '.tsv': if not delimiter: raise ValueError(( "To convert TXT to DB table, you need to give a value for the " "delimiter input parameter" )) __enc = 'utf-8' if not encoding_ else encoding_ data = tbl_to_obj( tbls[i], _delimiter=delimiter, encoding_=__enc ) elif ff == '.dbf': data = tbl_to_obj(tbls[i]) elif ff == '.xls' or ff == '.xlsx': data = tbl_to_obj(tbls[i], sheet=sheet) elif ff == '.ods': if not sheet: raise ValueError(( "To convert ODS to DB table, you need to give a value " "for the sheet input parameter" )) data = tbl_to_obj(tbls[i], sheet=sheet) else: raise ValueError('{} is not a valid table format!'.format(ff)) if colsMap: data.rename(columns=colsMap, inplace=True) # Send data to database out_tbl = fn if not outSQLTbl else outSQLTbl[i] \ if i+1 <= len(tbls) else fn _rtbl = df_to_db( db, data, out_tbl, append=isAppend, api=api_db ) RTBL.append(_rtbl) return RTBL[0] if len(RTBL) == 1 else RTBL
def join_shp_with_tbl(shp, shp_pk, tbl, tbl_fk, outShp, joinFieldsMantain=None, newNames=None, csv_delimiter=';', isbgri=None, sheet=None): """ Join BGRI ESRI Shapefile with table in xlsx or csv formats """ import pandas as pd from glass.pys import obj_to_lst from glass.ng.rd import tbl_to_obj from glass.g.rd.shp import shp_to_obj from glass.g.wt.shp import df_to_shp # Read main_table mainDf = shp_to_obj(shp) # Read join table joinDf = tbl_to_obj(tbl, _delimiter=csv_delimiter, encoding_='utf-8', sheet=sheet) # Force ids to strings mainDf[shp_pk] = mainDf[shp_pk].astype(str) joinDf[tbl_fk] = joinDf[tbl_fk].astype(str) # Sanitize GEO_COD of bgriCsv if isbgri: joinDf[tbl_fk] = joinDf[tbl_fk].str.replace("'", "") if joinFieldsMantain: joinFieldsMantain = obj_to_lst(joinFieldsMantain) dropCols = [] for col in joinDf.columns.values: if col not in [shp_pk] + joinFieldsMantain: dropCols.append(col) joinDf.drop(dropCols, axis=1, inplace=True) # Force numeric columns to be numeric for c in joinDf.columns.values: if c != tbl_fk: joinDf[c] = pd.to_numeric(joinDf[c], errors='ignore') resultDf = mainDf.merge(joinDf, how='inner', left_on=shp_pk, right_on=tbl_fk) if newNames: newNames = obj_to_lst(newNames) renDict = { joinFieldsMantain[n]: newNames[n] for n in range(len(joinFieldsMantain)) } resultDf.rename(columns=renDict, inplace=True) df_to_shp(resultDf, outShp) return outShp