class Job(Mysql_Job): OUTPUT_TYPES = { 'some_field1': types.INT(), 'some_field2': types.INT(), 'some_field3': types.VARCHAR(100)} def transform(self): return self.query_mysql("""SELECT * FROM some_table """)
def sqlcol(dfparam, text=None): dtypedict = {} for i, j in zip(dfparam.columns, dfparam.dtypes): if "object" in str(j): if text == "postgresql-text": dtypedict.update({i: TEXT()}) else: try: x = int(df[i].str.len().max() / 40) + 1 except: x = 50 dtypedict.update({i: types.VARCHAR(length=x * 80)}) if "datetime" in str(j): dtypedict.update({i: types.DateTime()}) if "float" in str(j): dtypedict.update({i: types.Float(precision=3, asdecimal=True)}) if "int" in str(j): dtypedict.update({i: types.INT()}) return dtypedict
def sql_col(data_frame): """Convert python data types to sql data types""" dtypes_dict = {} for column, dtype in zip(data_frame.columns, data_frame.dtypes): if "object" in str(dtype): str_max_len = col_length(data_frame[column].str.len().max()) dtypes_dict.update({column: types.VARCHAR(length=str_max_len)}) if "datetime" in str(dtype): dtypes_dict.update({column: types.DateTime()}) if "float" in str(dtype): dtypes_dict.update({column: types.FLOAT }) # .Float(precision=3, asdecimal=True)}) if "int" in str(dtype): dtypes_dict.update({column: types.INT()}) return dtypes_dict
class Job(ETL_Base): OUTPUT_TYPES = { 'session_id': types.VARCHAR(16), 'count_events': types.INT(), } def transform(self, some_events, other_events): df = self.query(""" SELECT se.session_id, count(*) as count_events FROM some_events se JOIN other_events oe on se.session_id=oe.session_id WHERE se.action='searchResultPage' and se.n_results>0 group by se.session_id order by count(*) desc """) return df
class Job(ETL_Base): OUTPUT_TYPES = { 'session_id': types.VARCHAR(16), 'count_events': types.INT(), } def transform(self): cred_profiles = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage) query_str = """ SELECT session_id, count_events FROM test_ex5_pyspark_job where rownum < 200 """ df = query_oracle(query_str, db=self.db_creds, creds_or_file=cred_profiles) # TODO: Check to get OUTPUT_TYPES from query_oracle, so not required here. sdf = pdf_to_sdf(df, self.OUTPUT_TYPES, self.sc, self.sc_sql) return sdf
def gen_types_from_pandas_to_sql(table): r""" Generate a dictionnary with the database types related to the dataframe dtypes """ dtypedict = {} for i, j in zip(table.columns, table.dtypes): if 'object' in str(j): dtypedict.update({i: types.NVARCHAR(length=500)}) if 'datetime' in str(j): dtypedict.update({i: types.DateTime()}) if 'float' in str(j): dtypedict.update({i: types.Float(precision=3, asdecimal=True)}) if 'int' in str(j): if max([l for l in table[i].tolist() if not pd.isnull(l) ]) > cp.INT_LIMIT: dtypedict.update({i: types.BIGINT()}) else: dtypedict.update({i: types.INT()}) return dtypedict
def get_spark_type(field, required_type): if isinstance(required_type, type(db_types.DATE())): return spk_types.StructField(field, spk_types.DateType(), True) elif isinstance(required_type, type(db_types.DATETIME())): return spk_types.StructField(field, spk_types.TimestampType(), True) elif isinstance(required_type, type(db_types.VARCHAR())): return spk_types.StructField(field, spk_types.StringType(), True) elif isinstance(required_type, type(db_types.INT())): return spk_types.StructField( field, spk_types.LongType(), True ) # db type enforced earlier than spark ones, so spark types needs to be less restrictive than spark ones so needs to choose LongType instead of IntegerType elif isinstance(required_type, type(db_types.FLOAT())): return spk_types.StructField(field, spk_types.FloatType(), True) elif isinstance(required_type, type(db_types.BOOLEAN())): return spk_types.StructField(field, spk_types.BooleanType(), True) else: raise Exception( "Type not recognized, field={}, required_type={}".format( field, required_type))
def db_write(self,df,tb_name,dbtype='mssql',conp=None,rora='replace'): cs=None if len(df)>20000:cs=10000 con=self.getconstr(dbtype,conp) dtypedict = {} for i,j in zip(df.columns, df.dtypes): if "object" in str(j): try: x=int(df[i].str.len().max()/40)+1 except: x=50 dtypedict.update({i: types.VARCHAR(length=x*80)}) if "datetime" in str(j): dtypedict.update({i: types.DateTime()}) if "float" in str(j): dtypedict.update({i: types.Float(precision=3, asdecimal=True)}) if "int" in str(j): dtypedict.update({i: types.INT()}) if dbtype in ['postgresql','mssql']: df.to_sql(tb_name,con,if_exists=rora,index=False,schema=conp[4],dtype=dtypedict,chunksize=cs) elif dbtype in ['oracle','mysql','sqlite']: df.to_sql(tb_name,con,if_exists=rora,index=False,dtype=dtypedict,chunksize=cs)
def cast_value(value, required_type, field_name): # TODO: make it less ugly.. or avoid using pandas to not require this. try: if isinstance(required_type, type(db_types.DATE())): if isinstance(value, str): return datetime.strptime(value, "%Y-%m-%d") # assuming iso format elif isinstance(value, pd.Timestamp): # == datetime return value.to_pydatetime().date() elif isinstance(value, date): return value elif pd.isnull(value): return None else: return required_type.python_type(value) if isinstance(required_type, type(db_types.DATETIME())): if isinstance(value, str): return datetime.strptime( value, "%Y-%m-%d %H:%M:%S") # assuming iso format elif isinstance(value, pd.Timestamp): return value.to_pydatetime() elif pd.isnull(value): return None else: return required_type.python_type(value) elif isinstance(required_type, type(db_types.VARCHAR())): return None if pd.isnull(value) else str(value) elif isinstance(required_type, type(db_types.INT())): return None if pd.isnull(value) else int(float(value)) elif isinstance(required_type, type(db_types.BIGINT())): return None if pd.isnull(value) else long(value) elif isinstance(required_type, type(db_types.FLOAT())): return None if pd.isnull(value) else float(value) else: return required_type.python_type(value) except Exception as e: logger.error(u"cast_value issue: {}, {}, {}, {}, {}.".format( field_name, value, type(value), required_type, str(e))) return None
def dtype_to_sqldtype(self, df): """ Creates a dict of SQL-dtypes to pass to the pd.to_sql() command. Args: df (DataFrame): Dataframe with data to be converted. Returns: sqldtype_dict (dict): Dict with SQL-dtype per column. """ sqldtype_dict = {} for i, j in zip(df.columns, df.dtypes): if "object" in str(j): sqldtype_dict.update({i: sql_dtype.VARCHAR(12)}) if "datetime" in str(j): sqldtype_dict.update({i: sql_dtype.DateTime()}) if "float" in str(j): sqldtype_dict.update({i: sql_dtype.FLOAT(precision=12)}) if "int" in str(j): sqldtype_dict.update({i: sql_dtype.INT()}) return sqldtype_dict
def osm_delineation(param): """ """ osm.op_endpoint = param['osm']['op_endpoint'] ######################################## ### Load data # run_time_start = pd.Timestamp.today().strftime('%Y-%m-%d %H:%M:%S') # print(run_time_start) ## Read in source data print('--Reading in source data...') json_lst = get_json_from_api(param['plan_limits']['api_url'], param['plan_limits']['api_headers']) json_lst1 = json_filters(json_lst, only_operative=True, only_reach_points=True) gjson1, hydro_units, pts_alt, sg1 = geojson_convert(json_lst1) combined_zones1 = [j for j in json_lst if j['id'] == param['other']['combined_zones_id']][0] combined_zones2 = [s['id'] for s in combined_zones1['spatialUnit']] no_limit1 = [j for j in json_lst if j['id'] == param['other']['no_limit_id']][0] no_limit2 = [s['id'] for s in no_limit1['spatialUnit']][0] # pts = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['pts']['table'], [param['gis_waterdata']['pts']['id']], where_in={param['gis_waterdata']['pts']['id']: pts_alt.id.unique().tolist()}, geo_col=True, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], rename_cols=[id_col]) pts = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['pts']['table'], [param['gis_waterdata']['pts']['id']], where_in={param['gis_waterdata']['pts']['id']: pts_alt.id.unique().tolist()}, geo_col=True, rename_cols=[id_col]) ## Point checks excluded_points = pts_alt[~pts_alt.id.isin(pts.SpatialUnitId)].copy() if not excluded_points.empty: print('These points are in the Plan Limits db, but have no GIS data:') print(excluded_points) bad_geo = pts[pts.geom_type != 'Point'] if not bad_geo.empty: print('These points do not have a "Point" geometry (likely "MultiPoint"):') print(bad_geo) pts = pts[~pts.SpatialUnitId.isin(bad_geo.SpatialUnitId)].copy() cwms1 = mssql.rd_sql(param['gis_prod']['server'], param['gis_prod']['database'], param['gis_prod']['cwms']['table'], param['gis_prod']['cwms']['col_names'], rename_cols=param['gis_prod']['cwms']['rename_cols'], geo_col=True, username=param['gis_prod']['username'], password=param['gis_prod']['password']) # zones3 = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['allo_zones']['table'], [param['gis_waterdata']['allo_zones']['id']], where_in={param['gis_waterdata']['allo_zones']['id']: combined_zones2}, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], geo_col=True, rename_cols=[id_col]) zones3 = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['allo_zones']['table'], [param['gis_waterdata']['allo_zones']['id']], where_in={param['gis_waterdata']['allo_zones']['id']: combined_zones2}, geo_col=True, rename_cols=[id_col]) pts['geometry'] = pts.geometry.simplify(1) ####################################### ### Run query print('--Pull out the waterways from OSM') pts1, bad_points = osm.get_nearest_waterways(pts, id_col, param['other']['search_distance'], 'all') waterways, nodes = osm.get_waterways(pts1, 'all') print('--Delineating Reaches from OSM') site_delin = osm.waterway_delineation(waterways, True) osm_delin = osm.to_osm(site_delin, nodes) gdf1 = osm.to_gdf(osm_delin) gdf2 = gdf1.to_crs(pts.crs) gdf3 = gdf2.merge(pts1.rename(columns={'id': 'start_node'})[['start_node', id_col]], on='start_node') print('--Pulling out all of Canterbury...') cant2 = osm.get_waterways_within_boundary(cwms1, buffer=0, waterway_type='all') combined1, poly1 = vector.pts_poly_join(cant2, zones3, id_col, op='intersects') gdf3 = gdf3[~gdf3.way_id.isin(combined1.way_id.unique())].copy() all_others1 = cant2[~cant2.way_id.isin(combined1.way_id)] all_others2 = all_others1[~all_others1.way_id.isin(gdf3.way_id.unique().tolist())].copy() all_others2[id_col] = no_limit2 print('--Combine all reach data') gdf4 = pd.concat([gdf3, combined1, all_others2]).reset_index(drop=True) gdf4.rename(columns={'way_id': 'OSMWaterwayId', 'waterway': 'OSMWaterwayType', 'name': 'RiverName', 'start_node': 'StartNode'}, inplace=True) gdf4['OSMWaterwayId'] = gdf4['OSMWaterwayId'].astype('int64') print('--Compare existing reaches in the database') cols = gdf4.columns.drop('geometry').tolist() cols.extend(['OBJECTID']) # old1 = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], cols, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], geo_col=True) old1 = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], cols, geo_col=True) comp_dict = util.compare_dfs(old1.drop('OBJECTID', axis=1), gdf4, on=['SpatialUnitId', 'OSMWaterwayId']) new1 = comp_dict['new'].copy() diff1 = comp_dict['diff'].copy() rem1 = comp_dict['remove'][['SpatialUnitId', 'OSMWaterwayId']].copy() print('--Save to database') sql_dtypes = {'StartNode': types.BIGINT(), 'OSMWaterwayId': types.BIGINT(), 'RiverName': types.NVARCHAR(200), 'OSMWaterwayType': types.NVARCHAR(30), 'SpatialUnitId': types.NVARCHAR(8), 'SHAPE_': types.VARCHAR(), 'OBJECTID': types.INT(), 'ModifiedDate': types.DATETIME()} if not new1.empty: max_id = old1['OBJECTID'].max() + 1 new1['ModifiedDate'] = today_str new1['OBJECTID'] = list(range(max_id, max_id + len(new1))) new1.rename(columns={'geometry': 'SHAPE'}, inplace=True) # mssql.update_table_rows(new1, param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], on=['SpatialUnitId', 'OSMWaterwayId'], index=False, append=True, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], geo_col='SHAPE', clear_table=False, dtype=sql_dtypes) mssql.update_table_rows(new1, param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], on=['SpatialUnitId', 'OSMWaterwayId'], index=False, append=True, geo_col='SHAPE', clear_table=False, dtype=sql_dtypes) if not diff1.empty: diff2 = pd.merge(diff1, old1[['SpatialUnitId', 'OSMWaterwayId', 'OBJECTID']], on=['SpatialUnitId', 'OSMWaterwayId']) diff2['ModifiedDate'] = today_str diff2.rename(columns={'geometry': 'SHAPE'}, inplace=True) # mssql.update_table_rows(diff2, param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], on=['SpatialUnitId', 'OSMWaterwayId'], index=False, append=True, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], geo_col='SHAPE', clear_table=False, dtype=sql_dtypes) mssql.update_table_rows(diff2, param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], on=['SpatialUnitId', 'OSMWaterwayId'], index=False, append=True, geo_col='SHAPE', clear_table=False, dtype=sql_dtypes) if not rem1.empty: # mssql.del_table_rows(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], pk_df=rem1, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password']) mssql.del_table_rows(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], pk_df=rem1) return gdf4, excluded_points, bad_geo, bad_points