Ejemplo n.º 1
0
class Job(Mysql_Job):
    OUTPUT_TYPES = {
        'some_field1': types.INT(),
        'some_field2': types.INT(),
        'some_field3': types.VARCHAR(100)}

    def transform(self):
        return self.query_mysql("""SELECT * FROM some_table """)
Ejemplo n.º 2
0
    def sqlcol(dfparam, text=None):

        dtypedict = {}
        for i, j in zip(dfparam.columns, dfparam.dtypes):

            if "object" in str(j):
                if text == "postgresql-text":
                    dtypedict.update({i: TEXT()})
                else:
                    try:
                        x = int(df[i].str.len().max() / 40) + 1
                    except:
                        x = 50
                    dtypedict.update({i: types.VARCHAR(length=x * 80)})

            if "datetime" in str(j):
                dtypedict.update({i: types.DateTime()})

            if "float" in str(j):
                dtypedict.update({i: types.Float(precision=3, asdecimal=True)})

            if "int" in str(j):
                dtypedict.update({i: types.INT()})

        return dtypedict
Ejemplo n.º 3
0
def sql_col(data_frame):
    """Convert python data types to sql data types"""
    dtypes_dict = {}
    for column, dtype in zip(data_frame.columns, data_frame.dtypes):
        if "object" in str(dtype):
            str_max_len = col_length(data_frame[column].str.len().max())
            dtypes_dict.update({column: types.VARCHAR(length=str_max_len)})
        if "datetime" in str(dtype):
            dtypes_dict.update({column: types.DateTime()})
        if "float" in str(dtype):
            dtypes_dict.update({column: types.FLOAT
                                })  # .Float(precision=3, asdecimal=True)})
        if "int" in str(dtype):
            dtypes_dict.update({column: types.INT()})
    return dtypes_dict
class Job(ETL_Base):
    OUTPUT_TYPES = {
        'session_id': types.VARCHAR(16),
        'count_events': types.INT(),
    }

    def transform(self, some_events, other_events):
        df = self.query("""
            SELECT se.session_id, count(*) as count_events
            FROM some_events se
            JOIN other_events oe on se.session_id=oe.session_id
            WHERE se.action='searchResultPage' and se.n_results>0
            group by se.session_id
            order by count(*) desc
            """)
        return df
class Job(ETL_Base):

    OUTPUT_TYPES = {
        'session_id': types.VARCHAR(16),
        'count_events': types.INT(),
        }

    def transform(self):
        cred_profiles = Cred_Ops_Dispatcher().retrieve_secrets(self.jargs.storage)
        query_str = """
            SELECT session_id, count_events
            FROM test_ex5_pyspark_job
            where rownum < 200
            """
        df = query_oracle(query_str, db=self.db_creds, creds_or_file=cred_profiles)
        # TODO: Check to get OUTPUT_TYPES from query_oracle, so not required here.
        sdf = pdf_to_sdf(df, self.OUTPUT_TYPES, self.sc, self.sc_sql)
        return sdf
Ejemplo n.º 6
0
def gen_types_from_pandas_to_sql(table):
    r"""
    Generate a dictionnary with the database types related to the dataframe dtypes
    """
    dtypedict = {}
    for i, j in zip(table.columns, table.dtypes):
        if 'object' in str(j):
            dtypedict.update({i: types.NVARCHAR(length=500)})
        if 'datetime' in str(j):
            dtypedict.update({i: types.DateTime()})
        if 'float' in str(j):
            dtypedict.update({i: types.Float(precision=3, asdecimal=True)})
        if 'int' in str(j):
            if max([l for l in table[i].tolist() if not pd.isnull(l)
                    ]) > cp.INT_LIMIT:
                dtypedict.update({i: types.BIGINT()})
            else:
                dtypedict.update({i: types.INT()})
    return dtypedict
Ejemplo n.º 7
0
def get_spark_type(field, required_type):
    if isinstance(required_type, type(db_types.DATE())):
        return spk_types.StructField(field, spk_types.DateType(), True)
    elif isinstance(required_type, type(db_types.DATETIME())):
        return spk_types.StructField(field, spk_types.TimestampType(), True)
    elif isinstance(required_type, type(db_types.VARCHAR())):
        return spk_types.StructField(field, spk_types.StringType(), True)
    elif isinstance(required_type, type(db_types.INT())):
        return spk_types.StructField(
            field, spk_types.LongType(), True
        )  # db type enforced earlier than spark ones, so spark types needs to be less restrictive than spark ones so needs to choose LongType instead of IntegerType
    elif isinstance(required_type, type(db_types.FLOAT())):
        return spk_types.StructField(field, spk_types.FloatType(), True)
    elif isinstance(required_type, type(db_types.BOOLEAN())):
        return spk_types.StructField(field, spk_types.BooleanType(), True)
    else:
        raise Exception(
            "Type not recognized, field={}, required_type={}".format(
                field, required_type))
Ejemplo n.º 8
0
 def db_write(self,df,tb_name,dbtype='mssql',conp=None,rora='replace'):
     cs=None
     if len(df)>20000:cs=10000
     con=self.getconstr(dbtype,conp)
     dtypedict = {}
     for i,j in zip(df.columns, df.dtypes):
         if "object" in str(j):
             try:
                 x=int(df[i].str.len().max()/40)+1 
             except:
                 x=50
             dtypedict.update({i: types.VARCHAR(length=x*80)})
         if "datetime" in str(j):
             dtypedict.update({i: types.DateTime()})
         if "float" in str(j):
             dtypedict.update({i: types.Float(precision=3, asdecimal=True)})
         if "int" in str(j):
             dtypedict.update({i: types.INT()})        
     if dbtype in ['postgresql','mssql']:
         df.to_sql(tb_name,con,if_exists=rora,index=False,schema=conp[4],dtype=dtypedict,chunksize=cs)
     elif dbtype in ['oracle','mysql','sqlite']:
         df.to_sql(tb_name,con,if_exists=rora,index=False,dtype=dtypedict,chunksize=cs)
Ejemplo n.º 9
0
def cast_value(value, required_type, field_name):
    # TODO: make it less ugly.. or avoid using pandas to not require this.
    try:
        if isinstance(required_type, type(db_types.DATE())):
            if isinstance(value, str):
                return datetime.strptime(value,
                                         "%Y-%m-%d")  # assuming iso format
            elif isinstance(value, pd.Timestamp):  # == datetime
                return value.to_pydatetime().date()
            elif isinstance(value, date):
                return value
            elif pd.isnull(value):
                return None
            else:
                return required_type.python_type(value)
        if isinstance(required_type, type(db_types.DATETIME())):
            if isinstance(value, str):
                return datetime.strptime(
                    value, "%Y-%m-%d %H:%M:%S")  # assuming iso format
            elif isinstance(value, pd.Timestamp):
                return value.to_pydatetime()
            elif pd.isnull(value):
                return None
            else:
                return required_type.python_type(value)
        elif isinstance(required_type, type(db_types.VARCHAR())):
            return None if pd.isnull(value) else str(value)
        elif isinstance(required_type, type(db_types.INT())):
            return None if pd.isnull(value) else int(float(value))
        elif isinstance(required_type, type(db_types.BIGINT())):
            return None if pd.isnull(value) else long(value)
        elif isinstance(required_type, type(db_types.FLOAT())):
            return None if pd.isnull(value) else float(value)
        else:
            return required_type.python_type(value)
    except Exception as e:
        logger.error(u"cast_value issue: {}, {}, {}, {}, {}.".format(
            field_name, value, type(value), required_type, str(e)))
        return None
    def dtype_to_sqldtype(self, df):
        """
        Creates a dict of SQL-dtypes to pass to the pd.to_sql() command.
        Args:
            df (DataFrame): Dataframe with data to be converted.

        Returns:
            sqldtype_dict (dict): Dict with SQL-dtype per column.
        """
        sqldtype_dict = {}
        for i, j in zip(df.columns, df.dtypes):
            if "object" in str(j):
                sqldtype_dict.update({i: sql_dtype.VARCHAR(12)})

            if "datetime" in str(j):
                sqldtype_dict.update({i: sql_dtype.DateTime()})

            if "float" in str(j):
                sqldtype_dict.update({i: sql_dtype.FLOAT(precision=12)})

            if "int" in str(j):
                sqldtype_dict.update({i: sql_dtype.INT()})

        return sqldtype_dict
def osm_delineation(param):
    """

    """
    osm.op_endpoint = param['osm']['op_endpoint']

    ########################################
    ### Load data

    # run_time_start = pd.Timestamp.today().strftime('%Y-%m-%d %H:%M:%S')
    # print(run_time_start)

    ## Read in source data
    print('--Reading in source data...')

    json_lst = get_json_from_api(param['plan_limits']['api_url'], param['plan_limits']['api_headers'])
    json_lst1 = json_filters(json_lst, only_operative=True, only_reach_points=True)
    gjson1, hydro_units, pts_alt, sg1 = geojson_convert(json_lst1)

    combined_zones1 = [j for j in json_lst if j['id'] == param['other']['combined_zones_id']][0]
    combined_zones2 = [s['id'] for s in combined_zones1['spatialUnit']]

    no_limit1 = [j for j in json_lst if j['id'] == param['other']['no_limit_id']][0]
    no_limit2 = [s['id'] for s in no_limit1['spatialUnit']][0]

    # pts = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['pts']['table'], [param['gis_waterdata']['pts']['id']], where_in={param['gis_waterdata']['pts']['id']: pts_alt.id.unique().tolist()}, geo_col=True, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], rename_cols=[id_col])
    pts = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['pts']['table'], [param['gis_waterdata']['pts']['id']], where_in={param['gis_waterdata']['pts']['id']: pts_alt.id.unique().tolist()}, geo_col=True, rename_cols=[id_col])

    ## Point checks
    excluded_points = pts_alt[~pts_alt.id.isin(pts.SpatialUnitId)].copy()
    if not excluded_points.empty:
        print('These points are in the Plan Limits db, but have no GIS data:')
        print(excluded_points)

    bad_geo = pts[pts.geom_type != 'Point']
    if not bad_geo.empty:
        print('These points do not have a "Point" geometry (likely "MultiPoint"):')
        print(bad_geo)
        pts = pts[~pts.SpatialUnitId.isin(bad_geo.SpatialUnitId)].copy()

    cwms1 = mssql.rd_sql(param['gis_prod']['server'], param['gis_prod']['database'], param['gis_prod']['cwms']['table'], param['gis_prod']['cwms']['col_names'], rename_cols=param['gis_prod']['cwms']['rename_cols'], geo_col=True, username=param['gis_prod']['username'], password=param['gis_prod']['password'])

    # zones3 = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['allo_zones']['table'], [param['gis_waterdata']['allo_zones']['id']], where_in={param['gis_waterdata']['allo_zones']['id']: combined_zones2}, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], geo_col=True, rename_cols=[id_col])
    zones3 = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['allo_zones']['table'], [param['gis_waterdata']['allo_zones']['id']], where_in={param['gis_waterdata']['allo_zones']['id']: combined_zones2}, geo_col=True, rename_cols=[id_col])

    pts['geometry'] = pts.geometry.simplify(1)

    #######################################
    ### Run query
    print('--Pull out the waterways from OSM')

    pts1, bad_points = osm.get_nearest_waterways(pts, id_col, param['other']['search_distance'], 'all')

    waterways, nodes = osm.get_waterways(pts1, 'all')

    print('--Delineating Reaches from OSM')

    site_delin = osm.waterway_delineation(waterways, True)
    osm_delin = osm.to_osm(site_delin, nodes)
    gdf1 = osm.to_gdf(osm_delin)

    gdf2 = gdf1.to_crs(pts.crs)

    gdf3 = gdf2.merge(pts1.rename(columns={'id': 'start_node'})[['start_node', id_col]], on='start_node')

    print('--Pulling out all of Canterbury...')

    cant2 = osm.get_waterways_within_boundary(cwms1, buffer=0, waterway_type='all')

    combined1, poly1 = vector.pts_poly_join(cant2, zones3, id_col, op='intersects')
    gdf3 = gdf3[~gdf3.way_id.isin(combined1.way_id.unique())].copy()

    all_others1 = cant2[~cant2.way_id.isin(combined1.way_id)]
    all_others2 = all_others1[~all_others1.way_id.isin(gdf3.way_id.unique().tolist())].copy()
    all_others2[id_col] = no_limit2

    print('--Combine all reach data')

    gdf4 = pd.concat([gdf3, combined1, all_others2]).reset_index(drop=True)

    gdf4.rename(columns={'way_id': 'OSMWaterwayId', 'waterway': 'OSMWaterwayType', 'name': 'RiverName', 'start_node': 'StartNode'}, inplace=True)
    gdf4['OSMWaterwayId'] = gdf4['OSMWaterwayId'].astype('int64')

    print('--Compare existing reaches in the database')

    cols = gdf4.columns.drop('geometry').tolist()
    cols.extend(['OBJECTID'])

    # old1 = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], cols, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], geo_col=True)
    old1 = mssql.rd_sql(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], cols, geo_col=True)

    comp_dict = util.compare_dfs(old1.drop('OBJECTID', axis=1), gdf4, on=['SpatialUnitId', 'OSMWaterwayId'])
    new1 = comp_dict['new'].copy()
    diff1 = comp_dict['diff'].copy()
    rem1 = comp_dict['remove'][['SpatialUnitId', 'OSMWaterwayId']].copy()

    print('--Save to database')

    sql_dtypes = {'StartNode': types.BIGINT(), 'OSMWaterwayId': types.BIGINT(), 'RiverName': types.NVARCHAR(200), 'OSMWaterwayType': types.NVARCHAR(30), 'SpatialUnitId': types.NVARCHAR(8), 'SHAPE_': types.VARCHAR(), 'OBJECTID': types.INT(), 'ModifiedDate': types.DATETIME()}

    if not new1.empty:
        max_id = old1['OBJECTID'].max() + 1

        new1['ModifiedDate'] = today_str
        new1['OBJECTID'] = list(range(max_id, max_id + len(new1)))
        new1.rename(columns={'geometry': 'SHAPE'}, inplace=True)

        # mssql.update_table_rows(new1, param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], on=['SpatialUnitId', 'OSMWaterwayId'], index=False, append=True, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], geo_col='SHAPE', clear_table=False, dtype=sql_dtypes)
        mssql.update_table_rows(new1, param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], on=['SpatialUnitId', 'OSMWaterwayId'], index=False, append=True, geo_col='SHAPE', clear_table=False, dtype=sql_dtypes)

    if not diff1.empty:
        diff2 = pd.merge(diff1, old1[['SpatialUnitId', 'OSMWaterwayId', 'OBJECTID']], on=['SpatialUnitId', 'OSMWaterwayId'])
        diff2['ModifiedDate'] = today_str
        diff2.rename(columns={'geometry': 'SHAPE'}, inplace=True)

        # mssql.update_table_rows(diff2, param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], on=['SpatialUnitId', 'OSMWaterwayId'], index=False, append=True, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'], geo_col='SHAPE', clear_table=False, dtype=sql_dtypes)
        mssql.update_table_rows(diff2, param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], on=['SpatialUnitId', 'OSMWaterwayId'], index=False, append=True, geo_col='SHAPE', clear_table=False, dtype=sql_dtypes)

    if not rem1.empty:
        # mssql.del_table_rows(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], pk_df=rem1, username=param['gis_waterdata']['username'], password=param['gis_waterdata']['password'])
        mssql.del_table_rows(param['gis_waterdata']['server'], param['gis_waterdata']['database'], param['gis_waterdata']['reaches']['table'], pk_df=rem1)

    return gdf4, excluded_points, bad_geo, bad_points