def distributed_pv(): # for now, just reuse old data # store data in postgresql tables shared_tables.create_table("project") shared_tables.create_table("cap_factor") # remove old records (best before removing indexes) execute(""" DELETE FROM cap_factor WHERE project_id IN (SELECT project_id FROM project WHERE technology = 'DistPV'); """) execute(""" DELETE FROM project WHERE technology = 'DistPV'; """) # remove indexes shared_tables.drop_indexes("cap_factor") # drop and recreate is faster than incremental sorting execute(""" INSERT INTO project (load_zone, technology, site, orientation, max_capacity) SELECT load_zone, technology, 'DistPV' AS site, orientation, max_capacity FROM max_capacity_pre_2016_06_21 WHERE technology = 'DistPV'; """) execute(""" INSERT INTO cap_factor (project_id, date_time, cap_factor) SELECT project_id, date_time, cap_factor FROM cap_factor_pre_2016_06_21 cf JOIN project USING (load_zone, technology, orientation) WHERE cf.technology = 'DistPV'; """) # restore indexes shared_tables.create_indexes("cap_factor")
def generator_info(): # note: for now, these must always be run in this sequence, because # new_generator_info() creates the generator_info and part_load_fuel_consumption # tables, and then existing_generator_info() appends to them (without clearing # out any prior records) shared_tables.create_table('project') shared_tables.create_table('generator_info') new_generator_info() existing_generator_info()
def tracking_pv(): # make a list of all available NSRDB data files nsrdb_file_dict, years = get_nsrdb_file_dict() cluster_cell = pd.DataFrame.from_csv( db_path('GIS/Utility-Scale Solar Sites/solar_cluster_nsrdb_grid_renamed.csv'), index_col='gridclstid' ) cluster_cell = cluster_cell[cluster_cell['solar_covg']>0] cell = cluster_cell.groupby('nsrdb_id') cluster = cluster_cell.groupby('cluster_id') cluster_total_solar_area = cluster['solar_area'].sum() cluster_ids = cluster.groups.keys() # list of all cluster ids, for convenience cluster_id_digits = len(str(max(cluster_ids))) # max number of digits for a cluster id # site_ids for each cluster_id (these distinguish PV sites from wind sites that may have the same number) site_ids = ['PV_' + str(cluster_id).zfill(cluster_id_digits) for cluster_id in cluster_ids] # calculate weighted average lat and lon for each cluster # (note: the axis=0 and axis=1 keep pandas from generating lots of nans due to # trying to match column name in addition to row index) cluster_coords = pd.concat([ cluster_cell['cluster_id'], cluster_cell[['solar_lat', 'solar_lon']].multiply(cluster_cell['solar_area'], axis=0) ], axis=1).groupby('cluster_id').sum().div(cluster_total_solar_area, axis=0) cluster_coords.columns=['latitude', 'longitude'] # get list of technologies to be defined technologies = central_solar_techs.index.values # calculate capacity factors for all projects # This dict will hold vectors of capacity factors for each cluster for each year and technology. # This arrangement is simpler than using a DataFrame because we don't yet know the # indexes (timesteps) of the data for each year. cluster_cap_factors = dict() for tech in technologies: # go through all the needed nsrdb cells and add them to the capacity factor for the # relevant cluster and year for cell_id, grp in cell: # grp has one row for each cluster that uses data from this cell lat = round_coord(grp['nsrdb_lat'].iloc[0]) lon = round_coord(grp['nsrdb_lon'].iloc[0]) for year in years: cap_factors = get_cap_factors(nsrdb_file_dict[lat, lon, year], central_solar_techs.loc[tech, 'array_type']) # note: iterrows() would convert everything to a single (float) series, but itertuples doesn't for clst in grp.itertuples(): contrib = cap_factors * clst.solar_area / cluster_total_solar_area[clst.cluster_id] key = (tech, clst.cluster_id, year) if key in cluster_cap_factors: cluster_cap_factors[key] += contrib else: cluster_cap_factors[key] = contrib # get timesteps for each year (based on lat and lon of first cell in the list) timesteps = dict() lat = round_coord(cluster_cell['nsrdb_lat'].iloc[0]) lon = round_coord(cluster_cell['nsrdb_lon'].iloc[0]) for year in years: timesteps[year] = get_timesteps(nsrdb_file_dict[(lat, lon, year)]) # make an index of all timesteps timestep_index = pd.concat([pd.DataFrame(index=x) for x in timesteps.values()]).index.sort_values() # make a single dataframe to hold all the data cap_factor_df = pd.DataFrame( index=timestep_index, columns=pd.MultiIndex.from_product([technologies, site_ids]), dtype=float ) # assign values to the dataframe for ((tech, cluster_id, year), cap_factors) in cluster_cap_factors.iteritems(): cap_factor_df.update(pd.DataFrame( cap_factors, index=timesteps[year], columns=[(tech, 'PV_' + str(cluster_id).zfill(cluster_id_digits))] )) cap_factor_df.columns.names = ['technology', 'site'] cap_factor_df.index.names=['date_time'] # add load_zone and orientation to the index cap_factor_df['load_zone'] = load_zone cap_factor_df['orientation'] = 'na' cap_factor_df.set_index(['load_zone', 'orientation'], append=True, inplace=True) # convert to database orientation, with natural order for indexes, # but also keep as a DataFrame cap_factor_df = pd.DataFrame( {'cap_factor': cap_factor_df.stack(cap_factor_df.columns.names)} ) # sort table, then switch to using z, t, s, o as index (to match with project table) cap_factor_df = cap_factor_df.reorder_levels( ['load_zone', 'technology', 'site', 'orientation', 'date_time'] ).sort_index().reset_index('date_time') # make a dataframe showing potential projects (same structure as "project" table) # note: for now we don't really handle multiple load zones and we don't worry about orientation # (may eventually have projects available with different azimuth and slope) # This concatenates a list of DataFrames, one for each technology project_df = pd.concat([ pd.DataFrame(dict( load_zone=load_zone, technology=tech, site=site_ids, orientation='na', max_capacity=cluster_total_solar_area*central_solar_techs.loc[tech, 'mw_per_m2'], latitude=cluster_coords['latitude'], longitude=cluster_coords['longitude'], )) for tech in technologies ], axis=0).set_index(['load_zone', 'technology', 'site', 'orientation']) # store data in postgresql tables shared_tables.create_table("project") execute("DELETE FROM project WHERE technology IN %s;", [tuple(technologies)]) project_df.to_sql('project', db_engine, if_exists='append') # retrieve the project IDs (created automatically in the database) project_ids = pd.read_sql( "SELECT project_id, load_zone, technology, site, orientation " + "FROM project WHERE technology IN %(techs)s;", db_engine, index_col=['load_zone', 'technology', 'site', 'orientation'], params={'techs': tuple(technologies)} ) cap_factor_df['project_id'] = project_ids['project_id'] # convert date_time values into strings for insertion into postgresql. # Inserting a timezone-aware DatetimeIndex into postgresql fails; see # http://stackoverflow.com/questions/35435424/pandas-to-sql-gives-valueerror-with-timezone-aware-column/35552061 # note: the string conversion is pretty slow cap_factor_df['date_time'] = pd.DatetimeIndex(cap_factor_df['date_time']).strftime("%Y-%m-%d %H:%M:%S%z") cap_factor_df.set_index(['project_id', 'date_time'], inplace=True) # Do we need error checking here? If any projects aren't in cap_factor_df, they'll # create single rows with NaNs (and any prior existing cap_factors for them will # get dropped below). # If any rows in cap_factor_df aren't matched to a project, they'll go in with # a null project_id. shared_tables.create_table("cap_factor") # only created if it doesn't exist shared_tables.drop_indexes("cap_factor") # drop and recreate is faster than incremental sorting execute("DELETE FROM cap_factor WHERE project_id IN %s;", [tuple(project_ids['project_id'])]) cap_factor_df.to_sql('cap_factor', db_engine, if_exists='append', chunksize=10000) shared_tables.create_indexes("cap_factor")
def distributed_pv(): # TODO: break up the major sub-sections of the main loop into separate functions # TODO: merge the code that gets capacity factors for each configuration here # with the code that gets capacity factors for each cell for utility-scale PV. # TODO: write a general function that adds a block of projects and capacity # factors to the postgresql database (including reading back the project IDs), # and use that for distributed PV, utility-scale PV and wind projects # read roof areas from load_zone_grid_cell.csv # treat any NaNs (missing data) as 0.0 coverage all_cells = pd.DataFrame.from_csv( db_path('GIS/General/load_zone_nsrdb_cell.csv'), index_col=('load_zone', 'nsrdb_id')).fillna(0.0) # make sure tables exist, and clear out existing DistPV data; # the loops below will add records back to this table, one load zone at a time shared_tables.create_table("project") shared_tables.create_table("cap_factor") shared_tables.drop_indexes( "cap_factor") # drop and recreate is faster than incremental sorting execute(""" DELETE FROM cap_factor c USING project p WHERE c.project_id=p.project_id AND p.technology='DistPV'; DELETE FROM project WHERE technology='DistPV'; DROP TABLE IF EXISTS dist_pv_details;") """) # calculate hourly capacity factor for all dist pv configurations # for each cell in each load zone for lz in load_zones: lz_cells = all_cells.loc[lz, :] lz_cells = lz_cells[lz_cells.roof_area > 0.0] # create an array to hold hourly capacity factors for all cells in this load zone # it will end up with one row for each cell and one column for each hour cap_factors = None for cell_n, cell in enumerate(lz_cells.itertuples()): cell_capacities, cell_cap_factors = get_dist_pv_cap_factors( cell.nsrdb_lat, cell.nsrdb_lon, cell.roof_area) # note: this is the first time when we know how many configurations # and timesteps there are, so this is when we create the cap_factors array if cap_factors is None: capacities = np.empty((len(lz_cells), ) + cell_capacities.shape) cap_factors = np.empty((len(lz_cells), ) + cell_cap_factors.shape) # fill them with nans, so we'll see if any aren't filled later capacities.fill(np.nan) cap_factors.fill(np.nan) capacities[cell_n, :] = cell_capacities cap_factors[cell_n, :, :] = cell_cap_factors # reshape into a long list of resources instead of a cell x config matrix capacities = capacities.reshape((-1, )) cap_factors = cap_factors.reshape((-1, cap_factors.shape[2])) # cluster available resources into 20 tranches with similar timing and quality # (we assume the better-suited ones will be developed before the worse ones) # (This could be sped up by using a subsample of the timesteps if needed, but then # the cluster means would have to be calculated afterwards.) # an alternative approach would be to cluster resources based on annual average # capacity factor, but that neglects differences in timing between different # orientations. km = KMeans(20, X=cap_factors, size=capacities) import time start = time.time() km.init_centers() # 3 mins print("init_centers(): {} s".format(time.time() - start)) start = time.time() km.find_centers() # 9 mins print("find_centers(): {} s".format(time.time() - start)) # now km.mu is a matrix of capacity factors, with one row per cluster # and one column per timestep # and km.cluster_id shows which cluster each resource belongs to cluster_capacities = np.bincount(km.cluster_id, weights=capacities) cluster_cap_factors = km.mu.T # PROJECT TABLE # store project definitions and capacity factors project_df = pd.DataFrame.from_items([ ('load_zone', load_zone), ('technology', 'DistPV'), ('site', ['Oahu_DistPV_' + str(i) for i in range(len(cluster_capacities))]), ('orientation', 'na'), ('max_capacity', cluster_capacities), ('connect_cost_per_mw', 0.0) ]).set_index(['load_zone', 'technology', 'site', 'orientation']) project_df.to_sql('project', db_engine, if_exists='append') # CAP_FACTOR TABLE # get timesteps for each year (based on lat and lon of last cell in the list) timesteps = [ get_timesteps(nsrdb_file_dict[(cell.nsrdb_lat, cell.nsrdb_lon, year)]) for year in years ] # make an index of all timesteps timestep_index = pd.concat( (pd.DataFrame(index=x) for x in timesteps)).index.sort_values() # make an index of all site_ids # TODO: change this code and project_df code to zero-fill site numbers up to 2 digits # (enough to cover the number of tranches in each zone) site_ids = [ load_zone + '_DistPV_' + str(i) for i in range(cluster_cap_factors.shape[1]) ] # multiindex of load_zone, technology, site, orientation proj_index = pd.MultiIndex.from_product([[load_zone], ['DistPV'], site_ids, ['na']]) # make a single dataframe to hold all the data cap_factor_df = pd.DataFrame( cluster_cap_factors, index=timestep_index, columns=proj_index, ) cap_factor_df.columns.names = [ 'load_zone', 'technology', 'site', 'orientation' ] cap_factor_df.index.names = ['date_time'] # convert to database orientation, with natural order for indexes, # but also keep as a DataFrame cap_factor_df = pd.DataFrame( {'cap_factor': cap_factor_df.stack(cap_factor_df.columns.names)}) # sort table, then switch to using z, t, s, o as index (to match with project table) # (takes a few minutes) cap_factor_df = cap_factor_df.reorder_levels( ['load_zone', 'technology', 'site', 'orientation', 'date_time']).sort_index().reset_index('date_time') # retrieve the project IDs (created automatically in the database earlier) # note: this read-back could potentially be done earlier, and then the # project ids could be included in cap_factor_df when it is first created. # but this provides cross-referencing by z, t, s, o automatically, which is helpful. project_ids = pd.read_sql( "SELECT project_id, load_zone, technology, site, orientation " + "FROM project WHERE technology = 'DistPV';", db_engine, index_col=['load_zone', 'technology', 'site', 'orientation']) cap_factor_df['project_id'] = project_ids['project_id'] # convert date_time values into strings for insertion into postgresql. # Inserting a timezone-aware DatetimeIndex into postgresql fails; see # http://stackoverflow.com/questions/35435424/pandas-to-sql-gives-valueerror-with-timezone-aware-column/35552061 # note: the string conversion is pretty slow cap_factor_df['date_time'] = pd.DatetimeIndex( cap_factor_df['date_time']).strftime("%Y-%m-%d %H:%M:%S%z") cap_factor_df.set_index(['project_id', 'date_time'], inplace=True) # Do we need error checking here? If any projects aren't in cap_factor_df, they'll # create single rows with NaNs (and any prior existing cap_factors for them will # get dropped below). # If any rows in cap_factor_df aren't matched to a project, they'll go in with # a null project_id. # The next line is very slow. But it only seems possible to speed it up by # copying the data to a csv and doing a bulk insert, which is more cumbersome. # progress can be monitored via this command in psql: # select query from pg_stat_activity where query like 'INSERT%'; cap_factor_df.to_sql('cap_factor', db_engine, if_exists='append', chunksize=10000) # DIST_PV_DETAILS TABLE # store cluster details for later reference # would be interesting to see mean and stdev of lat, lon, # cap factor, azimuth, tilt for each cluster, so we can describe them. dist_pv_details = pd.Panel( { 'capacity_mw': capacities.reshape((len(lz_cells), -1)), 'site': ('Oahu_DistPV_' + km.cluster_id.astype(str).astype(np.object)).reshape( (len(lz_cells), -1)) }, major_axis=[lz_cells[col] for col in ['nsrdb_lat', 'nsrdb_lon']], minor_axis=[ dist_pv_configs[col] for col in dist_pv_configs.columns ]).to_frame().reset_index() dist_pv_details.insert(0, 'load_zone', load_zone) # store in postgresql database dist_pv_details.to_sql('dist_pv_details', db_engine, if_exists='append') # restore indexes, final cleanup shared_tables.create_indexes("cap_factor") execute("ALTER TABLE dist_pv_details OWNER TO admin;")
def tracking_pv(): # note: this uses global nsrdb_file_dict and years variables cluster_cell = pd.DataFrame.from_csv(db_path( 'GIS/Utility-Scale Solar Sites/solar_cluster_nsrdb_grid_final.csv'), index_col='gridclstid') cluster_cell = cluster_cell[cluster_cell['solar_covg'] > 0] cell = cluster_cell.groupby('nsrdb_id') cluster = cluster_cell.groupby('cluster_id') cluster_total_solar_area = cluster['solar_area'].sum() cluster_ids = cluster.groups.keys( ) # list of all cluster ids, for convenience cluster_id_digits = len(str( max(cluster_ids))) # max number of digits for a cluster id # site_ids for each cluster_id (these distinguish PV sites from wind sites that may have the same number) site_ids = [ 'PV_' + str(cluster_id).zfill(cluster_id_digits) for cluster_id in cluster_ids ] # calculate weighted average lat and lon for each cluster # (note: the axis=0 and axis=1 keep pandas from generating lots of nans due to # trying to match column name in addition to row index) cluster_coords = pd.concat( [ cluster_cell['cluster_id'], cluster_cell[[ 'solar_lat', 'solar_lon' ]].multiply(cluster_cell['solar_area'], axis=0) ], axis=1).groupby('cluster_id').sum().div(cluster_total_solar_area, axis=0) cluster_coords.columns = ['latitude', 'longitude'] # get list of technologies to be defined technologies = central_solar_techs.index.values # calculate capacity factors for all projects # This dict will hold vectors of capacity factors for each cluster for each year and technology. # This arrangement is simpler than using a DataFrame because we don't yet know the # indexes (timesteps) of the data for each year. cluster_cap_factors = dict() for tech in technologies: # go through all the needed nsrdb cells and add them to the capacity factor for the # relevant cluster and year for cell_id, grp in cell: # grp has one row for each cluster that uses data from this cell lat = round_coord(grp['nsrdb_lat'].iloc[0]) lon = round_coord(grp['nsrdb_lon'].iloc[0]) array_type = central_solar_techs.loc[tech, 'array_type'] gcr = central_solar_techs.loc[tech, 'gcr'] azimuth = 180 # south-facing if array_type == 0: # fixed-slope tilt = lat else: tilt = 0 # could eventually be southern component of ground slope for trackers for year in years: cap_factors = get_cap_factors(nsrdb_file_dict[lat, lon, year], array_type, azimuth=azimuth, tilt=tilt, gcr=gcr) # note: iterrows() would convert everything to a single (float) series, but itertuples doesn't for clst in grp.itertuples(): contrib = cap_factors * clst.solar_area / cluster_total_solar_area[ clst.cluster_id] key = (tech, clst.cluster_id, year) if key in cluster_cap_factors: cluster_cap_factors[key] += contrib else: cluster_cap_factors[key] = contrib # get timesteps for each year (based on lat and lon of first cell in the list) timesteps = dict() lat = round_coord(cluster_cell['nsrdb_lat'].iloc[0]) lon = round_coord(cluster_cell['nsrdb_lon'].iloc[0]) for year in years: timesteps[year] = get_timesteps(nsrdb_file_dict[(lat, lon, year)]) # make an index of all timesteps timestep_index = pd.concat([ pd.DataFrame(index=x) for x in timesteps.values() ]).index.sort_values() # make a single dataframe to hold all the data cap_factor_df = pd.DataFrame(index=timestep_index, columns=pd.MultiIndex.from_product( [technologies, site_ids]), dtype=float) # assign values to the dataframe for ((tech, cluster_id, year), cap_factors) in cluster_cap_factors.iteritems(): cap_factor_df.update( pd.DataFrame(cap_factors, index=timesteps[year], columns=[ (tech, 'PV_' + str(cluster_id).zfill(cluster_id_digits)) ])) cap_factor_df.columns.names = ['technology', 'site'] cap_factor_df.index.names = ['date_time'] # add load_zone and orientation to the index cap_factor_df['load_zone'] = load_zone cap_factor_df['orientation'] = 'na' cap_factor_df.set_index(['load_zone', 'orientation'], append=True, inplace=True) # convert to database orientation, with natural order for indexes, # but also keep as a DataFrame cap_factor_df = pd.DataFrame( {'cap_factor': cap_factor_df.stack(cap_factor_df.columns.names)}) # sort table, then switch to using z, t, s, o as index (to match with project table) cap_factor_df = cap_factor_df.reorder_levels( ['load_zone', 'technology', 'site', 'orientation', 'date_time']).sort_index().reset_index('date_time') # make a dataframe showing potential projects (same structure as "project" table) # note: for now we don't really handle multiple load zones and we don't worry about orientation # (may eventually have projects available with different azimuth and slope) # This concatenates a list of DataFrames, one for each technology project_df = pd.concat( [ pd.DataFrame( dict( load_zone=load_zone, technology=tech, site=site_ids, orientation='na', max_capacity=cluster_total_solar_area * central_solar_techs.loc[tech, 'mw_per_m2'], latitude=cluster_coords['latitude'], longitude=cluster_coords['longitude'], )) for tech in technologies ], axis=0).set_index(['load_zone', 'technology', 'site', 'orientation']) # store data in postgresql tables shared_tables.create_table("project") execute("DELETE FROM project WHERE technology IN %s;", [tuple(technologies)]) project_df.to_sql('project', db_engine, if_exists='append') # retrieve the project IDs (created automatically in the database) project_ids = pd.read_sql( "SELECT project_id, load_zone, technology, site, orientation " + "FROM project WHERE technology IN %(techs)s;", db_engine, index_col=['load_zone', 'technology', 'site', 'orientation'], params={'techs': tuple(technologies)}) cap_factor_df['project_id'] = project_ids['project_id'] # convert date_time values into strings for insertion into postgresql. # Inserting a timezone-aware DatetimeIndex into postgresql fails; see # http://stackoverflow.com/questions/35435424/pandas-to-sql-gives-valueerror-with-timezone-aware-column/35552061 # note: the string conversion is pretty slow cap_factor_df['date_time'] = pd.DatetimeIndex( cap_factor_df['date_time']).strftime("%Y-%m-%d %H:%M:%S%z") cap_factor_df.set_index(['project_id', 'date_time'], inplace=True) # Do we need error checking here? If any projects aren't in cap_factor_df, they'll # create single rows with NaNs (and any prior existing cap_factors for them will # get dropped below). # If any rows in cap_factor_df aren't matched to a project, they'll go in with # a null project_id. shared_tables.create_table( "cap_factor") # only created if it doesn't exist shared_tables.drop_indexes( "cap_factor") # drop and recreate is faster than incremental sorting execute("DELETE FROM cap_factor WHERE project_id IN %s;", [tuple(project_ids['project_id'])]) cap_factor_df.to_sql('cap_factor', db_engine, if_exists='append', chunksize=10000) shared_tables.create_indexes("cap_factor")