# read in cell-level data # table contains site_id, grid_id, i, j, central_area, net_pv_capacity. with open('cell_central_pv_capacity_original.csv') as csvfile: data = list(csv.DictReader(csvfile)) x, y, area = np.array(list((r["i"], r["j"], r["central_area"]) for r in data), dtype=float).T # data = csv_to_dict('cell_central_pv_capacity_original.csv') # i = np.array(data["i"], dtype=float) # j = np.array(data["j"], dtype=float) # area = np.array(data["central_area"], dtype=float) # cluster the cells into 150 projects (somewhat arbitrarily) instead of ~750, # and use the cluster numbers as new site_id's. km = KMeans(150, np.c_[x, y], size=0.0001*area) km.init_centers() km.find_centers() # km.plot() for i in range(len(x)): # km.cluster_id is an array of cluster id's, same length as x and y data[i]["cluster_id"] = km.cluster_id[i] # insert the modified data into the database # note: it is reportedly faster to construct a single # insert query with all the values using python's string # construction operators, since executemany runs numerous # separate inserts. However, it's considered more secure to use # the database library's template substitution, so we do that. executemany(""" INSERT INTO cell_central_pv_capacity (cluster_id, site_id, grid_id, i, j, central_area, net_pv_capacity) VALUES (
def distributed_pv(): # TODO: break up the major sub-sections of the main loop into separate functions # TODO: merge the code that gets capacity factors for each configuration here # with the code that gets capacity factors for each cell for utility-scale PV. # TODO: write a general function that adds a block of projects and capacity # factors to the postgresql database (including reading back the project IDs), # and use that for distributed PV, utility-scale PV and wind projects # read roof areas from load_zone_grid_cell.csv # treat any NaNs (missing data) as 0.0 coverage all_cells = pd.DataFrame.from_csv( db_path('GIS/General/load_zone_nsrdb_cell.csv'), index_col=('load_zone', 'nsrdb_id')).fillna(0.0) # make sure tables exist, and clear out existing DistPV data; # the loops below will add records back to this table, one load zone at a time shared_tables.create_table("project") shared_tables.create_table("cap_factor") shared_tables.drop_indexes( "cap_factor") # drop and recreate is faster than incremental sorting execute(""" DELETE FROM cap_factor c USING project p WHERE c.project_id=p.project_id AND p.technology='DistPV'; DELETE FROM project WHERE technology='DistPV'; DROP TABLE IF EXISTS dist_pv_details;") """) # calculate hourly capacity factor for all dist pv configurations # for each cell in each load zone for lz in load_zones: lz_cells = all_cells.loc[lz, :] lz_cells = lz_cells[lz_cells.roof_area > 0.0] # create an array to hold hourly capacity factors for all cells in this load zone # it will end up with one row for each cell and one column for each hour cap_factors = None for cell_n, cell in enumerate(lz_cells.itertuples()): cell_capacities, cell_cap_factors = get_dist_pv_cap_factors( cell.nsrdb_lat, cell.nsrdb_lon, cell.roof_area) # note: this is the first time when we know how many configurations # and timesteps there are, so this is when we create the cap_factors array if cap_factors is None: capacities = np.empty((len(lz_cells), ) + cell_capacities.shape) cap_factors = np.empty((len(lz_cells), ) + cell_cap_factors.shape) # fill them with nans, so we'll see if any aren't filled later capacities.fill(np.nan) cap_factors.fill(np.nan) capacities[cell_n, :] = cell_capacities cap_factors[cell_n, :, :] = cell_cap_factors # reshape into a long list of resources instead of a cell x config matrix capacities = capacities.reshape((-1, )) cap_factors = cap_factors.reshape((-1, cap_factors.shape[2])) # cluster available resources into 20 tranches with similar timing and quality # (we assume the better-suited ones will be developed before the worse ones) # (This could be sped up by using a subsample of the timesteps if needed, but then # the cluster means would have to be calculated afterwards.) # an alternative approach would be to cluster resources based on annual average # capacity factor, but that neglects differences in timing between different # orientations. km = KMeans(20, X=cap_factors, size=capacities) import time start = time.time() km.init_centers() # 3 mins print("init_centers(): {} s".format(time.time() - start)) start = time.time() km.find_centers() # 9 mins print("find_centers(): {} s".format(time.time() - start)) # now km.mu is a matrix of capacity factors, with one row per cluster # and one column per timestep # and km.cluster_id shows which cluster each resource belongs to cluster_capacities = np.bincount(km.cluster_id, weights=capacities) cluster_cap_factors = km.mu.T # PROJECT TABLE # store project definitions and capacity factors project_df = pd.DataFrame.from_items([ ('load_zone', load_zone), ('technology', 'DistPV'), ('site', ['Oahu_DistPV_' + str(i) for i in range(len(cluster_capacities))]), ('orientation', 'na'), ('max_capacity', cluster_capacities), ('connect_cost_per_mw', 0.0) ]).set_index(['load_zone', 'technology', 'site', 'orientation']) project_df.to_sql('project', db_engine, if_exists='append') # CAP_FACTOR TABLE # get timesteps for each year (based on lat and lon of last cell in the list) timesteps = [ get_timesteps(nsrdb_file_dict[(cell.nsrdb_lat, cell.nsrdb_lon, year)]) for year in years ] # make an index of all timesteps timestep_index = pd.concat( (pd.DataFrame(index=x) for x in timesteps)).index.sort_values() # make an index of all site_ids # TODO: change this code and project_df code to zero-fill site numbers up to 2 digits # (enough to cover the number of tranches in each zone) site_ids = [ load_zone + '_DistPV_' + str(i) for i in range(cluster_cap_factors.shape[1]) ] # multiindex of load_zone, technology, site, orientation proj_index = pd.MultiIndex.from_product([[load_zone], ['DistPV'], site_ids, ['na']]) # make a single dataframe to hold all the data cap_factor_df = pd.DataFrame( cluster_cap_factors, index=timestep_index, columns=proj_index, ) cap_factor_df.columns.names = [ 'load_zone', 'technology', 'site', 'orientation' ] cap_factor_df.index.names = ['date_time'] # convert to database orientation, with natural order for indexes, # but also keep as a DataFrame cap_factor_df = pd.DataFrame( {'cap_factor': cap_factor_df.stack(cap_factor_df.columns.names)}) # sort table, then switch to using z, t, s, o as index (to match with project table) # (takes a few minutes) cap_factor_df = cap_factor_df.reorder_levels( ['load_zone', 'technology', 'site', 'orientation', 'date_time']).sort_index().reset_index('date_time') # retrieve the project IDs (created automatically in the database earlier) # note: this read-back could potentially be done earlier, and then the # project ids could be included in cap_factor_df when it is first created. # but this provides cross-referencing by z, t, s, o automatically, which is helpful. project_ids = pd.read_sql( "SELECT project_id, load_zone, technology, site, orientation " + "FROM project WHERE technology = 'DistPV';", db_engine, index_col=['load_zone', 'technology', 'site', 'orientation']) cap_factor_df['project_id'] = project_ids['project_id'] # convert date_time values into strings for insertion into postgresql. # Inserting a timezone-aware DatetimeIndex into postgresql fails; see # http://stackoverflow.com/questions/35435424/pandas-to-sql-gives-valueerror-with-timezone-aware-column/35552061 # note: the string conversion is pretty slow cap_factor_df['date_time'] = pd.DatetimeIndex( cap_factor_df['date_time']).strftime("%Y-%m-%d %H:%M:%S%z") cap_factor_df.set_index(['project_id', 'date_time'], inplace=True) # Do we need error checking here? If any projects aren't in cap_factor_df, they'll # create single rows with NaNs (and any prior existing cap_factors for them will # get dropped below). # If any rows in cap_factor_df aren't matched to a project, they'll go in with # a null project_id. # The next line is very slow. But it only seems possible to speed it up by # copying the data to a csv and doing a bulk insert, which is more cumbersome. # progress can be monitored via this command in psql: # select query from pg_stat_activity where query like 'INSERT%'; cap_factor_df.to_sql('cap_factor', db_engine, if_exists='append', chunksize=10000) # DIST_PV_DETAILS TABLE # store cluster details for later reference # would be interesting to see mean and stdev of lat, lon, # cap factor, azimuth, tilt for each cluster, so we can describe them. dist_pv_details = pd.Panel( { 'capacity_mw': capacities.reshape((len(lz_cells), -1)), 'site': ('Oahu_DistPV_' + km.cluster_id.astype(str).astype(np.object)).reshape( (len(lz_cells), -1)) }, major_axis=[lz_cells[col] for col in ['nsrdb_lat', 'nsrdb_lon']], minor_axis=[ dist_pv_configs[col] for col in dist_pv_configs.columns ]).to_frame().reset_index() dist_pv_details.insert(0, 'load_zone', load_zone) # store in postgresql database dist_pv_details.to_sql('dist_pv_details', db_engine, if_exists='append') # restore indexes, final cleanup shared_tables.create_indexes("cap_factor") execute("ALTER TABLE dist_pv_details OWNER TO admin;")