def organize_independent_networks(connections): rconn = nhd_network.reverse_network(connections) independent_networks = nhd_network.reachable_network(rconn) reaches_bytw = {} for tw, net in independent_networks.items(): path_func = partial(nhd_network.split_at_junction, net) reaches_bytw[tw] = nhd_network.dfs_decomposition(net, path_func) return independent_networks, reaches_bytw, rconn
def organize_independent_networks(connections, wbodies=None): rconn = nhd_network.reverse_network(connections) independent_networks = nhd_network.reachable_network(rconn) reaches_bytw = {} for tw, net in independent_networks.items(): if wbodies: path_func = partial( nhd_network.split_at_waterbodies_and_junctions, set(wbodies), net ) else: path_func = partial(nhd_network.split_at_junction, net) reaches_bytw[tw] = nhd_network.dfs_decomposition(net, path_func) return independent_networks, reaches_bytw, rconn
def main(): args = _handle_args() next_gen_input_folder = test_folder.joinpath("input", "next_gen") if args.input: next_gen_input_folder = pathlib.Path(args.input) # The following 2 values are currently hard coded for this test domain nts = 720 # number of timestep = 1140 * 60(model timestep) = 86400 = day dt_mc = 300.0 # time interval for MC # Currently tested on the Sugar Creek domain ngen_network_df = nhd_io.read_geopandas(args.supernetwork) if args.subset: ngen_network_df = ngen_network_df[ ngen_network_df['realized_catchment'].isin(args.subset)] # Create dictionary mapping each connection ID ngen_network_dict = dict(zip(ngen_network_df.id, ngen_network_df.toid)) #ngen_network_dict = dict(zip(ngen_network_df.ID, ngen_network_df.toID)) def node_key_func(x): return int(x[3:]) # Extract the ID integer values waterbody_connections = { node_key_func(k): node_key_func(v) for k, v in ngen_network_dict.items() } # Convert dictionary connections to data frame and make ID column the index waterbody_df = pd.DataFrame.from_dict(waterbody_connections, orient='index', columns=['to']) # Sort ID index column waterbody_df = waterbody_df.sort_index() waterbody_df = nhd_io.replace_downstreams(waterbody_df, "to", 0) connections = nhd_network.extract_connections(waterbody_df, "to") # Read and convert catchment lateral flows to format that can be processed by compute_network qlats = next_gen_io.read_catchment_lateral_flows(next_gen_input_folder) print(qlats) rconn = nhd_network.reverse_network(connections) subnets = nhd_network.reachable_network(rconn, check_disjoint=False) # read the routelink file nhd_routelink = nhd_io.read_netcdf("data/RouteLink_NHDPLUS.nc") nhd_routelink['dt'] = 300.0 nhd_routelink.set_index("link", inplace=True) routelink_cols = { "downstream": "to", "dx": "Length", "n": "n", "ncc": "nCC", "s0": "So", "bw": "BtmWdth", "tw": "TopWdth", "twcc": "TopWdthCC", "waterbody": "NHDWaterbodyComID", "musk": "MusK", "musx": "MusX", "cs": "ChSlp", } routelink_cols = dict([(value, key) for key, value in routelink_cols.items()]) nhd_routelink.rename(columns=routelink_cols, inplace=True) with open(next_gen_input_folder / 'coarse/crosswalk.json') as f: crosswalk_data = json.load(f) waterbody_df['comid'] = waterbody_df.apply( lambda x: crosswalk_data['cat-' + str(x.name)]['outlet_COMID'], axis=1) waterbody_df = waterbody_df.join(nhd_routelink, on='comid', how='left') del nhd_routelink # initial conditions, assume to be zero # TO DO: Allow optional reading of initial conditions from WRF q0 = pd.DataFrame(0, index=waterbody_df.index, columns=["qu0", "qd0", "h0"], dtype="float32") #Set types as float32 waterbody_df = waterbody_df.astype({ "dt": "float32", "bw": "float32", "tw": "float32", "twcc": "float32", "dx": "float32", "n": "float32", "ncc": "float32", "cs": "float32", "s0": "float32" }) subreaches = {} for tw, net in subnets.items(): path_func = partial(nhd_network.split_at_junction, net) subreaches[tw] = nhd_network.dfs_decomposition(net, path_func) results = [] for twi, (tw, reach) in enumerate(subreaches.items(), 1): r = list(chain.from_iterable(reach)) data_sub = waterbody_df.loc[ r, ['dt', 'bw', 'tw', 'twcc', 'dx', 'n', 'ncc', 'cs', 's0' ]].sort_index() #data_sub = waterbody_df.loc[r, ['dt', 'bw', 'tw', 'twcc', 'dx', 'n', 'ncc', 'cs', 's0']] qlat_sub = qlats.loc[r].sort_index() q0_sub = q0.loc[r].sort_index() results.append( mc_reach.compute_network(nts, reach, subnets[tw], data_sub.index.values, data_sub.columns.values, data_sub.values, qlat_sub.values, q0_sub.values)) fdv_columns = pd.MultiIndex.from_product([range(nts), ['q', 'v', 'd']]).to_flat_index() flowveldepth = pd.concat( [pd.DataFrame(d, index=i, columns=fdv_columns) for i, d in results], copy=False) flowveldepth = flowveldepth.sort_index() outfile_base_name = (args.supernetwork).split(".")[0] flowveldepth.to_csv(f"{outfile_base_name}_mc_results.csv") print(flowveldepth)
def compute_nhd_routing_v02( connections, rconn, wbody_conn, reaches_bytw, compute_func_name, parallel_compute_method, subnetwork_target_size, cpu_pool, dt, nts, qts_subdivisions, independent_networks, param_df, q0, qlats, usgs_df, lastobs_df, da_parameter_dict, assume_short_ts, return_courant, waterbodies_df, waterbody_parameters, waterbody_types_df, waterbody_type_specified, diffusive_parameters=None, ): da_decay_coefficient = da_parameter_dict.get("da_decay_coefficient", 0) param_df["dt"] = dt param_df = param_df.astype("float32") start_time = time.time() compute_func = _compute_func_map[compute_func_name] if parallel_compute_method == "by-subnetwork-jit-clustered": networks_with_subnetworks_ordered_jit = nhd_network.build_subnetworks( connections, rconn, subnetwork_target_size ) subnetworks_only_ordered_jit = defaultdict(dict) subnetworks = defaultdict(dict) for tw, ordered_network in networks_with_subnetworks_ordered_jit.items(): intw = independent_networks[tw] for order, subnet_sets in ordered_network.items(): subnetworks_only_ordered_jit[order].update(subnet_sets) for subn_tw, subnetwork in subnet_sets.items(): subnetworks[subn_tw] = {k: intw[k] for k in subnetwork} reaches_ordered_bysubntw = defaultdict(dict) for order, ordered_subn_dict in subnetworks_only_ordered_jit.items(): for subn_tw, subnet in ordered_subn_dict.items(): conn_subn = {k: connections[k] for k in subnet if k in connections} rconn_subn = {k: rconn[k] for k in subnet if k in rconn} if waterbodies_df.empty: path_func = partial(nhd_network.split_at_junction, rconn_subn) else: path_func = partial( nhd_network.split_at_waterbodies_and_junctions, set(waterbodies_df.index.values), rconn_subn ) reaches_ordered_bysubntw[order][ subn_tw ] = nhd_network.dfs_decomposition(rconn_subn, path_func) cluster_threshold = 0.65 # When a job has a total segment count 65% of the target size, compute it # Otherwise, keep adding reaches. reaches_ordered_bysubntw_clustered = defaultdict(dict) for order in subnetworks_only_ordered_jit: cluster = 0 reaches_ordered_bysubntw_clustered[order][cluster] = { "segs": [], "upstreams": {}, "tw": [], "subn_reach_list": [], } for twi, (subn_tw, subn_reach_list) in enumerate( reaches_ordered_bysubntw[order].items(), 1 ): segs = list(chain.from_iterable(subn_reach_list)) reaches_ordered_bysubntw_clustered[order][cluster]["segs"].extend(segs) reaches_ordered_bysubntw_clustered[order][cluster]["upstreams"].update( subnetworks[subn_tw] ) reaches_ordered_bysubntw_clustered[order][cluster]["tw"].append(subn_tw) reaches_ordered_bysubntw_clustered[order][cluster][ "subn_reach_list" ].extend(subn_reach_list) if ( len(reaches_ordered_bysubntw_clustered[order][cluster]["segs"]) >= cluster_threshold * subnetwork_target_size ) and ( twi < len(reaches_ordered_bysubntw[order]) # i.e., we haven't reached the end # TODO: perhaps this should be a while condition... ): cluster += 1 reaches_ordered_bysubntw_clustered[order][cluster] = { "segs": [], "upstreams": {}, "tw": [], "subn_reach_list": [], } if 1 == 1: print("JIT Preprocessing time %s seconds." % (time.time() - start_time)) print("starting Parallel JIT calculation") start_para_time = time.time() # if 1 == 1: with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: results_subn = defaultdict(list) flowveldepth_interorder = {} for order in range(max(subnetworks_only_ordered_jit.keys()), -1, -1): jobs = [] for cluster, clustered_subns in reaches_ordered_bysubntw_clustered[ order ].items(): segs = clustered_subns["segs"] offnetwork_upstreams = set() segs_set = set(segs) for seg in segs: for us in rconn[seg]: if us not in segs_set: offnetwork_upstreams.add(us) segs.extend(offnetwork_upstreams) common_segs = list(param_df.index.intersection(segs)) wbodies_segs = set(segs).symmetric_difference(common_segs) #Declare empty dataframe waterbody_types_df_sub = pd.DataFrame() if not waterbodies_df.empty: lake_segs = list(waterbodies_df.index.intersection(segs)) waterbodies_df_sub = waterbodies_df.loc[ lake_segs, [ "LkArea", "LkMxE", "OrificeA", "OrificeC", "OrificeE", "WeirC", "WeirE", "WeirL", "ifd", "qd0", "h0", ], ] #If reservoir types other than Level Pool are active if not waterbody_types_df.empty: waterbody_types_df_sub = waterbody_types_df.loc[ lake_segs, [ "reservoir_type", ], ] else: lake_segs = [] waterbodies_df_sub = pd.DataFrame() param_df_sub = param_df.loc[ common_segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() param_df_sub_super = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() if order < max(subnetworks_only_ordered_jit.keys()): for us_subn_tw in offnetwork_upstreams: subn_tw_sortposition = param_df_sub_super.index.get_loc( us_subn_tw ) flowveldepth_interorder[us_subn_tw][ "position_index" ] = subn_tw_sortposition subn_reach_list = clustered_subns["subn_reach_list"] upstreams = clustered_subns["upstreams"] subn_reach_list_with_type = _build_reach_type_list(subn_reach_list, wbodies_segs) qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] #Determine model_start_time from qlat_start_time qlat_start_time = list(qlat_sub)[0] qlat_time_step_seconds = qts_subdivisions * dt qlat_start_time_datetime_object = _format_qlat_start_time(qlat_start_time) model_start_time_datetime_object = qlat_start_time_datetime_object \ - timedelta(seconds=qlat_time_step_seconds) model_start_time = model_start_time_datetime_object.strftime('%Y-%m-%d_%H:%M:%S') param_df_sub = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() usgs_df_sub, lastobs_df_sub, da_positions_list_byseg = _prep_da_dataframes(usgs_df, lastobs_df, param_df_sub.index, offnetwork_upstreams) da_positions_list_byreach, da_positions_list_bygage = _prep_da_positions_byreach(subn_reach_list, lastobs_df_sub.index) qlat_sub = qlat_sub.reindex(param_df_sub.index) q0_sub = q0_sub.reindex(param_df_sub.index) # results_subn[order].append( # compute_func( jobs.append( delayed(compute_func)( nts, dt, qts_subdivisions, subn_reach_list_with_type, upstreams, param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), lake_segs, waterbodies_df_sub.values, waterbody_parameters, waterbody_types_df_sub.values.astype("int32"), waterbody_type_specified, model_start_time, usgs_df_sub.values.astype("float32"), # flowveldepth_interorder, # obtain keys and values from this dataset np.array(da_positions_list_byseg, dtype="int32"), np.array(da_positions_list_byreach, dtype="int32"), np.array(da_positions_list_bygage, dtype="int32"), lastobs_df_sub.get( "last_obs_discharge", pd.Series(index=lastobs_df_sub.index, name="Null"), ).values.astype("float32"), lastobs_df_sub.get( "time_since_lastobs", pd.Series(index=lastobs_df_sub.index, name="Null"), ).values.astype("float32"), da_decay_coefficient, { us: fvd for us, fvd in flowveldepth_interorder.items() if us in offnetwork_upstreams }, assume_short_ts, return_courant, diffusive_parameters, ) ) results_subn[order] = parallel(jobs) if order > 0: # This is not needed for the last rank of subnetworks flowveldepth_interorder = {} for ci, (cluster, clustered_subns) in enumerate( reaches_ordered_bysubntw_clustered[order].items() ): for subn_tw in clustered_subns["tw"]: # TODO: This index step is necessary because we sort the segment index # TODO: I think there are a number of ways we could remove the sorting step # -- the binary search could be replaced with an index based on the known topology flowveldepth_interorder[subn_tw] = {} subn_tw_sortposition = ( results_subn[order][ci][0].tolist().index(subn_tw) ) flowveldepth_interorder[subn_tw]["results"] = results_subn[ order ][ci][1][subn_tw_sortposition] # what will it take to get just the tw FVD values into an array to pass to the next loop? # There will be an empty array initialized at the top of the loop, then re-populated here. # we don't have to bother with populating it after the last group results = [] for order in subnetworks_only_ordered_jit: results.extend(results_subn[order]) if 1 == 1: print("PARALLEL TIME %s seconds." % (time.time() - start_para_time)) elif parallel_compute_method == "by-subnetwork-jit": networks_with_subnetworks_ordered_jit = nhd_network.build_subnetworks( connections, rconn, subnetwork_target_size ) subnetworks_only_ordered_jit = defaultdict(dict) subnetworks = defaultdict(dict) for tw, ordered_network in networks_with_subnetworks_ordered_jit.items(): intw = independent_networks[tw] for order, subnet_sets in ordered_network.items(): subnetworks_only_ordered_jit[order].update(subnet_sets) for subn_tw, subnetwork in subnet_sets.items(): subnetworks[subn_tw] = {k: intw[k] for k in subnetwork} reaches_ordered_bysubntw = defaultdict(dict) for order, ordered_subn_dict in subnetworks_only_ordered_jit.items(): for subn_tw, subnet in ordered_subn_dict.items(): conn_subn = {k: connections[k] for k in subnet if k in connections} rconn_subn = {k: rconn[k] for k in subnet if k in rconn} if waterbodies_df.empty: path_func = partial(nhd_network.split_at_junction, rconn_subn) else: path_func = partial( nhd_network.split_at_waterbodies_and_junctions, set(waterbodies_df.index.values), rconn_subn ) reaches_ordered_bysubntw[order][ subn_tw ] = nhd_network.dfs_decomposition(rconn_subn, path_func) if 1 == 1: print("JIT Preprocessing time %s seconds." % (time.time() - start_time)) print("starting Parallel JIT calculation") start_para_time = time.time() with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: results_subn = defaultdict(list) flowveldepth_interorder = {} for order in range(max(subnetworks_only_ordered_jit.keys()), -1, -1): jobs = [] for twi, (subn_tw, subn_reach_list) in enumerate( reaches_ordered_bysubntw[order].items(), 1 ): # TODO: Confirm that a list here is best -- we are sorting, # so a set might be sufficient/better segs = list(chain.from_iterable(subn_reach_list)) offnetwork_upstreams = set() segs_set = set(segs) for seg in segs: for us in rconn[seg]: if us not in segs_set: offnetwork_upstreams.add(us) segs.extend(offnetwork_upstreams) common_segs = list(param_df.index.intersection(segs)) wbodies_segs = set(segs).symmetric_difference(common_segs) #Declare empty dataframe waterbody_types_df_sub = pd.DataFrame() if not waterbodies_df.empty: lake_segs = list(waterbodies_df.index.intersection(segs)) waterbodies_df_sub = waterbodies_df.loc[ lake_segs, [ "LkArea", "LkMxE", "OrificeA", "OrificeC", "OrificeE", "WeirC", "WeirE", "WeirL", "ifd", "qd0", "h0", ], ] #If reservoir types other than Level Pool are active if not waterbody_types_df.empty: waterbody_types_df_sub = waterbody_types_df.loc[ lake_segs, [ "reservoir_type", ], ] else: lake_segs = [] waterbodies_df_sub = pd.DataFrame() param_df_sub = param_df.loc[ common_segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() param_df_sub_super = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() if order < max(subnetworks_only_ordered_jit.keys()): for us_subn_tw in offnetwork_upstreams: subn_tw_sortposition = param_df_sub_super.index.get_loc( us_subn_tw ) flowveldepth_interorder[us_subn_tw][ "position_index" ] = subn_tw_sortposition subn_reach_list_with_type = _build_reach_type_list(subn_reach_list, wbodies_segs) qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] #Determine model_start_time from qlat_start_time qlat_start_time = list(qlat_sub)[0] qlat_time_step_seconds = qts_subdivisions * dt qlat_start_time_datetime_object = _format_qlat_start_time(qlat_start_time) model_start_time_datetime_object = qlat_start_time_datetime_object \ - timedelta(seconds=qlat_time_step_seconds) model_start_time = model_start_time_datetime_object.strftime('%Y-%m-%d_%H:%M:%S') param_df_sub = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() usgs_df_sub, lastobs_df_sub, da_positions_list_byseg = _prep_da_dataframes(usgs_df, lastobs_df, param_df_sub.index, offnetwork_upstreams) da_positions_list_byreach, da_positions_list_bygage = _prep_da_positions_byreach(subn_reach_list, lastobs_df_sub.index) qlat_sub = qlat_sub.reindex(param_df_sub.index) q0_sub = q0_sub.reindex(param_df_sub.index) jobs.append( delayed(compute_func)( nts, dt, qts_subdivisions, subn_reach_list_with_type, subnetworks[subn_tw], param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), lake_segs, waterbodies_df_sub.values, waterbody_parameters, waterbody_types_df_sub.values.astype("int32"), waterbody_type_specified, model_start_time, usgs_df_sub.values.astype("float32"), # flowveldepth_interorder, # obtain keys and values from this dataset np.array(da_positions_list_byseg, dtype="int32"), np.array(da_positions_list_byreach, dtype="int32"), np.array(da_positions_list_bygage, dtype="int32"), lastobs_df_sub.get( "last_obs_discharge", pd.Series(index=lastobs_df_sub.index, name="Null"), ).values.astype("float32"), lastobs_df_sub.get( "time_since_lastobs", pd.Series(index=lastobs_df_sub.index, name="Null"), ).values.astype("float32"), da_decay_coefficient, { us: fvd for us, fvd in flowveldepth_interorder.items() if us in offnetwork_upstreams }, assume_short_ts, return_courant, diffusive_parameters, ) ) results_subn[order] = parallel(jobs) if order > 0: # This is not needed for the last rank of subnetworks flowveldepth_interorder = {} for twi, subn_tw in enumerate(reaches_ordered_bysubntw[order]): # TODO: This index step is necessary because we sort the segment index # TODO: I think there are a number of ways we could remove the sorting step # -- the binary search could be replaced with an index based on the known topology flowveldepth_interorder[subn_tw] = {} subn_tw_sortposition = ( results_subn[order][twi][0].tolist().index(subn_tw) ) flowveldepth_interorder[subn_tw]["results"] = results_subn[ order ][twi][1][subn_tw_sortposition] # what will it take to get just the tw FVD values into an array to pass to the next loop? # There will be an empty array initialized at the top of the loop, then re-populated here. # we don't have to bother with populating it after the last group results = [] for order in subnetworks_only_ordered_jit: results.extend(results_subn[order]) if 1 == 1: print("PARALLEL TIME %s seconds." % (time.time() - start_para_time)) elif parallel_compute_method == "by-subnetwork-diffusive": reaches_ordered_bysubntw, subnetworks, subnetworks_only_ordered_jit = nhd_network.build_subnetworks_btw_reservoirs( connections, rconn, wbody_conn, independent_networks, sources=None ) if 1 == 1: print("JIT Preprocessing time %s seconds." % (time.time() - start_time)) print("starting Parallel JIT calculation") start_para_time = time.time() with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: results_subn = defaultdict(list) flowveldepth_interorder = {} for order in range(max(reaches_ordered_bysubntw.keys()), -1, -1): jobs = [] for twi, (subn_tw, subn_reach_list) in enumerate( reaches_ordered_bysubntw[order].items(), 1 ): # TODO: Confirm that a list here is best -- we are sorting, # so a set might be sufficient/better segs = list(chain.from_iterable(subn_reach_list)) offnetwork_upstreams = set() segs_set = set(segs) for seg in segs: for us in rconn[seg]: if us not in segs_set: offnetwork_upstreams.add(us) segs.extend(offnetwork_upstreams) common_segs = list(param_df.index.intersection(segs)) wbodies_segs = set(segs).symmetric_difference(common_segs) # Declare empty dataframe waterbody_types_df_sub = pd.DataFrame() # Set compute_func_switch to compute_func. # compute_func for this function should be set to "diffusive" compute_func_switch = compute_func # Can comment out above statement and uncomment below # if need to run compute_network_structured in mc_reach # for every subnetwork for debugging purposes. #compute_func_switch = compute_network_structured if not waterbodies_df.empty: lake_segs = list(waterbodies_df.index.intersection(segs)) if subn_tw in waterbodies_df.index: # Since this subn_tw is a resevoir, set compute_func_switch # to compute_network_structured. compute_func_switch = compute_network_structured waterbodies_df_sub = waterbodies_df.loc[ lake_segs, [ "LkArea", "LkMxE", "OrificeA", "OrificeC", "OrificeE", "WeirC", "WeirE", "WeirL", "ifd", "qd0", "h0", ], ] #If reservoir types other than Level Pool are active if not waterbody_types_df.empty: waterbody_types_df_sub = waterbody_types_df.loc[ lake_segs, [ "reservoir_type", ], ] else: lake_segs = [] waterbodies_df_sub = pd.DataFrame() param_df_sub = param_df.loc[ common_segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() param_df_sub_super = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() if order < max(reaches_ordered_bysubntw.keys()): for us_subn_tw in offnetwork_upstreams: subn_tw_sortposition = param_df_sub_super.index.get_loc( us_subn_tw ) flowveldepth_interorder[us_subn_tw][ "position_index" ] = subn_tw_sortposition subn_reach_list_with_type = _build_reach_type_list(subn_reach_list, wbodies_segs) qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] #Determine model_start_time from qlat_start_time qlat_start_time = list(qlat_sub)[0] qlat_time_step_seconds = qts_subdivisions * dt qlat_start_time_datetime_object = _format_qlat_start_time(qlat_start_time) model_start_time_datetime_object = qlat_start_time_datetime_object \ - timedelta(seconds=qlat_time_step_seconds) model_start_time = model_start_time_datetime_object.strftime('%Y-%m-%d_%H:%M:%S') param_df_sub = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() usgs_df_sub, lastobs_df_sub, da_positions_list_byseg = _prep_da_dataframes(usgs_df, lastobs_df, param_df_sub.index, offnetwork_upstreams) da_positions_list_byreach, da_positions_list_bygage = _prep_da_positions_byreach(subn_reach_list, lastobs_df_sub.index) qlat_sub = qlat_sub.reindex(param_df_sub.index) q0_sub = q0_sub.reindex(param_df_sub.index) jobs.append( delayed(compute_func_switch)( nts, dt, qts_subdivisions, subn_reach_list_with_type, subnetworks[subn_tw], param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), lake_segs, waterbodies_df_sub.values, waterbody_parameters, waterbody_types_df_sub.values.astype("int32"), waterbody_type_specified, model_start_time, usgs_df_sub.values.astype("float32"), np.array(da_positions_list_byseg, dtype="int32"), np.array(da_positions_list_byreach, dtype="int32"), np.array(da_positions_list_bygage, dtype="int32"), lastobs_df_sub.get("last_obs_discharge", pd.Series(index=lastobs_df_sub.index, name="Null")).values.astype("float32"), lastobs_df_sub.get("time_since_lastobs", pd.Series(index=lastobs_df_sub.index, name="Null")).values.astype("float32"), da_decay_coefficient, # flowveldepth_interorder, # obtain keys and values from this dataset { us: fvd for us, fvd in flowveldepth_interorder.items() if us in offnetwork_upstreams }, assume_short_ts, return_courant, diffusive_parameters, ) ) results_subn[order] = parallel(jobs) if order > 0: # This is not needed for the last rank of subnetworks flowveldepth_interorder = {} for twi, subn_tw in enumerate(reaches_ordered_bysubntw[order]): # TODO: This index step is necessary because we sort the segment index # TODO: I think there are a number of ways we could remove the sorting step # -- the binary search could be replaced with an index based on the known topology flowveldepth_interorder[subn_tw] = {} subn_tw_sortposition = ( results_subn[order][twi][0].tolist().index(subn_tw) ) flowveldepth_interorder[subn_tw]["results"] = results_subn[ order ][twi][1][subn_tw_sortposition] # what will it take to get just the tw FVD values into an array to pass to the next loop? # There will be an empty array initialized at the top of the loop, then re-populated here. # we don't have to bother with populating it after the last group results = [] for order in subnetworks_only_ordered_jit: results.extend(results_subn[order]) if 1 == 1: print("PARALLEL TIME %s seconds." % (time.time() - start_para_time)) elif parallel_compute_method == "by-network": with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: jobs = [] for twi, (tw, reach_list) in enumerate(reaches_bytw.items(), 1): # The X_sub lines use SEGS... # which is now invalid with the wbodies included. # So we define "common_segs" to identify regular routing segments # and wbodies_segs for the waterbody reaches/segments segs = list(chain.from_iterable(reach_list)) common_segs = param_df.index.intersection(segs) # Assumes everything else is a waterbody... wbodies_segs = set(segs).symmetric_difference(common_segs) #Declare empty dataframe waterbody_types_df_sub = pd.DataFrame() # If waterbody parameters exist if not waterbodies_df.empty: lake_segs = list(waterbodies_df.index.intersection(segs)) waterbodies_df_sub = waterbodies_df.loc[ lake_segs, [ "LkArea", "LkMxE", "OrificeA", "OrificeC", "OrificeE", "WeirC", "WeirE", "WeirL", "ifd", "qd0", "h0", ], ] #If reservoir types other than Level Pool are active if not waterbody_types_df.empty: waterbody_types_df_sub = waterbody_types_df.loc[ lake_segs, [ "reservoir_type", ], ] else: lake_segs = [] waterbodies_df_sub = pd.DataFrame() param_df_sub = param_df.loc[ common_segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() reaches_list_with_type = _build_reach_type_list(reach_list, wbodies_segs) # qlat_sub = qlats.loc[common_segs].sort_index() # q0_sub = q0.loc[common_segs].sort_index() qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] #Determine model_start_time from qlat_start_time qlat_start_time = list(qlat_sub)[0] qlat_time_step_seconds = qts_subdivisions * dt qlat_start_time_datetime_object = _format_qlat_start_time(qlat_start_time) model_start_time_datetime_object = qlat_start_time_datetime_object \ - timedelta(seconds=qlat_time_step_seconds) model_start_time = model_start_time_datetime_object.strftime('%Y-%m-%d_%H:%M:%S') param_df_sub = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() usgs_df_sub, lastobs_df_sub, da_positions_list_byseg = _prep_da_dataframes(usgs_df, lastobs_df, param_df_sub.index) da_positions_list_byreach, da_positions_list_bygage = _prep_da_positions_byreach(reach_list, lastobs_df_sub.index) qlat_sub = qlat_sub.reindex(param_df_sub.index) q0_sub = q0_sub.reindex(param_df_sub.index) jobs.append( delayed(compute_func)( nts, dt, qts_subdivisions, reaches_list_with_type, independent_networks[tw], param_df_sub.index.values.astype("int64"), param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), lake_segs, waterbodies_df_sub.values, waterbody_parameters, waterbody_types_df_sub.values.astype("int32"), waterbody_type_specified, model_start_time, usgs_df_sub.values.astype("float32"), np.array(da_positions_list_byseg, dtype="int32"), np.array(da_positions_list_byreach, dtype="int32"), np.array(da_positions_list_bygage, dtype="int32"), lastobs_df_sub.get("last_obs_discharge", pd.Series(index=lastobs_df_sub.index, name="Null")).values.astype("float32"), lastobs_df_sub.get("time_since_lastobs", pd.Series(index=lastobs_df_sub.index, name="Null")).values.astype("float32"), da_decay_coefficient, {}, assume_short_ts, return_courant, diffusive_parameters, ) ) results = parallel(jobs) else: # Execute in serial results = [] for twi, (tw, reach_list) in enumerate(reaches_bytw.items(), 1): # The X_sub lines use SEGS... # which becomes invalid with the wbodies included. # So we define "common_segs" to identify regular routing segments # and wbodies_segs for the waterbody reaches/segments segs = list(chain.from_iterable(reach_list)) common_segs = param_df.index.intersection(segs) # Assumes everything else is a waterbody... wbodies_segs = set(segs).symmetric_difference(common_segs) #Declare empty dataframe waterbody_types_df_sub = pd.DataFrame() # If waterbody parameters exist if not waterbodies_df.empty: lake_segs = list(waterbodies_df.index.intersection(segs)) waterbodies_df_sub = waterbodies_df.loc[ lake_segs, [ "LkArea", "LkMxE", "OrificeA", "OrificeC", "OrificeE", "WeirC", "WeirE", "WeirL", "ifd", "qd0", "h0", ], ] #If reservoir types other than Level Pool are active if not waterbody_types_df.empty: waterbody_types_df_sub = waterbody_types_df.loc[ lake_segs, [ "reservoir_type", ], ] else: lake_segs = [] waterbodies_df_sub = pd.DataFrame() param_df_sub = param_df.loc[ common_segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() reaches_list_with_type = _build_reach_type_list(reach_list, wbodies_segs) # qlat_sub = qlats.loc[common_segs].sort_index() # q0_sub = q0.loc[common_segs].sort_index() qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] #Determine model_start_time from qlat_start_time qlat_start_time = list(qlat_sub)[0] qlat_time_step_seconds = qts_subdivisions * dt qlat_start_time_datetime_object = _format_qlat_start_time(qlat_start_time) model_start_time_datetime_object = qlat_start_time_datetime_object \ - timedelta(seconds=qlat_time_step_seconds) model_start_time = model_start_time_datetime_object.strftime('%Y-%m-%d_%H:%M:%S') param_df_sub = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() usgs_df_sub, lastobs_df_sub, da_positions_list_byseg = _prep_da_dataframes(usgs_df, lastobs_df, param_df_sub.index) da_positions_list_byreach, da_positions_list_bygage = _prep_da_positions_byreach(reach_list, lastobs_df_sub.index) qlat_sub = qlat_sub.reindex(param_df_sub.index) q0_sub = q0_sub.reindex(param_df_sub.index) results.append( compute_func( nts, dt, qts_subdivisions, reaches_list_with_type, independent_networks[tw], param_df_sub.index.values.astype("int64"), param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), lake_segs, waterbodies_df_sub.values, waterbody_parameters, waterbody_types_df_sub.values.astype("int32"), waterbody_type_specified, model_start_time, usgs_df_sub.values.astype("float32"), np.array(da_positions_list_byseg, dtype="int32"), np.array(da_positions_list_byreach, dtype="int32"), np.array(da_positions_list_bygage, dtype="int32"), lastobs_df_sub.get("last_obs_discharge", pd.Series(index=lastobs_df_sub.index, name="Null")).values.astype("float32"), lastobs_df_sub.get("time_since_lastobs", pd.Series(index=lastobs_df_sub.index, name="Null")).values.astype("float32"), da_decay_coefficient, {}, assume_short_ts, return_courant, diffusive_parameters, ) ) return results
def main(): args = _handle_args() nts = args.nts debuglevel = -1 * args.debuglevel verbose = args.verbose showtiming = args.showtiming supernetwork = args.supernetwork break_network_at_waterbodies = args.break_network_at_waterbodies csv_output_folder = args.csv_output_folder assume_short_ts = args.assume_short_ts test_folder = pathlib.Path(root, "test") geo_input_folder = test_folder.joinpath("input", "geo") # TODO: Make these commandline args """##NHD Subset (Brazos/Lower Colorado)""" # supernetwork = 'Brazos_LowerColorado_Named_Streams' # supernetwork = 'Brazos_LowerColorado_ge5' # supernetwork = 'Pocono_TEST1' """##NHD CONUS order 5 and greater""" # supernetwork = 'CONUS_ge5' """These are large -- be careful""" # supernetwork = 'Mainstems_CONUS' # supernetwork = 'CONUS_FULL_RES_v20' # supernetwork = 'CONUS_Named_Streams' #create a subset of the full resolution by reading the GNIS field # supernetwork = 'CONUS_Named_combined' #process the Named streams through the Full-Res paths to join the many hanging reaches if verbose: print("creating supernetwork connections set") if showtiming: start_time = time.time() # STEP 1 network_data = nnu.set_supernetwork_data( supernetwork=args.supernetwork, geo_input_folder=geo_input_folder, verbose=False, debuglevel=debuglevel, ) cols = network_data["columns"] param_df = nhd_io.read(network_data["geo_file_path"]) param_df = param_df[list(cols.values())] param_df = param_df.set_index(cols["key"]) if "mask_file_path" in network_data: data_mask = nhd_io.read_mask( network_data["mask_file_path"], layer_string=network_data["mask_layer_string"], ) param_df = param_df.filter(data_mask.iloc[:, network_data["mask_key"]], axis=0) param_df = param_df.sort_index() param_df = nhd_io.replace_downstreams(param_df, cols["downstream"], 0) if args.ql: qlats = nhd_io.read_qlat(args.ql) else: qlats = constant_qlats(param_df, nts, 10.0) # initial conditions, assume to be zero # TO DO: Allow optional reading of initial conditions from WRF q0 = pd.DataFrame( 0, index=param_df.index, columns=["qu0", "qd0", "h0"], dtype="float32" ) connections = nhd_network.extract_connections(param_df, cols["downstream"]) wbodies = nhd_network.extract_waterbodies( param_df, cols["waterbody"], network_data["waterbody_null_code"] ) if verbose: print("supernetwork connections set complete") if showtiming: print("... in %s seconds." % (time.time() - start_time)) # STEP 2 if showtiming: start_time = time.time() if verbose: print("organizing connections into reaches ...") rconn = nhd_network.reverse_network(connections) independent_networks = nhd_network.reachable_network(rconn) reaches_bytw = {} for tw, net in independent_networks.items(): path_func = partial(nhd_network.split_at_junction, net) reaches_bytw[tw] = nhd_network.dfs_decomposition(net, path_func) if verbose: print("reach organization complete") if showtiming: print("... in %s seconds." % (time.time() - start_time)) if showtiming: start_time = time.time() param_df["dt"] = 300.0 param_df = param_df.rename(columns=nnu.reverse_dict(cols)) param_df = param_df.astype("float32") # datasub = data[['dt', 'bw', 'tw', 'twcc', 'dx', 'n', 'ncc', 'cs', 's0']] parallel_compute_method = args.parallel_compute_method cpu_pool = args.cpu_pool compute_method = args.compute_method if compute_method == "standard cython compute network": compute_func = mc_reach.compute_network else: compute_func = mc_reach.compute_network if parallel_compute_method == "by-network": with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: jobs = [] for twi, (tw, reach_list) in enumerate(reaches_bytw.items(), 1): r = list(chain.from_iterable(reach_list)) param_df_sub = param_df.loc[ r, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0"] ].sort_index() qlat_sub = qlats.loc[r].sort_index() q0_sub = q0.loc[r].sort_index() jobs.append( delayed(compute_func)( nts, reach_list, independent_networks[tw], param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, qlat_sub.values, q0_sub.values, ) ) results = parallel(jobs) else: # Execute in serial results = [] for twi, (tw, reach_list) in enumerate(reaches_bytw.items(), 1): r = list(chain.from_iterable(reach_list)) param_df_sub = param_df.loc[ r, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0"] ].sort_index() qlat_sub = qlats.loc[r].sort_index() q0_sub = q0.loc[r].sort_index() results.append( compute_func( nts, reach_list, independent_networks[tw], param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, qlat_sub.values, q0_sub.values, ) ) if (debuglevel <= -1) or csv_output_folder: qvd_columns = pd.MultiIndex.from_product( [range(nts), ["q", "v", "d"]] ).to_flat_index() flowveldepth = pd.concat( [pd.DataFrame(d, index=i, columns=qvd_columns) for i, d in results], copy=False, ) if csv_output_folder: flowveldepth = flowveldepth.sort_index() output_path = pathlib.Path(csv_output_folder).resolve() flowveldepth.to_csv(output_path.joinpath(f"{args.supernetwork}.csv")) if debuglevel <= -1: print(flowveldepth) if verbose: print("ordered reach computation complete") if showtiming: print("... in %s seconds." % (time.time() - start_time))
def compute_nhd_routing_v02( connections, rconn, wbodies, reaches_bytw, compute_func_name, parallel_compute_method, subnetwork_target_size, cpu_pool, dt, nts, qts_subdivisions, independent_networks, param_df, q0, qlats, usgs_df, last_obs_df, assume_short_ts, return_courant, waterbodies_df, diffusive_parameters=None, ): param_df["dt"] = dt param_df = param_df.astype("float32") start_time = time.time() compute_func = _compute_func_map[compute_func_name] if parallel_compute_method == "by-subnetwork-jit-clustered": networks_with_subnetworks_ordered_jit = nhd_network.build_subnetworks( connections, rconn, subnetwork_target_size ) subnetworks_only_ordered_jit = defaultdict(dict) subnetworks = defaultdict(dict) for tw, ordered_network in networks_with_subnetworks_ordered_jit.items(): intw = independent_networks[tw] for order, subnet_sets in ordered_network.items(): subnetworks_only_ordered_jit[order].update(subnet_sets) for subn_tw, subnetwork in subnet_sets.items(): subnetworks[subn_tw] = {k: intw[k] for k in subnetwork} reaches_ordered_bysubntw = defaultdict(dict) for order, ordered_subn_dict in subnetworks_only_ordered_jit.items(): for subn_tw, subnet in ordered_subn_dict.items(): conn_subn = {k: connections[k] for k in subnet if k in connections} rconn_subn = {k: rconn[k] for k in subnet if k in rconn} path_func = partial(nhd_network.split_at_junction, rconn_subn) reaches_ordered_bysubntw[order][ subn_tw ] = nhd_network.dfs_decomposition(rconn_subn, path_func) cluster_threshold = 0.65 # When a job has a total segment count 65% of the target size, compute it # Otherwise, keep adding reaches. reaches_ordered_bysubntw_clustered = defaultdict(dict) for order in subnetworks_only_ordered_jit: cluster = 0 reaches_ordered_bysubntw_clustered[order][cluster] = { "segs": [], "upstreams": {}, "tw": [], "subn_reach_list": [], } for twi, (subn_tw, subn_reach_list) in enumerate( reaches_ordered_bysubntw[order].items(), 1 ): segs = list(chain.from_iterable(subn_reach_list)) reaches_ordered_bysubntw_clustered[order][cluster]["segs"].extend(segs) reaches_ordered_bysubntw_clustered[order][cluster]["upstreams"].update( subnetworks[subn_tw] ) reaches_ordered_bysubntw_clustered[order][cluster]["tw"].append(subn_tw) reaches_ordered_bysubntw_clustered[order][cluster][ "subn_reach_list" ].extend(subn_reach_list) if ( len(reaches_ordered_bysubntw_clustered[order][cluster]["segs"]) >= cluster_threshold * subnetwork_target_size ) and ( twi < len(reaches_ordered_bysubntw[order]) # i.e., we haven't reached the end # TODO: perhaps this should be a while condition... ): cluster += 1 reaches_ordered_bysubntw_clustered[order][cluster] = { "segs": [], "upstreams": {}, "tw": [], "subn_reach_list": [], } if 1 == 1: print("JIT Preprocessing time %s seconds." % (time.time() - start_time)) print("starting Parallel JIT calculation") start_para_time = time.time() # if 1 == 1: with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: results_subn = defaultdict(list) flowveldepth_interorder = {} for order in range(max(subnetworks_only_ordered_jit.keys()), -1, -1): jobs = [] for cluster, clustered_subns in reaches_ordered_bysubntw_clustered[ order ].items(): segs = clustered_subns["segs"] offnetwork_upstreams = set() segs_set = set(segs) for seg in segs: for us in rconn[seg]: if us not in segs_set: offnetwork_upstreams.add(us) segs.extend(offnetwork_upstreams) param_df_sub = param_df.loc[ segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() if order < max(subnetworks_only_ordered_jit.keys()): for us_subn_tw in offnetwork_upstreams: subn_tw_sortposition = param_df_sub.index.get_loc( us_subn_tw ) flowveldepth_interorder[us_subn_tw][ "position_index" ] = subn_tw_sortposition subn_reach_list = clustered_subns["subn_reach_list"] upstreams = clustered_subns["upstreams"] if not usgs_df.empty: usgs_segs = list(usgs_df.index.intersection(param_df_sub.index)) nudging_positions_list = param_df_sub.index.get_indexer( usgs_segs ) usgs_df_sub = usgs_df.loc[usgs_segs] usgs_df_sub.drop( usgs_df_sub.columns[range(0, 1)], axis=1, inplace=True ) else: usgs_df_sub = pd.DataFrame() nudging_positions_list = [] last_obs_sub = pd.DataFrame() qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] # TODO: Wire in the proper reservoir distinction # At present, in by-subnetwork-jit/jit-clustered, these next two lines # only produce a dummy list, but... # Eventually, the wiring for reservoir simulation needs to be added. subn_reach_type_list = [0 for reaches in subn_reach_list] subn_reach_list_with_type = list( zip(subn_reach_list, subn_reach_type_list) ) # results_subn[order].append( # compute_func( jobs.append( delayed(compute_func)( nts, qts_subdivisions, subn_reach_list_with_type, upstreams, param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), [], # lake_segs np.empty( shape=(0, 0), dtype="float64" ), # waterbodies_df_sub.values usgs_df_sub.values.astype("float32"), # flowveldepth_interorder, # obtain keys and values from this dataset np.array(nudging_positions_list, dtype="int32"), last_obs_sub.values.astype("float32"), { us: fvd for us, fvd in flowveldepth_interorder.items() if us in offnetwork_upstreams }, assume_short_ts, return_courant, diffusive_parameters, ) ) results_subn[order] = parallel(jobs) if order > 0: # This is not needed for the last rank of subnetworks flowveldepth_interorder = {} for ci, (cluster, clustered_subns) in enumerate( reaches_ordered_bysubntw_clustered[order].items() ): for subn_tw in clustered_subns["tw"]: # TODO: This index step is necessary because we sort the segment index # TODO: I think there are a number of ways we could remove the sorting step # -- the binary search could be replaced with an index based on the known topology flowveldepth_interorder[subn_tw] = {} subn_tw_sortposition = ( results_subn[order][ci][0].tolist().index(subn_tw) ) flowveldepth_interorder[subn_tw]["results"] = results_subn[ order ][ci][1][subn_tw_sortposition] # what will it take to get just the tw FVD values into an array to pass to the next loop? # There will be an empty array initialized at the top of the loop, then re-populated here. # we don't have to bother with populating it after the last group results = [] for order in subnetworks_only_ordered_jit: results.extend(results_subn[order]) if 1 == 1: print("PARALLEL TIME %s seconds." % (time.time() - start_para_time)) elif parallel_compute_method == "by-subnetwork-jit": networks_with_subnetworks_ordered_jit = nhd_network.build_subnetworks( connections, rconn, subnetwork_target_size ) subnetworks_only_ordered_jit = defaultdict(dict) subnetworks = defaultdict(dict) for tw, ordered_network in networks_with_subnetworks_ordered_jit.items(): intw = independent_networks[tw] for order, subnet_sets in ordered_network.items(): subnetworks_only_ordered_jit[order].update(subnet_sets) for subn_tw, subnetwork in subnet_sets.items(): subnetworks[subn_tw] = {k: intw[k] for k in subnetwork} reaches_ordered_bysubntw = defaultdict(dict) for order, ordered_subn_dict in subnetworks_only_ordered_jit.items(): for subn_tw, subnet in ordered_subn_dict.items(): conn_subn = {k: connections[k] for k in subnet if k in connections} rconn_subn = {k: rconn[k] for k in subnet if k in rconn} path_func = partial(nhd_network.split_at_junction, rconn_subn) reaches_ordered_bysubntw[order][ subn_tw ] = nhd_network.dfs_decomposition(rconn_subn, path_func) if 1 == 1: print("JIT Preprocessing time %s seconds." % (time.time() - start_time)) print("starting Parallel JIT calculation") start_para_time = time.time() with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: results_subn = defaultdict(list) flowveldepth_interorder = {} for order in range(max(subnetworks_only_ordered_jit.keys()), -1, -1): jobs = [] for twi, (subn_tw, subn_reach_list) in enumerate( reaches_ordered_bysubntw[order].items(), 1 ): # TODO: Confirm that a list here is best -- we are sorting, # so a set might be sufficient/better segs = list(chain.from_iterable(subn_reach_list)) offnetwork_upstreams = set() segs_set = set(segs) for seg in segs: for us in rconn[seg]: if us not in segs_set: offnetwork_upstreams.add(us) segs.extend(offnetwork_upstreams) param_df_sub = param_df.loc[ segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() if order < max(subnetworks_only_ordered_jit.keys()): for us_subn_tw in offnetwork_upstreams: subn_tw_sortposition = param_df_sub.index.get_loc( us_subn_tw ) flowveldepth_interorder[us_subn_tw][ "position_index" ] = subn_tw_sortposition if not usgs_df.empty: usgs_segs = list(usgs_df.index.intersection(param_df_sub.index)) nudging_positions_list = param_df_sub.index.get_indexer( usgs_segs ) usgs_df_sub = usgs_df.loc[usgs_segs] usgs_df_sub.drop( usgs_df_sub.columns[range(0, 1)], axis=1, inplace=True ) else: usgs_df_sub = pd.DataFrame() nudging_positions_list = [] last_obs_sub = pd.DataFrame() qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] # At present, in by-subnetwork-jit/jit-clustered, these next two lines # only produce a dummy list, but... # Eventually, the wiring for reservoir simulation needs to be added. subn_reach_type_list = [0 for reaches in subn_reach_list] subn_reach_list_with_type = list( zip(subn_reach_list, subn_reach_type_list) ) jobs.append( delayed(compute_func)( nts, qts_subdivisions, subn_reach_list_with_type, subnetworks[subn_tw], param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), [], # lake_segs np.empty( shape=(0, 0), dtype="float64" ), # waterbodies_df_sub.values usgs_df_sub.values.astype("float32"), # flowveldepth_interorder, # obtain keys and values from this dataset np.array(nudging_positions_list, dtype="int32"), { us: fvd for us, fvd in flowveldepth_interorder.items() if us in offnetwork_upstreams }, assume_short_ts, return_courant, diffusive_parameters, ) ) results_subn[order] = parallel(jobs) if order > 0: # This is not needed for the last rank of subnetworks flowveldepth_interorder = {} for twi, subn_tw in enumerate(reaches_ordered_bysubntw[order]): # TODO: This index step is necessary because we sort the segment index # TODO: I think there are a number of ways we could remove the sorting step # -- the binary search could be replaced with an index based on the known topology flowveldepth_interorder[subn_tw] = {} subn_tw_sortposition = ( results_subn[order][twi][0].tolist().index(subn_tw) ) flowveldepth_interorder[subn_tw]["results"] = results_subn[ order ][twi][1][subn_tw_sortposition] # what will it take to get just the tw FVD values into an array to pass to the next loop? # There will be an empty array initialized at the top of the loop, then re-populated here. # we don't have to bother with populating it after the last group results = [] for order in subnetworks_only_ordered_jit: results.extend(results_subn[order]) if 1 == 1: print("PARALLEL TIME %s seconds." % (time.time() - start_para_time)) elif parallel_compute_method == "by-network": with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: jobs = [] for twi, (tw, reach_list) in enumerate(reaches_bytw.items(), 1): # The X_sub lines use SEGS... # which is now invalid with the wbodies included. # So we define "common_segs" to identify regular routing segments # and wbodies_segs for the waterbody reaches/segments segs = list(chain.from_iterable(reach_list)) common_segs = param_df.index.intersection(segs) # Assumes everything else is a waterbody... wbodies_segs = set(segs).symmetric_difference(common_segs) # If waterbody parameters exist if not waterbodies_df.empty: lake_segs = list(waterbodies_df.index.intersection(segs)) waterbodies_df_sub = waterbodies_df.loc[ lake_segs, [ "LkArea", "LkMxE", "OrificeA", "OrificeC", "OrificeE", "WeirC", "WeirE", "WeirL", "ifd", "qd0", "h0", ], ] else: lake_segs = [] waterbodies_df_sub = pd.DataFrame() param_df_sub = param_df.loc[ common_segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() if not usgs_df.empty: usgs_segs = list(usgs_df.index.intersection(param_df_sub.index)) nudging_positions_list = param_df_sub.index.get_indexer(usgs_segs) usgs_df_sub = usgs_df.loc[usgs_segs] usgs_df_sub.drop( usgs_df_sub.columns[range(0, 1)], axis=1, inplace=True ) else: usgs_df_sub = pd.DataFrame() nudging_positions_list = [] last_obs_sub = pd.DataFrame() reaches_list_with_type = [] for reaches in reach_list: if set(reaches) & wbodies_segs: reach_type = 1 # type 1 for waterbody/lake else: reach_type = 0 # type 0 for reach reach_and_type_tuple = (reaches, reach_type) reaches_list_with_type.append(reach_and_type_tuple) # qlat_sub = qlats.loc[common_segs].sort_index() # q0_sub = q0.loc[common_segs].sort_index() qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] param_df_sub = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() qlat_sub = qlat_sub.reindex(param_df_sub.index) q0_sub = q0_sub.reindex(param_df_sub.index) jobs.append( delayed(compute_func)( nts, qts_subdivisions, reaches_list_with_type, independent_networks[tw], param_df_sub.index.values.astype("int64"), param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), lake_segs, waterbodies_df_sub.values, usgs_df_sub.values.astype("float32"), np.array(nudging_positions_list, dtype="int32"), last_obs_sub.values.astype("float32"), {}, assume_short_ts, return_courant, diffusive_parameters, ) ) results = parallel(jobs) else: # Execute in serial results = [] for twi, (tw, reach_list) in enumerate(reaches_bytw.items(), 1): # The X_sub lines use SEGS... # which becomes invalid with the wbodies included. # So we define "common_segs" to identify regular routing segments # and wbodies_segs for the waterbody reaches/segments segs = list(chain.from_iterable(reach_list)) common_segs = param_df.index.intersection(segs) # Assumes everything else is a waterbody... wbodies_segs = set(segs).symmetric_difference(common_segs) # If waterbody parameters exist if not waterbodies_df.empty: lake_segs = list(waterbodies_df.index.intersection(segs)) waterbodies_df_sub = waterbodies_df.loc[ lake_segs, [ "LkArea", "LkMxE", "OrificeA", "OrificeC", "OrificeE", "WeirC", "WeirE", "WeirL", "ifd", "qd0", "h0", ], ] else: lake_segs = [] waterbodies_df_sub = pd.DataFrame() param_df_sub = param_df.loc[ common_segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0", "alt"], ].sort_index() if not usgs_df.empty: usgs_segs = list(usgs_df.index.intersection(param_df_sub.index)) nudging_positions_list = param_df_sub.index.get_indexer(usgs_segs) usgs_df_sub = usgs_df.loc[usgs_segs] usgs_df_sub.drop(usgs_df_sub.columns[range(0, 1)], axis=1, inplace=True) else: usgs_df_sub = pd.DataFrame() nudging_positions_list = [] if not last_obs_df.empty: pass # lastobs_segs = list(last_obs_df.index.intersection(param_df_sub.index)) # nudging_positions_list = param_df_sub.index.get_indexer(lastobs_segs) # last_obs_sub = last_obs_df.loc[lastobs_segs] else: last_obs_sub = pd.DataFrame() # nudging_positions_list = [] # qlat_sub = qlats.loc[common_segs].sort_index() # q0_sub = q0.loc[common_segs].sort_index() qlat_sub = qlats.loc[param_df_sub.index] q0_sub = q0.loc[param_df_sub.index] param_df_sub = param_df_sub.reindex( param_df_sub.index.tolist() + lake_segs ).sort_index() qlat_sub = qlat_sub.reindex(param_df_sub.index) q0_sub = q0_sub.reindex(param_df_sub.index) reach_type_list = [ 1 if (set(reaches) & wbodies_segs) else 0 for reaches in reach_list ] reaches_list_with_type = list(zip(reach_list, reach_type_list)) """ reaches_list_with_type = [] for reaches in reach_list: if (set(reaches) & wbodies_segs): reach_type = 1 # type 1 for waterbody/lake else: reach_type = 0 # type 0 for reach reach_and_type_tuple = (reaches, reach_type) reaches_list_with_type.append(reach_and_type_tuple) """ results.append( compute_func( nts, qts_subdivisions, reaches_list_with_type, independent_networks[tw], param_df_sub.index.values.astype("int64"), param_df_sub.columns.values, param_df_sub.values, q0_sub.values.astype("float32"), qlat_sub.values.astype("float32"), lake_segs, waterbodies_df_sub.values, usgs_df_sub.values.astype("float32"), np.array(nudging_positions_list, dtype="int32"), last_obs_sub.values.astype("float32"), {}, assume_short_ts, return_courant, diffusive_parameters, ) ) return results
def compute_nhd_routing_v02( connections, rconn, reaches_bytw, compute_func, parallel_compute_method, subnetwork_target_size, cpu_pool, nts, qts_subdivisions, independent_networks, param_df, qlats, q0, assume_short_ts, ): start_time = time.time() if parallel_compute_method == "by-subnetwork-jit-clustered": networks_with_subnetworks_ordered_jit = nhd_network.build_subnetworks( connections, rconn, subnetwork_target_size) subnetworks_only_ordered_jit = defaultdict(dict) subnetworks = defaultdict(dict) for tw, ordered_network in networks_with_subnetworks_ordered_jit.items( ): intw = independent_networks[tw] for order, subnet_sets in ordered_network.items(): subnetworks_only_ordered_jit[order].update(subnet_sets) for subn_tw, subnetwork in subnet_sets.items(): subnetworks[subn_tw] = {k: intw[k] for k in subnetwork} reaches_ordered_bysubntw = defaultdict(dict) for order, ordered_subn_dict in subnetworks_only_ordered_jit.items(): for subn_tw, subnet in ordered_subn_dict.items(): conn_subn = { k: connections[k] for k in subnet if k in connections } rconn_subn = {k: rconn[k] for k in subnet if k in rconn} path_func = partial(nhd_network.split_at_junction, rconn_subn) reaches_ordered_bysubntw[order][ subn_tw] = nhd_network.dfs_decomposition( rconn_subn, path_func) cluster_threshold = 0.65 # When a job has a total segment count 65% of the target size, compute it # Otherwise, keep adding reaches. reaches_ordered_bysubntw_clustered = defaultdict(dict) for order in subnetworks_only_ordered_jit: cluster = 0 reaches_ordered_bysubntw_clustered[order][cluster] = { "segs": [], "upstreams": {}, "tw": [], "subn_reach_list": [], } for twi, (subn_tw, subn_reach_list) in enumerate( reaches_ordered_bysubntw[order].items(), 1): segs = list(chain.from_iterable(subn_reach_list)) reaches_ordered_bysubntw_clustered[order][cluster][ "segs"].extend(segs) reaches_ordered_bysubntw_clustered[order][cluster][ "upstreams"].update(subnetworks[subn_tw]) reaches_ordered_bysubntw_clustered[order][cluster][ "tw"].append(subn_tw) reaches_ordered_bysubntw_clustered[order][cluster][ "subn_reach_list"].extend(subn_reach_list) if ( len(reaches_ordered_bysubntw_clustered[order][cluster] ["segs"]) >= cluster_threshold * subnetwork_target_size ) and (twi < len(reaches_ordered_bysubntw[order]) # i.e., we haven't reached the end # TODO: perhaps this should be a while condition... ): cluster += 1 reaches_ordered_bysubntw_clustered[order][cluster] = { "segs": [], "upstreams": {}, "tw": [], "subn_reach_list": [], } if 1 == 1: print("JIT Preprocessing time %s seconds." % (time.time() - start_time)) print("starting Parallel JIT calculation") start_para_time = time.time() # if 1 == 1: with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: results_subn = defaultdict(list) flowveldepth_interorder = {} for order in range(max(subnetworks_only_ordered_jit.keys()), -1, -1): jobs = [] for cluster, clustered_subns in reaches_ordered_bysubntw_clustered[ order].items(): segs = clustered_subns["segs"] offnetwork_upstreams = set() segs_set = set(segs) for seg in segs: for us in rconn[seg]: if us not in segs_set: offnetwork_upstreams.add(us) segs.extend(offnetwork_upstreams) param_df_sub = param_df.loc[segs, [ "dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0" ]].sort_index() if order < max(subnetworks_only_ordered_jit.keys()): for us_subn_tw in offnetwork_upstreams: subn_tw_sortposition = param_df_sub.index.get_loc( us_subn_tw) flowveldepth_interorder[us_subn_tw][ "position_index"] = subn_tw_sortposition qlat_sub = qlats.loc[segs].sort_index() q0_sub = q0.loc[segs].sort_index() subn_reach_list = clustered_subns["subn_reach_list"] upstreams = clustered_subns["upstreams"] # results_subn[order].append( # compute_func( jobs.append( delayed(compute_func)( nts, qts_subdivisions, subn_reach_list, upstreams, param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, qlat_sub.values, q0_sub.values, # flowveldepth_interorder, # obtain keys and values from this dataset { us: fvd for us, fvd in flowveldepth_interorder.items() if us in offnetwork_upstreams }, assume_short_ts, )) results_subn[order] = parallel(jobs) if order > 0: # This is not needed for the last rank of subnetworks flowveldepth_interorder = {} for ci, (cluster, clustered_subns) in enumerate( reaches_ordered_bysubntw_clustered[order].items()): for subn_tw in clustered_subns["tw"]: # TODO: This index step is necessary because we sort the segment index # TODO: I think there are a number of ways we could remove the sorting step # -- the binary search could be replaced with an index based on the known topology flowveldepth_interorder[subn_tw] = {} subn_tw_sortposition = (results_subn[order][ci][0]. tolist().index(subn_tw)) flowveldepth_interorder[subn_tw][ "results"] = results_subn[order][ci][1][ subn_tw_sortposition] # what will it take to get just the tw FVD values into an array to pass to the next loop? # There will be an empty array initialized at the top of the loop, then re-populated here. # we don't have to bother with populating it after the last group results = [] for order in subnetworks_only_ordered_jit: results.extend(results_subn[order]) if 1 == 1: print("PARALLEL TIME %s seconds." % (time.time() - start_para_time)) elif parallel_compute_method == "by-subnetwork-jit": networks_with_subnetworks_ordered_jit = nhd_network.build_subnetworks( connections, rconn, subnetwork_target_size) subnetworks_only_ordered_jit = defaultdict(dict) subnetworks = defaultdict(dict) for tw, ordered_network in networks_with_subnetworks_ordered_jit.items( ): intw = independent_networks[tw] for order, subnet_sets in ordered_network.items(): subnetworks_only_ordered_jit[order].update(subnet_sets) for subn_tw, subnetwork in subnet_sets.items(): subnetworks[subn_tw] = {k: intw[k] for k in subnetwork} reaches_ordered_bysubntw = defaultdict(dict) for order, ordered_subn_dict in subnetworks_only_ordered_jit.items(): for subn_tw, subnet in ordered_subn_dict.items(): conn_subn = { k: connections[k] for k in subnet if k in connections } rconn_subn = {k: rconn[k] for k in subnet if k in rconn} path_func = partial(nhd_network.split_at_junction, rconn_subn) reaches_ordered_bysubntw[order][ subn_tw] = nhd_network.dfs_decomposition( rconn_subn, path_func) if 1 == 1: print("JIT Preprocessing time %s seconds." % (time.time() - start_time)) print("starting Parallel JIT calculation") start_para_time = time.time() with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: results_subn = defaultdict(list) flowveldepth_interorder = {} for order in range(max(subnetworks_only_ordered_jit.keys()), -1, -1): jobs = [] for twi, (subn_tw, subn_reach_list) in enumerate( reaches_ordered_bysubntw[order].items(), 1): # TODO: Confirm that a list here is best -- we are sorting, # so a set might be sufficient/better segs = list(chain.from_iterable(subn_reach_list)) offnetwork_upstreams = set() segs_set = set(segs) for seg in segs: for us in rconn[seg]: if us not in segs_set: offnetwork_upstreams.add(us) segs.extend(offnetwork_upstreams) param_df_sub = param_df.loc[segs, [ "dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0" ]].sort_index() if order < max(subnetworks_only_ordered_jit.keys()): for us_subn_tw in offnetwork_upstreams: subn_tw_sortposition = param_df_sub.index.get_loc( us_subn_tw) flowveldepth_interorder[us_subn_tw][ "position_index"] = subn_tw_sortposition qlat_sub = qlats.loc[segs].sort_index() q0_sub = q0.loc[segs].sort_index() jobs.append( delayed(compute_func)( nts, qts_subdivisions, subn_reach_list, subnetworks[subn_tw], param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, qlat_sub.values, q0_sub.values, # flowveldepth_interorder, # obtain keys and values from this dataset { us: fvd for us, fvd in flowveldepth_interorder.items() if us in offnetwork_upstreams }, assume_short_ts, )) results_subn[order] = parallel(jobs) if order > 0: # This is not needed for the last rank of subnetworks flowveldepth_interorder = {} for twi, subn_tw in enumerate( reaches_ordered_bysubntw[order]): # TODO: This index step is necessary because we sort the segment index # TODO: I think there are a number of ways we could remove the sorting step # -- the binary search could be replaced with an index based on the known topology flowveldepth_interorder[subn_tw] = {} subn_tw_sortposition = (results_subn[order][twi] [0].tolist().index(subn_tw)) flowveldepth_interorder[subn_tw][ "results"] = results_subn[order][twi][1][ subn_tw_sortposition] # what will it take to get just the tw FVD values into an array to pass to the next loop? # There will be an empty array initialized at the top of the loop, then re-populated here. # we don't have to bother with populating it after the last group results = [] for order in subnetworks_only_ordered_jit: results.extend(results_subn[order]) if 1 == 1: print("PARALLEL TIME %s seconds." % (time.time() - start_para_time)) elif parallel_compute_method == "by-network": with Parallel(n_jobs=cpu_pool, backend="threading") as parallel: jobs = [] for twi, (tw, reach_list) in enumerate(reaches_bytw.items(), 1): segs = list(chain.from_iterable(reach_list)) param_df_sub = param_df.loc[ segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0" ]].sort_index() qlat_sub = qlats.loc[segs].sort_index() q0_sub = q0.loc[segs].sort_index() jobs.append( delayed(compute_func)( nts, qts_subdivisions, reach_list, independent_networks[tw], param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, qlat_sub.values, q0_sub.values, {}, assume_short_ts, )) results = parallel(jobs) else: # Execute in serial results = [] for twi, (tw, reach_list) in enumerate(reaches_bytw.items(), 1): segs = list(chain.from_iterable(reach_list)) param_df_sub = param_df.loc[ segs, ["dt", "bw", "tw", "twcc", "dx", "n", "ncc", "cs", "s0" ]].sort_index() qlat_sub = qlats.loc[segs].sort_index() q0_sub = q0.loc[segs].sort_index() results.append( compute_func( nts, qts_subdivisions, reach_list, independent_networks[tw], param_df_sub.index.values, param_df_sub.columns.values, param_df_sub.values, qlat_sub.values, q0_sub.values, {}, assume_short_ts, )) return results