def load_separate_model_results( results_dir: str = BUILDING_RESULTS_DIR, exp_datetime: datetime.datetime = None) -> Dict: all_experiments = os.listdir(results_dir) if exp_datetime is None: paths = [ os.path.join(results_dir, experiment) for experiment in all_experiments ] exp_dir = max(paths, key=os.path.getctime) else: date_str = exp_datetime.isoformat(' ', 'seconds') exp_dir = os.path.join(results_dir, date_str) if not os.path.exists(exp_dir): logging.warning("Experiment path doesn't exist") return {'model_results': []} print(f"loading from {exp_dir}") all_files = os.listdir(exp_dir) chunk_size = 10 # TODO: ugly cause pickles are large (1800 models takes 2 hours to load) def load_chunk(i): all_results = [] for f in all_files[min(len(all_files), i * chunk_size):min(len(all_files), (i + 1) * chunk_size)]: with open(os.path.join(exp_dir, f, 'results.pickle'), 'rb') as file: res = pickle.load(file)['kfold_results'] # TODO: we might want it in the future # del res['X_df'] # del res['y'] all_results.append(res) return all_results def load_file(fpath): with open(os.path.join(exp_dir, fpath, 'results.pickle'), 'rb') as file: res = pickle.load(file)['kfold_results'] # TODO: we might want it in the future # del res['X_df'] # del res['y'] return res print(f"number of files is {len(all_files)}") all_results = parmap(load_file, all_files, chunk_size=5, use_tqdm=True, desc="Opening all model results", unit="model") return {'model_results': all_results}
def transform(self, input_gs: GeoSeries, use_cache: bool = True) -> pd.DataFrame: """ extract the desired features on desired geometries Args: input_gs: a GeoSeries with the desired geometries use_cache: if set and self.cache_table is filled will load/save the features to the cache Returns: a pandas dataframe, with columns as features, and rows as the geometries in input_gs """ assert len(input_gs.apply(lambda p: p.wkt).unique()) == len( input_gs), "Shouldn't have duplicates when transform" required_feats, loaded_feats_dfs = self.features, [] if use_cache: logging.debug( f"Starting load from cache for {len(input_gs)} objects") required_feats, loaded_feats_dfs = self.load_from_cache( self.features, input_gs) if len(required_feats) == 0: logging.debug("loaded all from cache!") return pd.concat(loaded_feats_dfs, axis=1) # append by column else: logging.debug( f"loaded from cache {len(loaded_feats_dfs)}/{len(self.features)}" ) else: logging.debug(f"Don't load from cache") feature_factory = PostgresFeatureFactory(required_feats, input_gs=input_gs) with feature_factory: features_gs_list = parmap( lambda feature: feature.extract(input_gs), feature_factory.features, use_tqdm=True, desc=f"Calculating Features for {len(input_gs)} geoms", unit='feature', leave=False) # TODO: if want, extract_object_set all_features_df = pd.concat(features_gs_list + loaded_feats_dfs, axis=1)[self.all_feat_names] if self.cache_table and use_cache: calculated_features_df = pd.concat(features_gs_list, axis=1) save_features_to_db(input_gs, calculated_features_df, self.cache_table) return all_features_df
def fit_all_models(self, x: pd.DataFrame, y_true, cv=None): # trained_models_and_scores = parmap(lambda model: model.fit(x, y_true, cv=cv), self.models_dict.items(), # use_tqdm=True, desc="Fitting Models", unit="model") # y_true_soft = self.get_soft_labels(gpd.GeoSeries(data=x.index.values), radius=TRUE_POSITIVE_RADIUS, # cache_dir=DISTANCE_CACHE_DIR) # y_true_soft = y_true # TODO delete this and uncomment last row def fit_model(model): if cv is not None: # TODO is this really needed? model.fit(x, y_true, cv=cv) else: model.fit(x, y_true) return model models = parmap(fit_model, self.models_dict.values(), use_tqdm=True, desc="Fitting Models", unit="model", nprocs=32) # for name, model in tqdm(self.models_dict.items(), desc="Fitting Models", unit="model"): return models
def test_building_to_CLSTR_multiple_pipe_runs_and_returns_same(self): # TODO: weird, if this runs second weird errors will arise. "pyproj database disk image is malformed" # also when runs in parallel with pytest sometimes fail neighb_init_states, poly_feat_builder = self.get_setup_variables() res = parmap( lambda b: building_to_CLSTR( b.hull, poly_feat_builder, _ClosestToSpecificArea(), # partial(hill_climbing, iterations_limit=2)), partial(beam, beam_size=50, iterations_limit=3)), neighb_init_states) # , nprocs=1 print(*[r[1] for r in res]) self.assertSetEqual({r[1] for r in res}, {res[0][1]}) for init_s, (bc, v) in zip(neighb_init_states, res): self.assertIsInstance(bc, BuildingCluster) self.assertIsInstance(v, float)
def test_parmap_does_calculation_correctly_with_chunks(self): res = parmap(f, range(500), chunk_size=5) self.assertListEqual(res, [x + 1 for x in range(500)])
def test_parmap_does_calculation_correctly(self): res = parmap(f, range(500)) self.assertListEqual(res, [x + 1 for x in range(500)])
def train_predict_on_split(task, building_gs, buildings_y, source_indices, geos, y, source_train_indices, source_test_indices): building_train_indices = np.isin(source_indices, source_train_indices) building_test_indices = np.isin(source_indices, source_test_indices) # fetch train-set and fit buildings_train_gs = building_gs.iloc[ building_train_indices].reset_index(drop=True) y_train_buildings = buildings_y[building_train_indices] buildings_test_gs = building_gs.iloc[ building_test_indices].reset_index(drop=True) y_test_buildings = buildings_y[building_test_indices] train_true_geos = geos[np.isin(range(len(geos)), source_train_indices) & y] # train-test in CLSTRs test_true_geos = geos[np.isin(range(len(geos)), source_test_indices) & y] # train-test in CLSTRs fpb = task.embedder # feature extractor for polygons # add the building scores feature train_hash = hash_geoseries(geos[source_train_indices]) fpb.features += [ BuildingScores( SCORES_TABLE, BUILDING_EXPERIMENT_NAME, 'BalancedRF1000', # TODO: doesn't match current MetaModel naming train_geom_hash=train_hash, radius=radius) for radius in [0, 25] ] heuristic_guiding_model = BaselineModel() heuristic_guiding_model.fit(task.transform(train_true_geos)) # for i in trange(5, desc="Training CLSTR heuristic"): # potential_CLSTRs_test = parmap(lambda b: building_to_CLSTR(b, fpb, heuristic_guiding_model), # random.sample(buildings_train_gs[y_train]), use_tqdm=True, desc="Calculating potential CLSTRs") # # heuristic_guiding_model = OneClassSVM() # heuristic_guiding_model.fit(task.transform(train_true_geos)) # TODO: do smarter choice of what buildings to start from ? score_extractor = FeaturesBuilder([ BuildingScores(SCORES_TABLE, BUILDING_EXPERIMENT_NAME, 'BalancedRF1000', radius=0, train_geom_hash=train_hash) ]) building_scores_sorted = score_extractor.transform( buildings_test_gs)['building_scores_avg_0m'].sort_values( ascending=False) building_scores = pd.Series( index=buildings_test_gs.iloc[building_scores_sorted.index], data=building_scores_sorted.values) # building_scores = gpd.GeoDataFrame( # zip(buildings_test_gs, np.random.random(len(buildings_test_gs))), # columns=['geometry', 'score'], geometry='geometry').set_index('geometry') # TODO: do smarter choice of what buildings to start from. now top scoring 250 best_test_buildings_with_scores = building_scores.iloc[random.sample( range(1000), 500)] potential_CLSTRs_test = parmap(lambda b: building_to_CLSTR( b, fpb, heuristic_guiding_model, partial(beam, beam_size=15, iterations_limit=15)), best_test_buildings_with_scores.index, use_tqdm=True, desc="Calculating potential CLSTRs", keep_child_tqdm=True, nprocs=16) # TODO: postprocessing, which CLSTRs to give. Related to how the fit together. print([p[1] for p in potential_CLSTRs_test]) print([len(p[0].buildings) for p in potential_CLSTRs_test]) sorted_potential_CLSTRs_test = list( sorted(potential_CLSTRs_test, key=lambda p: p[1], reverse=True)) # TODO: choose with intel, depending on pluga, etc. best_potential_CLSTRs_test = pd.Series( index=[p[0].hull for p in sorted_potential_CLSTRs_test], data=MinMaxScaler().fit_transform([ [p[1]] for p in sorted_potential_CLSTRs_test ])[:, 0]) # normalize scores, IMPORTANT print(best_potential_CLSTRs_test) return building_scores, geos.iloc[ source_train_indices], y_train_buildings, geos.iloc[ source_test_indices], test_true_geos, y_test_buildings, best_potential_CLSTRs_test
def _create_intersection_table( self, geom_table_name: str, eng: sa.engine.Engine) -> Dict[float, Dict[str, str]]: """ Create a temporary intersection table, to be later used for all the sub-features. The table is created using self.table_filter_dict, and the features radii Args: geom_table_name: name of the geometries table to intersect on. eng: sql Alchemy engine. Returns: The names of the temporary intersection table for each radius, then original table """ radius_table_to_tmp_table_names = {} all_tpls = [] for table, filters_dict in self.table_filter_dict.items(): all_radii = self.table_radii[table] for radius in all_radii: all_tpls.append((table, filters_dict[radius], radius)) def calc_intersect(tpl_idx, table, filters_dict, radius): eng = get_connection('POSTGRES') filters_columns_sql = ',\n'.join([ f"CASE WHEN {filter_sql} THEN 1 ELSE 0 end as {filter_name}" for filter_name, filter_sql in filters_dict.items() ]) filters_sql = ' or '.join(filters_dict.values()) tbl_name = f"{get_temp_table_name()}{tpl_idx}" # add height if exists height_exists = column_exists('height', table, eng) inner_height_sql = "height, absolute_ground_surface_height," if height_exists else "" outer_height_sql = """t.height as height, t.absolute_ground_surface_height as ground_height, t.absolute_ground_surface_height + t.height as absolute_height,""" if height_exists else "" query = f""" create UNLOGGED TABLE {tbl_name} as select 1.0 as coverage, {outer_height_sql} q.geom_id as geom_id, q.geom as q_geom, t.geom as t_geom, Geography(t.geom) as t_geog, {', '.join(filters_dict.keys())} from {geom_table_name} q JOIN (select way as geom, {inner_height_sql} {filters_columns_sql} from {table} WHERE {filters_sql}) t ON ST_DWithin(t.geom, q.geom, {radius}, true) """ eng.execute(query) add_postgis_index(eng, tbl_name, 'q_geom') add_postgis_index(eng, tbl_name, 't_geom') add_postgis_index(eng, tbl_name, 't_geog') eng.dispose() return radius, table, tbl_name res = parmap(lambda p: calc_intersect(p[0], *p[1]), list(enumerate(all_tpls)), use_tqdm=True, desc="Calculating intersection", unit="table", leave=False) for radius, table, tbl_name in res: radius_table_to_tmp_table_names.setdefault(radius, {}).update( {table: tbl_name}) return radius_table_to_tmp_table_names