def _classify_imgs(self, img_filepaths, clf, output_dir): pred_imgs_lazy = [] pred_img_filepaths = [] for img_filepath in img_filepaths: # filename, ext = path.splitext(path.basename(img_filepath)) # pred_img_filepath = path.join( # output_dir, f"{filename}-pred{ext}") pred_img_filepath = path.join(output_dir, path.basename(img_filepath)) pred_imgs_lazy.append( dask.delayed(self.classify_img)(img_filepath, clf, pred_img_filepath)) pred_img_filepaths.append(pred_img_filepath) with diagnostics.ProgressBar(): dask.compute(*pred_imgs_lazy) return pred_img_filepaths
def make_confusion_df( lidar_gdf, lidar_raw_dir, split_df=None, img_filepaths=None, n=None, frac=0.05, clf=None, clf_dict=None, ): c = dtr.Classifier() truth_pred_lazy = [] if clf is not None: if split_df is None: num_validation_tiles = int(frac * len(img_filepaths)) test_filepaths = random.choices(img_filepaths, k=num_validation_tiles) else: test_filepaths = _get_validation_df(split_df, n, frac)["img_filepath"] for img_filepath in test_filepaths: truth_pred_lazy.append( dask.delayed(_inner_loop)(img_filepath, lidar_gdf, lidar_raw_dir, c, clf)) else: validation_df = _get_validation_df(split_df, n, frac) for img_cluster, cluster_df in validation_df.groupby("img_cluster"): clf = clf_dict[img_cluster] for img_filepath in cluster_df["img_filepath"]: truth_pred_lazy.append( dask.delayed(_inner_loop)(img_filepath, lidar_gdf, lidar_raw_dir, c, clf)) with diagnostics.ProgressBar(): truth_pred = np.hstack(dask.compute(*truth_pred_lazy)) truth_ser = pd.Series(truth_pred[0], name="actual") pred_ser = pd.Series(truth_pred[1], name="predicted") return pd.crosstab(truth_ser, pred_ser) / len(truth_ser)
def descr_feature_matrix(self): try: return self._descr_feature_matrix except AttributeError: kernels = filters.get_gabor_filter_bank( frequencies=self.gabor_frequencies, num_orientations=self.gabor_num_orientations) # num_blocks = self.response_bins_per_axis**2 # feature_rows = [ # TrainingSelector._get_image_descr( # img_filepath, kernels, self.response_bins_per_axis, # num_blocks, self.num_color_bins) # for img_filepath in self.img_filepaths # ] values = [ dask.delayed( image_descriptor.compute_image_descriptor_from_filepath)( img_filepath, kernels, self.response_bins_per_axis, self.num_color_bins) for img_filepath in self.img_filepaths ] with diagnostics.ProgressBar(): feature_rows = dask.compute(*values) self._descr_feature_matrix = np.vstack(feature_rows) # TODO: cache as instance attribute (or even use property with and # pass this method's arguments to init), and then let people # interactively choose the number of PCA components until they're # happy with the represented variance? I vote yes. # TODO: cache this (via persistence): if `img_filepaths` and the # technical parameters coincide, load from a file instead of # recomputing it # TODO: return copy? return self._descr_feature_matrix
def train_classifiers(self, split_df, response_img_dir): """ Train a classifier for each first-level cluster in `split_df`. See the `background <https://bit.ly/2KlCICO>`_ example notebook for more details. Parameters ---------- split_df : pandas DataFrame Data frame with the train/test split, which must have an `img_cluster` column with the first-level cluster labels. response_img_dir : str representing path to a directory Path to the directory where the response tiles are located. Returns ------- clf_dict : dictionary Dictionary mapping a scikit-learn AdaBoostClassifier to each first-level cluster label """ if 'img_cluster' not in split_df: raise ValueError( "`split_df` must have an 'img_cluster' column ('cluster-II'). " "For 'cluster-I', use `train_classifier`.") clfs_lazy = {} for img_cluster, _ in split_df.groupby('img_cluster'): clfs_lazy[img_cluster] = dask.delayed(self.train_classifier)( split_df=split_df, response_img_dir=response_img_dir, method='cluster-II', img_cluster=img_cluster) with diagnostics.ProgressBar(): clfs_dict = dask.compute(clfs_lazy)[0] return clfs_dict
def convert(data, fname_data, df_artefacts=None, fname_uncorrected=None): """Convert TIFF files from 2p dataset in HDF5. Optionally create artefact-removed dataset.""" # Important: code expects no chunking in z, y, z -- need to have -1 for these dimensions. data = data.rechunk( (64, -1, -1, -1)) # 64 frames will be processed together for artefact removal. with diagnostics.ProgressBar(): if df_artefacts is None: logger.info('Writing data to %s', fname_data) unlink(fname_data) os.makedirs(fname_data.parent, exist_ok=True) data.to_hdf5(fname_data, HDF5_KEY) else: # This writes 2 hdf5 files, where the 2nd one depends on the same data being # written to the first. Ideally, both would be written simultaneously, but # that cannot be done using dask. Instead, the 1st file is written and then # read back to write the 2nd one. logger.info('Writing uncorrected data to %s', fname_uncorrected) unlink(fname_uncorrected) os.makedirs(fname_uncorrected.parent, exist_ok=True) data.to_hdf5(fname_uncorrected, HDF5_KEY) logger.info('Writing corrected data to %s', fname_data) with h5py.File(fname_uncorrected, 'r') as hfile: arr = da.from_array(hfile[HDF5_KEY]) # Depth of 1 in the first coordinate means to bring in the frames before and after # the chunk -- needed for doing diffs. depth = (1, 0, 0, 0) data_corrected = arr.map_overlap(remove_artefacts, depth=depth, dtype=data.dtype, df=df_artefacts, mydepth=depth) unlink(fname_data) os.makedirs(fname_data.parent, exist_ok=True) data_corrected.to_hdf5(fname_data, HDF5_KEY)
map_output = line_bag.map(line_to_words).flatten() map_output # In[11]: # we cheat a bit for the reduce step reduce_output = map_output.frequencies() top10 = reduce_output.topk(10, lambda x: x[1]) bot10 = reduce_output.topk(10, lambda x: -x[1]) # In[12]: import dask.diagnostics as diag with diag.ProgressBar(), diag.Profiler() as prof, diag.ResourceProfiler( 0.5) as rprof: print("Top 10\n", top10.compute(num_workers=4)) print("Bottom 10\n", bot10.compute(num_workers=4)) # In[13]: diag.visualize([prof, rprof]) # # Hadoop # # # Hadoop is the opensource version of MapReduce developed by Yahoo and released as an Apache project. It provides underlying infrastructure and filesystem that handles storing and distributing data so each machine stores some of the data locally and processing jobs run where the data is stored. # - Non-local data is copied over the network. # - Storage is automatically expanded with processing power. # - It's how Amazon, Microsoft, Yahoo, Facebook, ... deal with exabytes of data
def simulate_scenario_t_da(scenario_lulc_da, biophysical_table_filepath, ref_et_raster_filepath, t_ref, uhi_max, ucm_params, dst_t_dtype='float32', rio_meta=None, cc_method='factors'): if rio_meta is None: x = scenario_lulc_da['x'].values y = scenario_lulc_da['y'].values west = x[0] north = y[0] # TODO: does the method to get the transform work for all grids, i.e., # regardless of whether the origin is in the upper left or lower left? rio_meta = dict(driver='GTiff', dtype=scenario_lulc_da.dtype, nodata=scenario_lulc_da.attrs['nodata'], width=len(x), height=len(y), count=1, crs=scenario_lulc_da.attrs['pyproj_srs'], transform=transform.from_origin( west, north, x[1] - west, north - y[1])) # define the function here so that the fixed arguments are curried def _t_from_lulc(lulc_arr): with tempfile.TemporaryDirectory() as tmp_dir: lulc_raster_filepath = path.join(tmp_dir, 'lulc.tif') with rio.open(lulc_raster_filepath, 'w', **rio_meta) as dst: dst.write(lulc_arr, 1) ucm_wrapper = iuc.UCMWrapper(lulc_raster_filepath, biophysical_table_filepath, cc_method, ref_et_raster_filepath, t_ref, uhi_max, extra_ucm_args=ucm_params, workspace_dir=tmp_dir) return ucm_wrapper.predict_t_arr(0) scenario_t_da = xr.DataArray( dims=scenario_lulc_da.dims, coords=scenario_lulc_da.coords, attrs=dict(nodata=np.nan, pyproj_srs=scenario_lulc_da.attrs['pyproj_srs'])) change_nums = scenario_t_da['change_num'].values scenario_runs = scenario_t_da.coords.get('scenario_run', None) def _simulate_and_repeat(change_num): # simulate once and repeat it for all scenario runs lulc_da = scenario_lulc_da.sel(change_num=change_num) if scenario_runs is not None: t_arr = _t_from_lulc(lulc_da.isel(scenario_run=0)) t_arr = np.array( [t_arr for scenario_run in scenario_t_da['scenario_run']], dtype=dst_t_dtype) else: t_arr = _t_from_lulc(lulc_da) return t_arr if change_nums[0] == 0: scenario_t_da.loc[dict(change_num=0)] = _simulate_and_repeat(0) change_nums = change_nums[1:] scenario_dims = scenario_lulc_da.dims[:-2] stacked_da = scenario_lulc_da.sel(change_num=change_nums).stack( scenario=scenario_dims).transpose('scenario', 'y', 'x') with diagnostics.ProgressBar(): scenario_t_da.loc[dict(change_num=change_nums)] = xr.DataArray( np.array( dask.compute(*[ dask.delayed(_t_from_lulc)(_scenario_lulc_da) for _scenario_lulc_da in stacked_da ], scheduler='processes')).astype(dst_t_dtype), dims=stacked_da.dims, coords={dim: stacked_da.coords[dim] for dim in stacked_da.dims}, attrs=dict(dtype=dst_t_dtype)).unstack(dim='scenario').transpose( *scenario_dims, 'y', 'x') # replace nodata values - UCM/InVEST uses minus infinity, so we can use # temperatures lower than the absolute zero as a reference threshold which # (physically) makes sense return scenario_t_da.where(scenario_t_da > -273.15, np.nan)
def build_features(self, split_df=None, img_filepaths=None, img_dir=None, img_filename_pattern=None, method=None, img_cluster=None): """ Build the pixel features for a list of images Parameters ------- split_df : pd.DataFrame Data frame img_filepaths : list of image file paths, optional List of images to be transformed into features. Alternatively, the same information can be provided by means of the `img_dir` and `img_filename_pattern` keyword arguments. Ignored if providing `split_df` img_dir : str representing path to a directory, optional Path to the directory where the images whose filename matches `img_filename_pattern` are to be located. Ignored if `split_df` or `img_filepaths` is provided. img_filename_pattern : str representing a file-name pattern, optional Filename pattern to be matched in order to obtain the list of images. If no value is provided, the default value set in `settings.IMG_DEFAULT_FILENAME_PATTERN` will be taken. Ignored if `split_df` or `img_filepaths` is provided. method : {'cluster-I', 'cluster-II'}, optional Method used in the train/test split img_cluster : int, optional The label of the cluster of images. Only used if `method` is 'cluster-II' Returns ------- X : np.ndarray Array with the pixel features """ # TODO: accept `neighborhoods` kwarg if split_df is not None: if method is None: if 'img_cluster' in split_df: method = 'cluster-II' else: method = 'cluster-I' if method == 'cluster-I': # dump_train_feature_arrays(split_df, output_filepath) img_filepaths = split_df[split_df['train']]['img_filepath'] else: if img_cluster is None: raise ValueError( "If `method` is 'cluster-II', `img_cluster` must be " "provided") img_filepaths = utils.get_img_filepaths( split_df, img_cluster, True) else: if img_filepaths is None: if img_filename_pattern is None: img_filename_pattern = \ settings.IMG_DEFAULT_FILENAME_PATTERN if img_dir is None: raise ValueError( "Either `split_df`, `img_filepaths` or `img_dir` must " "be provided") img_filepaths = glob.glob( path.join(img_dir, img_filename_pattern)) values = [ dask.delayed(self.build_features_from_filepath)(img_filepath) for img_filepath in img_filepaths ] with diagnostics.ProgressBar(): X = dask.compute(*values) return np.vstack(X)
# In[ ]: # get image dimensions def get_dims(file): img = cv2.imread(file) h, w = img.shape[:2] return h, w # parallelize filepath = '../input/stage_1_test_images/' filelist = [filepath + f for f in os.listdir(filepath)] dimsbag = bag.from_sequence(filelist).map(get_dims) with diagnostics.ProgressBar(): dims = dimsbag.compute() dim_df = pd.DataFrame(dims, columns=['height', 'width']) sizes = dim_df.groupby(['height', 'width' ]).size().reset_index().rename(columns={0: 'count'}) sizes.hvplot.scatter(x='height', y='width', size='count', xlim=(0, 1200), ylim=(0, 1200), grid=True, xticks=2, yticks=2, height=500, width=600).options(scaling_factor=0.1,