def process_files(from_dir, this_file): stn = this_file.split('/')[-1] print("Prossecing station: ", stn) path = create_path(path_join(output_dir, from_dir, stn)) for init_time, domain in combined_list: var_metrics = [] for var in variables.itervalues(): file_names = '_'.join([var, domain, '*']) + ".txt" stages = glob(path_join(this_file, init_time, file_names)) # sorting stages stages_idx = [s.split('_')[-1].split('.')[0] for s in stages] stages_idx = np.argsort(np.array(stages_idx, dtype='int32')) stages = [stages[idx] for idx in stages_idx] # computing all metrics for each variable at the same stage s_metrics = [ compute_metrics(stg, metrics.values()) for stg in stages ] var_metrics.append(s_metrics) var_metrics = np.transpose(var_metrics, axes=(2, 1, 0)) for name, arr in zip(metrics.keys(), var_metrics): sf = path_join(path, '_'.join([name, init_time, domain]) + '.txt') dframe = pd.DataFrame(data=arr, columns=variables.values()) dframe.to_csv(sf, index=True, header=True, sep=' ', na_rep='nan')
def traverse_and_process(): # For data augmentation # Used to apply augmentor with 50% probability sometimes = lambda aug: va.Sometimes(0.5, aug) seq = va.Sequential([ # Randomly crop video with a size of (96 x 96) va.RandomCrop(size=(96, 96)), # Randomly rotates the video with a degree randomly choosen from [-10, 10] va.RandomRotate(degrees=10), # horizontally flip the video with 50% probability sometimes(va.HorizontalFlip()) ]) foldernames = [] for root, dirs, files in os.walk(config.VIDEO_DIREC): if len(dirs) > 0: foldernames = sorted(dirs) for folder in tqdm(foldernames, desc='Folder', bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'): filenames = [] for root, dirs, files in os.walk(f'{config.VIDEO_DIREC}/{folder}'): filenames = sorted(file.split(".")[0] for file in list( filter(lambda x: x != ".DS_Store", files))) for filename in tqdm(filenames[:20], desc='Class ', bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'): # for filename in filenames: for iter in tqdm(range(10), desc='Files ', bar_format='{l_bar}{bar:40}{r_bar}{bar:-10b}'): # Check if dst folder exists utils.create_path(f'{config.CROPPED_DIREC}/{(int(folder) - 1)}{iter}') # Set the paths src_path = f'{config.VIDEO_DIREC}/{folder}/{filename}.mp4' dst_path = f'{config.CROPPED_DIREC}/{(int(folder) - 1)}{iter}/{filename}.npz' # utils.check_video_length(src_path, verbose=True) sequence = detect_face_from_file(src_path, verbose=False) # print(type(sequence), sequence.shape) if config.AUGMENT and iter != 0: sequence = np.array(seq(sequence)) assert sequence is not None, f'Cannot crop from {src_path}.' # print(sequence.shape) # ... = Ellipsis data = transform.convert_bgr2gray( sequence) if config.CONVERT_GRAY else sequence[..., ::-1] utils.save2npz(dst_path, data=data)
def cluster_embed(dataset_name, app_name, embedding_name, embedding_df, labels): clusterer = AgglomerativeClustering(n_clusters=len(labels.unique())) clusterer.fit(embedding_df.values) clustered_labels = clusterer.labels_ cluster_labels_df = pd.DataFrame({"cluster_label": clustered_labels, "label":labels}, index=embedding_df.index) path = create_path([".", "data", "clusters", dataset_name, app_name], file_name = f"{embedding_name}.csv") cluster_labels_df.to_csv(path)
def sort_points_by_ref(dataset_name, app_name, embedding_name, embeddings, labels): # First, find the euclidean distances between every embedding mutual_distances = get_pairwise_dist(embeddings, metric="euclidean") # Get the text associated with each feedback text_df = get_feedback_text(dataset_name, app_name) # Find a place to save this visualisation finder_dir = create_path([".", "results", "", dataset_name, app_name], file_name=embedding_name) # Create the visualisation create_nn_finder_html(mutual_distances, text_df, finder_dir)
def plot_files(combined_list): # get values from iterable: metric, stn_id, domain = combined_list print("Creating '{}' plots for station: {}, domain: {}".format( metric, stn_id, domain)) # get files from model and mos at the same time: sorted_files = [] for from_dir in from_dirs: path = path_join(output_dir, from_dir, stn_id, '_'.join([metric, '*', domain]) + '*') sorted_files.extend(sorted(glob(path))) plot_stages = [] plot_values = [] for this_file in sorted_files: dframe = pd.read_csv(this_file, delimiter=' ', index_col=0) # create a nice string representation of forecast hours # (adding stages to initialization hour) init = this_file.split('.')[0].split('_')[-2][:2] stages = [ str(timedelta(hours=int(init) + 3.0 * off_set)).split(', ')[-1] for off_set in range(dframe.shape[0]) ] plot_stages.append(stages) plot_values.append(dframe) for name, var in variables.iteritems(): # list containning values for all initializations of the same # variable (var) for the same domain... data = [values[var].values for values in plot_values] # finally doing some actual plotting fig = plot_lines(plot_stages, data, name, stn_id, domain, metric) fig_path = create_path(path_join(output_dir, 'plots', stn_id)) fig_name = path_join(fig_path, '_'.join([metric, var, domain])) fig.savefig(fig_name + '.png', dpi=200) plt.close(fig)
def create_results_dirs(result_type, dataset_name, app_name): app_dir = create_path( [".", "results", result_type, dataset_name, app_name]) return app_dir
def generate_config(self): env_file = json.load( open(os.path.join(str(Path.home()), ".kaggle/path.json"), "r")) competition_file = json.load( open(os.path.join(os.getcwd(), "competition.json"), "r")) aug_json = json.load( open(os.path.join(os.getcwd(), "configs/service/aug.json"), "r")) self.config.update(competition_file) self.config['augs'] = aug_json self.config['dataset'].update(competition_file['dataset_split']) self.config['siamese'] = self.config.get("siamese", False) if self.config['siamese'] & self.config['dataset']['resize'][0] >= 512: self.config["competition_data_folder"] = "protein" self.config["competition_img"]["type"] = { "test": "tif", "train": "tif" } elif self.config['competition_data_folder'] == "protein/protein_1024": self.config["competition_img"]["type"] = { "test": "jpg", "train": "jpg" } elif self.config['competition_data_folder'] == "protein/protein_512": self.config["competition_img"]["type"] = { "test": "png", "train": "png" } else: raise NotImplementedError("No such data folder") self.config['n_threds'] = cpu_count() self.config['device'] = "cuda" if torch.cuda.is_available() else "cpu" self.config["net_name"] = self.config['net_class'].split(".")[-1] self.config['out_path'] = os.path.join(env_file['output_path'], self.config['competition_name']) self.config['out_folder'] = os.path.join(self.config['out_path'], self.get_folder_type()) self.config['out_folder'], counter = utils.create_path( self.config['out_folder']) self.config[ 'visdom_env_name'] = self.config['net_name'] + f"_{counter}" self.config['data_folder'] = os.path.join( env_file['data_path'], self.config['competition_data_folder']) self.config['predictions_folder'] = os.path.join( self.config['out_folder'], "predicitons") self.config['weights_folder'] = os.path.join(self.config['out_folder'], "weights") self.config['splits_path'] = os.path.join( str(Path.home()), ".kaggle_splits", self.config['competition_name'], "debug" if self.is_debug() else "", ) if self.config['pretrained_weights'] is not None: self.config['pretrained_weights'] = os.path.join( os.path.split(self.config['out_folder'])[0], self.config['pretrained_weights']) if self.config['use_folds'] == "all": self.config['use_folds'] = list( range(self.config['dataset']['n_folds'])) self.config['debug'] = False if self.is_debug(): self.config['n_epochs'] = 2 self.config['mode_train']['unfreeze'] = 1 self.config['debug'] = True # self.config['mode_stack']['early_stopping_rounds'] = 5 # self.config['mode_stack']["num_rounds"] = 10 assert len(self.config['mode_train'] ['loss']) <= 1, "Cannot be more than 1 loss type" assert self.config['competition_type'] in [ 'binary', 'multilabel', 'multiclass', 'segmentation' ]
# define pipeline pre-processor: preprocessor = Pipeline([("features", combined_features), ("scaler", x_scaler)]) if __name__ == "__main__": # Create the dataset path = 'data/train_data/' ref_var = ['rain', 'mslp', 't2m', 'rh2m', 'wind'] var_key = 'rain' init_time = '00' domain = 'd01' # path for saving models save_path = create_path(os.path.join('models', var_key)) ref_var.remove(var_key) file_name = '_'.join([var_key] + ref_var + [domain, init_time]) + '.mat' # file_name = 't2m_d01_00.mat' print('Loading data from: {}'.format(file_name)) data = loadmat(path + file_name, squeeze_me=True) print('Loading station clusters...') clusters = loadmat('data/clustered_stations_' + var_key + '.mat') # get reference scores: # (only save trained model if its score is higher than # the previous computed best score for this cluster) scores_path = 'data/best_scores_' + var_key + '.mat'
combined_list = combine([['00', '12'], ['d01', 'd02'], ['rain', 'mslp', 't2m', 'rh2m', 'wind']]) # create pool of workers: if np.isscalar(n_process): n_process = min(1, int(n_process)) else: n_process = cpu_count() print('Running with {:n} workers.'.format(n_process)) pool = Pool(n_process) # running in parallel function = partial(process_item, path_input, path_output, unique) pool.map(function, combined_list) pool.close() pool.join() if __name__ == '__main__': # create training dataset path_input = 'data/stn_vs_raw' path_output = create_path('data/train_data') # assingle_items(path_input, path_output, unique=True) var_key = 'rain' combined_items(path_input, path_output, var_key)
def apply_regression(path_input, path_output, var, init_times=[], domains=[], regressors=None, verbose=False): ind = var_keys.index(var) # for init_time, domain in combine([init_times, domains]): # file_name patterns to search wildcards = [ '*'.join([var_key, domain] + ['']) for var_key in var_keys ] print('find files with patterns: ', wildcards, 'from initialization: ', init_time, '\n') stn_files = os.listdir(path_input) init_time += '00' stacked_correlations = {} for stn_id in stn_files: stn = int(stn_id.split('_')[-1]) if verbose: print('Creating bias-corrected data for station: ', stn) path = create_path( os.path.join(path_output, stn_id, init_time)) var_files = [ sorted( glob( os.path.join(path_input, stn_id, init_time, wildcard))) for wildcard in wildcards ] # crate dictionary for computed correlations (one for each stage) stage_correlations = {s: None for s in range(25)} for same_stages in zip(*var_files): stage = same_stages[0].split('.')[0].split('_')[-1] dates = [] observations = [] predictions = [] for this_file in same_stages: data = pd.read_table( this_file, names=names, sep=' ', converters=date_converter, engine='c') # remove nan model values find_nans = np.isnan(data['forecast'].values) indexs = [not nans for nans in find_nans] # append member to existing ones dates.append(data['dates'].values[indexs]) observations.append(data['observed'].values[indexs]) predictions.append(data['forecast'].values[indexs]) dates = np.array(dates) if dates.shape[0] == len(var_files) and dates.ndim != 1: # only if all predictors share the same dates if not np.diff(dates, axis=0).all(): # creating predictors (some physics here): mslp, temp, relh if var is 'rain': x = np.stack(predictions, axis=0) x[-1], x[-2], x[-3] = apply_physics( x[-1], x[-2], x[-3]) # normalization factor for target variable scale = 1.0 # mm / 3h else: # (temp, press, wind?) no need for extra # predictions (for now) we will only use # model forecasts: only simple normalization x = np.expand_dims(predictions[ind], axis=0) # normalization factor for target variable scale = 36.0 # degrees celcius x /= scale x_pred = x.T if hasattr(regressors, 'predict'): corrected = regressors.predict(x_pred) * scale elif regressors.has_key(stn_id): corrected = regressors[stn_id].predict( x_pred) * scale # just use forecast values else: print( 'MOS is not available for this station at stage: ', str2stamp( str(dates[0]), off_set=3 * int(stage), to_julian=False, as_int=False), ' using forecast instead') corrected = predictions[ind] stacked_array = np.stack( [dates[0], observations[ind], corrected], axis=-1) # write stn predictions to disk file_name = '_'.join([var, domain, stage]) file_name = os.path.join(path, file_name) np.savetxt(file_name + '.txt', stacked_array) stage_correlations[int( stage)] = pearson_correlation( stacked_array[:, 1], stacked_array[:, 2]) else: # if station is no suitable for bias-correction # algorithm then use model forecast only: stage_correlations[int(stage)] = pearson_correlation( observations[ind], predictions[ind]) # write pearson correlations for gridded mos stacked_correlations[stn] = stage_correlations.values() # write scv file for make plots: stacked_correlations = pd.DataFrame(data=stacked_correlations) file_name = '_'.join(['pearson', init_time, domain]) + '.csv' path = create_path( os.path.join( 'test_data/processed/CorrelacionesPuntuales/PearsonT/MOS/pearson', var)) print('Save file: ', file_name, 'with data shaped: ', stacked_correlations.values.shape, '\n') stacked_correlations.to_csv( os.path.join(path, file_name), sep=',', na_rep='nan', header=list(stacked_correlations.columns), index=False, mode='w') print('Post-processing finished. Congrats!')
if __name__ == '__main__': ''' This script generates the data for each station after appling the regression equation found for that station and then save the file in the same fashion as the inputs: /stn_***/init/var_d0*_*.txt The bias-corrected series does not cover the same dates because simultaneous ocurrences of the predictors is not granted and of course the nans filtering process (this should be seriously revised for better regression models) ''' # create training dataset path_input = 'data/stn_vs_raw/' path_output = create_path('data/stn_vs_mos/') var_key = 'rain' init_times = ['00'] #, '12'] domains = ['d01'] #, 'd02']: regressors = {'stn_308': Dummy_Regressor, 'stn_320': Dummy_Regressor} apply_regression( path_input, path_output, var_key, init_times=init_times, domains=domains, regressors=regressors)
cmap=cm.plasma_r, norm=norm, orientation='vertical', label='Color mapping for values of ' + xlabels[-1]) return ax.get_figure() if __name__ == '__main__': # Create the dataset ref_var = ['rain', 'mslp', 't2m', 'rh2m'] path = 'data/train_data/' path_output = create_path('outputs/test_scalers/') var_key = 't2m' init_time = '00' domain = 'd01' ref_var.remove(var_key) file_name = '_'.join([var_key] + ref_var + [domain, init_time]) + '.mat' print('Loading data...') data = loadmat(path + file_name, squeeze_me=True) # for key in ['__header__', '__version__', '__globals__']: _ = data.pop(key) # join all data