def CreateDataByStationAndInit(GridSize, DateBegin, DateEnd, PredictionWindow, ListParam, WithTopo, TopoListParam, isLocal, n_parallel): time_begin = time() if DateEnd < DateBegin: raise Exception('DateEnd is smaller than DateBegin.') assert GridSize % 2 == 1, 'Grid size must be an odd number.' # different paths, whether we run the script locally or on a cluster node if isLocal: ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset' # COSMO-1 outputs ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data' # base address of topo files ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/' # base adress of obs files DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/station_init/grid_size_' + str( GridSize) # target directory for all generated files else: ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1' # COSMO-1 outputs ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo' # base address of topo files ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations' # base adress of obs files DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/station_init/grid_size_' + str( GridSize) # target directory for all generated files # create an output folder for each station, based on the station ids OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc') station_ids = OBS['station_id'].data OBS.close() station_paths = [] for S in station_ids: # prepare output folders for each station station_paths += [DESTINATION + '/Station_' + str(S)] if not os.path.exists(station_paths[-1]): os.makedirs(station_paths[-1]) # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not # already exists in the output folder folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION, 'StationAndInit', DateBegin, DateEnd) folders.sort() # calculate begin and end index of array to exclude files, that are not in the specified time interval begin, end = -1, -1 for idx, folder in enumerate(folders): if folder[:-4] >= DateBegin: begin = idx break for idx, folder in enumerate(folders): if folder[:-4] <= DateEnd: end = idx else: break if begin == -1 or end == -1: raise Exception('Could not find start or end in array.') folders = folders[begin:end + 1] print('%s files are left to be preprocessed.' % len(folders)) # split the folders into K approx. equal splits if n_parallel <= 1: folder_splits = [folders] else: n_folders = len(folders) indices = np.linspace(0, n_folders, n_parallel + 1).astype(int) folder_splits = [ folders[indices[i]:indices[i + 1]] for i in range(n_parallel) ] folder_splits = [l for l in folder_splits if len(l) > 0] # take timestamp after set-up time_setup = time() with Pool(processes=n_parallel) as pool: # run preprocessing in parallel for all splits and keep the processes in a list to sync them later process_results = [] for idx_split, split in enumerate(folder_splits): print('Process %s with range [%s, %s] queued.' % (idx_split, split[0], split[-1])) # only calculate topo data by the first process, since it is invariant if idx_split == 0: isTopo = WithTopo else: isTopo = 0 process_results.append( pool.apply_async( GetData, (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst, DESTINATION, ListParam, TopoListParam, GridSize, isTopo, split, PredictionWindow, isLocal))) # forces the parent process to wait on all forked children processes for ps_idx, ps_result in enumerate(process_results): # sync processes _ = ps_result.get() print('[Process %s] Synchronized after data creation.' % ps_idx) # take timestamp after completing all processes time_end = time() # dump preprocessing information in a descriptive JSON file preprocessing_information = { 'grid_size': GridSize, 'data_begin': DateBegin, 'data_end': DateEnd, 'parameters': ListParam, 'future_hours': PredictionWindow, 'n_processes': n_parallel, 'time_setup': str(timedelta(seconds=(time_setup - time_begin))), 'time_preprocessing': str(timedelta(seconds=(time_end - time_setup))) } preprocessing_information_json = json.dumps(preprocessing_information) f = open(DESTINATION + '/setup.json', 'w') f.write(preprocessing_information_json) f.close() print('Preprocessing sucessfully finished in %s.' % str(timedelta(seconds=(time_end - time_begin))))
def CreateBaselineData(DateBegin, DateEnd, PredictionWindow, isLocal, n_parallel): time_begin = time() if DateEnd < DateBegin: raise Exception('DateEnd is smaller than DateBegin.') # different paths, whether we run the script locally or on a cluster node if isLocal: ADDRESSdata = '/home/n1no/Documents/ethz/master_thesis/code/project/data/cosmo-1/data_subset' # COSMO-1 outputs ADDRESStopo = '/home/n1no/Documents/ethz/master_thesis/code/project/data' # base address of topo files ADDRESSobst = '/home/n1no/Documents/ethz/master_thesis/code/project/data/observations/' # base adress of obs files DESTINATION = '/home/n1no/Documents/ethz/master_thesis/code/project/data/preprocessed_data/baseline' # target directory for all generated files else: ADDRESSdata = '/mnt/data/bhendj/full/cosmo-1' # COSMO-1 outputs ADDRESStopo = '/mnt/ds3lab-scratch/ninow/topo' # base address of topo files ADDRESSobst = '/mnt/ds3lab-scratch/ninow/observations' # base adress of obs files DESTINATION = '/mnt/ds3lab-scratch/ninow/preprocessed_data/baseline' # target directory for all generated files if not os.path.exists(DESTINATION): os.makedirs(DESTINATION) # create an output folder for each station, based on the station ids OBS = xr.open_dataset(ADDRESSobst + '/meteoswiss_t2m_20151001-20180331.nc') TOPO = xr.open_dataset(ADDRESStopo + '/topodata.nc') station_ids = OBS['station_id'].data # extract time invariant features for each station and the corresponding sub-grid if not os.path.exists(DESTINATION + '/station_neighbors.pkl'): station_neighbors = {} # calculate for each station the neighbors on the grid in parallel with Pool(processes=n_parallel) as pool: process_results = [] gridHeightData = TOPO.HH.data gridLatData = TOPO.lat.data gridLonData = TOPO.lon.data # start a new process with the work function for each data split for idx_S, S in enumerate(station_ids): # calculate height difference between grid points and station station_height = OBS['height'].sel(station_id=S).data station_lat = OBS['lat'].sel(station_id=S).data station_lon = OBS['lon'].sel(station_id=S).data print('Neighborhood calculation for staiton %s queued.' % S) process_results.append( pool.apply_async( getStationNeighbors, (S, gridHeightData, gridLatData, gridLonData, station_height, station_lat, station_lon))) # aggregate results from all processes for ps_idx, ps_result in enumerate(process_results): # sync processes S, neighbor_data = ps_result.get() station_neighbors[S] = neighbor_data print('[Process %s] Synchronized after data creation.' % ps_idx) with open(DESTINATION + '/station_neighbors.pkl', 'wb') as handle: pkl.dump(station_neighbors, handle, protocol=pkl.HIGHEST_PROTOCOL) print( 'Station time invariant features have been calculated and stored.') else: with open(DESTINATION + '/station_neighbors.pkl', 'rb') as handle: station_neighbors = pkl.load(handle) print( 'Station time invariant features have been found on disk and were therefore not created again.' ) OBS.close() TOPO.close() for S in station_ids: temp_output_path = DESTINATION + '/temp/station_%s' % S if not os.path.exists(temp_output_path): os.makedirs(temp_output_path) # get all COSMO-1 files that are in the given time interval and have not yet been processed and thus do not # already exists in the output folder folders = DataUtils.getFilesToProcess(ADDRESSdata, DESTINATION, 'Station', DateBegin, DateEnd) folders.sort() # calculate begin and end index of array to exclude files, that are not in the specified time interval begin, end = -1, -1 for idx, folder in enumerate(folders): if folder[:-4] >= DateBegin: begin = idx break for idx, folder in enumerate(folders): if folder[:-4] <= DateEnd: end = idx else: break if begin == -1 or end == -1: raise Exception('Could not find start or end in array.') folders = folders[begin:end + 1] print('%s files are left to be preprocessed.' % len(folders)) # split the folders into K approx. equal splits if n_parallel <= 1: folder_splits = [folders] else: n_folders = len(folders) indices = np.linspace(0, n_folders, n_parallel + 1).astype(int) folder_splits = [ folders[indices[i]:indices[i + 1]] for i in range(n_parallel) ] folder_splits = [l for l in folder_splits if len(l) > 0] # take timestamp after set-up time_setup = time() # run preprocessing in parallel for all splits and keep the processes in a list to sync them later # calculate min/max and select samples on data split in parallel with Pool(processes=n_parallel) as pool: process_results = [] # start a new process with the work function for each data split for idx_split, split in enumerate(folder_splits): print('Process %s with range [%s, %s] queued.' % (idx_split, split[0], split[-1])) process_results.append( pool.apply_async( GetDataWrapper, (idx_split, ADDRESSdata, ADDRESStopo, ADDRESSobst, DESTINATION, split, station_neighbors, PredictionWindow, isLocal))) # aggregate results from all processes for ps_idx, ps_result in enumerate(process_results): # sync processes result = ps_result.get() print('[Process %s] Synchronized after data creation.' % ps_idx) station_folders_paths = [ f for f in os.listdir(DESTINATION + '/temp') if re.match(r'^station_([0-9]+?)$', f) ] process_results = [] for ps_idx, station_folder in enumerate(station_folders_paths): print('Process %s with station folder %s queued.' % (ps_idx, station_folder)) process_results.append( pool.apply_async(aggregateProcessFiles, (ps_idx, DESTINATION, station_folder))) # aggregate results from all processes for ps_idx, ps_result in enumerate(process_results): # sync processes result = ps_result.get() print('[Process %s] Synchronized after aggregation.' % ps_idx) # take timestamp after completing all processes time_end = time() # dump preprocessing information in a descriptive JSON file preprocessing_information = { 'data_begin': DateBegin, 'data_end': DateEnd, 'future_hours': PredictionWindow, 'n_processes': n_parallel, 'time_setup': str(timedelta(seconds=(time_setup - time_begin))), 'time_preprocessing': str(timedelta(seconds=(time_end - time_setup))) } preprocessing_information_json = json.dumps(preprocessing_information) f = open(DESTINATION + '/setup.json', 'w') f.write(preprocessing_information_json) f.close() print('Station baseline reprocessing sucessfully finished in %s.' % str(timedelta(seconds=(time_end - time_begin))))