def _make_dataset(zip_item, simulator, dir_name): """Protected function for parallel generate dataset. Generate noise-free synthetic data and save it as pickle. Parameters ---------- zip_item : zip object simulator : Simulator dir_name : str or pathlib.Path Returns ------- None """ resistivity, suffix_num = zip_item # stop printing messages with contextlib.redirect_stdout(None): data_synthetic = simulator.make_synthetic_data(resistivity, std=0) # pickle dump/load is faster than numpy savez_compressed(or save)/load pkl_name = os.path.join(dir_name, f'raw_data_{suffix_num:0>6}.pkl') write_pkl( { 'resistance': data_synthetic, 'resistivity_log10': np.log10(resistivity) }, pkl_name)
def _make_processed_dataset(filename, preprocess, root_dir, sub_dir_in_processed, Tx_locations, Rx_locations, nCx, nCy): # for filename in files: pkl_name = os.path.join(root_dir, filename) data = read_pkl(pkl_name) # check if the data is dict and have "resistance" and "resistivity_log10" keys if (not isinstance(data, dict) or data.get('resistance') is None or data.get('resistivity_log10') is None): raise Exception( 'data is not a dict or dict does not contain essential keys') # preprocess for k, v in preprocess.items(): if k == 'add_noise' and v.get('perform'): add_noise(data['resistance'], **v.get('kwargs')) elif k == 'log_transform' and v.get('perform'): log_transform(data['resistance'], **v.get('kwargs')) elif k == 'to_midpoint' and v.get('perform'): data['resistance'] = to_midpoint(data['resistance'], Tx_locations, Rx_locations) elif k == 'to_txrx' and v.get('perform'): data['resistance'] = to_txrx(data['resistance'], Tx_locations, Rx_locations) elif k == 'to_section' and v.get('perform'): data['resistivity_log10'] = to_section(data['resistivity_log10'], nCx, nCy) # save pickle in processed dir new_pkl_name = os.path.join(sub_dir_in_processed, re.sub(r'raw', r'processed', filename)) write_pkl(data, new_pkl_name) return data
def _forward_simulation(pkl_name, simulator): data = read_pkl(pkl_name) # shape_V = data['synthetic_resistance'].shape resistivity = np.flipud(np.power( 10, data['predicted_resistivity_log10'])).flatten() # stop printing messages with contextlib.redirect_stdout(None): data['predicted_resistance'] = simulator.make_synthetic_data( resistivity, std=0, force=True) write_pkl(data, pkl_name)
def _process_resistivity(filename, save_resistivity_dir, processes, to_float32, nCx, nCy): raw_resistivity = read_pkl(filename) pkl_name = os.path.basename(filename) save_resistivity_pkl = os.path.join(save_resistivity_dir, pkl_name) for process, kwargs in processes.items(): if process == 'to_section': raw_resistivity = to_section(raw_resistivity, nCx, nCy) if to_float32: raw_resistivity = raw_resistivity.astype('float32') write_pkl(raw_resistivity, save_resistivity_pkl)
def _process_resistance(filename, save_resistance_dir, processes, to_float32, Tx_locations, Rx_locations, nCx, nCy): raw_resistance = read_pkl(filename) pkl_name = os.path.basename(filename) save_resistance_pkl = os.path.join(save_resistance_dir, pkl_name) for process, kwargs in processes.items(): if process == 'add_noise': add_noise(raw_resistance, **kwargs) elif process == 'log_transform': log_transform(raw_resistance, **kwargs) elif process == 'to_midpoint': raw_resistance = to_midpoint(raw_resistance, Tx_locations, Rx_locations) elif process == 'to_txrx': raw_resistance = to_txrx(raw_resistance, Tx_locations, Rx_locations) if to_float32: raw_resistance = raw_resistance.astype('float32') write_pkl(raw_resistance, save_resistance_pkl)
par, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch( batch_size).prefetch(2) # read data => take mini-batch => prefetch dataset_valid = list_dataset_validation.map(par).batch(batch_size).prefetch(2) # training if os.path.isfile(pre_trained_weight_h5): model.load_weights(pre_trained_weight_h5) original_weights = model.get_weights() history = model.fit(dataset_train, validation_data=dataset_valid, epochs=epochs, callbacks=callbacks, workers=os.cpu_count()) # check weights weights = model.get_weights() if all([np.all(w == ow) for w, ow in zip(weights, original_weights)]): print('Weights in the template model have not changed') else: print('Weights in the template model have changed') # save weights os.makedirs(save_weights_dir, exist_ok=True) model.save_weights(trained_weight_h5) # save simulator save_simulator_pkl = os.path.join(save_model_dir, 'simulator.pkl') simulator.config['training'] = config write_pkl(simulator, save_simulator_pkl)
def make_dataset(config_file): """Generate raw dataset and save it as pickle. Parameters ---------- config_file : str, pathlib.Path or dict Path to a yaml file for configuration or a dictionary for configuration. Returns ------- None References ---------- https://codewithoutrules.com/2018/09/04/python-multiprocessing/ https://zhuanlan.zhihu.com/p/75207672 """ # parse config config = read_config_file(config_file) save_dateset_dir = config['save_dataset_dir'] os.makedirs(save_dateset_dir, exist_ok=True) save_simulator_pkl = os.path.join(save_dateset_dir, 'simulator.pkl') train_dir = os.path.join(save_dateset_dir, 'training') valid_dir = os.path.join(save_dateset_dir, 'validation') test_dir = os.path.join(save_dateset_dir, 'testing') num_examples_train = int(config['num_examples'] * config['train_ratio']) num_examples_valid = int(config['num_examples'] * (config['train_ratio'] + config['valid_ratio']) - num_examples_train) num_examples_test = config[ 'num_examples'] - num_examples_train - num_examples_valid simulator = Simulator(config) # TODO: resolve this warning # When reading the pickle file in ipython, we receive the following warning # RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. # Expected 192 from C header, got 216 from PyObject write_pkl(simulator, save_simulator_pkl) for dir_name, num_examples in ((train_dir, num_examples_train), (valid_dir, num_examples_valid), (test_dir, num_examples_test)): if num_examples == 0: pass else: os.makedirs(dir_name, exist_ok=True) suffix_num = next_path(os.path.join(dir_name, 'raw_data_%s.pkl'), only_num=True) par = partial(_make_dataset, simulator=simulator, dir_name=dir_name) resistivity_generator = simulator.get_random_resistivity_generator( num_examples=num_examples) suffix_generator = iter( range(suffix_num, suffix_num + num_examples)) # use "fork" will freeze the process pool = mp.get_context('spawn').Pool(processes=mp.cpu_count(), maxtasksperchild=1) for _ in tqdm(pool.imap_unordered( par, zip(resistivity_generator, suffix_generator)), desc=f'Generate {os.path.basename(dir_name)} data', total=num_examples): pass pool.close() pool.join()
} # use partial to assign read_dataset_info par = partial(tf_read_dataset, read_dataset_info=read_dataset_info) # read data => take mini-batch => prefetch dataset_test = list_dataset_testing.map(par).batch(1).prefetch(8) # Prediction print('\nPredict.') predict = model.predict(dataset_test, verbose=True) # Save os.makedirs(save_predictions_dir, exist_ok=True) for i, dataset_testing_targets in tqdm(enumerate( list_dataset_testing_targets.as_numpy_iterator()), desc="write pkl"): raw_resistance = read_pkl(raw_resistance_pkl_list[i]) resistivity_log10 = read_pkl(dataset_testing_targets) data = { "synthetic_resistance": raw_resistance, "synthetic_resistivity_log10": resistivity_log10.reshape(output_shape[0:2]), "predicted_resistivity_log10": predict[i].reshape(output_shape[0:2]) } filename = re.findall(r'\d+.pkl', testing_resistance_pkl_list[i])[0] write_pkl(data, os.path.join(save_predictions_dir, filename)) # save simulator simulator.config['testing'] = config write_pkl(simulator, simulator_pkl)
def make_processed_dataset(config_file): """ Preprocess raw dataset and save it to processed directory. Parameters ---------- config_file : str, pathlib.Path or dict The path to the configured yaml file or the dictionary for configuration. Returns ------- None """ config = read_config_file(config_file) raw_data_dir = config['raw_data_dir'] save_processed_data_dir = config['save_processed_data_dir'] preprocess = config['preprocess'] simulator_pkl = os.path.join(raw_data_dir, 'simulator.pkl') save_simulator_pkl = os.path.join(save_processed_data_dir, 'simulator.pkl') do_preprocess = any(value['perform'] for action, value in preprocess.items()) simulator = read_pkl(simulator_pkl) # read nCx and nCy nCx = simulator.mesh.nCx # number of cell center mesh in the x direction nCy = simulator.mesh.nCy # number of cell center mesh in the z (y) direction # read Tx_locations and Rx_locations Tx_locations = simulator.urf.abmn_locations[:, :4] Rx_locations = simulator.urf.abmn_locations[:, 4:] # expand simulator.config and save it simulator.config = { 'generate': simulator.config, # config for generate data 'preprocess': config # config for preprocess data } os.makedirs(save_processed_data_dir, exist_ok=True) write_pkl(simulator, save_simulator_pkl) if do_preprocess: pattern_raw_pkl = re.compile('raw_data_\d{6}.pkl') for root_dir, sub_dirs, files in os.walk(raw_data_dir): # filter files list so the files list will contain pickle files that match the pattern files = list(filter(pattern_raw_pkl.match, files)) # If the files list is empty, continue to the next iteration of the loop if not files: continue # make sub directory sub_dir_in_processed = re.sub(raw_data_dir, save_processed_data_dir, root_dir) os.makedirs(sub_dir_in_processed, exist_ok=True) # Parallel version! par = partial(_make_processed_dataset, preprocess=preprocess, root_dir=root_dir, sub_dir_in_processed=sub_dir_in_processed, Tx_locations=Tx_locations, Rx_locations=Rx_locations, nCx=nCx, nCy=nCy) pool = mp.Pool(processes=mp.cpu_count(), maxtasksperchild=1) for data in tqdm( pool.imap_unordered(par, files), desc=f'Preprocess data and save to {sub_dir_in_processed}', total=len(files)): pass pool.close() pool.join() # Serial version! # for filename in files: # pkl_name = os.path.join(root_dir, filename) # data = read_pkl(pkl_name) # # check if the data is dict and have "resistance" and "resistivity_log10" keys # if (not isinstance(data, dict) # or data.get('resistance') is None # or data.get('resistivity_log10') is None): # continue # # preprocess # for k, v in preprocess.items(): # if k == 'add_noise' and v.get('perform'): # add_noise(data['resistance'], **v.get('kwargs')) # elif k == 'log_transform' and v.get('perform'): # log_transform(data['resistance'], **v.get('kwargs')) # elif k == 'to_midpoint' and v.get('perform'): # data['resistance'] = to_midpoint( # data['resistance'], Tx_locations, Rx_locations # ) # elif k == 'to_txrx' and v.get('perform'): # data['resistance'] = to_txrx( # data['resistance'], Tx_locations, Rx_locations # ) # elif k == 'to_section' and v.get('perform'): # data['resistivity_log10'] = to_section( # data['resistivity_log10'], nCx, nCy # ) # # save pickle in processed dir # new_pkl_name = os.path.join( # sub_dir_in_processed, # re.sub(r'raw', r'processed', filename) # ) # write_pkl(data, new_pkl_name) # show information about input / target tensor shape try: print("The shape of resistance (shape of NN input data): " + f"{data['resistance'].shape}") print("The shape of resistivity (shape of NN target data): " + f"{data['resistivity_log10'].shape}") print( "IF YOU WANT TO GET THE RAW resistivity_log10, YOU SHOULD USE" + " `raw_resistivity_log10 = np.flipud(resistivity_log10).flatten()`" ) except NameError as err: pass # no pickle files