Esempio n. 1
0
def _make_dataset(zip_item, simulator, dir_name):
    """Protected function for parallel generate dataset.

    Generate noise-free synthetic data and save it as pickle.

    Parameters
    ----------
    zip_item : zip object
    simulator : Simulator
    dir_name : str or pathlib.Path

    Returns
    -------
    None
    """
    resistivity, suffix_num = zip_item
    # stop printing messages
    with contextlib.redirect_stdout(None):
        data_synthetic = simulator.make_synthetic_data(resistivity, std=0)
    # pickle dump/load is faster than numpy savez_compressed(or save)/load
    pkl_name = os.path.join(dir_name, f'raw_data_{suffix_num:0>6}.pkl')
    write_pkl(
        {
            'resistance': data_synthetic,
            'resistivity_log10': np.log10(resistivity)
        }, pkl_name)
Esempio n. 2
0
def _make_processed_dataset(filename, preprocess, root_dir,
                            sub_dir_in_processed, Tx_locations, Rx_locations,
                            nCx, nCy):
    # for filename in files:
    pkl_name = os.path.join(root_dir, filename)
    data = read_pkl(pkl_name)
    # check if the data is dict and have "resistance" and "resistivity_log10" keys
    if (not isinstance(data, dict) or data.get('resistance') is None
            or data.get('resistivity_log10') is None):
        raise Exception(
            'data is not a dict or dict does not contain essential keys')

    # preprocess
    for k, v in preprocess.items():
        if k == 'add_noise' and v.get('perform'):
            add_noise(data['resistance'], **v.get('kwargs'))
        elif k == 'log_transform' and v.get('perform'):
            log_transform(data['resistance'], **v.get('kwargs'))
        elif k == 'to_midpoint' and v.get('perform'):
            data['resistance'] = to_midpoint(data['resistance'], Tx_locations,
                                             Rx_locations)
        elif k == 'to_txrx' and v.get('perform'):
            data['resistance'] = to_txrx(data['resistance'], Tx_locations,
                                         Rx_locations)
        elif k == 'to_section' and v.get('perform'):
            data['resistivity_log10'] = to_section(data['resistivity_log10'],
                                                   nCx, nCy)

    # save pickle in processed dir
    new_pkl_name = os.path.join(sub_dir_in_processed,
                                re.sub(r'raw', r'processed', filename))
    write_pkl(data, new_pkl_name)
    return data
Esempio n. 3
0
def _forward_simulation(pkl_name, simulator):
    data = read_pkl(pkl_name)
    # shape_V = data['synthetic_resistance'].shape
    resistivity = np.flipud(np.power(
        10, data['predicted_resistivity_log10'])).flatten()
    # stop printing messages
    with contextlib.redirect_stdout(None):
        data['predicted_resistance'] = simulator.make_synthetic_data(
            resistivity, std=0, force=True)
    write_pkl(data, pkl_name)
Esempio n. 4
0
def _process_resistivity(filename, save_resistivity_dir, processes, to_float32,
                         nCx, nCy):
    raw_resistivity = read_pkl(filename)
    pkl_name = os.path.basename(filename)
    save_resistivity_pkl = os.path.join(save_resistivity_dir, pkl_name)
    for process, kwargs in processes.items():
        if process == 'to_section':
            raw_resistivity = to_section(raw_resistivity, nCx, nCy)
    if to_float32:
        raw_resistivity = raw_resistivity.astype('float32')
    write_pkl(raw_resistivity, save_resistivity_pkl)
Esempio n. 5
0
def _process_resistance(filename, save_resistance_dir, processes, to_float32,
                        Tx_locations, Rx_locations, nCx, nCy):
    raw_resistance = read_pkl(filename)
    pkl_name = os.path.basename(filename)
    save_resistance_pkl = os.path.join(save_resistance_dir, pkl_name)
    for process, kwargs in processes.items():
        if process == 'add_noise':
            add_noise(raw_resistance, **kwargs)
        elif process == 'log_transform':
            log_transform(raw_resistance, **kwargs)
        elif process == 'to_midpoint':
            raw_resistance = to_midpoint(raw_resistance, Tx_locations,
                                         Rx_locations)
        elif process == 'to_txrx':
            raw_resistance = to_txrx(raw_resistance, Tx_locations,
                                     Rx_locations)
    if to_float32:
        raw_resistance = raw_resistance.astype('float32')
    write_pkl(raw_resistance, save_resistance_pkl)
Esempio n. 6
0
        par, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(
            batch_size).prefetch(2)
# read data => take mini-batch => prefetch
dataset_valid = list_dataset_validation.map(par).batch(batch_size).prefetch(2)

# training
if os.path.isfile(pre_trained_weight_h5):
    model.load_weights(pre_trained_weight_h5)
original_weights = model.get_weights()
history = model.fit(dataset_train,
                    validation_data=dataset_valid,
                    epochs=epochs,
                    callbacks=callbacks,
                    workers=os.cpu_count())

# check weights
weights = model.get_weights()
if all([np.all(w == ow) for w, ow in zip(weights, original_weights)]):
    print('Weights in the template model have not changed')
else:
    print('Weights in the template model have changed')

# save weights
os.makedirs(save_weights_dir, exist_ok=True)
model.save_weights(trained_weight_h5)

# save simulator
save_simulator_pkl = os.path.join(save_model_dir, 'simulator.pkl')
simulator.config['training'] = config
write_pkl(simulator, save_simulator_pkl)
Esempio n. 7
0
def make_dataset(config_file):
    """Generate raw dataset and save it as pickle.

    Parameters
    ----------
    config_file : str, pathlib.Path or dict
        Path to a yaml file for configuration or a dictionary for configuration.

    Returns
    -------
    None

    References
    ----------
    https://codewithoutrules.com/2018/09/04/python-multiprocessing/
    https://zhuanlan.zhihu.com/p/75207672
    """
    # parse config
    config = read_config_file(config_file)
    save_dateset_dir = config['save_dataset_dir']
    os.makedirs(save_dateset_dir, exist_ok=True)
    save_simulator_pkl = os.path.join(save_dateset_dir, 'simulator.pkl')
    train_dir = os.path.join(save_dateset_dir, 'training')
    valid_dir = os.path.join(save_dateset_dir, 'validation')
    test_dir = os.path.join(save_dateset_dir, 'testing')
    num_examples_train = int(config['num_examples'] * config['train_ratio'])
    num_examples_valid = int(config['num_examples'] *
                             (config['train_ratio'] + config['valid_ratio']) -
                             num_examples_train)
    num_examples_test = config[
        'num_examples'] - num_examples_train - num_examples_valid

    simulator = Simulator(config)
    # TODO: resolve this warning
    # When reading the pickle file in ipython, we receive the following warning
    # RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility.
    # Expected 192 from C header, got 216 from PyObject
    write_pkl(simulator, save_simulator_pkl)
    for dir_name, num_examples in ((train_dir, num_examples_train),
                                   (valid_dir, num_examples_valid),
                                   (test_dir, num_examples_test)):
        if num_examples == 0:
            pass
        else:
            os.makedirs(dir_name, exist_ok=True)
            suffix_num = next_path(os.path.join(dir_name, 'raw_data_%s.pkl'),
                                   only_num=True)

            par = partial(_make_dataset,
                          simulator=simulator,
                          dir_name=dir_name)
            resistivity_generator = simulator.get_random_resistivity_generator(
                num_examples=num_examples)
            suffix_generator = iter(
                range(suffix_num, suffix_num + num_examples))
            # use "fork" will freeze the process
            pool = mp.get_context('spawn').Pool(processes=mp.cpu_count(),
                                                maxtasksperchild=1)
            for _ in tqdm(pool.imap_unordered(
                    par, zip(resistivity_generator, suffix_generator)),
                          desc=f'Generate {os.path.basename(dir_name)} data',
                          total=num_examples):
                pass
            pool.close()
            pool.join()
}

# use partial to assign read_dataset_info
par = partial(tf_read_dataset, read_dataset_info=read_dataset_info)
# read data => take mini-batch => prefetch
dataset_test = list_dataset_testing.map(par).batch(1).prefetch(8)

# Prediction
print('\nPredict.')
predict = model.predict(dataset_test, verbose=True)

# Save
os.makedirs(save_predictions_dir, exist_ok=True)
for i, dataset_testing_targets in tqdm(enumerate(
        list_dataset_testing_targets.as_numpy_iterator()),
                                       desc="write pkl"):
    raw_resistance = read_pkl(raw_resistance_pkl_list[i])
    resistivity_log10 = read_pkl(dataset_testing_targets)
    data = {
        "synthetic_resistance": raw_resistance,
        "synthetic_resistivity_log10":
        resistivity_log10.reshape(output_shape[0:2]),
        "predicted_resistivity_log10": predict[i].reshape(output_shape[0:2])
    }
    filename = re.findall(r'\d+.pkl', testing_resistance_pkl_list[i])[0]
    write_pkl(data, os.path.join(save_predictions_dir, filename))

# save simulator
simulator.config['testing'] = config
write_pkl(simulator, simulator_pkl)
Esempio n. 9
0
def make_processed_dataset(config_file):
    """
    Preprocess raw dataset and save it to processed directory.

    Parameters
    ----------
    config_file : str, pathlib.Path or dict
        The path to the configured yaml file or the dictionary for configuration.

    Returns
    -------
    None
    """

    config = read_config_file(config_file)
    raw_data_dir = config['raw_data_dir']
    save_processed_data_dir = config['save_processed_data_dir']
    preprocess = config['preprocess']
    simulator_pkl = os.path.join(raw_data_dir, 'simulator.pkl')
    save_simulator_pkl = os.path.join(save_processed_data_dir, 'simulator.pkl')
    do_preprocess = any(value['perform']
                        for action, value in preprocess.items())

    simulator = read_pkl(simulator_pkl)
    # read nCx and nCy
    nCx = simulator.mesh.nCx  # number of cell center mesh in the x direction
    nCy = simulator.mesh.nCy  # number of cell center mesh in the z (y) direction
    # read Tx_locations and Rx_locations
    Tx_locations = simulator.urf.abmn_locations[:, :4]
    Rx_locations = simulator.urf.abmn_locations[:, 4:]
    # expand simulator.config and save it
    simulator.config = {
        'generate': simulator.config,  # config for generate data
        'preprocess': config  # config for preprocess data
    }
    os.makedirs(save_processed_data_dir, exist_ok=True)
    write_pkl(simulator, save_simulator_pkl)

    if do_preprocess:
        pattern_raw_pkl = re.compile('raw_data_\d{6}.pkl')

        for root_dir, sub_dirs, files in os.walk(raw_data_dir):
            # filter files list so the files list will contain pickle files that match the pattern
            files = list(filter(pattern_raw_pkl.match, files))
            # If the files list is empty, continue to the next iteration of the loop
            if not files:
                continue
            # make sub directory
            sub_dir_in_processed = re.sub(raw_data_dir,
                                          save_processed_data_dir, root_dir)
            os.makedirs(sub_dir_in_processed, exist_ok=True)

            # Parallel version!
            par = partial(_make_processed_dataset,
                          preprocess=preprocess,
                          root_dir=root_dir,
                          sub_dir_in_processed=sub_dir_in_processed,
                          Tx_locations=Tx_locations,
                          Rx_locations=Rx_locations,
                          nCx=nCx,
                          nCy=nCy)
            pool = mp.Pool(processes=mp.cpu_count(), maxtasksperchild=1)
            for data in tqdm(
                    pool.imap_unordered(par, files),
                    desc=f'Preprocess data and save to {sub_dir_in_processed}',
                    total=len(files)):
                pass
            pool.close()
            pool.join()

            # Serial version!
            # for filename in files:
            #     pkl_name = os.path.join(root_dir, filename)
            #     data = read_pkl(pkl_name)
            #     # check if the data is dict and have "resistance" and "resistivity_log10" keys
            #     if (not isinstance(data, dict)
            #             or data.get('resistance') is None
            #             or data.get('resistivity_log10') is None):
            #         continue

            #     # preprocess
            #     for k, v in preprocess.items():
            #         if k == 'add_noise' and v.get('perform'):
            #             add_noise(data['resistance'], **v.get('kwargs'))
            #         elif k == 'log_transform' and v.get('perform'):
            #             log_transform(data['resistance'], **v.get('kwargs'))
            #         elif k == 'to_midpoint' and v.get('perform'):
            #             data['resistance'] = to_midpoint(
            #                 data['resistance'], Tx_locations, Rx_locations
            #             )
            #         elif k == 'to_txrx' and v.get('perform'):
            #             data['resistance'] = to_txrx(
            #                 data['resistance'], Tx_locations, Rx_locations
            #             )
            #         elif k == 'to_section' and v.get('perform'):
            #             data['resistivity_log10'] = to_section(
            #                 data['resistivity_log10'], nCx, nCy
            #             )

            #     # save pickle in processed dir
            #     new_pkl_name = os.path.join(
            #         sub_dir_in_processed,
            #         re.sub(r'raw', r'processed', filename)
            #     )
            #     write_pkl(data, new_pkl_name)

        # show information about input / target tensor shape
        try:
            print("The shape of resistance (shape of NN input data): " +
                  f"{data['resistance'].shape}")
            print("The shape of resistivity (shape of NN target data): " +
                  f"{data['resistivity_log10'].shape}")
            print(
                "IF YOU WANT TO GET THE RAW resistivity_log10, YOU SHOULD USE"
                +
                " `raw_resistivity_log10 = np.flipud(resistivity_log10).flatten()`"
            )
        except NameError as err:
            pass  # no pickle files