Esempio n. 1
0
    def __init__(self, config_file):

        self._LINE_LENGTH_RATIO = 0.2  # for determining core_z_length
        self._DELTA_TRN_RATIO = 1.75  # for determining core_z_length
        self._survey = None
        self._topo = None
        self._mesh = None
        self._active_idx = None
        self._problem = None
        self.config = read_config_file(config_file)

        # suitable for 2D surface survey now...
        # read urf file
        self.urf = URF(self.config['geometry_urf'],
                       survey_type='dipole-dipole',
                       dimension=2,
                       space_type='half-space')

        # generate an instance of IO
        self._IO = Static.DC.IO()

        self._prepare()
        self._get_unpaired_survey()
        self._get_mapping()
        self._get_problem()
Esempio n. 2
0
import numpy as np
import tensorflow as tf

from erinn.tf_dataset import tf_read_dataset
from erinn.metrics import r_squared
from erinn.utils.io_utils import get_pkl_list
from erinn.utils.io_utils import read_config_file
from erinn.utils.io_utils import read_pkl
from erinn.utils.io_utils import write_pkl

FILEDIR = os.path.dirname(__file__)

# read config
config_file = os.path.join(FILEDIR, '..', '..', 'config', 'for_training.yml')
config = read_config_file(config_file)

# parse config and setting
custom_NN = config['custom_NN']
dataset_rootdir = config['dataset_rootdir']
training_dir = os.path.join(dataset_rootdir, 'training')
validation_dir = os.path.join(dataset_rootdir, 'validation')
training_resistance_dir = os.path.join(training_dir, 'resistance',
                                       config['resistance_dirname'])
training_resistivity_dir = os.path.join(training_dir, 'resistivity',
                                        config['resistivity_dirname'])
validation_resistance_dir = os.path.join(validation_dir, 'resistance',
                                         config['resistance_dirname'])
validation_resistivity_dir = os.path.join(validation_dir, 'resistivity',
                                          config['resistivity_dirname'])
simulator_pkl = os.path.join(dataset_rootdir, 'simulator.pkl')
Esempio n. 3
0
def make_processed_dataset(config_file):
    """
    Preprocess raw dataset and save it to processed directory.

    Parameters
    ----------
    config_file : str, pathlib.Path or dict
        The path to the configured yaml file or the dictionary for configuration.

    Returns
    -------
    None
    """

    config = read_config_file(config_file)
    dataset_dir = config['dataset_dir']
    to_float32 = config['save_as_float32']
    # save_processed_data_dir = config['save_processed_data_dir']
    preprocess_resistance = config['preprocess']['resistance']
    preprocess_resistivity = config['preprocess']['resistivity']
    simulator_pkl = os.path.join(dataset_dir, 'simulator.pkl')
    # save_simulator_pkl = os.path.join(save_processed_data_dir, 'simulator.pkl')
    # do_preprocess = any(value['perform'] for action, value in preprocess.items())

    simulator = read_pkl(simulator_pkl)
    # read nCx and nCy
    nCx = simulator.mesh.nCx  # number of cell center mesh in the x direction
    nCy = simulator.mesh.nCy  # number of cell center mesh in the z (y) direction
    # read Tx_locations and Rx_locations
    Tx_locations = simulator.urf.abmn_locations[:, :4]
    Rx_locations = simulator.urf.abmn_locations[:, 4:]
    # expand simulator.config and save it
    # simulator.config = {
    #     'generating': simulator.config,  # config for generate data
    #     'preprocessing': config  # config for preprocess data
    # }
    # os.makedirs(save_processed_data_dir, exist_ok=True)
    # write_pkl(simulator, save_simulator_pkl)

    for sub_dir in ('training', 'validation', 'testing'):
        resistance_dir = os.path.join(dataset_dir, sub_dir, 'resistance')
        resistivity_dir = os.path.join(dataset_dir, sub_dir, 'resistivity')
        raw_resistance_dir = os.path.join(resistance_dir, 'raw')
        raw_resistivity_dir = os.path.join(resistivity_dir, 'raw')
        raw_resistance_list = get_pkl_list(raw_resistance_dir)
        raw_resistivity_list = get_pkl_list(raw_resistivity_dir)

        # create resistance directory
        save_resistance_dir_list = []
        for _, processes in preprocess_resistance.items():
            process_description_list = []
            for process, kwargs in processes.items():
                if process == 'add_noise':
                    process_description_list.append('[' + '_'.join([
                        f"{int(kwargs['scale']*100):0>3}%",
                        kwargs['noise_type'], 'noise'
                    ]) + ']')
                elif process == 'log_transform':
                    process_description_list.append('[log_transform]')
                elif process == 'to_midpoint':
                    process_description_list.append('[midpoint]')
                elif process == 'to_txrx':
                    process_description_list.append('[txrx]')
            save_resistance_dir = os.path.join(
                resistance_dir, '_'.join(process_description_list))
            os.makedirs(save_resistance_dir, exist_ok=True)
            save_resistance_dir_list.append(save_resistance_dir)

        # create resistivity directory
        save_resistivity_dir_list = []
        for _, processes in preprocess_resistivity.items():
            process_description_list = []
            for process, kwargs in processes.items():
                if process == 'to_section':
                    process_description_list.append('[section]')
            save_resistivity_dir = os.path.join(
                resistivity_dir, '_'.join(process_description_list))
            os.makedirs(save_resistivity_dir, exist_ok=True)
            save_resistivity_dir_list.append(save_resistivity_dir)

        # preprocess resistance
        for i, (_, processes) in enumerate(preprocess_resistance.items()):
            save_resistance_dir = save_resistance_dir_list.pop(0)
            par = partial(_process_resistance,
                          save_resistance_dir=save_resistance_dir,
                          processes=processes,
                          to_float32=to_float32,
                          Tx_locations=Tx_locations,
                          Rx_locations=Rx_locations,
                          nCx=nCx,
                          nCy=nCy)
            pool = mp.Pool(processes=mp.cpu_count(), maxtasksperchild=1)
            for data in tqdm(
                    pool.imap_unordered(par, raw_resistance_list),
                    desc=f'Preprocess data and save to {save_resistance_dir}',
                    total=len(raw_resistance_list)):
                pass
            pool.close()
            pool.join()

            # Serial version
            # for raw_resistance_pkl in raw_resistance_list:
            #     raw_resistance = read_pkl(raw_resistance_pkl)
            #     pkl_name = os.path.basename(raw_resistance_pkl)
            #     save_resistance_pkl = os.path.join(
            #         save_resistance_dir, pkl_name
            #     )
            #     for process, kwargs in processes.items():
            #         if process == 'add_noise':
            #             add_noise(raw_resistance, **kwargs)
            #         elif process == 'log_transform':
            #             log_transform(raw_resistance, **kwargs)
            #         elif process == 'to_midpoint':
            #             raw_resistance = to_midpoint(
            #                 raw_resistance, Tx_locations, Rx_locations
            #             )
            #         elif process == 'to_txrx':
            #             raw_resistance = to_txrx(
            #                 raw_resistance, Tx_locations, Rx_locations
            #             )
            #     if to_float32:
            #         raw_resistance = raw_resistance.astype('float32')
            #     write_pkl(raw_resistance, save_resistance_pkl)

        # preprocess resistivity
        for i, (_, processes) in enumerate(preprocess_resistivity.items()):
            save_resistivity_dir = save_resistivity_dir_list.pop(0)
            par = partial(_process_resistivity,
                          save_resistivity_dir=save_resistivity_dir,
                          processes=processes,
                          to_float32=to_float32,
                          nCx=nCx,
                          nCy=nCy)
            pool = mp.Pool(processes=mp.cpu_count(), maxtasksperchild=1)
            for data in tqdm(
                    pool.imap_unordered(par, raw_resistivity_list),
                    desc=f'Preprocess data and save to {save_resistivity_dir}',
                    total=len(raw_resistivity_list)):
                pass
            pool.close()
            pool.join()
            # for raw_resistivity_pkl in raw_resistivity_list:
            #     raw_resistivity = read_pkl(raw_resistivity_pkl)
            #     pkl_name = os.path.basename(raw_resistivity_pkl)
            #     save_resistivity_pkl = os.path.join(
            #         save_resistivity_dir, pkl_name
            #     )
            #     for process, kwargs in processes.items():
            #         if process == 'to_section':
            #             raw_resistivity = to_section(
            #                 raw_resistivity, nCx, nCy
            #             )
            #     if to_float32:
            #         raw_resistivity = raw_resistivity.astype('float32')
            #     write_pkl(raw_resistivity, save_resistivity_pkl)
    print("IF YOU WANT TO GET THE RAW resistivity_log10, YOU SHOULD USE" +
          " `raw_resistivity_log10 = np.flipud(resistivity_log10).flatten()`")
Esempio n. 4
0
def make_dataset(config_file):
    """Generate raw dataset and save it as pickle.

    Parameters
    ----------
    config_file : str, pathlib.Path or dict
        Path to a yaml file for configuration or a dictionary for configuration.

    Returns
    -------
    None

    References
    ----------
    https://codewithoutrules.com/2018/09/04/python-multiprocessing/
    https://zhuanlan.zhihu.com/p/75207672
    """
    # parse config
    config = read_config_file(config_file)
    save_dateset_dir = config['save_dataset_dir']
    os.makedirs(save_dateset_dir, exist_ok=True)
    save_simulator_pkl = os.path.join(save_dateset_dir, 'simulator.pkl')
    train_dir = os.path.join(save_dateset_dir, 'training')
    valid_dir = os.path.join(save_dateset_dir, 'validation')
    test_dir = os.path.join(save_dateset_dir, 'testing')
    num_examples_train = int(config['num_examples'] * config['train_ratio'])
    num_examples_valid = int(config['num_examples'] *
                             (config['train_ratio'] + config['valid_ratio']) -
                             num_examples_train)
    num_examples_test = config[
        'num_examples'] - num_examples_train - num_examples_valid

    simulator = Simulator(config)
    # TODO: resolve this warning
    # When reading the pickle file in ipython, we receive the following warning
    # RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility.
    # Expected 192 from C header, got 216 from PyObject
    write_pkl(simulator, save_simulator_pkl)
    for dir_name, num_examples in ((train_dir, num_examples_train),
                                   (valid_dir, num_examples_valid),
                                   (test_dir, num_examples_test)):
        if num_examples == 0:
            pass
        else:
            os.makedirs(dir_name, exist_ok=True)
            suffix_num = next_path(os.path.join(dir_name, 'raw_data_%s.pkl'),
                                   only_num=True)

            par = partial(_make_dataset,
                          simulator=simulator,
                          dir_name=dir_name)
            resistivity_generator = simulator.get_random_resistivity_generator(
                num_examples=num_examples)
            suffix_generator = iter(
                range(suffix_num, suffix_num + num_examples))
            # use "fork" will freeze the process
            pool = mp.get_context('spawn').Pool(processes=mp.cpu_count(),
                                                maxtasksperchild=1)
            for _ in tqdm(pool.imap_unordered(
                    par, zip(resistivity_generator, suffix_generator)),
                          desc=f'Generate {os.path.basename(dir_name)} data',
                          total=num_examples):
                pass
            pool.close()
            pool.join()
Esempio n. 5
0
def get_random_model(config_file, mesh, num_examples=None):

    config = read_config_file(config_file)
    x_bound = [np.nanmin(mesh.vectorNx), np.nanmax(mesh.vectorNx)]
    z_bound = [np.nanmin(mesh.vectorNy), np.nanmax(mesh.vectorNy)]
    kernel_shape = (config['z_kernel_size'], config['x_kernel_size'])
    if num_examples is None:
        num_examples = config['num_examples']

    # create the instance of resistivity "value" probability distribution
    # background
    pd_background = get_pd(use_hidden=config['use_hidden_background'],
                           pdf=config['pdf_background'],
                           scale=config['scale_background'],
                           a=config['a_background'],
                           b=config['b_background'],
                           hidden_for_a=(config['hidden_a_for_a_background'],
                                         config['hidden_b_for_a_background']),
                           hidden_for_b=(config['hidden_a_for_b_background'],
                                         config['hidden_b_for_b_background']),
                           hidden_pdf=config['hidden_pdf_background'])

    # rectangle(block)
    pd_rect = get_pd(use_hidden=config['use_hidden_rect'],
                     pdf=config['pdf_rect'],
                     scale=config['scale_rect'],
                     a=config['a_rect'],
                     b=config['b_rect'],
                     hidden_for_a=(config['hidden_a_for_a_rect'],
                                   config['hidden_b_for_a_rect']),
                     hidden_for_b=(config['hidden_a_for_b_rect'],
                                   config['hidden_b_for_b_rect']),
                     hidden_pdf=config['hidden_pdf_rect'])

    # circle
    pd_circle = get_pd(use_hidden=config['use_hidden_circle'],
                       pdf=config['pdf_circle'],
                       scale=config['scale_circle'],
                       a=config['a_circle'],
                       b=config['b_circle'],
                       hidden_for_a=(config['hidden_a_for_a_circle'],
                                     config['hidden_b_for_a_circle']),
                       hidden_for_b=(config['hidden_a_for_b_circle'],
                                     config['hidden_b_for_b_circle']),
                       hidden_pdf=config['hidden_pdf_circle'])

    for _ in range(num_examples):

        size = (mesh.nC, )
        resistivity = get_rvs(use_hidden=config['use_hidden_background'],
                              scale=config['scale_background'],
                              pd=pd_background,
                              size=size)

        # generate parameter for rectangle and circle
        num_rect = rand_num_shape(config['num_rect'])
        num_circle = rand_num_shape(config['num_circle'])

        stack = rand_rect(x_bound, z_bound, config['h_range'],
                          config['w_range'], mesh, num_rect)
        stack.extend(
            rand_circle(x_bound, z_bound, config['radius_bound'], mesh,
                        num_circle))
        np.random.shuffle(stack)

        for _ in range(num_rect + num_circle):
            elem = stack.pop()
            size = resistivity[elem[0]].shape
            if elem[1] == 'rect':
                resistivity[elem[0]] = get_rvs(
                    use_hidden=config['use_hidden_rect'],
                    scale=config['scale_rect'],
                    pd=pd_rect,
                    size=size)
            elif elem[1] == 'circle':
                resistivity[elem[0]] = get_rvs(
                    use_hidden=config['use_hidden_circle'],
                    scale=config['scale_circle'],
                    pd=pd_circle,
                    size=size)
            else:
                raise NotImplementedError()

        resistivity = smooth2d(resistivity.reshape(mesh.nCy, mesh.nCx),
                               kernel_shape)  # shape is (nz, nx)
        # The resistivity starts at the bottom left of the SimPEG 2d mesh.
        yield resistivity.flatten()
Esempio n. 6
0
def make_processed_dataset(config_file):
    """
    Preprocess raw dataset and save it to processed directory.

    Parameters
    ----------
    config_file : str, pathlib.Path or dict
        The path to the configured yaml file or the dictionary for configuration.

    Returns
    -------
    None
    """

    config = read_config_file(config_file)
    raw_data_dir = config['raw_data_dir']
    save_processed_data_dir = config['save_processed_data_dir']
    preprocess = config['preprocess']
    simulator_pkl = os.path.join(raw_data_dir, 'simulator.pkl')
    save_simulator_pkl = os.path.join(save_processed_data_dir, 'simulator.pkl')
    do_preprocess = any(value['perform']
                        for action, value in preprocess.items())

    simulator = read_pkl(simulator_pkl)
    # read nCx and nCy
    nCx = simulator.mesh.nCx  # number of cell center mesh in the x direction
    nCy = simulator.mesh.nCy  # number of cell center mesh in the z (y) direction
    # read Tx_locations and Rx_locations
    Tx_locations = simulator.urf.abmn_locations[:, :4]
    Rx_locations = simulator.urf.abmn_locations[:, 4:]
    # expand simulator.config and save it
    simulator.config = {
        'generate': simulator.config,  # config for generate data
        'preprocess': config  # config for preprocess data
    }
    os.makedirs(save_processed_data_dir, exist_ok=True)
    write_pkl(simulator, save_simulator_pkl)

    if do_preprocess:
        pattern_raw_pkl = re.compile('raw_data_\d{6}.pkl')

        for root_dir, sub_dirs, files in os.walk(raw_data_dir):
            # filter files list so the files list will contain pickle files that match the pattern
            files = list(filter(pattern_raw_pkl.match, files))
            # If the files list is empty, continue to the next iteration of the loop
            if not files:
                continue
            # make sub directory
            sub_dir_in_processed = re.sub(raw_data_dir,
                                          save_processed_data_dir, root_dir)
            os.makedirs(sub_dir_in_processed, exist_ok=True)

            # Parallel version!
            par = partial(_make_processed_dataset,
                          preprocess=preprocess,
                          root_dir=root_dir,
                          sub_dir_in_processed=sub_dir_in_processed,
                          Tx_locations=Tx_locations,
                          Rx_locations=Rx_locations,
                          nCx=nCx,
                          nCy=nCy)
            pool = mp.Pool(processes=mp.cpu_count(), maxtasksperchild=1)
            for data in tqdm(
                    pool.imap_unordered(par, files),
                    desc=f'Preprocess data and save to {sub_dir_in_processed}',
                    total=len(files)):
                pass
            pool.close()
            pool.join()

            # Serial version!
            # for filename in files:
            #     pkl_name = os.path.join(root_dir, filename)
            #     data = read_pkl(pkl_name)
            #     # check if the data is dict and have "resistance" and "resistivity_log10" keys
            #     if (not isinstance(data, dict)
            #             or data.get('resistance') is None
            #             or data.get('resistivity_log10') is None):
            #         continue

            #     # preprocess
            #     for k, v in preprocess.items():
            #         if k == 'add_noise' and v.get('perform'):
            #             add_noise(data['resistance'], **v.get('kwargs'))
            #         elif k == 'log_transform' and v.get('perform'):
            #             log_transform(data['resistance'], **v.get('kwargs'))
            #         elif k == 'to_midpoint' and v.get('perform'):
            #             data['resistance'] = to_midpoint(
            #                 data['resistance'], Tx_locations, Rx_locations
            #             )
            #         elif k == 'to_txrx' and v.get('perform'):
            #             data['resistance'] = to_txrx(
            #                 data['resistance'], Tx_locations, Rx_locations
            #             )
            #         elif k == 'to_section' and v.get('perform'):
            #             data['resistivity_log10'] = to_section(
            #                 data['resistivity_log10'], nCx, nCy
            #             )

            #     # save pickle in processed dir
            #     new_pkl_name = os.path.join(
            #         sub_dir_in_processed,
            #         re.sub(r'raw', r'processed', filename)
            #     )
            #     write_pkl(data, new_pkl_name)

        # show information about input / target tensor shape
        try:
            print("The shape of resistance (shape of NN input data): " +
                  f"{data['resistance'].shape}")
            print("The shape of resistivity (shape of NN target data): " +
                  f"{data['resistivity_log10'].shape}")
            print(
                "IF YOU WANT TO GET THE RAW resistivity_log10, YOU SHOULD USE"
                +
                " `raw_resistivity_log10 = np.flipud(resistivity_log10).flatten()`"
            )
        except NameError as err:
            pass  # no pickle files