Ejemplo n.º 1
0
def _print_info_helper(key, io):
    print2 = _get_print2()
    is_stretch = key == 'tstretch'
    fname = _get_fname(io, key)
    keys = _get_existent(fname, '/', 1)  # 1, 3, 4
    if len(keys) == 0:
        print2('None')
    for key in sorted(keys):
        keys2 = _get_existent(fname, key, 3)
        subkey = key.split('/')[-1]
        if is_stretch:
            o = '%s: %d combs' % (subkey, len(keys2))
        else:
            keys3 = _get_existent(fname, key, 4)
            o = ('%s: %d combs, %d corrs' %
                 (subkey, len(keys2), len(keys3)))
        print2(o)
Ejemplo n.º 2
0
def start_correlate(io,
                    filter_inventory=None,
                    startdate='1990-01-01', enddate='2020-01-01',
                    njobs=None,
                    parallel_inner_loop=False,
                    keep_correlations=False,
                    stack='1d',
                    dataset_kwargs=None,
                    **kwargs):
    """
    Start correlation

    :param io: |io|
    :param filter_inventory: filter inventory with its select method,
        specified dict is passed to |Inventory.select|
    :param str startdate,enddate: start and end date as strings
    : param njobs: number of cores to use for computation, days are computed
        parallel, this might consume much memory, default: None -- use all
        available cores
    :param parallel_inner_loop: Run inner loops parallel instead of outer loop
        (preproccessing of different stations and correlation of different
        pairs versus processing of different days).
        Useful for a datset with many stations.
    :param dtype: data type for storing correlations
        (default: float16 - half precision)
    :param dataset_kwargs: options passed to obspyh5 resp. h5py when creating
         a new dataset,
         e.g. `dataset_kwargs={'compression':'gzip'}`.
         See create_dataset in h5py for more options.
         By default the dtype is set to `'float16'`.
    :param keep_correlations,stack,\*\*kwargs: all other kwargs are passed to
        `~yam.correlate.correlate()` function
    """
    if dataset_kwargs is None:
        dataset_kwargs = {}
    if filter_inventory:
        log.debug('filter inventory')
        io['inventory'] = io['inventory'].select(**filter_inventory)
    log.info('start preprocessing and correlation')
    tasks = [str(t)[:10] for t in IterTime(UTC(startdate), UTC(enddate))]
    done_tasks = None
    if stack is not None:
        key2 = kwargs['outkey'] + '_s' + stack
        done_tasks = [t[-16:-6] for t in _get_existent(io['stack'], key2, 4)]
    if keep_correlations:
        key2 = kwargs['outkey']
        done_tasks2 = [t[-16:-6] for t in _get_existent(io['corr'], key2, 4)]
        if done_tasks is None:
            done_tasks = done_tasks2
        else:
            done_tasks = [t for t in done_tasks if t in done_tasks2]
    tasks = _todo_tasks(tasks, done_tasks)
    tasks = [UTC(t) for t in tasks]
    kwargs.update({'keep_correlations': keep_correlations, 'stack': stack})

    dir_corr = 'correlation'
    if os.path.exists(dir_corr):
        shutil.rmtree(dir_corr)
    os.makedirs(dir_corr)

    if parallel_inner_loop:
        kwargs['njobs'] = njobs
        njobs = 1
    do_work = functools.partial(correlate, io, **kwargs)
    if njobs == 1:
        log.info('do work sequentially')
        for task in tqdm.tqdm(tasks, total=len(tasks)):
            result = do_work(task)
            # _write_corr(result, io, **dataset_kwargs)
    else:
        pool = multiprocessing.Pool(njobs)
        log.info('do work parallel (%d cores)', pool._processes)
        for result in tqdm.tqdm(pool.imap_unordered(do_work, tasks),
                                total=len(tasks)):
            continue
            # _write_corr(result, io, **dataset_kwargs)
        pool.close()
        pool.join()

    log.info('finished preprocessing and correlation')
Ejemplo n.º 3
0
def info(io, key=None, subkey='', config=None, **unused_kwargs):
    """
    Print information about yam project

    :param io: |io|
    :param key: key to print infos about
        (key inside HDF5 file, or one of data, stations,
        default: None -- print overview)
    :param subkey: only print part of the HDF5 file
    :param config: list of configuration dictionaries
    """
    print2 = _get_print2()
    data_plugin = io.get('data_plugin')
    if key is None:
        print('Stations:')
        inventory = io['inventory']
        if inventory is None:
            print2('Not found')
        else:
            stations = inventory.get_contents()['stations']
            channels = inventory.get_contents()['channels']
            print2(' '.join(st.strip().split()[0] for st in stations))
            print2('%d stations, %d channels' % (len(stations), len(channels)))
        if data_plugin:
            print('Data plugin:')
            print2('%s' % data_plugin)
        else:
            print('Raw data (expression for day files):')
            print2(io['data'])
            print2('%d files found' % len(_get_data_files(io['data'])))
        print('Config ids:')

        def get_keys(d):
            if d is None or len(d) == 0:
                return 'None'
            else:
                return ', '.join(sorted(d.keys()))
        print2('c Corr: ' + get_keys(config[0]))
        print2('s Stack: ' + get_keys(config[1]))
        print2('t Stretch: ' + get_keys(config[2]))
        print('Correlations (channel combinations, correlations calculated):')
        _print_info_helper('corr', io)
        print('Stacks:')
        _print_info_helper('stack', io)
        print('Stretching matrices:')
        _print_info_helper('tstretch', io)
    elif key == 'stations':
        print(io['inventory'])
    elif key == 'data':
        if data_plugin:
            print('Data plugin:')
            print2('%s' % data_plugin)
        else:
            print('Raw data (expression for day files):')
            print2(io['data'])
            fnames = _get_data_files(io['data'])
            print2('%d files found' % len(fnames))
            for fname in sorted(fnames):
                print2(fname)
    else:
        is_stretch = 't' in _analyze_key(key)
        fname = _get_fname(io, key)
        level = 3 if is_stretch else 4
        for line in _get_existent(fname, key + subkey, level):
            print2(line)
Ejemplo n.º 4
0
def start_stretch(io, key, subkey='', njobs=None, reftrid=None,
                  starttime=None, endtime=None,
                  dataset_kwargs=None,
                  **kwargs):
    """
    Start stretching

    :param io: |io|
    :param key:  key to load correlations from
    :param subkey: only use a part of the correlations
    :param njobs: number of cores to use for computation,
        default: None -- use all available cores
    :param reftrid: Parallel processing is only possible when this parameter
        is specified. Key to load the reference trace from, e.g. `'c1_s'`,
        it can be created by a command similar to `yam stack c1 ''`.
    :param starttime,endtime: constrain start and end dates
    :param dataset_kwargs: options passed to obspyh5 resp. h5py when creating
         a new dataset,
         e.g. `dataset_kwargs={'compression':'gzip'}`.
         See create_dataset in h5py for more options.
         By default the dtype is set to `'float16'`.
    :param \*\*kwargs: all other kwargs are passed to
        `stretch_wrapper()` function
    """
    if dataset_kwargs is None:
        dataset_kwargs = {}
    fname = _get_fname(io, key)
    outkey = kwargs['outkey']
    tasks = _get_existent(fname, key + subkey, 3)
    done_tasks = [t.replace(outkey, key) for t in
                  _get_existent(io['stretch'], outkey + subkey, 3)]
    tasks = _todo_tasks(tasks, done_tasks)
    for task in tqdm.tqdm(tasks, total=len(tasks)):
        if reftrid is None:
            reftr = None
        else:
            fname_reftr = _get_fname(io, reftrid)
            group_reftr = task.replace(key, reftrid)
            reftr = obspy.read(fname_reftr, 'H5', group=group_reftr,
                               dtype=float)
            if len(reftr) != 1:
                raise NotImplementedError('Reference must be single trace')
            reftr = reftr[0]
        subtasks = [t for t in _get_existent(fname, task, 4) if
                    (starttime is None or t[-16:] >= starttime) and
                    (endtime is None or t[-16:] <= endtime)]
        if reftr is None:
            subtask_chunks = [tuple(subtasks)]
        else:
            step = 1000
            subtask_chunks = [tuple(subtasks[i:i + step]) for i in
                              range(0, len(subtasks), step)]
        do_work = functools.partial(_stretch_wrapper, fname=fname,
                                    reftr=reftr, **kwargs)
        results = []
        if njobs == 1 or len(subtask_chunks) == 1:
            log.debug('do work sequentially')
            for stask in tqdm.tqdm(subtask_chunks, total=len(subtask_chunks)):
                result = do_work(stask)
                results.append(result)
        else:
            pool = multiprocessing.Pool(njobs)
            log.debug('do work parallel (%d cores)', pool._processes)
            for result in tqdm.tqdm(
                    pool.imap(do_work, subtask_chunks),
                    total=len(subtask_chunks)):
                results.append(result)
            pool.close()
            pool.join()
        result = yam.stretch.join_dicts(results)
        if result is not None:
            write_dict(result, io['stretch'], **dataset_kwargs)
Ejemplo n.º 5
0
def start_stack(io, key, outkey, subkey='', njobs=None,
                starttime=None, endtime=None,
                dataset_kwargs=None,
                **kwargs):
    """
    Start stacking

    :param io: |io|
    :param key:  key to load correlations from
    :param outkey: key to write stacked correlations to
    :param subkey: only use a part of the correlations
    :param njobs: number of cores to use for computation,
        default: None -- use all available cores
    :param starttime,endtime: constrain start and end dates
    :param dataset_kwargs: options passed to obspyh5 resp. h5py when creating
         a new dataset,
         e.g. `dataset_kwargs={'compression':'gzip'}`.
         See create_dataset in h5py for more options.
         By default the dtype is set to `'float16'`.
    :param \*\*kwargs: all other kwargs are passed to
        `yam.stack.stack()` function
    """
    if dataset_kwargs is None:
        dataset_kwargs = {}
    dataset_kwargs.setdefault('dtype', 'float16')
    fname = io['stack'] if 's' in _analyze_key(key) else io['corr']
    tasks = _get_existent(fname, key + subkey, 3)
    done_tasks = [t.replace(outkey, key) for t in
                  _get_existent(io['stack'], outkey + subkey, 3)]
    tasks = _todo_tasks(tasks, done_tasks)
    length = kwargs.get('length')
    for task in tqdm.tqdm(tasks, total=len(tasks)):
        subtasks = [t for t in _get_existent(fname, task, 4) if
                    (starttime is None or t[-16:] >= starttime) and
                    (endtime is None or t[-16:] <= endtime)]
        if length is None and njobs != 1:
            step = 1000
            subtask_chunks = [tuple(subtasks[i:i + step]) for i in
                              range(0, len(subtasks), step)]
        else:
            subtask_chunks = [subtasks]
            # TODO: parallel stacking for arbitrary stack id
#            lensec = _time2sec(length)
#            if lensec >= 30 * 24 * 3600:
#                subtask_chunks = [subtasks]
#            else:
#                subtask_chunks = []
#                for i in range(0, len(subtasks), step):
#                    chunk = subtasks[i:i + step]
#                    t1 = UTC(subtasks[i + step - 1][-16:])
#                    j = 0
#                    while i + step + j < len(subtasks):
#                        t2 = UTC(subtasks[i + step + j][-16:])
#                        # assume lensec is always larger than movesec
#                        # not ideal, may load to much data
#                        # eg for stack over 1 year
#                        if t2 - t1 <= lensec:
#                            chunk.append(subtasks[i + step + j])
#                        else:
#                            break
#                        j += 1
#                    subtask_chunks.append(chunk)
        do_work = functools.partial(_stack_wrapper, fname=fname, outkey=outkey,
                                    **kwargs)
        results = []
        if njobs == 1 or len(subtask_chunks) == 1:
            log.debug('do work sequentially')
            for stask in tqdm.tqdm(subtask_chunks, total=len(subtask_chunks)):
                result = do_work(stask)
                results.append(result)
        else:
            pool = multiprocessing.Pool(njobs)
            log.debug('do work parallel (%d cores)', pool._processes)
            for result in tqdm.tqdm(
                    pool.imap(do_work, subtask_chunks),
                    total=len(subtask_chunks)):
                results.append(result)
            pool.close()
            pool.join()
        if length is None:
            for stream in results:
                assert len(stream) <= 1
            traces = [tr for stream in results for tr in stream]
            num = sum(tr.stats.num for tr in traces)
            data = np.sum([tr.data * (tr.stats.num / num) for tr in traces],
                          axis=0)
            tr_stack = obspy.Trace(data, header=traces[0].stats)
            tr_stack.stats.num = num
            tr_stack.write(io['stack'], 'H5', mode='a', **dataset_kwargs)
        else:
            for stack_stream in results:
                stack_stream.write(io['stack'], 'H5', mode='a',
                                   **dataset_kwargs)