Beispiel #1
0
    def test_with_mask_groups(self):
        nbins = 100
        nsets = 10
        data = np.random.normal(size=(nsets, nbins))
        mask = np.random.randint(2, size=data.shape).astype(np.uint8)

        rate_stats1 = StreamingStats1D(nbins)
        for di, d in enumerate(data[:(nbins // 2)]):
            rate_stats1.update(d, mask[di])

        rate_stats2 = StreamingStats1D(nbins)
        for di, d in enumerate(data[(nbins // 2):]):
            rate_stats2.update(d, mask[di])

        rate_stats3 = rate_stats1 + rate_stats2
        rate_stats1 += rate_stats2

        data_masked = np.ma.array(data, mask=mask)

        assert np.allclose(rate_stats1.mean,
                           data_masked.mean(axis=0).filled(fill_value=0.0))
        assert np.allclose(rate_stats1.var,
                           data_masked.var(axis=0).filled(fill_value=0.0))

        assert np.allclose(rate_stats3.mean,
                           data_masked.mean(axis=0).filled(fill_value=0.0))
        assert np.allclose(rate_stats3.var,
                           data_masked.var(axis=0).filled(fill_value=0.0))
Beispiel #2
0
    def test_nomask(self):
        nbins = 100
        nsets = 10
        data = np.random.normal(size=(nsets, nbins))
        mask = np.zeros((nbins, ), np.uint8)

        rate_stats = StreamingStats1D(nbins)
        for d in data:
            rate_stats.update(d, mask)

        assert np.allclose(rate_stats.mean, data.mean(axis=0))
        assert np.allclose(rate_stats.var, data.var(axis=0, ))
Beispiel #3
0
    def test_nomask_groups(self):
        nbins = 100
        nsets = 10
        data = np.random.normal(size=(nsets, nbins))
        mask = np.zeros((nbins, ), np.uint8)

        rate_stats1 = StreamingStats1D(nbins)
        for d in data[:(nbins // 2)]:
            rate_stats1.update(d, mask)

        rate_stats2 = StreamingStats1D(nbins)
        for d in data[(nbins // 2):]:
            rate_stats2.update(d, mask)

        rate_stats3 = rate_stats1 + rate_stats2
        rate_stats1 += rate_stats2

        assert np.allclose(rate_stats1.mean, data.mean(axis=0))
        assert np.allclose(rate_stats1.var, data.var(axis=0, ))

        assert np.allclose(rate_stats3.mean, data.mean(axis=0))
        assert np.allclose(rate_stats3.var, data.var(axis=0, ))
Beispiel #4
0
    def test_with_mask(self):
        nbins = 100
        nsets = 10
        data = np.random.normal(size=(nsets, nbins))
        mask = np.random.randint(2, size=data.shape).astype(np.uint8)

        rate_stats = StreamingStats1D(nbins)
        for di, d in enumerate(data):
            rate_stats.update(d, mask[di])

        data_masked = np.ma.array(data, mask=mask)

        assert np.allclose(rate_stats.mean, data_masked.mean(axis=0).filled(fill_value=0.0))
        assert np.allclose(rate_stats.var, data_masked.var(axis=0).filled(fill_value=0.0))
def tuple2stats(stat_tuple):
    ndims = stat_tuple.M1.ndim
    assert ndims == 1 or ndims == 2

    if ndims == 2:
        stats = StreamingStats2D(stat_tuple.M1.shape)
    elif ndims == 1:
        stats = StreamingStats1D(stat_tuple.M1.shape[0])
    else:
        raise ValueError

    stats.M1 = stat_tuple.M1
    stats.M2 = stat_tuple.M2
    stats.n = stat_tuple.n

    return stats
Beispiel #6
0
    def test_tuple2stats1D(self):
        StreamingStatsTuple = collections.namedtuple('StreamingStatsTuple', ['M1', 'M2', 'n'])
        nbins = 100
        nsets = 10
        data = np.random.normal(size=(nsets, nbins))
        mask = np.zeros((nbins,), np.uint8)

        rate_stats = StreamingStats1D(nbins)
        for d in data:
            rate_stats.update(d, mask)

        c_rate_stats = StreamingStatsTuple(rate_stats.M1, rate_stats.M2, rate_stats.n)

        rate_stats2 = tuple2stats(c_rate_stats)

        assert np.allclose(rate_stats.M1, rate_stats2.M1)
        assert np.allclose(rate_stats.M2, rate_stats2.M2)
        assert np.allclose(rate_stats.n, rate_stats2.n)
        assert np.allclose(rate_stats.mean, rate_stats2.mean)
        assert np.allclose(rate_stats.var, rate_stats2.var)
def process_iter_chunk(bin_mapper, iter_indices, iter_data=None):
    '''Calculate the flux matrices and populations of a set of iterations specified
    by iter_indices. Optionally provide the necessary arrays to perform the calculation
    in iter_data. Otherwise get data from the data_manager directly.
    '''

    data_manager = westpa.rc.get_data_manager()
    system = westpa.rc.get_system_driver()

    itercount = len(iter_indices)
    nbins = bin_mapper.nbins

    flux_stats = StreamingStats2D((nbins, nbins))
    rate_stats = StreamingStats2D((nbins, nbins))
    pop_stats = StreamingStats1D(nbins)

    nomask1d = np.zeros((nbins, ), np.uint8)
    nomask2d = np.zeros((nbins, nbins), np.uint8)
    rate_mask = np.zeros((nbins, nbins), np.uint8)

    flux_matrix = np.zeros((nbins, nbins), np.float64)
    rate_matrix = np.zeros((nbins, nbins), np.float64)
    population_vector = np.zeros((nbins, ), np.float64)

    pcoord_len = system.pcoord_len
    assign = bin_mapper.assign

    for iiter, n_iter in enumerate(iter_indices):
        flux_matrix.fill(0.0)
        population_vector.fill(0.0)

        if iter_data:
            iter_group_name = 'iter_{:0{prec}d}'.format(
                int(n_iter), prec=data_manager.iter_prec)
            iter_group = iter_data[iter_group_name]
        else:
            iter_group = data_manager.get_iter_group(n_iter)

        # first, account for the flux due to recycling
        # we access the hdf5 file directly to avoid nearly 50% overhead of creating a ton of
        # tiny newweightentry objects
        try:
            nwgroup = iter_group['new_weights']
        except KeyError:
            # no new weight data
            pass
        else:
            if iter_data:
                index = None
                weights = nwgroup['weight']
                prev_init_pcoords = nwgroup['prev_init_pcoord']
                new_init_pcoords = nwgroup['new_init_pcoord']
            else:
                index = nwgroup['index'][...]
                weights = index['weight']
                prev_init_pcoords = nwgroup['prev_init_pcoord'][...]
                new_init_pcoords = nwgroup['new_init_pcoord'][...]

            prev_init_assignments = assign(prev_init_pcoords)
            new_init_assignments = assign(new_init_pcoords)

            flux_assign(weights, prev_init_assignments, new_init_assignments,
                        flux_matrix)
            #for (weight,i,j) in izip (weights, prev_init_assignments, new_init_assignments):
            #    flux_matrices[iiter,i,j] += weight
            del index
            del prev_init_pcoords, new_init_pcoords, prev_init_assignments, new_init_assignments, weights

        #iter_group = data_manager.get_iter_group(n_iter)
        if iter_data:
            weights = iter_group['weight']
            initial_pcoords = iter_group['initial_pcoords']
            final_pcoords = iter_group['final_pcoords']
        else:
            weights = iter_group['seg_index']['weight']
            initial_pcoords = iter_group['pcoord'][:, 0]
            final_pcoords = iter_group['pcoord'][:, pcoord_len - 1]

        initial_assignments = assign(initial_pcoords)
        final_assignments = assign(final_pcoords)

        flux_assign(weights, initial_assignments, final_assignments,
                    flux_matrix)
        pop_assign(weights, initial_assignments, population_vector)

        flux_stats.update(flux_matrix, nomask2d)
        pop_stats.update(population_vector, nomask1d)

        calc_rates(flux_matrix, population_vector, rate_matrix, rate_mask)
        rate_stats.update(rate_matrix, rate_mask)

        del weights
        del initial_assignments, final_assignments
        del initial_pcoords, final_pcoords
        del iter_group

    # Create namedtuple proxies for the cython StreamingStats objects
    # since the typed memoryviews class variables do not seem to return
    # cleanly from the zmq workers
    c_flux_stats = StreamingStatsTuple(flux_stats.M1, flux_stats.M2,
                                       flux_stats.n)
    c_rate_stats = StreamingStatsTuple(rate_stats.M1, rate_stats.M2,
                                       rate_stats.n)
    c_pop_stats = StreamingStatsTuple(pop_stats.M1, pop_stats.M2, pop_stats.n)

    return c_flux_stats, c_rate_stats, c_pop_stats
    def calculate(self,
                  iter_start=None,
                  iter_stop=None,
                  n_blocks=1,
                  queue_size=1):
        '''Read the HDF5 file and collect flux matrices and population vectors
        for each bin for each iteration in the range [iter_start, iter_stop). Break
        the calculation into n_blocks blocks. If the calculation is broken up into
        more than one block, queue_size specifies the maxmimum number of tasks in
        the work queue.'''

        iter_start = iter_start or 1
        iter_stop = iter_stop or self.data_manager.current_iteration

        itercount = iter_stop - iter_start
        block_size = max(1, itercount // n_blocks)
        nbins = self.bin_mapper.nbins

        if n_blocks == 1:
            flux_stats_t, rate_stats_t, population_stats_t = process_iter_chunk(
                self.bin_mapper, list(range(iter_start, iter_stop)))

            flux_stats = tuple2stats(flux_stats_t)
            rate_stats = tuple2stats(rate_stats_t)
            population_stats = tuple2stats(population_stats_t)
        else:
            flux_stats = StreamingStats2D((nbins, nbins))
            rate_stats = StreamingStats2D((nbins, nbins))
            population_stats = StreamingStats1D(nbins)

            task_generator = self.task_generator(iter_start, iter_stop,
                                                 block_size)

            for future in self.work_manager.submit_as_completed(
                    task_generator, queue_size):
                chunk_flux_stats_t, chunk_rate_stats_t, chunk_pop_stats_t = future.get_result(
                )

                chunk_flux_stats = tuple2stats(chunk_flux_stats_t)
                chunk_rate_stats = tuple2stats(chunk_rate_stats_t)
                chunk_pop_stats = tuple2stats(chunk_pop_stats_t)

                # Update statistics with chunked subsets
                flux_stats += chunk_flux_stats
                rate_stats += chunk_rate_stats
                population_stats += chunk_pop_stats

        self.average_flux = flux_stats.mean
        self.stderr_flux = np.nan_to_num(
            np.sqrt(flux_stats.var) / flux_stats.n)

        self.average_populations = population_stats.mean
        self.stderr_populations = np.nan_to_num(
            np.sqrt(population_stats.var) / population_stats.n)

        self.average_rate = rate_stats.mean
        self.stderr_rate = np.nan_to_num(
            np.sqrt(rate_stats.var) / rate_stats.n)

        assert ~np.any(np.isinf(self.stderr_flux))
        assert ~np.any(np.isinf(self.stderr_rate))
        assert ~np.any(np.isinf(self.stderr_populations))