Beispiel #1
0
    def test_with_mask_groups(self):
        nbins = 100
        nsets = 10
        data = numpy.random.normal(size=(nsets, nbins, nbins))
        mask = numpy.random.randint(2, size=data.shape).astype(numpy.uint8)

        rate_stats1 = StreamingStats2D((nbins, nbins))
        for di, d in enumerate(data[:(nbins / 2)]):
            rate_stats1.update(d, mask[di])

        rate_stats2 = StreamingStats2D((nbins, nbins))
        for di, d in enumerate(data[(nbins / 2):]):
            rate_stats2.update(d, mask[di])

        rate_stats3 = rate_stats1 + rate_stats2
        rate_stats1 += rate_stats2

        data_masked = numpy.ma.array(data, mask=mask)

        assert numpy.allclose(rate_stats1.mean,
                              data_masked.mean(axis=0).filled(fill_value=0.0))
        assert numpy.allclose(rate_stats1.var,
                              data_masked.var(axis=0).filled(fill_value=0.0))

        assert numpy.allclose(rate_stats3.mean,
                              data_masked.mean(axis=0).filled(fill_value=0.0))
        assert numpy.allclose(rate_stats3.var,
                              data_masked.var(axis=0).filled(fill_value=0.0))
    def calculate(self, iter_start=None, iter_stop=None, n_blocks=1, queue_size=1):
        '''Read the HDF5 file and collect flux matrices and population vectors
        for each bin for each iteration in the range [iter_start, iter_stop). Break
        the calculation into n_blocks blocks. If the calculation is broken up into
        more than one block, queue_size specifies the maxmimum number of tasks in
        the work queue.'''

        iter_start = iter_start or 1
        iter_stop = iter_stop or self.data_manager.current_iteration

        itercount = iter_stop - iter_start
        block_size = max(1, itercount // n_blocks)
        nbins = self.bin_mapper.nbins

        if n_blocks == 1:
            flux_stats_t, rate_stats_t, population_stats_t = process_iter_chunk(self.bin_mapper, range(iter_start, iter_stop))

            flux_stats = tuple2stats(flux_stats_t)
            rate_stats = tuple2stats(rate_stats_t)
            population_stats = tuple2stats(population_stats_t)
        else:
            flux_stats = StreamingStats2D((nbins, nbins))
            rate_stats = StreamingStats2D((nbins, nbins))
            population_stats = StreamingStats1D(nbins)

            task_generator = self.task_generator(iter_start, iter_stop, block_size)

            for future in self.work_manager.submit_as_completed(task_generator, queue_size):
                chunk_flux_stats_t, chunk_rate_stats_t, chunk_pop_stats_t = future.get_result()

                chunk_flux_stats = tuple2stats(chunk_flux_stats_t)
                chunk_rate_stats = tuple2stats(chunk_rate_stats_t)
                chunk_pop_stats = tuple2stats(chunk_pop_stats_t)

                # Update statistics with chunked subsets
                flux_stats += chunk_flux_stats
                rate_stats += chunk_rate_stats
                population_stats += chunk_pop_stats

        self.average_flux = flux_stats.mean 
        self.stderr_flux = numpy.nan_to_num(numpy.sqrt(flux_stats.var) / flux_stats.n)

        self.average_populations = population_stats.mean 
        self.stderr_populations = numpy.nan_to_num(numpy.sqrt(population_stats.var) / population_stats.n)

        self.average_rate = rate_stats.mean 
        self.stderr_rate = numpy.nan_to_num(numpy.sqrt(rate_stats.var) / rate_stats.n)

        assert ~numpy.any(numpy.isinf(self.stderr_flux))
        assert ~numpy.any(numpy.isinf(self.stderr_rate))
        assert ~numpy.any(numpy.isinf(self.stderr_populations))
    def test_nomask(self):
        nbins = 100
        nsets = 10
        data = numpy.random.normal(size=(nsets, nbins, nbins))
        mask = numpy.zeros((nbins, nbins), numpy.uint8)

        rate_stats = StreamingStats2D((nbins, nbins))
        for d in data:
            rate_stats.update(d, mask)

        assert numpy.allclose(rate_stats.mean, data.mean(axis=0))
        assert numpy.allclose(rate_stats.var, data.var(axis=0,))
    def test_nomask_groups(self):
        nbins = 100
        nsets = 10
        data = numpy.random.normal(size=(nsets, nbins, nbins))
        mask = numpy.zeros((nbins, nbins), numpy.uint8)

        rate_stats1 = StreamingStats2D((nbins, nbins))
        for d in data[:(nbins/2)]:
            rate_stats1.update(d, mask)

        rate_stats2 = StreamingStats2D((nbins, nbins))
        for d in data[(nbins/2):nbins]:
            rate_stats2.update(d, mask)

        rate_stats3 = rate_stats1 + rate_stats2
        rate_stats1 += rate_stats2

        assert numpy.allclose(rate_stats1.mean, data.mean(axis=0))
        assert numpy.allclose(rate_stats1.var, data.var(axis=0,))

        assert numpy.allclose(rate_stats3.mean, data.mean(axis=0))
        assert numpy.allclose(rate_stats3.var, data.var(axis=0,))
Beispiel #5
0
def tuple2stats(stat_tuple):
    ndims = stat_tuple.M1.ndim
    assert ndims == 1 or ndims == 2

    if ndims == 2:
        stats = StreamingStats2D(stat_tuple.M1.shape)
    elif ndims == 1:
        stats = StreamingStats1D(stat_tuple.M1.shape[0])
    else:
        raise ValueError

    stats.M1 = stat_tuple.M1
    stats.M2 = stat_tuple.M2
    stats.n = stat_tuple.n

    return stats
    def test_tuple2stats2D(self):
        StreamingStatsTuple = namedtuple('StreamingStatsTuple', ['M1', 'M2', 'n'])
        nbins = 100
        nsets = 10
        data = numpy.random.normal(size=(nsets, nbins, nbins))
        mask = numpy.zeros((nbins, nbins), numpy.uint8)

        rate_stats = StreamingStats2D((nbins, nbins))
        for d in data:
            rate_stats.update(d, mask)

        c_rate_stats = StreamingStatsTuple(rate_stats.M1, rate_stats.M2, rate_stats.n)

        rate_stats2 = tuple2stats(c_rate_stats)

        assert numpy.allclose(rate_stats.M1, rate_stats2.M1)
        assert numpy.allclose(rate_stats.M2, rate_stats2.M2)
        assert numpy.allclose(rate_stats.n, rate_stats2.n)
        assert numpy.allclose(rate_stats.mean, rate_stats2.mean)
        assert numpy.allclose(rate_stats.var, rate_stats2.var)
Beispiel #7
0
def process_iter_chunk(bin_mapper, iter_indices, iter_data=None):
    '''Calculate the flux matrices and populations of a set of iterations specified
    by iter_indices. Optionally provide the necessary arrays to perform the calculation
    in iter_data. Otherwise get data from the data_manager directly.
    '''

    data_manager = westpa.rc.get_data_manager()
    system = westpa.rc.get_system_driver()

    itercount = len(iter_indices)
    nbins = bin_mapper.nbins

    flux_stats = StreamingStats2D((nbins, nbins))
    rate_stats = StreamingStats2D((nbins, nbins))
    pop_stats = StreamingStats1D(nbins)

    nomask1d = numpy.zeros((nbins, ), numpy.uint8)
    nomask2d = numpy.zeros((nbins, nbins), numpy.uint8)
    rate_mask = numpy.zeros((nbins, nbins), numpy.uint8)

    flux_matrix = numpy.zeros((nbins, nbins), numpy.float64)
    rate_matrix = numpy.zeros((nbins, nbins), numpy.float64)
    population_vector = numpy.zeros((nbins, ), numpy.float64)

    pcoord_len = system.pcoord_len
    assign = bin_mapper.assign

    for iiter, n_iter in enumerate(iter_indices):
        flux_matrix.fill(0.0)
        population_vector.fill(0.0)

        if iter_data:
            iter_group_name = 'iter_{:0{prec}d}'.format(
                int(n_iter), prec=data_manager.iter_prec)
            iter_group = iter_data[iter_group_name]
        else:
            iter_group = data_manager.get_iter_group(n_iter)

        # first, account for the flux due to recycling
        # we access the hdf5 file directly to avoid nearly 50% overhead of creating a ton of
        # tiny newweightentry objects
        try:
            nwgroup = iter_group['new_weights']
        except KeyError:
            # no new weight data
            pass
        else:
            if iter_data:
                index = None
                weights = nwgroup['weight']
                prev_init_pcoords = nwgroup['prev_init_pcoord']
                new_init_pcoords = nwgroup['new_init_pcoord']
            else:
                index = nwgroup['index'][...]
                weights = index['weight']
                prev_init_pcoords = nwgroup['prev_init_pcoord'][...]
                new_init_pcoords = nwgroup['new_init_pcoord'][...]

            prev_init_assignments = assign(prev_init_pcoords)
            new_init_assignments = assign(new_init_pcoords)

            flux_assign(weights, prev_init_assignments, new_init_assignments,
                        flux_matrix)
            #for (weight,i,j) in izip (weights, prev_init_assignments, new_init_assignments):
            #    flux_matrices[iiter,i,j] += weight
            del index
            del prev_init_pcoords, new_init_pcoords, prev_init_assignments, new_init_assignments, weights

        #iter_group = data_manager.get_iter_group(n_iter)
        if iter_data:
            weights = iter_group['weight']
            initial_pcoords = iter_group['initial_pcoords']
            final_pcoords = iter_group['final_pcoords']
        else:
            weights = iter_group['seg_index']['weight']
            initial_pcoords = iter_group['pcoord'][:, 0]
            final_pcoords = iter_group['pcoord'][:, pcoord_len - 1]

        initial_assignments = assign(initial_pcoords)
        final_assignments = assign(final_pcoords)

        flux_assign(weights, initial_assignments, final_assignments,
                    flux_matrix)
        pop_assign(weights, initial_assignments, population_vector)

        flux_stats.update(flux_matrix, nomask2d)
        pop_stats.update(population_vector, nomask1d)

        calc_rates(flux_matrix, population_vector, rate_matrix, rate_mask)
        rate_stats.update(rate_matrix, rate_mask)

        del weights
        del initial_assignments, final_assignments
        del initial_pcoords, final_pcoords
        del iter_group

    # Create namedtuple proxies for the cython StreamingStats objects
    # since the typed memoryviews class variables do not seem to return
    # cleanly from the zmq workers
    c_flux_stats = StreamingStatsTuple(flux_stats.M1, flux_stats.M2,
                                       flux_stats.n)
    c_rate_stats = StreamingStatsTuple(rate_stats.M1, rate_stats.M2,
                                       rate_stats.n)
    c_pop_stats = StreamingStatsTuple(pop_stats.M1, pop_stats.M2, pop_stats.n)

    return c_flux_stats, c_rate_stats, c_pop_stats