def sample_choice(prob_array, method="MC"):
    """sample chosen index given probability in prob_array
    prob_array - 2-d array for probability of being chosen, with probablities for a agent at
                 one row, and probabilities for alternatives at columns
    method - the method used to sample choice, either MC (Monte Carlo) or max_prob

    if prob_array.ndim <> 2:
        raise RuntimeError, "prob_array must be a 2d array"

    rows, columns = prob_array.shape
    sum_prob_by_col = sum(prob_array, axis=1, dtype=float64)
    if not ma.allclose(sum_prob_by_col, ones((rows, ))):
        strange_rows = where(sum_prob_by_col != ones((rows, )))
        raise RuntimeError, "prob_array must add up to 1 for each row. Abnormal rows: %s" % prob_array[
            strange_rows, :]

    if method.lower() == "mc":
        cum_prob = ncumsum(prob_array, axis=1)

        R = uniform(0, 1, rows)
        R.resize((rows, 1))

        match = (R < cum_prob)
        choices = argmax(
            match, axis=1)  # return the first index of 1 in each row
    elif method.lower() == "max_prob":
        choices = argmax(prob_array)

    if choices.size <> rows:
        raise RuntimeError, "having problems sample choice"

    return (arange(rows), choices)
 def run(self, individual_dataset, fraction_dataset, id_name1='blockgroup_id', 
         id_name2='zone_id', fraction_attribute_name='fraction'):
     assert id_name1 in individual_dataset.get_known_attribute_names()
     if id_name2 not in individual_dataset.get_known_attribute_names():           
         individual_dataset.add_primary_attribute(-1*ones(individual_dataset.size()), id_name2)
     fraction_id1 = fraction_dataset.get_attribute(id_name1)
     individual_id1 = individual_dataset.get_attribute(id_name1)
     unique_ids = unique(fraction_id1)
     for id1 in unique_ids:
         individual_of_id1 = where(individual_id1==id1)[0]
         n = individual_of_id1.size
         logger.log_status("Processing %s %s: %s individuals" % (id_name1, id1, n) )
         if n > 0:
             fractions = fraction_dataset.get_attribute(fraction_attribute_name)[fraction_id1==id1]
             id2 = fraction_dataset.get_attribute(id_name2)[fraction_id1==id1]
             ## ignore households in geography with sum of fractions less than 1.0e-6
             if fractions.sum() < 1.0e-2:
             if not allclose(fractions.sum(), 1.0, rtol=1.e-2):
                 fractions = normalize(fractions)
             fractions_cumsum = ncumsum(fractions)
             R = random(n)
             index = searchsorted(fractions_cumsum, R)
             individual_dataset.modify_attribute(id_name2, id2[index], index=individual_of_id1)
    def __init__(self, config):
        ss = SimulationState(new_instance=True)

        #if not os.path.exists(config['cache_directory']):  ## if cache exists, it will automatically skip
        cacher = CreateBaseyearCache()
        cache_dir =

        if 'estimation_database_configuration' in config:
            db_server = DatabaseServer(config['estimation_database_configuration'])
            db = db_server.get_database(config['estimation_database_configuration'].database_name)
            out_storage = StorageFactory().get_storage(
                storage_location = db)
            output_cache = os.path.join(config['cache_directory'], str(config['base_year']+1))
            out_storage = StorageFactory().get_storage('flt_storage', storage_location=output_cache)

        dataset_pool = SessionConfiguration().get_dataset_pool()
        households = dataset_pool.get_dataset("household")
        buildings = dataset_pool.get_dataset("building")
        zones = dataset_pool.get_dataset("zone")
        zone_ids = zones.get_id_attribute()
        capacity_attribute_name = "residential_units"  #_of_use_id_%s" % id
        capacity_variable_name = "" % \
                                 (capacity_attribute_name, capacity_attribute_name)
        buildings.compute_variables("sanfrancisco.building.zone_id", dataset_pool=dataset_pool)
        zones.compute_variables(capacity_variable_name, dataset_pool=dataset_pool)

        building_zone_id = buildings.get_attribute('zone_id')
#        is_household_unplace = datasets['household'].get_attribute("building_id") <= 0
        is_household_unplaced = 1 #all households are unplaced
        household_building_id = zeros(households.size(), dtype='int32')-1 #datasets['household'].get_attribute("building_id")
        for zone_id in zone_ids:
            capacity = zones.get_attribute_by_id(capacity_attribute_name, zone_id)
            is_household_in_this_zone = (households.get_attribute('zone_id') == zone_id)
            is_unplaced_household_in_this_zone = is_household_in_this_zone * is_household_unplaced
            is_building_in_this_zone = (building_zone_id == zone_id)
#            if not is_household_in_this_zone.sum() <= capacity:
            if capacity == 0 or is_household_in_this_zone.sum()==0:
                print "WARNING: zone %s has %s households but only %s units" % (zone_id, is_household_in_this_zone.sum(), capacity)
            prob = buildings.get_attribute(capacity_attribute_name) * is_building_in_this_zone / array(capacity, dtype=float64)

            r = random(sum(is_unplaced_household_in_this_zone))
            prob_cumsum = ncumsum(prob)
            index_to_bldg = searchsorted(prob_cumsum, r)

            household_building_id[where(is_unplaced_household_in_this_zone)] = buildings.get_attribute_by_index('building_id', index_to_bldg)

#        import pdb;pdb.set_trace()
        households.set_values_of_one_attribute('building_id', household_building_id)
        households.write_dataset(out_table_name='households', out_storage=out_storage)
def sample_choice(prob_array, method="MC"):
    """sample chosen index given probability in prob_array
    prob_array - 2-d array for probability of being chosen, with probablities for a agent at
                 one row, and probabilities for alternatives at columns
    method - the method used to sample choice, either MC (Monte Carlo) or max_prob

    if prob_array.ndim <> 2:
        raise RuntimeError, "prob_array must be a 2d array"

    rows, columns = prob_array.shape
    if not ma.allclose(sum(prob_array, axis=1, dtype=float64), ones((rows,))):
        raise RuntimeError, "prob_array must add up to 1 for each row"

    if method.lower() == "mc":
        cum_prob = ncumsum(prob_array, axis=1)

        R = uniform(0, 1, rows)  ## new R spec
        R.resize((rows, 1))
        #        R = random((rows,1)) ## preOPUS4 specification of R - added 8 jul 09

        match = R < cum_prob
        choices = argmax(match, axis=1)  # return the first index of 1 in each row
    elif method.lower() == "max_prob":
        choices = argmax(prob_array)

    if choices.size <> rows:
        raise RuntimeError, "having problems sample choice"

    return (arange(rows), choices)
    def run(self, individual_dataset, fraction_dataset, id_name1='blockgroup_id', 
            id_name2='zone_id', fraction_attribute_name='fraction', dataset_pool=None):
        if dataset_pool is None:
            dataset_pool = SessionConfiguration().get_dataset_pool()
        if isinstance(individual_dataset, str):
            individual_dataset = dataset_pool[individual_dataset]
        if isinstance(fraction_dataset, str):
            fraction_dataset = dataset_pool[fraction_dataset]

        assert id_name1 in individual_dataset.get_known_attribute_names()
        if id_name2 not in individual_dataset.get_known_attribute_names():
            dtype = fraction_dataset.get_attribute(id_name2).dtype
            default_values = -1*ones(individual_dataset.size(), dtype=dtype)
            individual_dataset.add_primary_attribute(default_values, id_name2)
        fraction_id1 = fraction_dataset.get_attribute(id_name1)
        individual_id1 = individual_dataset.get_attribute(id_name1)
        unique_ids = unique(fraction_id1)
        for id1 in unique_ids:
            individual_of_id1 = where(individual_id1==id1)[0]
            n = individual_of_id1.size
            logger.log_status("Processing %s %s: %s individuals" % (id_name1, id1, n) )
            if n > 0:
                fractions = fraction_dataset.get_attribute(fraction_attribute_name)[fraction_id1==id1]
                id2 = fraction_dataset.get_attribute(id_name2)[fraction_id1==id1]
                ## ignore individuals in geography with sum of fractions less than 1.0e-2
                if fractions.sum() < 1.0e-2:
                if not allclose(fractions.sum(), 1.0, rtol=1.e-2):
                    fractions = normalize(fractions)
                fractions_cumsum = ncumsum(fractions)
                R = random(n)
                index = searchsorted(fractions_cumsum, R)
                individual_dataset.modify_attribute(id_name2, id2[index], index=individual_of_id1)
def probsample_replace(source_array, size, prob_array, return_index=False):
    """Unequal probability sampling; with replacement case.
    Using numpy searchsorted function, suitable for large array"""
    if not isinstance(source_array, ndarray):
            source_array = asarray(source_array)
            raise TypeError, "source_array must be of type ndarray"

    if prob_array is None:
        return sample_replace(source_array, size, return_index=return_index)

    if prob_array.sum() == 0:
        raise ValueError, "there aren't non-zero weights in prob_array"

    cum_prob = ncumsum(prob_array)

    sample_prob = uniform(0, 1, size)
    sampled_index = searchsorted(cum_prob, sample_prob)
    if return_index:
        return sampled_index
        return source_array[sampled_index]
def probsample_noreplace(source_array, sample_size, prob_array=None,
                          exclude_index=None, return_index=False):
    """generate non-repeat random 1d samples from source_array of sample_size, excluding
    indices appeared in exclude_index.

    return indices to source_array if return_index is true.

    source_array - the source array to sample from
    sample_size - scalar representing the sample size
    prob_array - the array used to weight sample
    exclude_index - array representing indices should not appear in resulted array,
                    which can be used, for example, to exclude current choice from sampling,
                    indexed to source_array

    if not isinstance(source_array, ndarray):
            source_array = asarray(source_array)
            raise TypeError, "source_array must be of type ndarray"

    pop_size = source_array.size

    if pop_size < sample_size:
        logger.log_warning("There are less or equal indices (%s) in source_array than the sample_size (%s). Use probsample_replace. " %
              (pop_size, sample_size))
        #sample_size = max
        return probsample_replace(source_array, sample_size, prob_array=prob_array, return_index=return_index)
    elif pop_size == sample_size:
        if return_index:
            return arange(source_array.size)
            return source_array

    if sample_size <= 0:
        #logger.log_warning("sample_size is %s. Nothing is sampled." % sample_size)
        return array([], dtype='i')
    if prob_array is None:
        return sample_noreplace(source_array, sample_size, return_index=return_index)

    if not isinstance(prob_array, ndarray):
            prob_array = asarray(prob_array)
            raise TypeError, "prob_array must be of type ndarray"

    p_array = prob_array.astype(float64) # creates a copy (not just a pointer)

    p_array_sum = p_array.sum()
    if not ma.allclose(p_array_sum, 1.0):
        p_array = p_array / p_array_sum
        if abs(1.0 - p_array_sum) > 0.01:
            # print this message only if it is a serious difference
            logger.log_warning("prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum)

    #import pdb; pdb.set_trace()
    prob_array_size = nonzerocounts(prob_array)
    if prob_array_size < sample_size:
        logger.log_warning("There are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Use probsample_replace. " %
              (prob_array_size, sample_size))
        return probsample_replace(source_array, sample_size, prob_array=p_array, return_index=return_index)
    elif prob_array_size == sample_size:
        if return_index:
            return where(prob_array>0)[0]
            return source_array[prob_array>0]

    totalmass = 1.0
    to_be_sampled = sample_size
    sampled_index = array([], dtype='i')  #initialize sampled_index

    if exclude_index is not None:
            totalmass -= asarray(p_array[exclude_index]).sum()
            p_array[exclude_index] = 0
        except IndexError:
            logger.log_warning("The exclude_index (%s) is not in prob array" % (exclude_index))
            #raise IndexError, "Having problem to apply exclude_index values"

    cum_prob = ncumsum(p_array/p_array.sum()) * totalmass
    if not ma.allclose(cum_prob[-1], totalmass):
        raise ValueError, "prob_array doesn't sum up to 1 even after normalization"

    while True:
        sample_prob = uniform(0, totalmass, to_be_sampled).astype(float32)
        proposed_index = searchsorted(cum_prob, sample_prob) #, 0, cum_prob.size-1)

#         dup_indicator = find_duplicates_self(proposed_index)
#         valid_index = proposed_index[dup_indicator==0]
#         sampled_index = concatenate((sampled_index, source_array[valid_index]))

#         if not sometrue(dup_indicator):
#             return sampled_index
        i = 0
        if numpy.__version__ >= '1.2.0':
        ## numpy.unique1d in version 1.2.0 has reversed the return, changed [0]->[1]
            i = 1
        uniqueidx = unique1d(proposed_index, True)[i]
        valid_index = proposed_index[sort(uniqueidx)]
        #valid_index = unique_values(proposed_index)
        #import pdb; pdb.set_trace()
        sampled_index = concatenate((sampled_index, valid_index))
        if valid_index.size == to_be_sampled:
            if return_index:
                return sampled_index
                return source_array[sampled_index]

        totalmass -= asarray(p_array[valid_index]).sum()
        p_array[valid_index] = 0

        cum_prob = ncumsum(p_array/p_array.sum()) * totalmass
        assert ma.allclose(totalmass, cum_prob[-1])
        to_be_sampled -= valid_index.size
def prob2dsample(source_array, sample_size, prob_array=None, exclude_index=None,
                  replace=False, return_index=False):
    """generate non-repeat random 2d samples from source_array of sample_size, not including
    indices appeared in exclude_index; sample column by column, more efficient when there are more
    rows than columns in sample_size.

    return elements in source_array of shape sample_size.

    source_array - the source array to sample from
    sample_size - tuple representing the sample size with (rows, columns), non-repeat at each row
    exclude_index - array representing indices should not appear in resulted array, used to exclude current choice from sampling
    prob_array - the array used to weight sample

    rows, columns = sample_size
    source_array_size = source_array.size

    if source_array_size <= columns:
        logger.log_warning("There are less or equal indices (%s) in source_array than the sample_size (%s). Sample %s." %
              (source_array_size, columns, source_array_size))
        return ones((rows,1)) * source_array[newaxis,:]

    if prob_array is None:
        prob_array = ones(source_array_size)

    if not (isinstance(prob_array, ndarray) or is_masked_array(prob_array)):
        raise TypeError, "prob_array must be of type ndarray"

#    prob_array_size = nonzerocounts(prob_array)
#    if prob_array_size <= columns:
#            logger.log_warning( "there are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Sample %s instead. " %\
#                  (prob_array_size, columns, prob_array_size))
#            return ones((rows,1)) * source_array[nonzero(prob_array)][newaxis,:]

    p_array = prob_array

    p_array_sum = p_array.sum(dtype="float64")
    if not ma.allclose(p_array_sum, 1.0):
        if abs(1.0 - p_array_sum) > 0.01:
            # print this message only if it is a serious difference
            logger.log_warning("prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum)
        p_array = p_array / p_array_sum

    cum_prob = ncumsum(p_array)

    sampled_choiceset_index = zeros(sample_size, dtype="int32") - 1    #initialize output

    if not replace:
        if exclude_index is not None:
            if not isinstance(exclude_index, ndarray):
                    exclude_index = asarray(exclude_index)
                    raise TypeError, "exclude_index must be of type ndarray"
            if exclude_index.shape[0] <> rows:
                raise ValueError, "exclude_index should have the same number of rows as sample_size[0]"
            if rank(exclude_index) == 1:
                exclude_index = exclude_index[:, newaxis]
            #sampled_choiceset_index = concatenate((exclude_index,sampled_choiceset_index),axis=1)
                      #attach exclude_index to the beginning of sampled_choiceset_index
            exclude_index = zeros(shape=(sample_size[0],1), dtype="int32")

        for j in range(columns):
            slots_to_be_sampled = arange(rows)
            #proposed_index = zeros((rows,1)) - 1
            while True:
                proposed_index = probsample_replace(arange(source_array_size), slots_to_be_sampled.size, p_array)
                    exclude_array = exclude_index[slots_to_be_sampled,]
                    exclude_array = None
                duplicate_indicator = find_duplicates_others(proposed_index, exclude_array)
                valid_index = slots_to_be_sampled[duplicate_indicator==0]
                sampled_choiceset_index[valid_index, j] = proposed_index[duplicate_indicator==0]
                if nonzerocounts(duplicate_indicator) == 0:

                slots_to_be_sampled = slots_to_be_sampled[duplicate_indicator>0]

            exclude_index = concatenate((exclude_index, take(sampled_choiceset_index,(j,), axis=1)), axis = 1)
        for j in range(columns):
            sampled_choiceset_index[:,j] = probsample_replace(arange(source_array_size), rows, p_array)

    if return_index:
        return sampled_choiceset_index
        return source_array[sampled_choiceset_index]
def prob2dsample(source_array,
    """generate non-repeat random 2d samples from source_array of sample_size, not including
    indices appeared in exclude_index; sample column by column, more efficient when there are more
    rows than columns in sample_size.

    return elements in source_array of shape sample_size.

    source_array - the source array to sample from
    sample_size - tuple representing the sample size with (rows, columns), non-repeat at each row
    exclude_index - array representing indices should not appear in resulted array, used to exclude current choice from sampling
    prob_array - the array used to weight sample

    rows, columns = sample_size
    source_array_size = source_array.size

    if source_array_size <= columns and not replace:
            "There are less or equal indices (%s) in source_array than the sample_size (%s). Sample %s."
            % (source_array_size, columns, source_array_size))

        if return_index:
            return ones(
                (rows, 1), dtype='i') * arange(source_array_size)[newaxis, :]
            return ones((rows, 1), dtype='i') * source_array[newaxis, :]

    if prob_array is None:
        prob_array = ones(source_array_size)

    if not (isinstance(prob_array, ndarray) or is_masked_array(prob_array)):
        raise TypeError, "prob_array must be of type ndarray"

#    prob_array_size = nonzerocounts(prob_array)
#    if prob_array_size <= columns:
#            logger.log_warning( "there are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Sample %s instead. " %\
#                  (prob_array_size, columns, prob_array_size))
#            return ones((rows,1)) * source_array[nonzero(prob_array)][newaxis,:]

    p_array = prob_array

    p_array_sum = p_array.sum(dtype="float64")
    if not ma.allclose(p_array_sum, 1.0):
        if abs(1.0 - p_array_sum) > 0.01:
            # print this message only if it is a serious difference
                "prob_array doesn't sum up to 1, and is normalized. Sum: %s" %
        p_array = p_array / p_array_sum

    cum_prob = ncumsum(p_array)

    sampled_choiceset_index = zeros(
        sample_size, dtype="int32") - 1  #initialize output

    if not replace:
        if exclude_index is not None:
            if not isinstance(exclude_index, ndarray):
                    exclude_index = asarray(exclude_index)
                    raise TypeError, "exclude_index must be of type ndarray"
            if exclude_index.shape[0] <> rows:
                raise ValueError, "exclude_index should have the same number of rows as sample_size[0]"
            if rank(exclude_index) == 1:
                exclude_index = exclude_index[:, newaxis]
            #sampled_choiceset_index = concatenate((exclude_index,sampled_choiceset_index),axis=1)
            #attach exclude_index to the beginning of sampled_choiceset_index
            exclude_index = zeros(shape=(sample_size[0], 1), dtype="int32")

        for j in range(columns):
            slots_to_be_sampled = arange(rows)
            #proposed_index = zeros((rows,1)) - 1
            while True:
                proposed_index = probsample_replace(
                    arange(source_array_size), slots_to_be_sampled.size,
                    exclude_array = exclude_index[slots_to_be_sampled, ]
                    exclude_array = None
                duplicate_indicator = find_duplicates_others(
                    proposed_index, exclude_array)
                valid_index = slots_to_be_sampled[duplicate_indicator == 0]
                sampled_choiceset_index[valid_index, j] = proposed_index[
                    duplicate_indicator == 0]
                if nonzerocounts(duplicate_indicator) == 0:

                slots_to_be_sampled = slots_to_be_sampled[
                    duplicate_indicator > 0]

            exclude_index = concatenate(
                (exclude_index, take(sampled_choiceset_index, (j, ), axis=1)),
        for j in range(columns):
            sampled_choiceset_index[:, j] = probsample_replace(
                arange(source_array_size), rows, p_array)

    if return_index:
        return sampled_choiceset_index
        return source_array[sampled_choiceset_index]
def probsample_noreplace(source_array, sample_size, prob_array=None, exclude_index=None, return_index=False):
    """generate non-repeat random 1d samples from source_array of sample_size, excluding
    indices appeared in exclude_index.

    return indices to source_array if return_index is true.

    source_array - the source array to sample from
    sample_size - scalar representing the sample size
    prob_array - the array used to weight sample
    exclude_index - array representing indices should not appear in resulted array,
                    which can be used, for example, to exclude current choice from sampling,
                    indexed to source_array

    if not isinstance(source_array, ndarray):
            source_array = asarray(source_array)
            raise TypeError, "source_array must be of type ndarray"

    pop_size = source_array.size

    if pop_size < sample_size:
            "There are less or equal indices (%s) in source_array than the sample_size (%s). Use probsample_replace. "
            % (pop_size, sample_size)
        # sample_size = max
        return probsample_replace(source_array, sample_size, prob_array=prob_array, return_index=return_index)
    elif pop_size == sample_size:
        if return_index:
            return arange(source_array.size)
            return source_array

    if sample_size <= 0:
        # logger.log_warning("sample_size is %s. Nothing is sampled." % sample_size)
        return array([], dtype="i")

    if prob_array is None:
        return sample_noreplace(source_array, sample_size, return_index=return_index)

    if not isinstance(prob_array, ndarray):
            prob_array = asarray(prob_array)
            raise TypeError, "prob_array must be of type ndarray"

    p_array = prob_array.astype(float64)  # creates a copy (not just a pointer)

    p_array_sum = p_array.sum()
    if not ma.allclose(p_array_sum, 1.0):
        p_array = p_array / p_array_sum
        if abs(1.0 - p_array_sum) > 0.01:
            # print this message only if it is a serious difference
            logger.log_warning("prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum)

    # import pdb; pdb.set_trace()
    prob_array_size = nonzerocounts(prob_array)
    if prob_array_size < sample_size:
            "There are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Use probsample_replace. "
            % (prob_array_size, sample_size)
        return probsample_replace(source_array, sample_size, prob_array=p_array, return_index=return_index)
    elif prob_array_size == sample_size:
        if return_index:
            return where(prob_array > 0)[0]
            return source_array[prob_array > 0]

    totalmass = 1.0
    to_be_sampled = sample_size
    sampled_index = array([], dtype="i")  # initialize sampled_index

    if exclude_index is not None:
            totalmass -= asarray(p_array[exclude_index]).sum()
            p_array[exclude_index] = 0
        except IndexError:
            logger.log_warning("The exclude_index (%s) is not in prob array" % (exclude_index))
            # raise IndexError, "Having problem to apply exclude_index values"

    cum_prob = ncumsum(p_array / p_array.sum()) * totalmass
    if not ma.allclose(cum_prob[-1], totalmass):
        raise ValueError, "prob_array doesn't sum up to 1 even after normalization"

    while True:
        sample_prob = uniform(0, totalmass, to_be_sampled).astype(float32)
        proposed_index = searchsorted(cum_prob, sample_prob)  # , 0, cum_prob.size-1)

        #         dup_indicator = find_duplicates_self(proposed_index)
        #         valid_index = proposed_index[dup_indicator==0]
        #         sampled_index = concatenate((sampled_index, source_array[valid_index]))

        #         if not sometrue(dup_indicator):
        #             return sampled_index
        i = 0
        if numpy.__version__ >= "1.2.0":
            ## numpy.unique1d in version 1.2.0 has reversed the return, changed [0]->[1]
            i = 1
        uniqueidx = unique1d(proposed_index, True)[i]
        valid_index = proposed_index[sort(uniqueidx)]
        # valid_index = unique_values(proposed_index)
        # import pdb; pdb.set_trace()
        sampled_index = concatenate((sampled_index, valid_index))
        if valid_index.size == to_be_sampled:
            if return_index:
                return sampled_index
                return source_array[sampled_index]

        totalmass -= asarray(p_array[valid_index]).sum()
        p_array[valid_index] = 0

        cum_prob = ncumsum(p_array / p_array.sum()) * totalmass
        assert ma.allclose(totalmass, cum_prob[-1])
        to_be_sampled -= valid_index.size