コード例 #1
0
 def add_attribute(self, data, name, metadata=2):
     """Add values given in argument 'data' to dataset as an attribute 'name' as type 'metadata'. If this
     attribute already exists, its values are overwritten. 
     'metadata' should be of type AttributeType (PRIMARY=1, COMPUTED=2).
     The method increments and returns the version number of the attribute.
     """
     if not (isinstance(data, ndarray) or is_masked_array(data)):
         data=array(data)
     name = self.create_and_check_qualified_variable_name(name)
     short_name = name.get_alias()
     if short_name in self.get_attribute_names():
         self.attribute_boxes[short_name].set_is_in_memory(True)
         self.attribute_boxes[short_name].set_type(metadata)
     else:
         self.attribute_boxes[short_name] = AttributeBox(self, data=[], variable_name=name,
                                             type=metadata)
     if metadata == AttributeType.PRIMARY:
         self._add_to_primary_attribute_names(short_name)
     self.df[short_name] = data
     self.__increment_version(short_name)
     return self.get_version(short_name)
コード例 #2
0
 def aggregate_dataset_over_ids(self, dataset, function='sum', attribute_name=None, constant=None):
     """Aggregate attribute (given by 'attribute_name') of the given 'dataset' over
     self by applying the given function. The dataset is expected to have an attribute of the same
     name as the unique identifier of self. If attribute_name is not given, the
     argument 'constant' must be given, which is either a scalar or a numpy array. if it
     is a scalar, for each individual to be counted the constant value is taken into the function;
     if it is a numpy array of the same size as dataset, the value in the same index as
     individual is counted into the function.
     """
     workdf = dataset.df
     if attribute_name == None:
         if constant == None:
             self._raise_error(StandardError,
                               "Either 'attribute_name' or 'constant' must be given.")
         elif isinstance(constant, ndarray):
             if constant.size <> dataset_id_values.size:
                 self._raise_error(StandardError,
                                   "constant's size (%d) must be of the same as dataset's size (%d)"
                                   % (constant.size, dataset_id_values.size))
             values = constant
         else:
             values = resize(array([constant]), dataset.size())
         attribute_name = '__constant__'
         workdf[attribute_name] = values 
     else: 
         if is_masked_array(dataset[attribute_name]):
             w = where(ma.getmask(dataset[attribute_name]))
             if len(w)>0:
                 where_masked = w[0]
                 # do not consider those elements in the computation
                 workdf[attribute_name] = ma.filled(workdf[attribute_name], NaN)
     #logger.start_block('Aggregate Pandas')
     grouped = workdf.groupby(self.get_id_name())[attribute_name]
     f = getattr(np, function)
     res = grouped.aggregate(f)
     #logger.end_block()
     return res
コード例 #3
0
def prob2dsample(source_array, sample_size, prob_array=None, exclude_index=None,
                  replace=False, return_index=False):
    """generate non-repeat random 2d samples from source_array of sample_size, not including
    indices appeared in exclude_index; sample column by column, more efficient when there are more
    rows than columns in sample_size.

    return elements in source_array of shape sample_size.

    source_array - the source array to sample from
    sample_size - tuple representing the sample size with (rows, columns), non-repeat at each row
    exclude_index - array representing indices should not appear in resulted array, used to exclude current choice from sampling
    prob_array - the array used to weight sample
    """

    rows, columns = sample_size
    source_array_size = source_array.size

    if source_array_size <= columns:
        logger.log_warning("There are less or equal indices (%s) in source_array than the sample_size (%s). Sample %s." %
              (source_array_size, columns, source_array_size))
        return ones((rows,1)) * source_array[newaxis,:]

    if prob_array is None:
        prob_array = ones(source_array_size)

    if not (isinstance(prob_array, ndarray) or is_masked_array(prob_array)):
        raise TypeError, "prob_array must be of type ndarray"

#    prob_array_size = nonzerocounts(prob_array)
#    if prob_array_size <= columns:
#            logger.log_warning( "there are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Sample %s instead. " %\
#                  (prob_array_size, columns, prob_array_size))
#            return ones((rows,1)) * source_array[nonzero(prob_array)][newaxis,:]

    p_array = prob_array

    p_array_sum = p_array.sum(dtype="float64")
    if not ma.allclose(p_array_sum, 1.0):
        if abs(1.0 - p_array_sum) > 0.01:
            # print this message only if it is a serious difference
            logger.log_warning("prob_array doesn't sum up to 1, and is normalized. Sum: %s" % p_array_sum)
        p_array = p_array / p_array_sum

    cum_prob = ncumsum(p_array)

    sampled_choiceset_index = zeros(sample_size, dtype="int32") - 1    #initialize output

    if not replace:
        if exclude_index is not None:
            if not isinstance(exclude_index, ndarray):
                try:
                    exclude_index = asarray(exclude_index)
                except:
                    raise TypeError, "exclude_index must be of type ndarray"
            if exclude_index.shape[0] <> rows:
                raise ValueError, "exclude_index should have the same number of rows as sample_size[0]"
            if rank(exclude_index) == 1:
                exclude_index = exclude_index[:, newaxis]
            #sampled_choiceset_index = concatenate((exclude_index,sampled_choiceset_index),axis=1)
                      #attach exclude_index to the beginning of sampled_choiceset_index
        else:
            exclude_index = zeros(shape=(sample_size[0],1), dtype="int32")

        for j in range(columns):
            slots_to_be_sampled = arange(rows)
            #proposed_index = zeros((rows,1)) - 1
            while True:
                proposed_index = probsample_replace(arange(source_array_size), slots_to_be_sampled.size, p_array)
                try:
                    exclude_array = exclude_index[slots_to_be_sampled,]
                except:
                    exclude_array = None
                duplicate_indicator = find_duplicates_others(proposed_index, exclude_array)
                valid_index = slots_to_be_sampled[duplicate_indicator==0]
                sampled_choiceset_index[valid_index, j] = proposed_index[duplicate_indicator==0]
                if nonzerocounts(duplicate_indicator) == 0:
                    break

                slots_to_be_sampled = slots_to_be_sampled[duplicate_indicator>0]

            exclude_index = concatenate((exclude_index, take(sampled_choiceset_index,(j,), axis=1)), axis = 1)
    else:
        for j in range(columns):
            sampled_choiceset_index[:,j] = probsample_replace(arange(source_array_size), rows, p_array)

    if return_index:
        return sampled_choiceset_index
    else:
        return source_array[sampled_choiceset_index]
コード例 #4
0
def prob2dsample(source_array,
                 sample_size,
                 prob_array=None,
                 exclude_index=None,
                 replace=False,
                 return_index=False):
    """generate non-repeat random 2d samples from source_array of sample_size, not including
    indices appeared in exclude_index; sample column by column, more efficient when there are more
    rows than columns in sample_size.

    return elements in source_array of shape sample_size.

    source_array - the source array to sample from
    sample_size - tuple representing the sample size with (rows, columns), non-repeat at each row
    exclude_index - array representing indices should not appear in resulted array, used to exclude current choice from sampling
    prob_array - the array used to weight sample
    """

    rows, columns = sample_size
    source_array_size = source_array.size

    if source_array_size <= columns and not replace:
        logger.log_warning(
            "There are less or equal indices (%s) in source_array than the sample_size (%s). Sample %s."
            % (source_array_size, columns, source_array_size))

        if return_index:
            return ones(
                (rows, 1), dtype='i') * arange(source_array_size)[newaxis, :]
        else:
            return ones((rows, 1), dtype='i') * source_array[newaxis, :]

    if prob_array is None:
        prob_array = ones(source_array_size)

    if not (isinstance(prob_array, ndarray) or is_masked_array(prob_array)):
        raise TypeError, "prob_array must be of type ndarray"


#    prob_array_size = nonzerocounts(prob_array)
#    if prob_array_size <= columns:
#            logger.log_warning( "there are less or equal non-zero weight (%s) in prob_array than the sample_size (%s). Sample %s instead. " %\
#                  (prob_array_size, columns, prob_array_size))
#            return ones((rows,1)) * source_array[nonzero(prob_array)][newaxis,:]

    p_array = prob_array

    p_array_sum = p_array.sum(dtype="float64")
    if not ma.allclose(p_array_sum, 1.0):
        if abs(1.0 - p_array_sum) > 0.01:
            # print this message only if it is a serious difference
            logger.log_warning(
                "prob_array doesn't sum up to 1, and is normalized. Sum: %s" %
                p_array_sum)
        p_array = p_array / p_array_sum

    cum_prob = ncumsum(p_array)

    sampled_choiceset_index = zeros(
        sample_size, dtype="int32") - 1  #initialize output

    if not replace:
        if exclude_index is not None:
            if not isinstance(exclude_index, ndarray):
                try:
                    exclude_index = asarray(exclude_index)
                except:
                    raise TypeError, "exclude_index must be of type ndarray"
            if exclude_index.shape[0] <> rows:
                raise ValueError, "exclude_index should have the same number of rows as sample_size[0]"
            if rank(exclude_index) == 1:
                exclude_index = exclude_index[:, newaxis]
            #sampled_choiceset_index = concatenate((exclude_index,sampled_choiceset_index),axis=1)
            #attach exclude_index to the beginning of sampled_choiceset_index
        else:
            exclude_index = zeros(shape=(sample_size[0], 1), dtype="int32")

        for j in range(columns):
            slots_to_be_sampled = arange(rows)
            #proposed_index = zeros((rows,1)) - 1
            while True:
                proposed_index = probsample_replace(
                    arange(source_array_size), slots_to_be_sampled.size,
                    p_array)
                try:
                    exclude_array = exclude_index[slots_to_be_sampled, ]
                except:
                    exclude_array = None
                duplicate_indicator = find_duplicates_others(
                    proposed_index, exclude_array)
                valid_index = slots_to_be_sampled[duplicate_indicator == 0]
                sampled_choiceset_index[valid_index, j] = proposed_index[
                    duplicate_indicator == 0]
                if nonzerocounts(duplicate_indicator) == 0:
                    break

                slots_to_be_sampled = slots_to_be_sampled[
                    duplicate_indicator > 0]

            exclude_index = concatenate(
                (exclude_index, take(sampled_choiceset_index, (j, ), axis=1)),
                axis=1)
    else:
        for j in range(columns):
            sampled_choiceset_index[:, j] = probsample_replace(
                arange(source_array_size), rows, p_array)

    if return_index:
        return sampled_choiceset_index
    else:
        return source_array[sampled_choiceset_index]