Ejemplo n.º 1
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            sample_size=10,
            weight=None,
            include_chosen_choice=None,
            with_replacement=True,
            resources=None,
            dataset_pool=None):
        """
        
        
        this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            sc = SessionConfiguration()
            try:
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool(sc.package_order)

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "sample_size":
            sample_size,
            "weight":
            weight,
            "with_replacement":
            with_replacement,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(
            ['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        choice = local_resources["dataset2"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        agent_category_definition = local_resources.get(
            "agent_category_definition", [])
        choice_category_definition = local_resources.get(
            "choice_category_definition", [])
        agent_filter_attribute = local_resources.get("agent_filter_attribute",
                                                     None)
        category_inflating_factor = local_resources.get(
            "category_inflating_factor", 10)

        frequency, unique_agent_category_id, unique_choice_category_id, agent_category_id, choice_category_id = \
                get_category_and_frequency(agent, agent_category_definition,
                                           choice, choice_category_definition,
                                           agent_filter_attribute, category_inflating_factor,
                                           dataset_pool=dataset_pool)

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
        local_resources.merge_with_defaults(
            {'with_replacement': with_replacement})
        with_replacement = local_resources.get("with_replacement")

        sampled_index = empty((index1.size, J), dtype=DTYPE)
        sampling_prob = empty((index1.size, J), dtype="float64")

        _digitize, _where, _normalize = digitize, where, normalize
        _ncumsum, _rand, _searchsorted = ncumsum, rand, searchsorted  #speed hack
        for i in range(unique_agent_category_id.size):
            category_id = unique_agent_category_id[i]
            agents_in_this_category = _where(
                agent_category_id[index1] == category_id)[0]
            num_agents = agents_in_this_category.size
            if num_agents == 0: continue
            #import pdb; pdb.set_trace()

            ## divide frequency by the mean frequency to avoid overflow
            weights = frequency[
                i,
                _digitize(choice_category_id[index2], unique_choice_category_id
                          ) - 1] / frequency[i, :].mean()
            prob = _normalize(weights)
            index = _searchsorted(_ncumsum(prob),
                                  _rand(num_agents * J)).reshape(-1, J)

            if not with_replacement:
                raise NotImplementedError, "Sample without replacement is not implemented for this sampler yet."
                #    nz = nonzero(prob)[0].size
                #    if J < nz:
                #        ## number of non zero weight less than alternatives, sample with replacement
                #        logger.log_warning("There are %s non zero weights and are less than the number of alternatives proposed %s. " % (nz, J) +
                #                           "Sample with replacement instead.")
                #        continue
                #    i=0; max_iterations=200
                #    while True:
                #        index = sort(index, axis=1)
                #        where_repeats = nonzero( logical_not(diff(index, axis=1)) )
                #        num_repeats = where_repeats[0].size
                #        if num_repeats == 0: break
                #        index[where_repeats] = _searchsorted(_rand(num_repeats), prob)
                #        i += 1
                #        if i > max_iterations:
                #            logger.log_warning("weight_sampler_by_category is unable to sample %i alternatives without replacement in %i iterations; " % \
                #                               (J, max_iterations) +
                #                               "give up sampling without replacement and results may contain replacement."
                #                              )
                #            break

            sampled_index[agents_in_this_category, :] = index
            sampling_prob[agents_in_this_category, :] = prob[index]

        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack(
                (chosen_choice_index[:, newaxis], sampled_index))
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1

            sampling_prob_for_chosen_choices = take(
                prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen is unplaced has the sampling prob is 0
            sampling_prob_for_chosen_choices[where(
                chosen_choice_index == UNPLACED_ID)[0], ] = 0.0
            sampling_prob = column_stack(
                [sampling_prob_for_chosen_choices, sampling_prob])

        #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
    def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None,
            include_chosen_choice=None, with_replacement=True, resources=None, dataset_pool=None):
        """
        
        
        this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            sc = SessionConfiguration()
            try:
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool(sc.package_order)

        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "sample_size": sample_size, "weight": weight,
                "with_replacement": with_replacement,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        choice = local_resources["dataset2"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())
            
        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)        

        agent_category_definition = local_resources.get("agent_category_definition", [])
        choice_category_definition = local_resources.get("choice_category_definition", [])
        agent_filter_attribute = local_resources.get("agent_filter_attribute", None)
        category_inflating_factor = local_resources.get("category_inflating_factor", 10)

        frequency, unique_agent_category_id, unique_choice_category_id, agent_category_id, choice_category_id = \
                get_category_and_frequency(agent, agent_category_definition,
                                           choice, choice_category_definition,
                                           agent_filter_attribute, category_inflating_factor,
                                           dataset_pool=dataset_pool)
         
        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
        local_resources.merge_with_defaults({'with_replacement': with_replacement})
        with_replacement = local_resources.get("with_replacement")
        
        sampled_index = empty((index1.size, J), dtype="int32")
        sampling_prob = empty((index1.size, J), dtype="float64")
        
        _digitize, _where,  _normalize = digitize, where, normalize
        _ncumsum, _rand, _searchsorted = ncumsum, rand, searchsorted   #speed hack
        for i in range(unique_agent_category_id.size):
            category_id = unique_agent_category_id[i]
            agents_in_this_category = _where(agent_category_id[index1] == category_id)[0]
            num_agents = agents_in_this_category.size
            if num_agents == 0: continue
            #import pdb; pdb.set_trace()
            
            ## divide frequency by the mean frequency to avoid overflow
            weights = frequency[i, _digitize(choice_category_id[index2], unique_choice_category_id)-1]  / frequency[i, :].mean()
            prob = _normalize(weights)
            index = _searchsorted(_ncumsum(prob), _rand(num_agents * J)).reshape(-1, J)

            if not with_replacement:
                raise NotImplementedError, "Sample without replacement is not implemented for this sampler yet."
                #    nz = nonzero(prob)[0].size
                #    if J < nz:
                    #        ## number of non zero weight less than alternatives, sample with replacement
                    #        logger.log_warning("There are %s non zero weights and are less than the number of alternatives proposed %s. " % (nz, J) + 
                    #                           "Sample with replacement instead.")
                    #        continue
                    #    i=0; max_iterations=200
                    #    while True:
                        #        index = sort(index, axis=1)
                        #        where_repeats = nonzero( logical_not(diff(index, axis=1)) ) 
                        #        num_repeats = where_repeats[0].size
                        #        if num_repeats == 0: break
                        #        index[where_repeats] = _searchsorted(_rand(num_repeats), prob)
                        #        i += 1
                        #        if i > max_iterations:
                            #            logger.log_warning("weight_sampler_by_category is unable to sample %i alternatives without replacement in %i iterations; " % \
                                    #                               (J, max_iterations) + 
                            #                               "give up sampling without replacement and results may contain replacement."
                            #                              )
                            #            break

            sampled_index[agents_in_this_category, :] = index
            sampling_prob[agents_in_this_category, :] = prob[index] 

        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index))
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            
            sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen is unplaced has the sampling prob is 0
            sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0
            sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob])
            
        #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
        
        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset
Ejemplo n.º 3
0
class InteractionDataset(Dataset):
    """Class serves as a holder of interaction variables."""

    def __init__(self, resources=None, dataset1=None, dataset2=None, index1=None, index2=None, dataset_name=None,
                  debug=None):
        """ Argument 'resources' is of type Resources. It is merged with arguments. It should contain:
                dataset1 - agent class
                dataset2 - class of the choice dataset
            Optional:
                index1 - 1D array, indices of dataset1
                index2 - If 2D array: row i contains indices of individuals of dataset2 that belong to
                        i-th individual of dataset1[index1].
                        If 1D array: indices of individuals of dataset2 for all individuals of dataset1[index1].
                dataset_name - subdirectory in which implementation of the interaction variables is placed (default "")
            dataset1.resources and dataset2.resources should contain key 'dataset_name' (see Dataset.get_dataset_name()).
        """
        self.resources = Resources(resources)
        self.resources.merge_if_not_None({
                "dataset1":dataset1, "dataset2":dataset2,
                "index1":index1, "index2":index2,
                "dataset_name":dataset_name, "debug":debug})
        self.attribute_boxes = {}
        self.attribute_names = []
        self.debug = self.resources.get("debug",  0)
        if not isinstance(self.debug, DebugPrinter):
            self.debug = DebugPrinter(self.debug)
        self.resources.check_obligatory_keys(["dataset1", "dataset2"])
        self.dataset1 = self.resources["dataset1"]
        self.dataset2 = self.resources["dataset2"]
        self.index1 = self.resources.get("index1", None)
        self.index2 = self.resources.get("index2", None)
        self.dataset_name = self.resources.get("dataset_name", None)
        if self.dataset_name == None:
            self.dataset_name = self.dataset1.get_dataset_name() + '_x_' + self.dataset2.get_dataset_name()
        self._primary_attribute_names=[]
        self.index1_mapping = {}
        if self.index1 <> None:
            self.index1_mapping = do_id_mapping_dict_from_array(self.index1)
        self._id_names = None # for compatibility with Dataset
        self.variable_factory = VariableFactory()
        self._aliases = {} # for compatibility with Dataset

    def _ensure_id_attribute_is_loaded(self):
        pass
    
    def get_attribute(self, name):
        """ Return an array of the (by the argument name) given attribute. """
        if not isinstance(name, VariableName):
            attr_name = VariableName(name)
        else:
            attr_name = name
        alias = attr_name.get_alias()
        dataset_name = attr_name.get_dataset_name()
        if not (alias in self.get_attribute_names()):
            if dataset_name == self.get_dataset(1).dataset_name:
                index = self.get_2d_index_of_dataset1()
                return self.get_dataset(1).get_attribute_by_index(attr_name, index)
            if dataset_name == self.get_dataset(2).dataset_name:
                index = self.get_2d_index()
                return self.get_dataset(2).get_attribute_by_index(attr_name, index)
            
            if alias in self.get_dataset(1).get_known_attribute_names():
                index = self.get_2d_index_of_dataset1()
                return self.get_dataset(1).get_attribute_by_index(attr_name, index)
            if alias in self.get_dataset(2).get_known_attribute_names():
                index = self.get_2d_index()
                return self.get_dataset(2).get_attribute_by_index(attr_name, index)
            self._raise_error(NameError, "Variable %s not found!" % alias)
        return self.attribute_boxes[alias].get_data()

    def get_attribute_of_dataset(self, name, dataset_number=1):
        """ Return values of attribute given by 'name' belonging to the given dataset, 
        possibly filtred by the corresponding indes. It is a 1d array of size 
        reduced_n or reduced_m.
        """
        index = self.get_index(dataset_number)
        if index <> None:
            return self.get_dataset(dataset_number).get_attribute_by_index(name, index)
        return self.get_dataset(dataset_number).get_attribute(name)
        
    def get_id_attribute_of_dataset(self, dataset_number=1):
        """Like 'get_attribute_of_dataset' where name is the id_name of the given dataset.
        """
        index = self.get_index(dataset_number)
        if index <> None:
            return self.get_dataset(dataset_number).get_id_attribute()[index]
        return self.get_dataset(dataset_number).get_id_attribute()

    def add_primary_attribute(self, data, name):
        """ Add values given in argument 'data' to the dataset as an attribute 'name'. 
        'data' should be an array of the same size as the dataset.
        If this attribute already exists, its values are overwritten.
        The attribute is marked as a primary attribute.
        """
        if not isinstance(data, ndarray):
            data=array(data)
        if data.shape[0] <> self.size()[0][0] or data.shape[1] <> self.size()[0][1]:
            logger.log_warning("In add_primary_attribute: Mismatch in sizes of the argument 'data' and the InteractionDataset object.")
        self.add_attribute(data, name, metadata=AttributeType.PRIMARY)
        
    def _compute_if_needed(self, name, dataset_pool, resources=None, quiet=False, version=None):
        """ Compute variable given by the argument 'name' only if this variable
        has not been computed before.
        Check first if this variable belongs to dataset1 or dataset2.
        dataset_pool holds available datasets.
        """
        if not isinstance(name, VariableName):
            variable_name = VariableName(name)
        else:
            variable_name = name
        short_name = variable_name.get_alias()
        if (short_name in self.get_attribute_names()) and (self.are_dependent_variables_up_to_date(
                            variable_name, version=version)):
            return version #nothing to be done
        dataset_name = variable_name.get_dataset_name()
        if dataset_name == self.get_dataset_name():
            new_version = self._compute_one_variable(variable_name, dataset_pool, resources)
        else:
            owner_dataset, index = self.get_owner_dataset_and_index(dataset_name)
            if owner_dataset is None:
                self._raise_error(StandardError, "Cannot find variable '%s'\nin either dataset or in the interaction set." %
                                variable_name.get_expression())
            owner_dataset.compute_variables([variable_name], dataset_pool, resources=resources, quiet=True)
            new_version = self.add_attribute(data = owner_dataset.get_attribute_by_index(variable_name, index),
                name = variable_name, metadata = AttributeType.COMPUTED)
            attribute_box = owner_dataset._get_attribute_box(variable_name)
            variable = attribute_box.get_variable_instance()
            my_attribute_box = self._get_attribute_box(variable_name)
            my_attribute_box.set_variable_instance(variable)
        return new_version

    def get_owner_dataset_and_index(self, dataset_name):
        if dataset_name == self.dataset1.get_dataset_name():
            return (self.dataset1, self.get_2d_index_of_dataset1())
        elif dataset_name == self.dataset2.get_dataset_name():
            return (self.dataset2, self.get_2d_index())
        return (None, None)

    def are_dependent_variables_up_to_date(self, variable_name, version):
        """ Return True if the version of this variable correspond to versions of all
        dependent variables, otherwise False. That is, if any of the dependent variable
        must be recomputed, the method returns False.
        """
        short_name = variable_name.get_alias()
        if short_name in self.get_primary_attribute_names():
            return self.is_version(short_name, version)

        dataset_name = variable_name.get_dataset_name()
        owner_name = variable_name.get_dataset_name()
        if owner_name == self.dataset1.get_dataset_name():
            owner_dataset = self.dataset1
        elif owner_name == self.dataset2.get_dataset_name():
            owner_dataset = self.dataset2
        else:
            owner_dataset = self

        if not(dataset_name == owner_dataset.get_dataset_name()):
                self._raise_mismatch_dataset_name_error(variable_name)
        if owner_dataset is self:
            attribute_box = owner_dataset._get_attribute_box(variable_name)
            if attribute_box is None:
                return False
            variable = attribute_box.get_variable_instance()
            res = variable.are_dependent_variables_up_to_date(version)
            return not(False in res)
        return owner_dataset.are_dependent_variables_up_to_date(variable_name, version)

    def _prepare_dataset_pool_for_variable(self, dataset_pool=None, resources=None):
        dataset_pool, compute_resources = Dataset._prepare_dataset_pool_for_variable(self, dataset_pool, resources)
        dataset1_name = "dataset1"
        dataset2_name = "dataset2"
        dataset1 = self.get_dataset(1)
        dataset2 = self.get_dataset(2)
        if dataset1 <> None:
            dataset1_name=dataset1.get_dataset_name()
        if dataset2 <> None:
            dataset2_name=dataset2.get_dataset_name()
        dataset_pool.add_datasets_if_not_included({dataset1_name: dataset1, dataset2_name: dataset2})
        return dataset_pool, compute_resources

    def get_n(self):
        """Return size of dataset 1.
        """
        return self.dataset1.size()

    def get_m(self):
        """Return size of dataset 2.
        """
        return self.dataset2.size()

    def get_reduced_n(self):
        if self.index1 == None:
            return self.get_n()
        if isinstance(self.index1, ndarray):
            return self.index1.shape[0]
        return self.get_n()

    def get_reduced_m(self):
        if self.index2 == None:
            return self.get_m()
        if isinstance(self.index2, ndarray):
            if self.index2.ndim == 1:
                return self.index2.shape[0]
            else:
                return self.index2.shape[1]
        return self.get_m()

    def size(self):
        return [(self.get_reduced_n(), self.get_reduced_m()), (self.get_n(), self.get_m())]

    def get_dataset(self, nr):
        if (nr == 1):
            return self.dataset1
        if (nr == 2):
            return self.dataset2
        return None

    def get_dataset_named(self, name):
        if name==self.dataset1.get_dataset_name():
            return self.dataset1
        if name==self.dataset2.get_dataset_name():
            return self.dataset2
        raise ValueError, 'trying to get an interaction set component named %s but it does not exist' % name

    def get_index(self, nr):
        if (nr == 1):
            return self.index1
        if (nr == 2):
            return self.index2
        return None

    def attribute_sum(self, name):
        """Return the sum of values of the given attribute.
        """
        return (ma.ravel(self.get_attribute(name))).sum()

    def attribute_average(self, name):
        """Return the value of the given attribute averaged over the dataset.
        """
        return ma.average(ma.ravel(self.get_attribute(name)))

    def summary(self, names, resources=None):
        """Print a marginal summary of the attributes given in the list 'names'.
        """
        print "Summary\t\tsum\t\taverage"
        print "------------------------------------------------"
        if not isinstance(names,list):
            names = [names]
        for item in names:
            if not (item.get_alias() in self.get_attribute_names()):
                self.compute_variables([item], resources=resources)

            print item + "\t" + str(self.attribute_sum(item.alias))\
                     + "\t" + str(round(self.attribute_average(item.get_alias(),5)))

    def get_2d_dataset_attribute(self, name):
        """ Return a 2D array of the attribute given by 'name'. It is assumed
        to be an attribute of dataset2.
        The method should serve the purpose of preparing 1D arrays for computing
        intraction operations (between dataset1 and dataset2) by transfering them to the corresponding 2D array.
        The resulting array is of size n x m, where m is either the attribute length of dataset2,
        or, if index2 is a 1D array, its length, or, if index2 is a 2D array,
        the number of columns. n is size of dataset1 or of index1 if given.
        If index2 is None, all values of the given attribute are repeated n times.
        """
        dataset = self.get_dataset(2)
        index = self.get_2d_index()
        return dataset.get_attribute_by_index(name, index)

    def get_2d_index(self):
        n = self.get_reduced_n()
        m = self.get_reduced_m()
        if self.index2 == None:
            index = indices((n,m))[1]
        elif isinstance(self.index2, ndarray):
            if self.index2.ndim == 1: # one-dim array
                index = repeat(reshape(self.index2,(1,self.index2.shape[0])), n, 0)
            else:
                index = self.index2
        else:
            self._raise_error(StandardError, "'index2' has incompatible type. It should be a numpy array or None.")
        if (index.shape[0] <> n) or (index.shape[1] <> m):
            self._raise_error(StandardError, "'index2' has wrong dimensions.")
        return index

    def get_2d_index_of_dataset1(self):
        n = self.get_reduced_n()
        m = self.get_reduced_m()
        index = self.get_index(1)
        if index == None:
            index = arange(n)
        return repeat(reshape(index, (index.size,1)), m, 1)

    def create_logit_data(self, coefficients, index=None):
        """It creates a data array corresponding to specified coefficients
        (=coefficients connected to a specification) as one variable per column.
        'coefficients' is of type "SpecifiedCoefficientsFor1Submodel".
        If 'index' is not None, it is considered as index (1D array) of dataset1 determining
        which individuals should be considered.
        Return a 3D array (nobservations|len(index) x nequations x nvariables).
        """
        shape = coefficients.getshape()
        neqs, nvar = shape[0:2]
        other_dims = ()
        if len(shape) > 2:
            other_dims = shape[2:]
        nparenteqs = coefficients.parent.nequations()
        if (neqs <> self.get_reduced_m()) and (nparenteqs <> self.get_reduced_m()):
            self._raise_error(StandardError, "create_logit_data: Mismatch in number of equations and size of dataset2.")

        if index <> None:
            nobs = index.size
        else:
            nobs = self.get_reduced_n()
            index = arange(nobs)

        variables = coefficients.get_full_variable_names()
        mapping = coefficients.get_coefficient_mapping()
        # Fill the x array from data array
        data_shape = tuple([nobs,neqs,nvar] + list(other_dims))
        try:
            x = zeros(data_shape, dtype=float32)
        except:    # in case it fails due to memory allocation error
            logger.log_warning("Not enough memory. Deleting not used attributes.",
                                tags=["memory", "logit"])
            var_names = map(lambda x: x.get_alias(), variables)
            self.dataset1.unload_not_used_attributes(var_names)
            self.dataset2.unload_not_used_attributes(var_names)
            collect()
            x = zeros(data_shape, dtype=float32)
        if (len(variables) <= 0) or (nobs <= 0):
            return x
        for ivar in range(nvar): # Iterate over variables
            if variables[ivar].is_constant_or_reserved_name():
                c = where(mapping[:,ivar] < 0, 0.0, 1)
                x[:,:,ivar] = c
            else:
                data = ma.filled(self.get_attribute(variables[ivar]),0.0)[index,]
                if neqs < nparenteqs:
                    data = take(data, coefficients.get_equations_index(), axis=1)
                if x.ndim > 3:
                    data = resize(data, tuple(list(x.shape[0:2]) + list(other_dims)))
                x[:,:,ivar] = data
        return x

    def create_logit_data_from_beta_alt(self, coefficients, index=None):
        """It creates a data array corresponding to specified coefficients
        (=coefficients connected to a specification) as one coefficient per column. (Thus there can be multiple columns
        of one variable.)
        'coefficients' is of type "SpecifiedCoefficientsFor1Submodel".
        If 'index' is not None, it is considered as index (1D array) of dataset1 determining
        which individuals should be considered.
        It puts zeros on spots where the corresponding coefficient is zero. It is meant to be used for preparing data
        for estimation.
        Return a 3D array (nobservations|len(index) x nequations x ncoefficients).
        """
        shape = coefficients.getshape()
        neqs, nvar = shape[0:2]
        other_dims = ()
        if len(shape) > 2:
            other_dims = shape[2:]
        nparenteqs = coefficients.parent.nequations()
        if (neqs <> self.get_reduced_m()) and (nparenteqs <> self.get_reduced_m()):
            self._raise_error(StandardError, "create_logit_data: Mismatch in number of equations and size of dataset2.")

        mapping = coefficients.get_coefmap_alt()
        ncoef = mapping.size
        if index <> None:
            nobs = index.size
        else:
            nobs = self.get_reduced_n()
            index = arange(nobs)

        variables = coefficients.get_variable_names_from_alt()

        # Fill the x array from data array
        data_shape = tuple([nobs,neqs,ncoef] + list(other_dims))
        try:
            x = zeros(data_shape, dtype=float32)
        except:    # in case it fails due to memory allocation error
            logger.log_warning("Not enough memory. Deleting not used attributes.",
                                tags=["memory", "logit"])
            self.dataset1.unload_not_used_attributes(unique(variables))
            self.dataset2.unload_not_used_attributes(unique(variables))
            collect()
            x = zeros(data_shape, dtype=float32)

        if (len(variables) <= 0) or (nobs <= 0):
            return x

        coefvalues = coefficients.get_beta_alt()
        for ivar in range(len(variables)): # Iterate over variables
            if coefficients.is_variable_constant_or_reserved_name(variables[ivar]):
                c = where(coefvalues[:,ivar] == 0, 0.0, 1)
                x[:,:,ivar] = c
            else:
                data = ma.filled(self.get_attribute(variables[ivar]),0.0)[index,]
                if neqs < nparenteqs:
                    data = take(data, coefficients.get_equations_index(), axis=1)
                if x.ndim > 3:
                    data = reshape(data, tuple(list(x.shape[0:2]) + len(other_dims)*[1]))
                    for iodim in range(len(other_dims)):
                        data = repeat(data, other_dims[iodim], axis=2+iodim)
                x[:,:,ivar] = data
                w = where(coefvalues[:,ivar] == 0)
                if x.ndim > 3:
                    x[:,w[0], ivar, w[1:]] = 0.0
                else:
                    x[:,w,ivar] = 0.0
        return x

    def modify_logit_data_for_estimation(self, data, choice, constants_positions=array([], dtype='int32')):
        """Modify the variable columns for alternative specific constants. It is set to one
        for choices where the actual choice have been made, otherwise zeros.
        'data' is a 3D array (output of create_logit_data).
        'choice' is a 1D array containing indices of the actual choices (within the sampled choice set)
            for each agent that was included in the data array.
        'constants_positions' is an array with indices of the alternative specific constants
            within the data array.
        """
        nobs, neqs, nvar = data.shape
        if where(choice<0)[0].size > 0:
            self._raise_error(StandardError, "There are no choices for some agents. Check argument 'choice'.")
        if constants_positions.size > 0:
            for const in constants_positions:
                data[:,:,const] = 0
                data[arange(nobs), choice, const] = 1
        return data

    def get_attribute_by_choice(self, name, choices, resources=None):
        """  'name' is an attribute of dataset2, 'choices' is 1D array - choices[i] represents a choice
        (index of attribute 'name' among the values index2[i,]) for individual i of dataset1[index1].
        If name == None, indices belonging to dataset2 are returned.
        The method returns 1D array - the actual values of the choices.
        """
        if choices.size <> self.get_n():
            self._raise_error(StandardError, "get_attribute_by_choice: Argument 'choices' must be the same size as dataset1")
        resources.merge_with_defaults(self.resources)
        if name == None:
            twoDattr = self.get_2d_index()
        else:
            twoDattr = self.get_2d_dataset_attribute(name, resources)
        return take_choices(twoDattr, choices)

    def is_same_as(self, name1, name2):
        """Test equality of 2 variables. 'name1' is an attribute of dataset1, 'name2' is an attribute of 'dataset2'.
        Return a 2D array.
        """
        self.load_datasets()
        attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1))
        return attr1 == self.get_2d_dataset_attribute(name2)

    def is_less_or_equal(self, name1, name2):
        """Test if attribute 'name1' (attr. of dataset1) is <= than attr. 'name2' (attr. 'dataset2').
        Return a 2D array.
        """
        self.load_datasets()
        attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1))
        return attr1 <= self.get_2d_dataset_attribute(name2)

    def is_greater_or_equal(self, name1, name2):
        """est if attribute 'name1' (attr. of dataset1) is >= than attr. 'name2' (attr. 'dataset2').
        Return a 2D array.
        """
        self.load_datasets()
        attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1))
        return attr1 >= self.get_2d_dataset_attribute(name2)

    def multiply(self, name1, name2):
        """Multiply 2 variables. 'name1' is an attribute of dataset1, 'name2' is an attribute of 'dataset2'.
        Return a 2D array.
        """
        self.load_datasets()
        attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1))
        return attr1 * self.get_2d_dataset_attribute(name2)

    def divide(self, name1, name2):
        """ Divide variable 'name1' (attribute of dataset1) by variable 'name2' (attribute of 'dataset2').
        Return a masked 2D array.
        """
        self.load_datasets()
        attr2 = reshape(self.get_attribute_of_dataset(name2),(self.get_reduced_n(), 1))
        return self.get_2d_dataset_attribute(name1) / ma.masked_where(attr2 == 0.0, attr2.astype(float32))

    def match_agent_attribute_to_choice(self, name, dataset_pool=None):
        """ Return a tuple where the first element is a 2D array of the attribute 'name_{postfix}'. 
        It is assumed to be an attribute
        of dataset1 (possibly computed). {postfix} is created either by values of the attribute
        'name' of dataset2 (if it has any such attribute), or by the id values of dataset2.
        The second value of the resulting tuple is a list of dependent variables.
        """
        if 'name' in self.get_dataset(2).get_known_attribute_names():
            name_postfix = self.get_attribute_of_dataset('name', 2)
        else:
            name_postfix = self.get_id_attribute_of_dataset(2)
        name_postfix_alt = self.get_id_attribute_of_dataset(2)
        
        dependencies = []
        for i in range(self.get_reduced_m()):
            full_name = VariableName("%s_%s" % (name, name_postfix[i]))
            if full_name.get_dataset_name() is None:
                full_name = VariableName("%s.%s" % (self.get_dataset(1).get_dataset_name(), full_name.get_expression()))
            try:
                self.get_dataset(1).compute_variables(full_name, dataset_pool=dataset_pool)
            except:
                full_name = VariableName("%s_%s" % (name, name_postfix_alt[i]))
                if full_name.get_dataset_name() is None:
                    full_name = VariableName("%s.%s" % (self.get_dataset(1).get_dataset_name(), full_name.get_expression()))
                self.get_dataset(1).compute_variables(full_name, dataset_pool=dataset_pool)
            
            dependencies.append(full_name.get_expression())
            if i == 0:
                result = self.get_attribute(full_name)
            else:
                result[:,i] = self.get_attribute_of_dataset(full_name, 1)
        return result, dependencies
            
    def load_datasets(self):
        if self.dataset1.size() <= 0:
            self.dataset1.get_id_attribute()
        if self.dataset2.size() <= 0:
            self.dataset2.get_id_attribute()

    def get_index1_idx(self, ids):
        id = asarray(ids)
        try:
            return array(map(lambda x: self.index1_mapping[x], ids))
        except:
            return None

    def get_dependent_datasets(self, variables):
        """Return a list of dataset names that the given variables depend on."""
        result = []
        for variable in variables:
            try:
                result = result + self.get_dataset(1).get_dependent_datasets(variables=[variable], quiet=True)
            except:
                try:
                    result = result + self.get_dataset(2).get_dependent_datasets(variables=[variable], quiet=True)
                except:
                    result = result + get_dependency_datasets(variables=[variable])
        result = get_distinct_list(result)
        for i in [1,2]: # remove dependencies on datasets of this interaction, since it is implicitly given
            dataset_name = self.get_dataset(i).get_dataset_name()
            if dataset_name in result:
                result.remove(dataset_name)
        return result

    def _raise_error(self, error, msg):
        raise error("In interaction set '%s': %s'" % (self.name(), msg))

    def name(self):
        return "%s -> %s" % (self.dataset1.get_dataset_name(),
                                            self.dataset2.get_dataset_name())

    def get_mask(self, index):
        """index is an array of size reduced_n. The method returns array of 1's and 0's
        (of size reduced_n x reduced_m) where 0's are on rows determined by index.
        """
        mask = ones((self.get_reduced_n(), self.get_reduced_m()), dtype="int32")
        for i in index:
            mask[i,:] = 0
        return mask

    def interact_attribute_with_condition(self, attribute, condition, filled_value=0.0, do_logical_not=False):
        """Creates a 2D array (reduced_n x reduced_m) with values of 'attribute' on spots where values of the 'condition'
        attribute are > 0. All other spots have 'filled_value'. 'attribute' is an attribute name of
        the second dataset, condition is an attribute name of teh first dataset.
        If 'do_logical_not' is True, the condition is negated.
        """
        cond_values = self.get_attribute_of_dataset(condition)
        if do_logical_not:
            cond_values = logical_not(cond_values)
        index = where(cond_values > 0)[0]
        mask = self.get_mask(index)
        return ma.filled(ma.masked_array(self.get_2d_dataset_attribute(attribute), mask=mask), filled_value)

    def create_and_check_qualified_variable_name(self, name):
        """Convert name to a VariableName if it isn't already, and add dataset_name to
        the VariableName if it is missing.  If it already has a dataset_name, make sure
        it is the same as the name of this dataset.
        """
        if isinstance(name, VariableName):
            vname = name
        else:
            vname = VariableName(name)
        if vname.get_dataset_name() is None:
            vname.set_dataset_name(self.get_dataset_name())
        else:
            self._check_dataset_name(vname)
            
        return vname
    
    def get_flatten_dataset(self):
        """Creates a new dataset that is a 1D version of this dataset. All attributes are flattened.
        Id name is a combination of the two id attributes.
        """
        storage = StorageFactory().get_storage('dict_storage')
            
        table_name = '%s_flatten' % self.get_dataset_name()
        data = {}
        for attr in self.get_known_attribute_names():
            data[attr] = self.get_attribute(attr).ravel()
            
        ids = []
        for i in [1,2]:
            id_name = self.get_dataset(i).get_id_name()[0]
            ids.append(id_name)
            if id_name not in data.keys():
                data[id_name] = self.get_attribute(id_name).ravel()
            
        storage.write_table(
                    table_name=table_name,
                    table_data=data
                )
        dataset = Dataset(in_storage=storage, id_name=ids,
                          dataset_name=table_name, in_table_name=table_name)
        return dataset
    
    def _check_dataset_name(self, vname):
        """check that name is the name of this dataset or one of its components"""
        name = vname.get_dataset_name()
        dataset_names = set([self.get_dataset_name()] + list(self.get_dataset(i).get_dataset_name() for i in [1,2]))
        if name not in dataset_names:
            raise ValueError, "When checking dataset name of '%s': different dataset names for variable and dataset or a component: '%s' <> '%s'" % (vname.get_expression(), name, dataset_names)

    def add_mnl_bias_correction_term(self, probability, sampled_index, bias_attribute_name='__mnl_bias_correction_term'):
        """Compute and add an MNL bias correction term introduced by sampling. 
        'probability' is a probability array of the whole choice set. 
        'sampled_index' is an index of elements within the 'probability' array determining the sampled set of alternatives.
        The computed term is added to the interaction set as an additional attribute,
        using the name given in 'bias_attribute_name'.
        This method is mainly to be used by Samplers classes.
        """
        lnprob = ln(probability)
        ln1minusprob = ln(1-probability)
        bias_term = ln1minusprob.sum() - \
                    take(ln1minusprob, sampled_index).sum(axis=1).reshape((self.get_reduced_n(),1)) + \
                    take(lnprob, sampled_index).sum(axis=1).reshape((self.get_reduced_n(),1)) - \
                    take(lnprob, sampled_index)       
        self.add_attribute(bias_term, bias_attribute_name)
Ejemplo n.º 4
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            sample_size=10,
            weight=None,
            include_chosen_choice=False,
            with_replacement=False,
            resources=None,
            dataset_pool=None):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "sample_size":
            sample_size,
            "weight":
            weight,
            "with_replacement":
            with_replacement,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(
            ['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1

        with_replacement = local_resources.get("with_replacement")

        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight = choice.get_attribute(weight)
                rank_of_weight = 1
            else:
                varname = VariableName(weight)
                if varname.get_dataset_name() == choice.get_dataset_name():
                    weight = choice.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 1
                elif varname.get_interaction_set_names() is not None:
                    ## weights can be an interaction variable
                    interaction_dataset = InteractionDataset(local_resources)
                    weight = interaction_dataset.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 2
                    assert (len(weight.shape) >= rank_of_weight)
                else:
                    err_msg = ("weight is neither a known attribute name "
                               "nor a simple variable from the choice dataset "
                               "nor an interaction variable: '%s'" % weight)
                    logger.log_error(err_msg)
                    raise ValueError, err_msg
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        if rank_of_weight == 1:  # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement  # sampling with no replacement
            non_zero_counts = nonzerocounts(weight)
            if non_zero_counts < J:
                logger.log_warning(
                    "weight array dosen't have enough non-zero counts, use sample with replacement"
                )
                replace = True
            if non_zero_counts > 0:
                sampled_index = prob2dsample(
                    index2,
                    sample_size=(index1.size, J),
                    prob_array=prob,
                    exclude_index=chosen_choice_index_to_index2,
                    replace=replace,
                    return_index=True)
            else:
                # all alternatives have a zero weight
                sampled_index = zeros((index1.size, 0), dtype=DTYPE)
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size, J), dtype=DTYPE) - 1

            for i in range(index1.size):
                replace = with_replacement  # sampling with/without replacement
                i_prob = prob[i, :]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning(
                        "weight array dosen't have enough non-zero counts, use sample with replacement"
                    )
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i, :] = probsample_noreplace(
                    index2,
                    sample_size=J,
                    prob_array=i_prob,
                    exclude_index=chosen_choice_index_to_index2[i],
                    return_index=True)
        sampling_prob = take(prob, sampled_index)
        sampled_index_within_prob = sampled_index.copy()
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack(
                (chosen_choice_index[:, newaxis], sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(
                prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(
                chosen_choice_index == UNPLACED_ID)[0], ] = 0.0
            sampling_prob = column_stack(
                [sampling_prob_for_chosen_choices, sampling_prob])

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        if local_resources.get("include_mnl_bias_correction_term", False):
            if include_chosen_choice:
                sampled_index_within_prob = column_stack(
                    (chosen_choice_index_to_index2[:, newaxis],
                     sampled_index_within_prob))
            interaction_dataset.add_mnl_bias_correction_term(
                prob, sampled_index_within_prob)

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
    def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None,
            include_chosen_choice=False, with_replacement=False, resources=None, dataset_pool=None):
        
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "sample_size": sample_size, "weight": weight,
                "with_replacement": with_replacement,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())
            
        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None
        
        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
            
        with_replacement = local_resources.get("with_replacement")
            
        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight=choice.get_attribute(weight)
                rank_of_weight = 1 
            elif VariableName(weight).get_dataset_name() == choice.get_dataset_name():
                weight=choice.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 1
            else:
                ## weights can be an interaction variable
                interaction_dataset = InteractionDataset(local_resources)
                weight=interaction_dataset.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 2
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        if rank_of_weight == 1: # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement           # sampling with no replacement 
            if nonzerocounts(weight) < J:
                logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                replace = True
            sampled_index = prob2dsample( index2, sample_size=(index1.size, J),
                                        prob_array=prob, exclude_index=chosen_choice_index_to_index2,
                                        replace=replace, return_index=True )
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size,J), dtype="int32") - 1
                
            for i in range(index1.size):
                replace = with_replacement          # sampling with/without replacement
                i_prob = prob[i,:]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i,:] = probsample_noreplace( index2, sample_size=J, prob_array=i_prob,
                                                     exclude_index=chosen_choice_index_to_index2[i],
                                                     return_index=True )
        sampling_prob = take(prob, sampled_index)
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0
            sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob])
        
        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        
        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset
Ejemplo n.º 6
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            stratum=None,
            weight=None,
            sample_size=1,
            sample_size_from_each_stratum=None,
            sample_size_from_chosen_stratum=None,
            sample_rate=None,
            include_chosen_choice=False,
            resources=None,
            with_replacement=False,
            dataset_pool=None,
            **kwargs):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alternatives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten
          by sample_size_from_each_stratum if it's not None
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""
        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "with_replacement":
            with_replacement,
            "stratum":
            stratum,
            "weight":
            weight,
            "sample_size":
            sample_size,
            "sample_size_from_each_stratum":
            sample_size_from_each_stratum,
            "sample_size_from_chosen_stratum":
            sample_size_from_chosen_stratum,
            "sample_rate":
            sample_rate,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(['dataset1', 'dataset2'])
        index1 = local_resources.get("index1", None)

        agent = dataset1

        if index1 is None:
            agent.get_id_attribute()
            index1 = arange(agent.size())

        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)

        if index2 is None:
            choice.get_id_attribute()
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        weight = local_resources.get("weight", None)

        if isinstance(weight, str):
            choice.compute_variables(weight, resources=local_resources)
            weight = choice.get_attribute(weight)
            rank_of_weight = 1
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif weight is None:
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unknown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                weight = take(weight, index2)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        stratum = local_resources.get("stratum", None)
        if stratum is None:
            raise StandardError, "'stratum' must be defined for stratified sampling."
        if isinstance(stratum, str):
            choice.compute_variables(stratum, resources=local_resources)
            stratum = choice.get_attribute(stratum)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        ##TODO: check all chosen strata are in selectable strata
        #i.e. chosen_choice_index is in index2
        chosen_stratum = ones(chosen_choice_index.size,
                              dtype=DTYPE) * NO_STRATUM_ID
        chosen_stratum[where(
            chosen_choice_index != -1)] = stratum[chosen_choice_index[where(
                chosen_choice_index != -1)]]
        selectable_strata = stratum[index2]
        unique_strata = unique(selectable_strata)
        unique_strata = unique_strata[where(unique_strata != NO_STRATUM_ID)]

        #        if rank_of_weight == 2:
        #            raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet"

        #        sampled_index = zeros((index1.size,1)) - 1

        sample_size = local_resources.get("sample_size", None)
        sample_size_from_each_stratum = local_resources.get(
            "sample_size_from_each_stratum", None)
        if sample_size_from_each_stratum is None:
            sample_size_from_each_stratum = sample_size
        strata_sample_size = ones(unique_strata.size,
                                  dtype=DTYPE) * sample_size_from_each_stratum
        sample_rate = local_resources.get("sample_rate", None)
        if sample_rate is not None:
            raise UnImplementedError, "sample_rate is not implemented yet."
            ##TODO: to be finished
            #num_elements_in_strata = histogram(selectable_strata, unique_strata)
            #strata_sample_size = round(num_elements_in_strata * sample_rate)

        sample_size_from_chosen_stratum = local_resources.get(
            "sample_size_from_chosen_stratum", None)
        if sample_size_from_chosen_stratum is None and not include_chosen_choice:
            strata_sample_pairs = array(
                map(lambda x, y: [x, y], unique_strata, strata_sample_size))
            if rank_of_weight == 1:
                sampled_index = self._sample_by_stratum(
                    index1, index2, selectable_strata, prob,
                    chosen_choice_index_to_index2, strata_sample_pairs)
            elif rank_of_weight == 2:
                sampled_index = self._sample_by_agent_and_stratum(
                    index1, index2, selectable_strata, prob,
                    chosen_choice_index_to_index2, strata_sample_pairs)
        else:
            strata_sample_setting = zeros((index1.size, unique_strata.size, 2),
                                          dtype=DTYPE)
            for i in range(index1.size):
                agents_strata_sample_size = copy.copy(strata_sample_size)
                if sample_size_from_chosen_stratum is None:
                    ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True,
                    ## sample one less from the chosen stratum
                    agents_strata_sample_size[where(
                        unique_strata == chosen_stratum[i])] += -1
                else:
                    agents_strata_sample_size[where(
                        unique_strata ==
                        chosen_stratum[i])] = sample_size_from_chosen_stratum
                strata_sample_pairs = array(
                    map(lambda x, y: [x, y], unique_strata,
                        agents_strata_sample_size))
                strata_sample_setting[i, ...] = strata_sample_pairs

            sampled_index = self._sample_by_agent_and_stratum(
                index1, index2, selectable_strata, prob,
                chosen_choice_index_to_index2, strata_sample_setting)
        #chosen_choice = None
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        if include_chosen_choice:
            sampled_index = concatenate(
                (chosen_choice_index[:, newaxis], sampled_index), axis=1)
            #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1
            #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does)
            #since the chosen choice index is attached to the first column, the chosen choice should be all zeros
            #for valid chosen_choice_index
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1

            chosen_probability = zeros(
                (chosen_choice_index.size, ), dtype=float32) - 1
            for stratum in unique_strata:
                w = chosen_stratum == stratum
                chosen_probability[w] = (
                    prob[chosen_choice_index[w]] /
                    prob[selectable_strata == stratum].sum()).astype(float32)
            self._sampling_probability = concatenate(
                (chosen_probability[:, newaxis], self._sampling_probability),
                axis=1)
            self._stratum_id = concatenate(
                (chosen_stratum[:, newaxis], self._stratum_id), axis=1)

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(self._sampling_probability,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        interaction_dataset.add_attribute(self._stratum_id, 'stratum_id')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
Ejemplo n.º 7
0
    def run(self, dataset1, dataset2, index1=None, index2=None, stratum=None, weight=None,
            sample_size=1, sample_size_from_each_stratum=None, sample_size_from_chosen_stratum=None, sample_rate=None,
            include_chosen_choice=False, resources=None, with_replacement=False, dataset_pool=None, **kwargs):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alternatives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten
          by sample_size_from_each_stratum if it's not None
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""
        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
                        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "with_replacement": with_replacement,
                "stratum":stratum, "weight": weight,
                "sample_size": sample_size,
                "sample_size_from_each_stratum": sample_size_from_each_stratum,
                "sample_size_from_chosen_stratum": sample_size_from_chosen_stratum,
                
                "sample_rate": sample_rate,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2'])
        index1 = local_resources.get("index1", None)

        agent = dataset1

        if index1 is None:
            agent.get_id_attribute()
            index1 = arange(agent.size())

        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)

        if index2 is None:
            choice.get_id_attribute()
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        weight = local_resources.get("weight", None)

        if isinstance(weight, str):
            choice.compute_variables(weight,
                resources = local_resources )
            weight=choice.get_attribute(weight)
            rank_of_weight = 1
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif weight is None:
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unknown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                weight = take(weight, index2)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        stratum = local_resources.get("stratum", None)
        if stratum is None:
            raise StandardError, "'stratum' must be defined for stratified sampling."
        if isinstance(stratum, str):
            choice.compute_variables(stratum,
                resources = local_resources )
            stratum=choice.get_attribute(stratum)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        ##TODO: check all chosen strata are in selectable strata
        #i.e. chosen_choice_index is in index2
        chosen_stratum = ones(chosen_choice_index.size, dtype=DTYPE) * NO_STRATUM_ID
        chosen_stratum[where(chosen_choice_index!=-1)] = stratum[chosen_choice_index[where(chosen_choice_index!=-1)]]
        selectable_strata = stratum[index2]
        unique_strata = unique(selectable_strata)
        unique_strata = unique_strata[where(unique_strata!=NO_STRATUM_ID)]

#        if rank_of_weight == 2:
#            raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet"

#        sampled_index = zeros((index1.size,1)) - 1

        sample_size = local_resources.get("sample_size", None)
        sample_size_from_each_stratum = local_resources.get("sample_size_from_each_stratum", None)
        if sample_size_from_each_stratum is None:
            sample_size_from_each_stratum = sample_size
        strata_sample_size = ones(unique_strata.size, dtype=DTYPE) * sample_size_from_each_stratum
        sample_rate = local_resources.get("sample_rate", None)
        if sample_rate is not None:
            raise UnImplementedError, "sample_rate is not implemented yet."
            ##TODO: to be finished
            #num_elements_in_strata = histogram(selectable_strata, unique_strata)
            #strata_sample_size = round(num_elements_in_strata * sample_rate)

        sample_size_from_chosen_stratum = local_resources.get("sample_size_from_chosen_stratum", None)
        if sample_size_from_chosen_stratum is None and not include_chosen_choice:
            strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, strata_sample_size))
            if rank_of_weight == 1:
                sampled_index = self._sample_by_stratum(index1, index2, selectable_strata, prob,
                                                        chosen_choice_index_to_index2, strata_sample_pairs)
            elif rank_of_weight == 2:
                sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob,
                                                                  chosen_choice_index_to_index2, strata_sample_pairs)
        else:
            strata_sample_setting = zeros((index1.size,unique_strata.size,2), dtype=DTYPE)
            for i in range(index1.size):
                agents_strata_sample_size = copy.copy(strata_sample_size)
                if sample_size_from_chosen_stratum is None:
                    ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True, 
                    ## sample one less from the chosen stratum
                    agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] += - 1
                else:
                    agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] = sample_size_from_chosen_stratum
                strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, agents_strata_sample_size))
                strata_sample_setting[i,...] = strata_sample_pairs

            sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob,
                                                              chosen_choice_index_to_index2, strata_sample_setting)
        #chosen_choice = None
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        if include_chosen_choice:
            sampled_index = concatenate((chosen_choice_index[:,newaxis],sampled_index), axis=1)
            #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1
            #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does)
                                                                      #since the chosen choice index is attached to the first column, the chosen choice should be all zeros
                                                                      #for valid chosen_choice_index
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            
            chosen_probability = zeros((chosen_choice_index.size,),dtype=float32) - 1
            for stratum in unique_strata:
                w = chosen_stratum==stratum
                chosen_probability[w] = (prob[chosen_choice_index[w]] / prob[selectable_strata==stratum].sum()).astype(float32)
            self._sampling_probability = concatenate((chosen_probability[:,newaxis], self._sampling_probability), axis=1)
            self._stratum_id = concatenate((chosen_stratum[:,newaxis], self._stratum_id), axis=1)

        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(self._sampling_probability, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        interaction_dataset.add_attribute(self._stratum_id, 'stratum_id')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset    
Ejemplo n.º 8
0
    def load_dataset(self, resources=None, attributes=None, in_storage=None,
                     in_table_name=None, lowercase=None, **kwargs):

        #set defaults
        attributes_default = '*'
        lower_default = 1 # if 1, use lowercase for attribute names

        # merge arguments with dictionaries and add missing entries
        local_resources = Resources(self.resources)
        if resources is not None:
            local_resources.merge_if_not_None(resources)
        local_resources.merge_if_not_None({"attributes":attributes,
                                           "in_storage":in_storage,
                                           "in_table_name":in_table_name,
                                           "lowercase":lowercase})
        local_resources.merge_with_defaults({"attributes":attributes_default,
                                             "lowercase":lower_default,
                                            })

        # check obligatory entries
        local_resources.check_obligatory_keys(["in_storage", "in_table_name"])

        # prepare for loading
        in_storage = local_resources["in_storage"]

        if not self._is_hidden_id():
            local_resources.merge({"id_name":self._id_names})
            
        table_name = local_resources['in_table_name']
        column_names = local_resources['attributes']
        chunked_attributes = self.chunk_columns(storage=in_storage,
                                                   table_name=table_name, 
                                                   column_names=column_names,
                                                   nchunks=1)
        # flatten list
        column_names = [name for name in chunked_attributes[0]
                                if name in in_storage.get_column_names(table_name)]
        data = in_storage.load_table(table_name = table_name, 
                                             column_names = column_names)
        self.df = pd.DataFrame(data)
        self.df.set_index(self._id_names, inplace=True)
        data_computed = {}
        if table_name+".computed" in in_storage.get_table_names():
            column_names_computed = [name for name in column_names
                                if name in in_storage.get_column_names(table_name+".computed")]
            data_computed = in_storage.load_table(table_name = table_name+".computed", 
                                                 column_names = column_names_computed)
            dfcomp = pd.DataFrame(data_computed)
            dfcomp.set_index(self._id_names, inplace=True)
            self.df = concat(self.df, dfcomp)
                      
        for attr in data:
            if not ((attr in self._id_names) and self.attribute_boxes.has_key(attr)): #do not store id_name every time
                self.attribute_boxes[attr] = AttributeBox(self, [],
                                                variable_name=self.create_and_check_qualified_variable_name(attr),
                                                type=AttributeType.PRIMARY,
                                                is_in_memory=True,
                                                header=None,
                                                version=0)

        for attr in data_computed:
            if not ((attr in self._id_names) and self.attribute_boxes.has_key(attr)): #do not store id_name every time
                self.attribute_boxes[attr] = AttributeBox(self, [],
                                                variable_name=self.create_and_check_qualified_variable_name(attr),
                                                type=AttributeType.COMPUTED,
                                                is_in_memory=True,
                                                header=None,
                                                version=0)
                                                                        
        self.n = self.df.shape[0]