Exemple #1
0
    def test_1d_weight_array_variant_sample_size_using_icc(self):
        sample_size = 2
        index1 = where(self.households.get_attribute("lucky"))[0][1:]
        index2 = where(self.gridcells.get_attribute("filter"))[0]
        weight=self.gridcells.get_attribute("weight")
        sample_ret = stratified_sampler().run(dataset1=self.households, dataset2=self.gridcells, index1=index1,
                        index2=index2, stratum="stratum_id", sample_size=sample_size,
                        weight="weight",include_chosen_choice=True)
        # get results
        sampled_index = sample_ret.get_2d_index()
        chosen_choices = UNPLACED_ID * ones(index1.size, dtype=DTYPE) 
        where_chosen = where(sample_ret.get_attribute("chosen_choice"))
        chosen_choices[where_chosen[0]]=where_chosen[1]

        self.assertEqual(sampled_index.shape, (index1.size,self.num_strata*sample_size))

        self.assertEqual( chosen_choices.size, index1.size)
        placed_agents_index = self.gridcells.try_get_id_index(
                                self.households.get_attribute("grid_id")[index1],UNPLACED_ID)
        chosen_choice_index = UNPLACED_ID * ones(index1.shape, dtype=DTYPE)
        w = where(chosen_choices>=0)[0]
        chosen_choice_index[w] = sampled_index[w, chosen_choices[w]].astype(int32)
        self.assert_( alltrue(equal(placed_agents_index, chosen_choice_index)) )
        sampled_index = sampled_index[:,1:]
        self.assert_( alltrue(lookup(sampled_index.ravel(), index2, index_if_not_found=UNPLACED_ID)!=UNPLACED_ID) )
        self.assert_( all(not_equal(weight[sampled_index], 0.0)) )
    def test_1d_weight_array(self):
        """"""
        sample_size = 5
        # check the individual gridcells
        # This is a stochastic model, so it may legitimately fail occassionally.
        index1 = where(self.households.get_attribute("lucky"))[0]
        index2 = where(self.gridcells.get_attribute("filter"))[0]
        weight=self.gridcells.get_attribute("weight")
        for icc in [0,1]: #include_chosen_choice?
            #icc = sample([0,1],1)
            sampler_ret = weighted_sampler().run(dataset1=self.households, dataset2=self.gridcells, index1=index1,
                            index2=index2, sample_size=sample_size, weight="weight",include_chosen_choice=icc)
            # get results
            sampled_index = sampler_ret.get_2d_index()
            chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
            where_chosen = where(sampler_ret.get_attribute("chosen_choice"))
            chosen_choices[where_chosen[0]]=where_chosen[1]

            sample_results = sampled_index, chosen_choices
            sampled_index = sample_results[0]
            self.assertEqual(sampled_index.shape, (index1.size, sample_size))
            if icc:
                placed_agents_index = self.gridcells.try_get_id_index(
                                        self.households.get_attribute("grid_id")[index1],UNPLACED_ID)
                chosen_choice_index = resize(array([UNPLACED_ID], dtype="int32"), index1.shape)
                w = where(chosen_choices>=0)[0]
                # for 64 bit machines, need to coerce the type to int32 -- on a
                # 32 bit machine the astype(int32) doesn't do anything
                chosen_choice_index[w] = sampled_index[w, chosen_choices[w]].astype(int32)
                self.assert_( alltrue(equal(placed_agents_index, chosen_choice_index)) )
                sampled_index = sampled_index[:,1:]
            
            self.assert_( alltrue(lookup(sampled_index.ravel(), index2, index_if_not_found=UNPLACED_ID)!=UNPLACED_ID) )
            self.assert_( all(not_equal(weight[sampled_index], 0.0)) )
Exemple #3
0
    def test_1d_weight_array_variant_sample_size(self):

        sample_size_from_chosen_stratum = 2
        index1 = where(self.households.get_attribute("lucky"))[0]
        index2 = where(self.gridcells.get_attribute("filter"))[0]
        weight = self.gridcells.get_attribute("weight")
        for icc in [0, 1]:

            #icc = sample([0,1],1)   #include_chosen_choice?
            sample_ret = stratified_sampler().run(
                dataset1=self.households,
                dataset2=self.gridcells,
                index1=index1,
                index2=index2,
                stratum="stratum_id",
                sample_size=0,
                sample_size_from_chosen_stratum=sample_size_from_chosen_stratum,
                weight="weight",
                include_chosen_choice=icc)
            # get results
            sampled_index = sample_ret.get_2d_index()
            chosen_choices = UNPLACED_ID * ones(index1.size, dtype=DTYPE)
            where_chosen = where(sample_ret.get_attribute("chosen_choice"))
            chosen_choices[where_chosen[0]] = where_chosen[1]

            if icc:
                self.assertEqual(
                    sampled_index.shape,
                    (index1.size, sample_size_from_chosen_stratum + 1))
            else:
                self.assertEqual(
                    sampled_index.shape,
                    (index1.size, sample_size_from_chosen_stratum))

            if icc:
                self.assertEqual(chosen_choices.size, index1.size)
                placed_agents_index = self.gridcells.try_get_id_index(
                    self.households.get_attribute("grid_id")[index1],
                    UNPLACED_ID)
                chosen_choice_index = UNPLACED_ID * ones(index1.shape,
                                                         dtype=DTYPE)
                w = where(chosen_choices >= 0)[0]
                chosen_choice_index[w] = sampled_index[
                    w, chosen_choices[w]].astype(int32)
                self.assert_(
                    alltrue(equal(placed_agents_index, chosen_choice_index)))
                sampled_index = sampled_index[:, 1:]

            self.assert_(
                alltrue(
                    lookup(sampled_index.ravel(),
                           index2,
                           index_if_not_found=UNPLACED_ID) != UNPLACED_ID))
            self.assert_(all(not_equal(weight[sampled_index], 0.0)))
Exemple #4
0
    def test_1d_weight_array(self):
        """"""
        # check the individual gridcells
        # This is a stochastic model, so it may legitimately fail occassionally.
        index1 = where(self.households.get_attribute("lucky"))[0]
        index2 = where(self.gridcells.get_attribute("filter"))[0]
        weight = self.gridcells.get_attribute("weight")
        for icc in [0, 1]:
            #icc = sample([0,1],1)   #include_chosen_choice?
            sample_ret = stratified_sampler().run(dataset1=self.households,
                                                  dataset2=self.gridcells,
                                                  index1=index1,
                                                  index2=index2,
                                                  stratum="stratum_id",
                                                  sample_size=1,
                                                  weight="weight",
                                                  include_chosen_choice=icc)
            # get results
            sampled_index = sample_ret.get_2d_index()
            chosen_choices = UNPLACED_ID * ones(index1.size, dtype=DTYPE)
            where_chosen = where(sample_ret.get_attribute("chosen_choice"))
            chosen_choices[where_chosen[0]] = where_chosen[1]

            if icc:
                self.assertEqual(sampled_index.shape,
                                 (index1.size, self.num_strata + 1))
            else:
                self.assertEqual(sampled_index.shape,
                                 (index1.size, self.num_strata))

            if icc:
                self.assertEqual(chosen_choices.size, index1.size)
                placed_agents_index = self.gridcells.try_get_id_index(
                    self.households.get_attribute("grid_id")[index1],
                    UNPLACED_ID)
                chosen_choice_index = UNPLACED_ID * ones(index1.shape,
                                                         dtype=DTYPE)
                w = where(chosen_choices >= 0)[0]
                # for 64 bit machines, need to coerce the type to int32 -- on a
                # 32 bit machine the astype(int32) doesn't do anything
                chosen_choice_index[w] = sampled_index[
                    w, chosen_choices[w]].astype(int32)
                self.assert_(
                    alltrue(equal(placed_agents_index, chosen_choice_index)))
                sampled_index = sampled_index[:, 1:]

            self.assert_(
                alltrue(
                    lookup(sampled_index.ravel(),
                           index2,
                           index_if_not_found=UNPLACED_ID) != UNPLACED_ID))
            self.assert_(all(not_equal(weight[sampled_index], 0.0)))
    def test_2d_weight_array(self):
        #2d weight
        sample_size = 5
        n = self.households.size()
        index1 = where(self.households.get_attribute("lucky"))[0]
        index2 = where(self.gridcells.get_attribute("filter"))[0]
        lucky = self.households.get_attribute("lucky")
        weight = repeat(self.gridcells.get_attribute("weight")[newaxis, :],
                        n,
                        axis=0)
        for i in range(n):
            weight[i, :] += lucky[i]

        for icc in [0, 1]:
            sampler_ret = weighted_sampler().run(dataset1=self.households,
                                                 dataset2=self.gridcells,
                                                 index1=index1,
                                                 index2=index2,
                                                 sample_size=sample_size,
                                                 weight=weight,
                                                 include_chosen_choice=icc)

            sampled_index = sampler_ret.get_2d_index()
            chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
            where_chosen = where(sampler_ret.get_attribute("chosen_choice"))
            chosen_choices[where_chosen[0]] = where_chosen[1]

            self.assertEqual(sampled_index.shape, (index1.size, sample_size))

            if icc:
                placed_agents_index = self.gridcells.try_get_id_index(
                    self.households.get_attribute("grid_id")[index1],
                    UNPLACED_ID)

                chosen_choice_index = resize(
                    array([UNPLACED_ID], dtype="int32"), index1.shape)
                w = where(chosen_choices >= 0)[0]
                chosen_choice_index[w] = sampled_index[
                    w, chosen_choices[w]].astype(int32)
                self.assert_(
                    alltrue(equal(placed_agents_index, chosen_choice_index)))
                sampled_index = sampled_index[:, 1:]

            self.assert_(
                alltrue(
                    lookup(sampled_index.ravel(),
                           index2,
                           index_if_not_found=UNPLACED_ID) != UNPLACED_ID))

            for j in range(sample_size):
                self.assert_(
                    all(not_equal(weight[j, sampled_index[j, :]], 0.0)))
Exemple #6
0
    def test_1(self):
        """"""
        sample_size = 5
        # check the individual gridcells
        # This is a stochastic model, so it may legitimately fail occassionally.
        index1 = where(self.households.get_attribute("lucky"))[0]
        #index2 = where(self.gridcells.get_attribute("filter"))[0]
        weight = self.gridcells.get_attribute("weight")
        estimation_config = {
            "agent_category_definition": ["household.lucky"],
            "choice_category_definition": ["gridcell.filter+1"]
        }
        for icc in [0, 1]:  #include_chosen_choice?
            #icc = sample([0,1],1)
            sampler_ret = weighted_sampler_by_category().run(
                dataset1=self.households,
                dataset2=self.gridcells,
                index1=index1,
                sample_size=sample_size,
                include_chosen_choice=icc,
                resources=estimation_config)
            # get results
            sampled_index = sampler_ret.get_2d_index()
            chosen_choices = UNPLACED_ID * ones(index1.size, dtype=DTYPE)
            where_chosen = where(sampler_ret.get_attribute("chosen_choice"))
            chosen_choices[where_chosen[0]] = where_chosen[1]

            self.assertEqual(sampled_index.shape, (index1.size, sample_size))
            if icc:
                placed_agents_index = self.gridcells.try_get_id_index(
                    self.households.get_attribute("grid_id")[index1],
                    UNPLACED_ID)
                chosen_choice_index = resize(array([UNPLACED_ID], dtype=DTYPE),
                                             index1.shape)
                w = where(chosen_choices >= 0)[0]
                # for 64 bit machines, need to coerce the type to int32 -- on a
                # 32 bit machine the astype(int32) doesn't do anything
                chosen_choice_index[w] = sampled_index[
                    w, chosen_choices[w]].astype(int32)
                self.assert_(
                    alltrue(equal(placed_agents_index, chosen_choice_index)))
                sampled_index = sampled_index[:, 1:]

            self.assert_(
                alltrue(
                    lookup(sampled_index.ravel(),
                           arange(self.gridcells.size()),
                           index_if_not_found=UNPLACED_ID) != UNPLACED_ID))
            self.assert_(all(not_equal(weight[sampled_index], 0.0)))
    def test_2d_weight_array(self):
        #2d weight
        sample_size = 5
        n = self.households.size()
        index1 = where(self.households.get_attribute("lucky"))[0]
        index2 = where(self.gridcells.get_attribute("filter"))[0]
        lucky = self.households.get_attribute("lucky")
        weight = repeat(self.gridcells.get_attribute("weight")[newaxis, :], n, axis=0)
        for i in range(n):
            weight[i,:] += lucky[i]

        for icc in [0,1]:
            sampler_ret = weighted_sampler().run(dataset1=self.households, dataset2=self.gridcells, index1=index1,
                            index2=index2, sample_size=sample_size, weight=weight,include_chosen_choice=icc)

            sampled_index = sampler_ret.get_2d_index()
            chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
            where_chosen = where(sampler_ret.get_attribute("chosen_choice"))
            chosen_choices[where_chosen[0]]=where_chosen[1]

            self.assertEqual(sampled_index.shape, (index1.size, sample_size))

            if icc:
                placed_agents_index = self.gridcells.try_get_id_index(
                                        self.households.get_attribute("grid_id")[index1],UNPLACED_ID)

                chosen_choice_index = resize(array([UNPLACED_ID], dtype="int32"), index1.shape)
                w = where(chosen_choices>=0)[0]
                chosen_choice_index[w] = sampled_index[w, chosen_choices[w]].astype(int32)
                self.assert_( alltrue(equal(placed_agents_index, chosen_choice_index)) )
                sampled_index = sampled_index[:,1:]
                
            self.assert_( alltrue(lookup(sampled_index.ravel(), index2, index_if_not_found=UNPLACED_ID)!=UNPLACED_ID) )

            for j in range(sample_size):
                self.assert_( all(not_equal(weight[j, sampled_index[j,:]], 0.0)) )
Exemple #8
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            sample_size=10,
            weight=None,
            include_chosen_choice=None,
            with_replacement=True,
            resources=None,
            dataset_pool=None):
        """
        
        
        this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            sc = SessionConfiguration()
            try:
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool(sc.package_order)

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "sample_size":
            sample_size,
            "weight":
            weight,
            "with_replacement":
            with_replacement,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(
            ['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        choice = local_resources["dataset2"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        agent_category_definition = local_resources.get(
            "agent_category_definition", [])
        choice_category_definition = local_resources.get(
            "choice_category_definition", [])
        agent_filter_attribute = local_resources.get("agent_filter_attribute",
                                                     None)
        category_inflating_factor = local_resources.get(
            "category_inflating_factor", 10)

        frequency, unique_agent_category_id, unique_choice_category_id, agent_category_id, choice_category_id = \
                get_category_and_frequency(agent, agent_category_definition,
                                           choice, choice_category_definition,
                                           agent_filter_attribute, category_inflating_factor,
                                           dataset_pool=dataset_pool)

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
        local_resources.merge_with_defaults(
            {'with_replacement': with_replacement})
        with_replacement = local_resources.get("with_replacement")

        sampled_index = empty((index1.size, J), dtype=DTYPE)
        sampling_prob = empty((index1.size, J), dtype="float64")

        _digitize, _where, _normalize = digitize, where, normalize
        _ncumsum, _rand, _searchsorted = ncumsum, rand, searchsorted  #speed hack
        for i in range(unique_agent_category_id.size):
            category_id = unique_agent_category_id[i]
            agents_in_this_category = _where(
                agent_category_id[index1] == category_id)[0]
            num_agents = agents_in_this_category.size
            if num_agents == 0: continue
            #import pdb; pdb.set_trace()

            ## divide frequency by the mean frequency to avoid overflow
            weights = frequency[
                i,
                _digitize(choice_category_id[index2], unique_choice_category_id
                          ) - 1] / frequency[i, :].mean()
            prob = _normalize(weights)
            index = _searchsorted(_ncumsum(prob),
                                  _rand(num_agents * J)).reshape(-1, J)

            if not with_replacement:
                raise NotImplementedError, "Sample without replacement is not implemented for this sampler yet."
                #    nz = nonzero(prob)[0].size
                #    if J < nz:
                #        ## number of non zero weight less than alternatives, sample with replacement
                #        logger.log_warning("There are %s non zero weights and are less than the number of alternatives proposed %s. " % (nz, J) +
                #                           "Sample with replacement instead.")
                #        continue
                #    i=0; max_iterations=200
                #    while True:
                #        index = sort(index, axis=1)
                #        where_repeats = nonzero( logical_not(diff(index, axis=1)) )
                #        num_repeats = where_repeats[0].size
                #        if num_repeats == 0: break
                #        index[where_repeats] = _searchsorted(_rand(num_repeats), prob)
                #        i += 1
                #        if i > max_iterations:
                #            logger.log_warning("weight_sampler_by_category is unable to sample %i alternatives without replacement in %i iterations; " % \
                #                               (J, max_iterations) +
                #                               "give up sampling without replacement and results may contain replacement."
                #                              )
                #            break

            sampled_index[agents_in_this_category, :] = index
            sampling_prob[agents_in_this_category, :] = prob[index]

        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack(
                (chosen_choice_index[:, newaxis], sampled_index))
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1

            sampling_prob_for_chosen_choices = take(
                prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen is unplaced has the sampling prob is 0
            sampling_prob_for_chosen_choices[where(
                chosen_choice_index == UNPLACED_ID)[0], ] = 0.0
            sampling_prob = column_stack(
                [sampling_prob_for_chosen_choices, sampling_prob])

        #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
    def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None,
            include_chosen_choice=None, with_replacement=True, resources=None, dataset_pool=None):
        """
        
        
        this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            sc = SessionConfiguration()
            try:
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool(sc.package_order)

        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "sample_size": sample_size, "weight": weight,
                "with_replacement": with_replacement,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        choice = local_resources["dataset2"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())
            
        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)        

        agent_category_definition = local_resources.get("agent_category_definition", [])
        choice_category_definition = local_resources.get("choice_category_definition", [])
        agent_filter_attribute = local_resources.get("agent_filter_attribute", None)
        category_inflating_factor = local_resources.get("category_inflating_factor", 10)

        frequency, unique_agent_category_id, unique_choice_category_id, agent_category_id, choice_category_id = \
                get_category_and_frequency(agent, agent_category_definition,
                                           choice, choice_category_definition,
                                           agent_filter_attribute, category_inflating_factor,
                                           dataset_pool=dataset_pool)
         
        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
        local_resources.merge_with_defaults({'with_replacement': with_replacement})
        with_replacement = local_resources.get("with_replacement")
        
        sampled_index = empty((index1.size, J), dtype="int32")
        sampling_prob = empty((index1.size, J), dtype="float64")
        
        _digitize, _where,  _normalize = digitize, where, normalize
        _ncumsum, _rand, _searchsorted = ncumsum, rand, searchsorted   #speed hack
        for i in range(unique_agent_category_id.size):
            category_id = unique_agent_category_id[i]
            agents_in_this_category = _where(agent_category_id[index1] == category_id)[0]
            num_agents = agents_in_this_category.size
            if num_agents == 0: continue
            #import pdb; pdb.set_trace()
            
            ## divide frequency by the mean frequency to avoid overflow
            weights = frequency[i, _digitize(choice_category_id[index2], unique_choice_category_id)-1]  / frequency[i, :].mean()
            prob = _normalize(weights)
            index = _searchsorted(_ncumsum(prob), _rand(num_agents * J)).reshape(-1, J)

            if not with_replacement:
                raise NotImplementedError, "Sample without replacement is not implemented for this sampler yet."
                #    nz = nonzero(prob)[0].size
                #    if J < nz:
                    #        ## number of non zero weight less than alternatives, sample with replacement
                    #        logger.log_warning("There are %s non zero weights and are less than the number of alternatives proposed %s. " % (nz, J) + 
                    #                           "Sample with replacement instead.")
                    #        continue
                    #    i=0; max_iterations=200
                    #    while True:
                        #        index = sort(index, axis=1)
                        #        where_repeats = nonzero( logical_not(diff(index, axis=1)) ) 
                        #        num_repeats = where_repeats[0].size
                        #        if num_repeats == 0: break
                        #        index[where_repeats] = _searchsorted(_rand(num_repeats), prob)
                        #        i += 1
                        #        if i > max_iterations:
                            #            logger.log_warning("weight_sampler_by_category is unable to sample %i alternatives without replacement in %i iterations; " % \
                                    #                               (J, max_iterations) + 
                            #                               "give up sampling without replacement and results may contain replacement."
                            #                              )
                            #            break

            sampled_index[agents_in_this_category, :] = index
            sampling_prob[agents_in_this_category, :] = prob[index] 

        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index))
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            
            sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen is unplaced has the sampling prob is 0
            sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0
            sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob])
            
        #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
        
        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            sample_size=10,
            weight=None,
            include_chosen_choice=False,
            with_replacement=False,
            resources=None,
            dataset_pool=None):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "sample_size":
            sample_size,
            "weight":
            weight,
            "with_replacement":
            with_replacement,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(
            ['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1

        with_replacement = local_resources.get("with_replacement")

        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight = choice.get_attribute(weight)
                rank_of_weight = 1
            else:
                varname = VariableName(weight)
                if varname.get_dataset_name() == choice.get_dataset_name():
                    weight = choice.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 1
                elif varname.get_interaction_set_names() is not None:
                    ## weights can be an interaction variable
                    interaction_dataset = InteractionDataset(local_resources)
                    weight = interaction_dataset.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 2
                    assert (len(weight.shape) >= rank_of_weight)
                else:
                    err_msg = ("weight is neither a known attribute name "
                               "nor a simple variable from the choice dataset "
                               "nor an interaction variable: '%s'" % weight)
                    logger.log_error(err_msg)
                    raise ValueError, err_msg
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        if rank_of_weight == 1:  # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement  # sampling with no replacement
            non_zero_counts = nonzerocounts(weight)
            if non_zero_counts < J:
                logger.log_warning(
                    "weight array dosen't have enough non-zero counts, use sample with replacement"
                )
                replace = True
            if non_zero_counts > 0:
                sampled_index = prob2dsample(
                    index2,
                    sample_size=(index1.size, J),
                    prob_array=prob,
                    exclude_index=chosen_choice_index_to_index2,
                    replace=replace,
                    return_index=True)
            else:
                # all alternatives have a zero weight
                sampled_index = zeros((index1.size, 0), dtype=DTYPE)
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size, J), dtype=DTYPE) - 1

            for i in range(index1.size):
                replace = with_replacement  # sampling with/without replacement
                i_prob = prob[i, :]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning(
                        "weight array dosen't have enough non-zero counts, use sample with replacement"
                    )
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i, :] = probsample_noreplace(
                    index2,
                    sample_size=J,
                    prob_array=i_prob,
                    exclude_index=chosen_choice_index_to_index2[i],
                    return_index=True)
        sampling_prob = take(prob, sampled_index)
        sampled_index_within_prob = sampled_index.copy()
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack(
                (chosen_choice_index[:, newaxis], sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(
                prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(
                chosen_choice_index == UNPLACED_ID)[0], ] = 0.0
            sampling_prob = column_stack(
                [sampling_prob_for_chosen_choices, sampling_prob])

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        if local_resources.get("include_mnl_bias_correction_term", False):
            if include_chosen_choice:
                sampled_index_within_prob = column_stack(
                    (chosen_choice_index_to_index2[:, newaxis],
                     sampled_index_within_prob))
            interaction_dataset.add_mnl_bias_correction_term(
                prob, sampled_index_within_prob)

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
    def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None,
            include_chosen_choice=False, with_replacement=False, resources=None, dataset_pool=None):
        
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "sample_size": sample_size, "weight": weight,
                "with_replacement": with_replacement,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())
            
        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None
        
        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
            
        with_replacement = local_resources.get("with_replacement")
            
        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight=choice.get_attribute(weight)
                rank_of_weight = 1 
            elif VariableName(weight).get_dataset_name() == choice.get_dataset_name():
                weight=choice.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 1
            else:
                ## weights can be an interaction variable
                interaction_dataset = InteractionDataset(local_resources)
                weight=interaction_dataset.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 2
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        if rank_of_weight == 1: # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement           # sampling with no replacement 
            if nonzerocounts(weight) < J:
                logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                replace = True
            sampled_index = prob2dsample( index2, sample_size=(index1.size, J),
                                        prob_array=prob, exclude_index=chosen_choice_index_to_index2,
                                        replace=replace, return_index=True )
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size,J), dtype="int32") - 1
                
            for i in range(index1.size):
                replace = with_replacement          # sampling with/without replacement
                i_prob = prob[i,:]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i,:] = probsample_noreplace( index2, sample_size=J, prob_array=i_prob,
                                                     exclude_index=chosen_choice_index_to_index2[i],
                                                     return_index=True )
        sampling_prob = take(prob, sampled_index)
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0
            sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob])
        
        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        
        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset
Exemple #12
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            stratum=None,
            weight=None,
            sample_size=1,
            sample_size_from_each_stratum=None,
            sample_size_from_chosen_stratum=None,
            sample_rate=None,
            include_chosen_choice=False,
            resources=None,
            with_replacement=False,
            dataset_pool=None,
            **kwargs):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alternatives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten
          by sample_size_from_each_stratum if it's not None
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""
        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "with_replacement":
            with_replacement,
            "stratum":
            stratum,
            "weight":
            weight,
            "sample_size":
            sample_size,
            "sample_size_from_each_stratum":
            sample_size_from_each_stratum,
            "sample_size_from_chosen_stratum":
            sample_size_from_chosen_stratum,
            "sample_rate":
            sample_rate,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(['dataset1', 'dataset2'])
        index1 = local_resources.get("index1", None)

        agent = dataset1

        if index1 is None:
            agent.get_id_attribute()
            index1 = arange(agent.size())

        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)

        if index2 is None:
            choice.get_id_attribute()
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        weight = local_resources.get("weight", None)

        if isinstance(weight, str):
            choice.compute_variables(weight, resources=local_resources)
            weight = choice.get_attribute(weight)
            rank_of_weight = 1
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif weight is None:
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unknown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                weight = take(weight, index2)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        stratum = local_resources.get("stratum", None)
        if stratum is None:
            raise StandardError, "'stratum' must be defined for stratified sampling."
        if isinstance(stratum, str):
            choice.compute_variables(stratum, resources=local_resources)
            stratum = choice.get_attribute(stratum)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        ##TODO: check all chosen strata are in selectable strata
        #i.e. chosen_choice_index is in index2
        chosen_stratum = ones(chosen_choice_index.size,
                              dtype=DTYPE) * NO_STRATUM_ID
        chosen_stratum[where(
            chosen_choice_index != -1)] = stratum[chosen_choice_index[where(
                chosen_choice_index != -1)]]
        selectable_strata = stratum[index2]
        unique_strata = unique(selectable_strata)
        unique_strata = unique_strata[where(unique_strata != NO_STRATUM_ID)]

        #        if rank_of_weight == 2:
        #            raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet"

        #        sampled_index = zeros((index1.size,1)) - 1

        sample_size = local_resources.get("sample_size", None)
        sample_size_from_each_stratum = local_resources.get(
            "sample_size_from_each_stratum", None)
        if sample_size_from_each_stratum is None:
            sample_size_from_each_stratum = sample_size
        strata_sample_size = ones(unique_strata.size,
                                  dtype=DTYPE) * sample_size_from_each_stratum
        sample_rate = local_resources.get("sample_rate", None)
        if sample_rate is not None:
            raise UnImplementedError, "sample_rate is not implemented yet."
            ##TODO: to be finished
            #num_elements_in_strata = histogram(selectable_strata, unique_strata)
            #strata_sample_size = round(num_elements_in_strata * sample_rate)

        sample_size_from_chosen_stratum = local_resources.get(
            "sample_size_from_chosen_stratum", None)
        if sample_size_from_chosen_stratum is None and not include_chosen_choice:
            strata_sample_pairs = array(
                map(lambda x, y: [x, y], unique_strata, strata_sample_size))
            if rank_of_weight == 1:
                sampled_index = self._sample_by_stratum(
                    index1, index2, selectable_strata, prob,
                    chosen_choice_index_to_index2, strata_sample_pairs)
            elif rank_of_weight == 2:
                sampled_index = self._sample_by_agent_and_stratum(
                    index1, index2, selectable_strata, prob,
                    chosen_choice_index_to_index2, strata_sample_pairs)
        else:
            strata_sample_setting = zeros((index1.size, unique_strata.size, 2),
                                          dtype=DTYPE)
            for i in range(index1.size):
                agents_strata_sample_size = copy.copy(strata_sample_size)
                if sample_size_from_chosen_stratum is None:
                    ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True,
                    ## sample one less from the chosen stratum
                    agents_strata_sample_size[where(
                        unique_strata == chosen_stratum[i])] += -1
                else:
                    agents_strata_sample_size[where(
                        unique_strata ==
                        chosen_stratum[i])] = sample_size_from_chosen_stratum
                strata_sample_pairs = array(
                    map(lambda x, y: [x, y], unique_strata,
                        agents_strata_sample_size))
                strata_sample_setting[i, ...] = strata_sample_pairs

            sampled_index = self._sample_by_agent_and_stratum(
                index1, index2, selectable_strata, prob,
                chosen_choice_index_to_index2, strata_sample_setting)
        #chosen_choice = None
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        if include_chosen_choice:
            sampled_index = concatenate(
                (chosen_choice_index[:, newaxis], sampled_index), axis=1)
            #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1
            #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does)
            #since the chosen choice index is attached to the first column, the chosen choice should be all zeros
            #for valid chosen_choice_index
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1

            chosen_probability = zeros(
                (chosen_choice_index.size, ), dtype=float32) - 1
            for stratum in unique_strata:
                w = chosen_stratum == stratum
                chosen_probability[w] = (
                    prob[chosen_choice_index[w]] /
                    prob[selectable_strata == stratum].sum()).astype(float32)
            self._sampling_probability = concatenate(
                (chosen_probability[:, newaxis], self._sampling_probability),
                axis=1)
            self._stratum_id = concatenate(
                (chosen_stratum[:, newaxis], self._stratum_id), axis=1)

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(self._sampling_probability,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        interaction_dataset.add_attribute(self._stratum_id, 'stratum_id')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
Exemple #13
0
    def run(self, dataset1, dataset2, index1=None, index2=None, stratum=None, weight=None,
            sample_size=1, sample_size_from_each_stratum=None, sample_size_from_chosen_stratum=None, sample_rate=None,
            include_chosen_choice=False, resources=None, with_replacement=False, dataset_pool=None, **kwargs):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alternatives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten
          by sample_size_from_each_stratum if it's not None
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""
        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
                        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "with_replacement": with_replacement,
                "stratum":stratum, "weight": weight,
                "sample_size": sample_size,
                "sample_size_from_each_stratum": sample_size_from_each_stratum,
                "sample_size_from_chosen_stratum": sample_size_from_chosen_stratum,
                
                "sample_rate": sample_rate,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2'])
        index1 = local_resources.get("index1", None)

        agent = dataset1

        if index1 is None:
            agent.get_id_attribute()
            index1 = arange(agent.size())

        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)

        if index2 is None:
            choice.get_id_attribute()
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        weight = local_resources.get("weight", None)

        if isinstance(weight, str):
            choice.compute_variables(weight,
                resources = local_resources )
            weight=choice.get_attribute(weight)
            rank_of_weight = 1
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif weight is None:
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unknown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                weight = take(weight, index2)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        stratum = local_resources.get("stratum", None)
        if stratum is None:
            raise StandardError, "'stratum' must be defined for stratified sampling."
        if isinstance(stratum, str):
            choice.compute_variables(stratum,
                resources = local_resources )
            stratum=choice.get_attribute(stratum)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        ##TODO: check all chosen strata are in selectable strata
        #i.e. chosen_choice_index is in index2
        chosen_stratum = ones(chosen_choice_index.size, dtype=DTYPE) * NO_STRATUM_ID
        chosen_stratum[where(chosen_choice_index!=-1)] = stratum[chosen_choice_index[where(chosen_choice_index!=-1)]]
        selectable_strata = stratum[index2]
        unique_strata = unique(selectable_strata)
        unique_strata = unique_strata[where(unique_strata!=NO_STRATUM_ID)]

#        if rank_of_weight == 2:
#            raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet"

#        sampled_index = zeros((index1.size,1)) - 1

        sample_size = local_resources.get("sample_size", None)
        sample_size_from_each_stratum = local_resources.get("sample_size_from_each_stratum", None)
        if sample_size_from_each_stratum is None:
            sample_size_from_each_stratum = sample_size
        strata_sample_size = ones(unique_strata.size, dtype=DTYPE) * sample_size_from_each_stratum
        sample_rate = local_resources.get("sample_rate", None)
        if sample_rate is not None:
            raise UnImplementedError, "sample_rate is not implemented yet."
            ##TODO: to be finished
            #num_elements_in_strata = histogram(selectable_strata, unique_strata)
            #strata_sample_size = round(num_elements_in_strata * sample_rate)

        sample_size_from_chosen_stratum = local_resources.get("sample_size_from_chosen_stratum", None)
        if sample_size_from_chosen_stratum is None and not include_chosen_choice:
            strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, strata_sample_size))
            if rank_of_weight == 1:
                sampled_index = self._sample_by_stratum(index1, index2, selectable_strata, prob,
                                                        chosen_choice_index_to_index2, strata_sample_pairs)
            elif rank_of_weight == 2:
                sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob,
                                                                  chosen_choice_index_to_index2, strata_sample_pairs)
        else:
            strata_sample_setting = zeros((index1.size,unique_strata.size,2), dtype=DTYPE)
            for i in range(index1.size):
                agents_strata_sample_size = copy.copy(strata_sample_size)
                if sample_size_from_chosen_stratum is None:
                    ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True, 
                    ## sample one less from the chosen stratum
                    agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] += - 1
                else:
                    agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] = sample_size_from_chosen_stratum
                strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, agents_strata_sample_size))
                strata_sample_setting[i,...] = strata_sample_pairs

            sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob,
                                                              chosen_choice_index_to_index2, strata_sample_setting)
        #chosen_choice = None
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        if include_chosen_choice:
            sampled_index = concatenate((chosen_choice_index[:,newaxis],sampled_index), axis=1)
            #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1
            #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does)
                                                                      #since the chosen choice index is attached to the first column, the chosen choice should be all zeros
                                                                      #for valid chosen_choice_index
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            
            chosen_probability = zeros((chosen_choice_index.size,),dtype=float32) - 1
            for stratum in unique_strata:
                w = chosen_stratum==stratum
                chosen_probability[w] = (prob[chosen_choice_index[w]] / prob[selectable_strata==stratum].sum()).astype(float32)
            self._sampling_probability = concatenate((chosen_probability[:,newaxis], self._sampling_probability), axis=1)
            self._stratum_id = concatenate((chosen_stratum[:,newaxis], self._stratum_id), axis=1)

        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(self._sampling_probability, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        interaction_dataset.add_attribute(self._stratum_id, 'stratum_id')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset