Exemple #1
0
    def compute_good_turing(self,
                            labels,
                            batch_size=stg.JOBLIB_BATCH_SIZE,
                            parallel=True):
        """Compute good turing estimator"""
        stg.logger.info("Computing Good Turing Estimator")

        n_jobs = u.get_n_processes() if parallel else 1

        stg.logger.info("Compute frequencies")
        # Get frequencies
        freq = self.frequencies(labels, batch_size, n_jobs=n_jobs)

        # Check if there are labels appearing only once
        if not any(np.where(freq == 1)[0]):
            stg.logger.info("No labels appearing only once")
            n1 = 0
            #  n1 = np.inf
        else:
            stg.logger.info("Compute frequencies of frequencies")
            # Get frequency of frequencies
            freq_freq = self.frequencies(freq,
                                         batch_size=batch_size,
                                         n_jobs=n_jobs)
            n1 = freq_freq[0]

        # Get Good Turing estimator
        self.good_turing = n1 / self.n_samples

        # Get Good Turing estimator
        self.good_turing_smooth = self.alpha * n1/self.n_samples + \
            (1 - self.alpha) * self.good_turing_smooth
Exemple #2
0
    def solve_parametric(self, theta,
                         batch_size=stg.JOBLIB_BATCH_SIZE,
                         parallel=True,  # Solve problems in parallel
                         message="Solving for all theta",
                         ):
        """
        Solve parametric problems for each value of theta.

        Parameters
        ----------
        theta : DataFrame
            Parameter values.
        parallel : bool, optional
            Solve problems in parallel. Default True.
        message : str, optional
            Message to be printed on progress bar.

        Returns
        -------
        dict
            Results dictionary.
        """
        n = len(theta)  # Number of points

        n_jobs = u.get_n_processes() if parallel else 1

        stg.logger.info(message + " (n_jobs = %d)" % n_jobs)

        results = Parallel(n_jobs=n_jobs, batch_size=batch_size)(
            delayed(self.populate_and_solve)(theta.iloc[i])
            for i in tqdm(range(n))
        )

        return results
Exemple #3
0
def encode_strategies(strategies,
                      batch_size=stg.JOBLIB_BATCH_SIZE,
                      parallel=True):
    """
    Encode strategies


    Parameters
    ----------
    strategies : Strategies array
        Array of strategies to be encoded.

    Returns
    -------
    numpy array
        Encodings for each strategy in strategies.
    Strategies array
        Array of unique strategies.
    """
    stg.logger.info("Encoding strategies")
    N = len(strategies)

    stg.logger.info("Getting unique set of strategies")
    start_time = time()
    unique = unique_strategies(strategies)
    end_time = time()
    stg.logger.info("Extraction time %.3f sec" % (end_time - start_time))
    n_unique_strategies = len(unique)
    stg.logger.info("Found %d unique strategies" % n_unique_strategies)

    # Map strategies to number
    n_jobs = u.get_n_processes() if parallel else 1
    stg.logger.info("Assign samples to unique strategies (n_jobs = %d)" %
                    n_jobs)

    results = Parallel(n_jobs=n_jobs, batch_size=batch_size)(
        delayed(assign_to_unique_strategy)(s, unique)
        for s in tqdm(strategies))
    y = np.array(results)

    return y, unique
Exemple #4
0
    def assign_samples(self,
                       discarded_samples,
                       selected_strategies,
                       batch_size,
                       parallel=True):
        """
        Assign samples to strategies choosing the ones minimizing the cost.
        """

        # Backup strategies labels and encodings
        #  self.y_full = self.y_train

        # Reassign y_labels
        # selected_strategies: find index where new labels are
        # discarded_strategies: -1
        self.y_train = np.array([
            np.where(selected_strategies == label)[0][0]
            if label in selected_strategies else -1 for label in self.y_train
        ])

        # Assign discarded samples and compute degradation
        degradation = np.zeros(len(discarded_samples))

        n_jobs = u.get_n_processes() if parallel else 1

        stg.logger.info("Assign samples to selected strategies (n_jobs = %d)" %
                        n_jobs)

        results = Parallel(n_jobs=n_jobs, batch_size=batch_size)(
            delayed(best_strategy)(self.X_train.iloc[i], self.obj_train[i],
                                   self.encoding, self.problem)
            for i in tqdm(range(len(discarded_samples))))

        for i in range(len(discarded_samples)):
            sample_idx = discarded_samples[i]
            self.y_train[sample_idx], degradation[i] = results[i]

        return degradation
Exemple #5
0
    def choose_best(self, problem_data, labels, parallel=False,
                    batch_size=stg.JOBLIB_BATCH_SIZE, use_cache=True):
        """
        Choose best strategy between provided ones

        Parameters
        ----------
        labels : list
            Strategy labels to compare.
        parallel : bool, optional
            Perform `n_best` strategies evaluation in parallel.
            True by default.
        use_cache : bool, optional
            Use solver cache if available. True by default.

        Returns
        -------
        dict
            Results as a dictionary.
        """
        n_best = self._learner.options['n_best']

        # For each n_best classes get x, y, time and store the best one
        x = []
        time = []
        infeas = []
        cost = []

        strategies = [self.encoding[label] for label in labels]

        # Cache is a list of solver caches to pass
        cache = [None] * n_best
        if self._solver_cache and use_cache:
            cache = [self._solver_cache[label] for label in labels]

        n_jobs = u.get_n_processes(n_best) if parallel else 1

        results = Parallel(n_jobs=n_jobs, batch_size=batch_size)(
            delayed(self._problem.solve)(problem_data,
                                         strategy=strategies[j],
                                         cache=cache[j])
            for j in range(n_best))

        x = [r["x"] for r in results]
        time = [r["time"] for r in results]
        infeas = [r["infeasibility"] for r in results]
        cost = [r["cost"] for r in results]

        # Pick best class between k ones
        infeas = np.array(infeas)
        cost = np.array(cost)
        idx_filter = np.where(infeas <= stg.INFEAS_TOL)[0]
        if len(idx_filter) > 0:
            # Case 1: Feasible points
            # -> Get solution with best cost
            #    between feasible ones
            if self._problem.sense() == Minimize:
                idx_pick = idx_filter[np.argmin(cost[idx_filter])]
            elif self._problem.sense() == Maximize:
                idx_pick = idx_filter[np.argmax(cost[idx_filter])]
            else:
                e.value_error('Objective type not understood')
        else:
            # Case 2: No feasible points
            # -> Get solution with minimum infeasibility
            idx_pick = np.argmin(infeas)

        # Store values we are interested in
        result = {}
        result['x'] = x[idx_pick]
        result['time'] = np.sum(time)
        result['strategy'] = strategies[idx_pick]
        result['cost'] = cost[idx_pick]
        result['infeasibility'] = infeas[idx_pick]

        return result
Exemple #6
0
    def __init__(self,
                 **options):
        """
        Initialize OptimalTrees class.

        Parameters
        ----------
        options : dict
            Learner options as a dictionary.
        """
        if not OptimalTree.is_installed():
            e.value_error("Interpretable AI not installed")

        # Import julia and IAI module
        from interpretableai import iai
        self.iai = iai
        from julia import Distributed
        self.nprocs = Distributed.nprocs

        # Define name
        self.name = stg.OPTIMAL_TREE

        # Assign settings
        self.n_input = options.pop('n_input')
        self.n_classes = options.pop('n_classes')
        self.options = {}
        self.options['hyperplanes'] = options.pop('hyperplanes', False)
        #  self.options['fast_num_support_restarts'] = \
        #      options.pop('fast_num_support_restarts', [20])
        self.options['parallel'] = options.pop('parallel_trees', True)
        self.options['cp'] = options.pop('cp', None)
        self.options['max_depth'] = options.pop('max_depth',
                octstg.DEFAULT_TRAINING_PARAMS['max_depth'])
        self.options['minbucket'] = options.pop('minbucket',
                octstg.DEFAULT_TRAINING_PARAMS['minbucket'])
        # Pick minimum between n_best and n_classes
        self.options['n_best'] = min(options.pop('n_best', stg.N_BEST),
                                     self.n_classes)
        self.options['save_svg'] = options.pop('save_svg', False)

        # Get fraction between training and validation
        self.options['frac_train'] = options.pop('frac_train', stg.FRAC_TRAIN)

        # Load Julia
        n_cpus = get_n_processes()

        n_cur_procs = self.nprocs()
        if n_cur_procs < n_cpus and self.options['parallel']:
            # Add processors to match number of cpus
            Distributed.addprocs((n_cpus - n_cur_procs))

        # Assign optimaltrees options
        self.optimaltrees_options = {'random_seed': 1}
        self.optimaltrees_options['max_depth'] = self.options['max_depth']
        self.optimaltrees_options['minbucket'] = self.options['minbucket']
        if self.options['hyperplanes']:
            self.optimaltrees_options['hyperplane_config'] = \
                {'sparsity': 'all'}

        if self.options['cp']:
            self.optimaltrees_options['cp'] = self.options['cp']