コード例 #1
0
def check_n_jobs(n_jobs):
    """Check and adjust the number of CPUs that can work in parallel.

    Parameters
    ----------
    n_jobs : int,
      Number of parallel workers, specified according to joblib's conventions:
      If 0 is provided, all CPUs are used.
      A negative number indicates that all the CPUs except (|n_jobs| - 1) ones
      will be used.

    Returns
    -------
    n_jobs : int,
      Actual number of CPUs that will be used according to their availability.

    """
    if n_jobs == 0:  # invalid according to joblib's conventions
        raise ValueError(
            "'n_jobs == 0' is not a valid choice. "
            "Please provide a positive number of CPUs, or -1 "
            "for all CPUs, or a negative number (-i) for "
            "'all but (i-1)' CPUs (joblib conventions)."
        )
    elif n_jobs < 0:
        n_jobs = max(1, joblib.cpu_count() + n_jobs + 1)
    else:
        n_jobs = min(n_jobs, joblib.cpu_count())

    return n_jobs
コード例 #2
0
ファイル: common_checks.py プロジェクト: meghalD/nilearn-RSA
def check_n_jobs(n_jobs):
    """Check and adjust the number of CPUs that can work in parallel.

    Parameters
    ----------
    n_jobs : int,
      Number of parallel workers, specified according to joblib's conventions:
      If 0 is provided, all CPUs are used.
      A negative number indicates that all the CPUs except (|n_jobs| - 1) ones
      will be used.

    Returns
    -------
    n_jobs : int,
      Actual number of CPUs that will be used according to their availability.

    """
    if n_jobs == 0:  # invalid according to joblib's conventions
        raise ValueError("'n_jobs == 0' is not a valid choice. "
                         "Please provide a positive number of CPUs, or -1 "
                         "for all CPUs, or a negative number (-i) for "
                         "'all but (i-1)' CPUs (joblib conventions).")
    elif n_jobs < 0:
        n_jobs = max(1, joblib.cpu_count() + n_jobs + 1)
    else:
        n_jobs = min(n_jobs, joblib.cpu_count())

    return n_jobs
コード例 #3
0
def get_split_scores(factory,thresholds,formula,
                     metric = None,#p.e. usability entropy
                     use_joblib = False,
                     joblib_backend = 'threading',
                     n_jobs = -1,
                     min_events_fraction_leaf = 0.,verbose = False):

    if metric == None:
        metric = penalized_usability_entropy
    if min_events_fraction_leaf <=1:
        min_events_fraction_leaf = int(min_events_fraction_leaf*sum(factory.weights))
    if verbose:
        print min_events_fraction_leaf, sum(factory.weights)

    if not use_joblib:
        scores = np.repeat(float("inf"),len(thresholds))
        for i,(feature,cut,_) in enumerate(thresholds):
            predicate =  (factory.events[:,feature] > cut)

            #skip the edge cases... (inf penalty)
            if np.all(predicate) or (not np.any(predicate)):
                #if this split does not split, fuggedaboutit
                continue 
            if min_events_fraction_leaf>0:
                #get rid of too uneven a cuts
                sum_weight = np.sum(factory.weights)
                true_weight = np.sum(factory.weights[predicate])
                false_weight = sum_weight - true_weight
                if true_weight < min_events_fraction_leaf or false_weight < min_events_fraction_leaf:
                    if verbose: print "t:",true_weight,"f:",false_weight, "discarded"
                    continue
                if verbose: print "t:",true_weight,"f:",false_weight, "passed"
            #compute score
            subFactories = factory.split_by(predicate)
            scores[i] = metric(formula,*subFactories)
    else:
        if n_jobs < 0:
            n_jobs = joblib.cpu_count() +1 - n_jobs
       
        indices = [0]+[len(thresholds)*(i+1)/n_jobs for i in range(n_jobs)]
        thresholdSections = [thresholds[indices[i]:indices[i+1]] for i in range(n_jobs)]
        
        if joblib_backend == 'threading':
            factory = [deepcopy(factory) for i in range(n_jobs)]
            formula = [deepcopy(formula) for i in range(n_jobs)]
            metric = [deepcopy(metric) for i in range(n_jobs)] #in case it has some internal data
            
            jobs = (joblib.delayed(get_split_scores)(factory[i],thresholdSection, formula[i],
                                                 metric=metric[i],use_joblib = False,
                                                 min_events_fraction_leaf = min_events_fraction_leaf,
                                                 verbose = verbose)
                                    for i,thresholdSection in enumerate(thresholdSections))
        else:
            jobs = (joblib.delayed(get_split_scores)(factory,thresholdSection, formula,
                                                 metric=metric,use_joblib = False,
                                                 min_events_fraction_leaf = min_events_fraction_leaf,
                                                 verbose = verbose)
                                    for thresholdSection in thresholdSections)
        scores = np.hstack(joblib.Parallel(n_jobs = n_jobs, backend = joblib_backend)(jobs))
    return scores
コード例 #4
0
ファイル: rgf_model.py プロジェクト: fukatani/rgf_python
    def _fit_multiclass_task(self, X, y, sample_weight, params):
        if params['init_model'] is not None:
            max_digits = len(str(len(self._classes)))
            init_model_filenames = ['{}.{}'.format(params['init_model'],
                                                   str(i + 1).zfill(max_digits)) for i in range(self._n_classes)]
        ovr_list = [None] * self._n_classes
        for i, cls_num in enumerate(self._classes):
            if params['init_model'] is not None:
                params['init_model'] = init_model_filenames[i]
            self._classes_map[i] = cls_num
            ovr_list[i] = (y == cls_num).astype(int)
            self._estimators[i] = RGFExecuter(**params)

        n_jobs = self.n_jobs if self.n_jobs > 0 else cpu_count() + self.n_jobs + 1
        substantial_n_jobs = max(n_jobs, self.n_classes_)
        if substantial_n_jobs < n_jobs and self.verbose:
            print('n_jobs = {0}, but RGFClassifier uses {1} CPUs because '
                  'classes_ is {2}'.format(n_jobs, substantial_n_jobs,
                                           self.n_classes_))

        self._estimators = Parallel(n_jobs=self.n_jobs)(delayed(utils.fit_ovr_binary)(self._estimators[i],
                                                                                      X,
                                                                                      ovr_list[i],
                                                                                      sample_weight)
                                                        for i in range(self._n_classes))
コード例 #5
0
    def _parallel_learning(self, X, Y, w):
        n_samples = len(X)
        objective, positive_slacks = 0, 0
        verbose = max(0, self.verbose - 3)
        if self.batch_size is not None:
            raise ValueError("If n_jobs != 1, batch_size needs to" "be None")
        # generate batches of size n_jobs
        # to speed up inference
        if self.n_jobs == -1:
            n_jobs = cpu_count()
        else:
            n_jobs = self.n_jobs

        n_batches = int(np.ceil(float(len(X)) / n_jobs))
        slices = gen_even_slices(n_samples, n_batches)
        for batch in slices:
            X_b = X[batch]
            Y_b = Y[batch]
            candidate_constraints = Parallel(
                n_jobs=self.n_jobs,
                verbose=verbose)(delayed(find_constraint)(self.model, x, y, w)
                                 for x, y in zip(X_b, Y_b))
            djoint_feature = np.zeros(self.model.size_joint_feature)
            for x, y, constraint in zip(X_b, Y_b, candidate_constraints):
                y_hat, delta_joint_feature, slack, loss = constraint
                if slack > 0:
                    objective += slack
                    djoint_feature += delta_joint_feature
                    positive_slacks += 1
            w = self._solve_subgradient(djoint_feature, n_samples, w)
        return objective, positive_slacks, w
コード例 #6
0
    def _fit_multiclass_task(self, X, y, sample_weight, params):
        if params['init_model'] is not None:
            max_digits = len(str(len(self._classes)))
            init_model_filenames = [
                '{}.{}'.format(params['init_model'],
                               str(i + 1).zfill(max_digits))
                for i in range(self._n_classes)
            ]
        ovr_list = [None] * self._n_classes
        for i, cls_num in enumerate(self._classes):
            if params['init_model'] is not None:
                params['init_model'] = init_model_filenames[i]
            self._classes_map[i] = cls_num
            ovr_list[i] = (y == cls_num).astype(int)
            self._estimators[i] = RGFExecuter(**params)

        n_jobs = self.n_jobs if self.n_jobs > 0 else cpu_count(
        ) + self.n_jobs + 1
        substantial_n_jobs = max(n_jobs, self.n_classes_)
        if substantial_n_jobs < n_jobs and self.verbose:
            print('n_jobs = {0}, but RGFClassifier uses {1} CPUs because '
                  'classes_ is {2}'.format(n_jobs, substantial_n_jobs,
                                           self.n_classes_))

        self._estimators = Parallel(n_jobs=self.n_jobs)(
            delayed(utils.fit_ovr_binary)(self._estimators[i], X, ovr_list[i],
                                          sample_weight)
            for i in range(self._n_classes))
コード例 #7
0
ファイル: subgradient_ssvm.py プロジェクト: huyng/pystruct
    def _parallel_learning(self, X, Y, w):
        n_samples = len(X)
        objective, positive_slacks = 0, 0
        verbose = max(0, self.verbose - 3)
        if self.batch_size is not None:
            raise ValueError("If n_jobs != 1, batch_size needs to" "be None")
        # generate batches of size n_jobs
        # to speed up inference
        if self.n_jobs == -1:
            n_jobs = cpu_count()
        else:
            n_jobs = self.n_jobs

        n_batches = int(np.ceil(float(len(X)) / n_jobs))
        slices = gen_even_slices(n_samples, n_batches)
        for batch in slices:
            X_b = X[batch]
            Y_b = Y[batch]
            candidate_constraints = Parallel(n_jobs=self.n_jobs, verbose=verbose)(
                delayed(find_constraint)(self.model, x, y, w) for x, y in zip(X_b, Y_b)
            )
            dpsi = np.zeros(self.model.size_psi)
            for x, y, constraint in zip(X_b, Y_b, candidate_constraints):
                y_hat, delta_psi, slack, loss = constraint
                if slack > 0:
                    objective += slack
                    dpsi += delta_psi
                    positive_slacks += 1
            w = self._solve_subgradient(dpsi, n_samples, w)
        return objective, positive_slacks, w
コード例 #8
0
ファイル: task.py プロジェクト: tuxedocat/ntcir13-medweb
def define_model(n_random_search: int = 100, n_jobs: int = None) -> Model:
    # TODO: needs refinements
    rf = RandomForestClassifier(random_state=None)
    # Or Extremely Randomized Trees, but currently no big difference in terms of performance.
    # rf = ExtraTreesClassifier(random_state=None)
    _n_estimators = list(range(8, 128, 4))
    _max_depth = list(range(8, 32, 1))

    search_space = dict(
        n_estimators=_n_estimators,
        criterion=['gini', 'entropy'],
        max_features=['auto', 'log2', 0.5, None],
        max_depth=_max_depth
    )

    if n_jobs:
        ncores = n_jobs
    else:
        ncores = joblib.cpu_count()
    rfcv = model_selection.RandomizedSearchCV(estimator=rf,
                                              param_distributions=search_space,
                                              n_iter=n_random_search,
                                              n_jobs=ncores,
                                              cv=5,
                                              verbose=1
                                              )
    return rfcv
コード例 #9
0
def _get_n_jobs(n_jobs):
    """Get number of jobs for the computation.
    See sklearn/utils/__init__.py for more information.

    This function reimplements the logic of joblib to determine the actual
    number of jobs depending on the cpu count. If -1 all CPUs are used.
    If 1 is given, no parallel computing code is used at all, which is useful
    for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
    Thus for n_jobs = -2, all CPUs but one are used.
    Parameters
    ----------
    n_jobs : int
        Number of jobs stated in joblib convention.
    Returns
    -------
    n_jobs : int
        The actual number of jobs as positive integer.
    Examples
    --------
    >>> from sklearn.utils import _get_n_jobs
    >>> _get_n_jobs(4)
    4
    >>> jobs = _get_n_jobs(-2)
    >>> assert jobs == max(cpu_count() - 1, 1)
    >>> _get_n_jobs(0)
    Traceback (most recent call last):
    ...
    ValueError: Parameter n_jobs == 0 has no meaning.
    """
    if n_jobs < 0:
        return max(cpu_count() + 1 + n_jobs, 1)
    elif n_jobs == 0:
        raise ValueError('Parameter n_jobs == 0 has no meaning.')
    else:
        return n_jobs
コード例 #10
0
ファイル: sklearn_base.py プロジェクト: flaviassantos/pyod
def _get_n_jobs(n_jobs):
    """Get number of jobs for the computation.
    See sklearn/utils/__init__.py for more information.

    This function reimplements the logic of joblib to determine the actual
    number of jobs depending on the cpu count. If -1 all CPUs are used.
    If 1 is given, no parallel computing code is used at all, which is useful
    for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
    Thus for n_jobs = -2, all CPUs but one are used.
    Parameters
    ----------
    n_jobs : int
        Number of jobs stated in joblib convention.
    Returns
    -------
    n_jobs : int
        The actual number of jobs as positive integer.
    Examples
    --------
    >>> from sklearn.utils import _get_n_jobs
    >>> _get_n_jobs(4)
    4
    >>> jobs = _get_n_jobs(-2)
    >>> assert jobs == max(cpu_count() - 1, 1)
    >>> _get_n_jobs(0)
    Traceback (most recent call last):
    ...
    ValueError: Parameter n_jobs == 0 has no meaning.
    """
    if n_jobs < 0:
        return max(cpu_count() + 1 + n_jobs, 1)
    elif n_jobs == 0:
        raise ValueError('Parameter n_jobs == 0 has no meaning.')
    else:
        return n_jobs
コード例 #11
0
ファイル: taskmanager.py プロジェクト: MaximeTasset/Cytomine
    def compute_partition(self, nb_tasks, data_size):
        """
        Compute data partitioning for parallel computation :
        min(nb_tasks, data_size)

        Parameters
        ----------
        nb_tasks : int (!=0)
            If >0 : the parallelization factor.
            If <0 : nb_tasks = #cpu+nb_tasks+1 (-1 -> nb_tasks = #cpu)
        data_size : int > 0
            The size of the data to process

        Return
        ------
        triplet = (nb_tasks, counts, starts)
        nb_tasks : int
            The final parallelization factor. It is computed as
            min(#cpu/nb_tasks, data_size)
        starts : list of int
            The start indexes of the data for each parallel task
        """
        # Compute the actual number of core to use
        if nb_tasks < 0:
            cpu = cpu_count() + nb_tasks + 1
            if cpu <= 0:
                cpu = 1
            nb_tasks = min(cpu, data_size)
        else:
            if nb_tasks == 0:
                nb_tasks = 1
            nb_tasks = min(nb_tasks, data_size)

        # Compute the minimum load
        increment = data_size // nb_tasks
        starts = [
            x * y for x, y in zip([increment] * nb_tasks, range(nb_tasks))
        ]
        starts.append(data_size)

        # Distribute the extra load if necessary
        gap = data_size - increment * nb_tasks
        if gap > 0:
            # If there are leftovers, we will increase the number of objects
            # of the first cores : starts[i] = starts[i] + corrections[i]
            # The correction vector is [0]  [1,2,...,gap] [gap,...,gap] [0]
            # The first 0 is so as to start at the first element
            # The second part is to increase the number of datum of the first
            # cores by one.
            # Then we have to shifs all the remaining component to keep the
            # same number of elements...
            # Except for the last one which must correspond to the lenght of
            # the data vector
            corrections = range(gap + 1) + ([gap] * (nb_tasks - gap - 1)) + [0]

            starts = [x + y for x, y in zip(starts, corrections)]

        return nb_tasks, starts
コード例 #12
0
ファイル: __init__.py プロジェクト: jonaqp/sklearn-genetic
    def _fit(self, X, y):
        X, y = check_X_y(X, y, "csr")
        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        estimator = clone(self.estimator)

        # Genetic Algorithm
        toolbox = base.Toolbox()

        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat,
                         creator.Individual, toolbox.attr_bool, n=n_features)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y,
                         cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params,
                         caching=self.caching)
        toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba)
        toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs > 1:
            pool = multiprocessing.Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        elif self.n_jobs < 0:
            pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1))
            toolbox.register("map", pool.map)

        pop = toolbox.population(n=self.n_population)
        hof = tools.HallOfFame(1, similar=np.array_equal)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean, axis=0)
        stats.register("std", np.std, axis=0)
        stats.register("min", np.min, axis=0)
        stats.register("max", np.max, axis=0)

        if self.verbose > 0:
            print("Selecting features with genetic algorithm.")

        algorithms.eaSimple(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba,
                            ngen=self.n_generations, stats=stats, halloffame=hof,
                            verbose=self.verbose)
        if self.n_jobs != 1:
            pool.close()
            pool.join()

        # Set final attributes
        support_ = np.array(hof, dtype=np.bool)[0]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, support_], y)

        self.n_features_ = support_.sum()
        self.support_ = support_

        return self
コード例 #13
0
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    if cpu_count() > 1:
        # parallelism requires this to be the case for a sane implementation
        assert_false(est1 is est2)
def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    if cpu_count() > 1:
        # parallelism requires this to be the case for a sane implementation
        assert_false(est1 is est2)
コード例 #15
0
    def fit(self, X, y=None, groups=None):
        """Run fit on the estimator with randomly drawn parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output]
            Target relative to X for classification or regression;

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """

        # check if the list of parameter spaces is provided. If not, then
        # only step in manual mode can be used.

        if len(self.search_spaces_) == 0:
            raise ValueError(
                "Please provide search space using `add_spaces` first before"
                "calling fit method.")

        n_jobs = self.n_jobs

        # account for case n_jobs < 0
        if n_jobs < 0:
            n_jobs = max(1, cpu_count() + n_jobs + 1)

        for space_id in sorted(self.search_spaces_.keys()):
            elem = self.search_spaces_[space_id]

            # if not provided with search subspace, n_iter is taken as
            # self.n_iter
            if isinstance(elem, tuple):
                space, n_iter = elem
            else:
                n_iter = self.n_iter

            # do the optimization for particular search space
            while n_iter > 0:
                # when n_iter < n_jobs points left for evaluation
                n_jobs_adjusted = min(n_iter, self.n_jobs)

                self.step(X,
                          y,
                          space_id,
                          groups=groups,
                          n_jobs=n_jobs_adjusted)
                n_iter -= n_jobs
コード例 #16
0
ファイル: searchcv.py プロジェクト: MechCoder/scikit-optimize
    def fit(self, X, y=None, groups=None):
        """Run fit on the estimator with randomly drawn parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples] or [n_samples, n_output]
            Target relative to X for classification or regression;

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """

        # check if the list of parameter spaces is provided. If not, then
        # only step in manual mode can be used.

        if len(self.search_spaces_) == 0:
            raise ValueError(
                "Please provide search space using `add_spaces` first before"
                "calling fit method."
            )

        n_jobs = self.n_jobs

        # account for case n_jobs < 0
        if n_jobs < 0:
            n_jobs = max(1, cpu_count() + n_jobs + 1)

        for space_id in sorted(self.search_spaces_.keys()):
            elem = self.search_spaces_[space_id]

            # if not provided with search subspace, n_iter is taken as
            # self.n_iter
            if isinstance(elem, tuple):
                space, n_iter = elem
            else:
                n_iter = self.n_iter

            # do the optimization for particular search space
            while n_iter > 0:
                # when n_iter < n_jobs points left for evaluation
                n_jobs_adjusted = min(n_iter, self.n_jobs)

                self.step(
                    X, y, space_id,
                    groups=groups, n_jobs=n_jobs_adjusted
                )
                n_iter -= n_jobs
コード例 #17
0
def _partition_X(X, n_jobs):
    """Private function used to partition X between jobs."""
    n_nodes = X.shape[1]

    # Compute the number of jobs
    n_jobs = min(cpu_count() if n_jobs == -1 else n_jobs, n_nodes)

    # Partition estimators between jobs
    n_node_per_job = (n_nodes // n_jobs) * np.ones(n_jobs, dtype=np.int)
    n_node_per_job[:n_nodes % n_jobs] += 1
    starts = np.cumsum(n_node_per_job)

    return n_jobs, [0] + starts.tolist()
コード例 #18
0
def _partition_X(X, n_jobs):
    """Private function used to partition X between jobs."""
    n_nodes = X.shape[1]

    # Compute the number of jobs
    n_jobs = min(cpu_count() if n_jobs == -1 else n_jobs, n_nodes)

    # Partition estimators between jobs
    n_node_per_job = (n_nodes // n_jobs) * np.ones(n_jobs, dtype=np.int)
    n_node_per_job[:n_nodes % n_jobs] += 1
    starts = np.cumsum(n_node_per_job)

    return n_jobs, [0] + starts.tolist()
コード例 #19
0
ファイル: greedy.py プロジェクト: justheuristic/pruner
def try_add1_bfs(allTrees,
                 factory,
                 learning_rate,
                 loss,
                 breadth,
                 y_pred,
                 regularizer=0.,
                 use_joblib=False,
                 n_jobs=-1):
    '''
    select best tree to add (1 step)
    '''
    if factory.__class__ is BinaryClassificationFactory:
        y_sign = factory.labels_sign
        margin = y_sign * y_pred
    elif factory.__class__ is RegressionFactory:
        margin = factory.labels - y_pred
    else:
        raise Exception("Factory type not supported")

    if use_joblib:
        if n_jobs < 0:
            n_jobs = joblib.cpu_count() + 1 - n_jobs

        indices = [0] + [
            len(allTrees) * (i + 1) / n_jobs for i in range(n_jobs)
        ]
        treeSections = [
            allTrees[indices[i]:indices[i + 1]] for i in range(n_jobs)
        ]

        tasks = [
            joblib.delayed(_inthread_try_add)(treeSection, factory, loss,
                                              margin, y_pred, learning_rate,
                                              regularizer)
            for treeSection in treeSections
        ]
        _res = joblib.Parallel(n_jobs=n_jobs, backend="multiprocessing")(tasks)
        triples = reduce(lambda a, b: a + b, _res)

    else:
        triples = [
            _try_add(tree, factory, loss, margin, y_pred, learning_rate,
                     regularizer) for tree in allTrees
        ]

    triples.sort(key=lambda el: el[0])

    return ([triple[1] for triple in triples[:breadth]
             ], [triple[0] for triple in triples[:breadth]],
            [triple[2] for triple in triples[:breadth]])
コード例 #20
0
    def __init__(self, dataset_config, anomaly_map, feeder_df, n_jobs=1):

        self.dataset_config = dataset_config
        self.anomaly_map = anomaly_map
        self.feeder_df = feeder_df
        if n_jobs < 0:
            self.n_jobs = max(cpu_count() + 1 + n_jobs, 1)
        elif n_jobs == 0:
            raise ValueError('Parameter n_jobs == 0 has no meaning.')
        else:
            self.n_jobs = n_jobs
        low_cust = feeder_df.loc[feeder_df.CUSTOMERS < 100].index.values
        zero_len = feeder_df.loc[(feeder_df.FDR_OH == 0)
                                 & (feeder_df.FDR_UG == 0)].index.values
        self.feeder_ignore = set(list(low_cust) + list(zero_len))
コード例 #21
0
ファイル: ga.py プロジェクト: albahnsen/pyea
def _partition_estimators(n_estimators, n_jobs):
    """Private function used to partition estimators between jobs."""
    # Compute the number of jobs
    if n_jobs == -1:
        n_jobs = min(cpu_count(), n_estimators)

    else:
        n_jobs = min(n_jobs, n_estimators)

    # Partition estimators between jobs
    n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs,
                                                              dtype=np.int)
    n_estimators_per_job[:n_estimators % n_jobs] += 1
    starts = np.cumsum(n_estimators_per_job)

    return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
コード例 #22
0
ファイル: bag.py プロジェクト: orazaro/kgml
def _partition_estimators(ensemble):
    """Private function used to partition estimators between jobs."""
    # Compute the number of jobs
    if ensemble.n_jobs == -1:
        n_jobs = min(cpu_count(), ensemble.n_estimators)

    else:
        n_jobs = min(ensemble.n_jobs, ensemble.n_estimators)

    # Partition estimators between jobs
    n_estimators = (ensemble.n_estimators // n_jobs) * np.ones(n_jobs,
                                                               dtype=np.int)
    n_estimators[:ensemble.n_estimators % n_jobs] += 1
    starts = np.cumsum(n_estimators)

    return n_jobs, n_estimators.tolist(), [0] + starts.tolist()
コード例 #23
0
def _partition_estimators(ensemble):
    """Private function used to partition estimators between jobs."""
    # Compute the number of jobs
    if ensemble.n_jobs == -1:
        n_jobs = min(cpu_count(), ensemble.n_estimators)

    else:
        n_jobs = min(ensemble.n_jobs, ensemble.n_estimators)

    # Partition estimators between jobs
    n_estimators = (ensemble.n_estimators // n_jobs) * np.ones(n_jobs,
                                                               dtype=np.int)
    n_estimators[:ensemble.n_estimators % n_jobs] += 1
    starts = np.cumsum(n_estimators)

    return n_jobs, n_estimators.tolist(), [0] + starts.tolist()
コード例 #24
0
def _partition_estimators(n_estimators, n_jobs):
    """Private function used to partition estimators between jobs."""
    # Compute the number of jobs
    if n_jobs == -1:
        n_jobs = min(cpu_count(), n_estimators)

    else:
        n_jobs = min(n_jobs, n_estimators)

    # Partition estimators between jobs
    n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs,
                                                              dtype=np.int)
    n_estimators_per_job[:n_estimators % n_jobs] += 1
    starts = np.cumsum(n_estimators_per_job)

    return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
コード例 #25
0
ファイル: greedy.py プロジェクト: mindis/pruner
def try_add1_bfs(allTrees,factory,learning_rate,
                 loss,breadth,y_pred,regularizer = 0.,
                 use_joblib = False,n_jobs = -1):
    '''
    select best tree to add (1 step)
    '''
    if factory.__class__ is BinaryClassificationFactory:
        y_sign = factory.labels_sign
        margin = y_sign*y_pred
    elif factory.__class__ is RegressionFactory:
        margin = factory.labels - y_pred
    else:
        raise Exception("Factory type not supported")

    if use_joblib:
        if n_jobs < 0:
            n_jobs = joblib.cpu_count() + 1 - n_jobs
        
        indices = [0]+[len(allTrees)*(i+1)/n_jobs for i in range(n_jobs)]
        treeSections = [allTrees[indices[i]:indices[i+1]] for i in range(n_jobs)]

        tasks = [joblib.delayed(_inthread_try_add)(
                    treeSection,
                    factory,
                    loss,
                    margin,
                    y_pred,
                    learning_rate,
                    regularizer) for treeSection in treeSections]
        _res = joblib.Parallel(n_jobs = n_jobs,
                               backend = "multiprocessing")(tasks)
        triples = reduce(lambda a,b:a+b, _res)

    else:
        triples = [_try_add(tree,factory,loss,margin,y_pred,learning_rate,regularizer) for tree in allTrees]   

    
    triples.sort(key = lambda el: el[0])
    



    return ([triple[1] for triple in triples[:breadth]],
            [triple[0] for triple in triples[:breadth]],
            [triple[2] for triple in triples[:breadth]])
コード例 #26
0
    def computePartition(self, nbTasks, dataSize):
        """
        Compute data partitioning for parallel computation :
        min(nbTasks, dataSize)

        Parameters
        ----------
        nbTasks : int (!=0)
            If >0 : the parallelization factor.
            If <0 : nbTasks = #cpu+nbTasks+1 (-1 -> nbTasks = #cpu)
        dataSize : int > 0
            The size of the data to process

        Return
        ------
        triplet = (nbTasks, counts, starts)
        nbTasks : int
            The final parallelization factor. It is computed as
            min(#cpu/nbTasks, dataSize)
        counts : list of int
            The number of data pieces for each parallel task
        starts : list of int
            The start indexes of the data for each parallel task
        """
        if nbTasks < 0:
            cpu = cpu_count() + nbTasks + 1
            if cpu <= 0:
                cpu = 1
            nbTasks = min(cpu, dataSize)
        else:
            if nbTasks == 0:
                nbTasks = 1
            nbTasks = min(nbTasks, dataSize)

        counts = [dataSize / nbTasks] * nbTasks

        for i in xrange(dataSize % nbTasks):
            counts[i] += 1

        starts = [0] * (nbTasks + 1)

        for i in xrange(1, nbTasks + 1):
            starts[i] = starts[i - 1] + counts[i - 1]

        return nbTasks, counts, starts
コード例 #27
0
ファイル: TaskManager.py プロジェクト: jm-begon/masterthesis
    def computePartition(self, nbTasks, dataSize):
        """
        Compute data partitioning for parallel computation :
        min(nbTasks, dataSize)

        Parameters
        ----------
        nbTasks : int (!=0)
            If >0 : the parallelization factor.
            If <0 : nbTasks = #cpu+nbTasks+1 (-1 -> nbTasks = #cpu)
        dataSize : int > 0
            The size of the data to process

        Return
        ------
        triplet = (nbTasks, counts, starts)
        nbTasks : int
            The final parallelization factor. It is computed as
            min(#cpu/nbTasks, dataSize)
        counts : list of int
            The number of data pieces for each parallel task
        starts : list of int
            The start indexes of the data for each parallel task
        """
        if nbTasks < 0:
            cpu = cpu_count()+nbTasks+1
            if cpu <= 0:
                cpu = 1
            nbTasks = min(cpu, dataSize)
        else:
            if nbTasks == 0:
                nbTasks = 1
            nbTasks = min(nbTasks, dataSize)

        counts = [dataSize / nbTasks] * nbTasks

        for i in xrange(dataSize % nbTasks):
            counts[i] += 1

        starts = [0] * (nbTasks + 1)

        for i in xrange(1, nbTasks + 1):
            starts[i] = starts[i - 1] + counts[i - 1]

        return nbTasks, counts, starts
コード例 #28
0
def _partition_images(n_jobs, n_images):
    if n_jobs == -1:
        n_jobs = min(cpu_count(), n_images)

    else:
        n_jobs = min(n_jobs, n_images)

    counts = [n_images // n_jobs] * n_jobs

    for i in range(n_images % n_jobs):
        counts[i] += 1

    starts = [0] * (n_jobs + 1)

    for i in range(1, n_jobs + 1):
        starts[i] = starts[i - 1] + counts[i - 1]

    return n_jobs, counts, starts
コード例 #29
0
def _partition_clips(n_jobs, n_clips):
    if n_jobs == -1:
        n_jobs = min(cpu_count(), n_clips)

    else:
        n_jobs = min(n_jobs, n_clips)

    counts = [n_clips / n_jobs] * n_jobs

    for i in xrange(n_clips % n_jobs):
        counts[i] += 1

    starts = [0] * (n_jobs + 1)

    for i in xrange(1, n_jobs + 1):
        starts[i] = starts[i - 1] + counts[i - 1]

    return n_jobs, counts, starts
コード例 #30
0
ファイル: rgf_model.py プロジェクト: opavader/rgf
    def _fit_multiclass_task(self, X, y, sample_weight, params):
        ovr_list = [None] * self._n_classes
        for i, cls_num in enumerate(self._classes):
            self._classes_map[i] = cls_num
            ovr_list[i] = (y == cls_num).astype(int)
            self._estimators[i] = RGFExecuter(**params)

        n_jobs = self.n_jobs if self.n_jobs > 0 else cpu_count(
        ) + self.n_jobs + 1
        substantial_n_jobs = max(n_jobs, self.n_classes_)
        if substantial_n_jobs < n_jobs and self.verbose:
            print('n_jobs = {0}, but RGFClassifier uses {1} CPUs because '
                  'classes_ is {2}'.format(n_jobs, substantial_n_jobs,
                                           self.n_classes_))

        self._estimators = Parallel(n_jobs=self.n_jobs)(
            delayed(utils.fit_ovr_binary)(self._estimators[i], X, ovr_list[i],
                                          sample_weight)
            for i in range(self._n_classes))
コード例 #31
0
def _parallel_pairwise(X, Y, func, n_jobs, **kwds):
    """Break the pairwise matrix in n_jobs even slices
    and compute them in parallel"""
    if n_jobs < 0:
        n_jobs = max(cpu_count() + 1 + n_jobs, 1)

    if Y is None:
        Y = X

    if n_jobs == 1:
        # Special case to avoid picklability checks in delayed
        return func(X, Y, **kwds)

    # TODO: in some cases, backend='threading' may be appropriate
    fd = delayed(func)
    ret = Parallel(n_jobs=n_jobs, verbose=0)(
        fd(X, Y[s], **kwds)
        for s in gen_even_slices(Y.shape[0], n_jobs))

    return np.hstack(ret)
コード例 #32
0
ファイル: fastrgf_model.py プロジェクト: hxy201801/rgf_python
    def _set_params_with_dependencies(self):
        if self.max_bin is None:
            if self._is_sparse_train_X:
                self._max_bin = 200
            else:
                self._max_bin = 65000
        else:
            self._max_bin = self.max_bin

        if isinstance(self.min_samples_leaf, utils.FLOATS):
            self._min_samples_leaf = ceil(self.min_samples_leaf * self._n_samples)
        else:
            self._min_samples_leaf = self.min_samples_leaf

        if self.n_jobs == -1:
            self._n_jobs = 0
        elif self.n_jobs < 0:
            self._n_jobs = cpu_count() + self.n_jobs + 1
        else:
            self._n_jobs = self.n_jobs
コード例 #33
0
def _partition_trees(forest):
    """Private function used to partition trees between jobs."""
    # Compute the number of jobs
    if forest.n_jobs == -1:
        n_jobs = min(cpu_count(), forest.n_estimators)

    else:
        n_jobs = min(forest.n_jobs, forest.n_estimators)

    # Partition trees between jobs
    n_trees = [forest.n_estimators // n_jobs] * n_jobs

    for i in range(forest.n_estimators % n_jobs):
        n_trees[i] += 1

    starts = [0] * (n_jobs + 1)

    for i in range(1, n_jobs + 1):
        starts[i] = starts[i - 1] + n_trees[i - 1]

    return n_jobs, n_trees, starts
コード例 #34
0
ファイル: fastrgf_model.py プロジェクト: fukatani/rgf_python
    def _set_params_with_dependencies(self):
        if self.max_bin is None:
            if self._is_sparse_train_X:
                self._max_bin = 200
            else:
                self._max_bin = 65000
        else:
            self._max_bin = self.max_bin

        if isinstance(self.min_samples_leaf, utils.FLOATS):
            self._min_samples_leaf = ceil(self.min_samples_leaf * self._n_samples)
        else:
            self._min_samples_leaf = self.min_samples_leaf

        if self.n_jobs == -1:
            self._n_jobs = 0
        elif self.n_jobs < 0:
            self._n_jobs = cpu_count() + self.n_jobs + 1
        else:
            self._n_jobs = self.n_jobs

        self._set_target_and_loss()
コード例 #35
0
ファイル: lda.py プロジェクト: praveenkottayi/topicModels
    def _e_step(self, X, cal_delta):
        """
        E-step

        set `cal_delta == True` when we need to run _m_step
        for inference, set it to False
        """

        # parell run e-step
        if self.n_jobs == -1:
            n_jobs = cpu_count()
        else:
            n_jobs = self.n_jobs

        results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_update_gamma)
            (X[idx_slice, :], self.expElogbeta, self.alpha,
             self.rng, 100, self.mean_change_tol, cal_delta)
            for idx_slice in gen_even_slices(X.shape[0], n_jobs))

        # merge result
        gammas, deltas = zip(*results)
        gamma = np.vstack(gammas)

        if cal_delta:
            # This step finishes computing the sufficient statistics for the
            # M step, so that
            # sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
            # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
            delta_component = np.zeros(self.components_.shape)
            for delta in deltas:
                delta_component += delta
            delta_component *= self.expElogbeta
        else:
            delta_component = None

        return (gamma, delta_component)
コード例 #36
0
ファイル: lda.py プロジェクト: emgong/topicModels
    def _e_step(self, X, cal_delta):
        """
        E-step

        set `cal_delta == True` when we need to run _m_step
        for inference, set it to False
        """

        # parell run e-step
        if self.n_jobs == -1:
            n_jobs = cpu_count()
        else:
            n_jobs = self.n_jobs

        results = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
            delayed(_update_gamma)
            (X[idx_slice, :], self.expElogbeta, self.alpha,
             self.rng, self.max_gamma_update_iter, self.mean_change_tol, cal_delta)
            for idx_slice in gen_even_slices(X.shape[0], n_jobs))

        # merge result
        gammas, deltas = zip(*results)
        gamma = np.vstack(gammas)

        if cal_delta:
            # This step finishes computing the sufficient statistics for the
            # M step, so that
            # sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
            # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
            delta_component = np.zeros(self.components_.shape)
            for delta in deltas:
                delta_component += delta
            delta_component *= self.expElogbeta
        else:
            delta_component = None

        return (gamma, delta_component)
コード例 #37
0
    def _fit(self, X, y):
        X, y = check_X_y(X, y, "csr")
        # Initialization
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)
        n_features = X.shape[1]

        if self.max_features is not None:
            if not isinstance(self.max_features, numbers.Integral):
                raise TypeError("'max_features' should be an integer between 1 and {} features."
                                " Got {!r} instead."
                                .format(n_features, self.max_features))
            elif self.max_features < 1 or self.max_features > n_features:
                raise ValueError("'max_features' should be between 1 and {} features."
                                 " Got {} instead."
                                 .format(n_features, self.max_features))
            max_features = self.max_features
        else:
            max_features = n_features

        if not isinstance(self.n_gen_no_change, (numbers.Integral, np.integer, type(None))):
            raise ValueError("'n_gen_no_change' should either be None or an integer."
                             " {} was passed."
                             .format(self.n_gen_no_change))

        estimator = clone(self.estimator)

        # Genetic Algorithm
        toolbox = base.Toolbox()

        toolbox.register("attr_bool", random.randint, 0, 1)
        toolbox.register("individual", tools.initRepeat,
                         creator.Individual, toolbox.attr_bool, n=n_features)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y,
                         cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params,
                         max_features=max_features, caching=self.caching)
        toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba)
        toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba)
        toolbox.register("select", tools.selTournament, tournsize=self.tournament_size)

        if self.n_jobs == 0:
            raise ValueError("n_jobs == 0 has no meaning.")
        elif self.n_jobs > 1:
            pool = multiprocessing.Pool(processes=self.n_jobs)
            toolbox.register("map", pool.map)
        elif self.n_jobs < 0:
            pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1))
            toolbox.register("map", pool.map)

        pop = toolbox.population(n=self.n_population)
        hof = tools.HallOfFame(1, similar=np.array_equal)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("avg", np.mean, axis=0)
        stats.register("std", np.std, axis=0)
        stats.register("min", np.min, axis=0)
        stats.register("max", np.max, axis=0)

        if self.verbose > 0:
            print("Selecting features with genetic algorithm.")

        _, log = _eaFunction(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba,
                             ngen=self.n_generations, ngen_no_change=self.n_gen_no_change,
                             stats=stats, halloffame=hof, verbose=self.verbose)
        if self.n_jobs != 1:
            pool.close()
            pool.join()

        # Set final attributes
        support_ = np.array(hof, dtype=np.bool)[0]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, support_], y)

        self.generation_scores_ = np.array([score for score, _ in log.select("max")])
        self.n_features_ = support_.sum()
        self.support_ = support_

        return self
コード例 #38
0
    def fit(self, X, Y, H_init=None, warm_start=False, initialize=True):
        """Learn parameters using subgradient descent.

        Parameters
        ----------
        X : iterable
            Traing instances. Contains the structured input objects.
            No requirement on the particular form of entries of X is made.

        Y : iterable
            Training labels. Contains the strctured labels for inputs in X.
            Needs to have the same length as X.

        constraints : None
            Discarded. Only for API compatibility currently.

        warm_start : boolean, default=False
            Whether to restart a previous fit.

        initialize : boolean, default=True
            Whether to initialize the model for the data.
            Leave this true except if you really know what you are doing.
        """
        if self.verbose > 0:
            print("Training latent subgradient structural SVM")
        if initialize:
            self.model.initialize(X, Y)
        self.grad_old = np.zeros(self.model.size_psi)
        if not warm_start:
            self.w = getattr(self, "w", np.random.normal(
                0, 1, size=self.model.size_psi))
            self.timestamps_ = [time()]
            self.objective_curve_ = []
            if self.learning_rate == "auto":
                self.learning_rate_ = self.C * len(X)
            else:
                self.learning_rate_ = self.learning_rate
        else:
            # hackety hack
            self.timestamps_[0] = time() - self.timestamps_[-1]
        w = self.w.copy()
        n_samples = len(X)
        try:
            # catch ctrl+c to stop training
            for iteration in xrange(self.max_iter):
                self.timestamps_.append(time() - self.timestamps_[0])
                positive_slacks = 0
                objective = 0.
                #verbose = max(0, self.verbose - 3)

                if self.n_jobs == 1:
                    # online learning
                    for x, y in zip(X, Y):
                        h = self.model.latent(x, y, w)
                        h_hat = self.model.loss_augmented_inference(
                            x, h, w, relaxed=True)
                        delta_psi = (self.model.psi(x, h)
                                     - self.model.psi(x, h_hat))
                        slack = (-np.dot(delta_psi, w)
                                 + self.model.loss(h, h_hat))
                        objective += np.maximum(slack, 0)
                        if slack > 0:
                            positive_slacks += 1
                        w = self._solve_subgradient(delta_psi, n_samples, w)
                else:
                    #generate batches of size n_jobs
                    #to speed up inference
                    if self.n_jobs == -1:
                        n_jobs = cpu_count()
                    else:
                        n_jobs = self.j_jobs

                    n_batches = int(np.ceil(float(len(X)) / n_jobs))
                    slices = gen_even_slices(n_samples, n_batches)
                    for batch in slices:
                        X_b = X[batch]
                        Y_b = Y[batch]
                        verbose = self.verbose - 1
                        candidate_constraints = Parallel(
                            n_jobs=self.n_jobs,
                            verbose=verbose)(delayed(find_constraint_latent)(
                                self.model, x, y, w)
                                for x, y in zip(X_b, Y_b))
                        dpsi = np.zeros(self.model.size_psi)
                        for x, y, constraint in zip(X_b, Y_b,
                                                    candidate_constraints):
                            y_hat, delta_psi, slack, loss = constraint
                            objective += slack
                            dpsi += delta_psi
                            if slack > 0:
                                positive_slacks += 1
                        dpsi /= float(len(X_b))
                        w = self._solve_subgradient(dpsi, n_samples, w)

                # some statistics
                objective *= self.C
                objective += np.sum(self.w ** 2) / 2.

                if positive_slacks == 0:
                    print("No additional constraints")
                    if self.break_on_no_constraints:
                        break
                if self.verbose > 0:
                    print(self)
                    print("iteration %d" % iteration)
                    print("positive slacks: %d, "
                          "objective: %f" %
                          (positive_slacks, objective))
                self.objective_curve_.append(objective)

                if self.verbose > 2:
                    print(self.w)

                self._compute_training_loss(X, Y, iteration)
                if self.logger is not None:
                    self.logger(self, iteration)

        except KeyboardInterrupt:
            pass
        self.timestamps_.append(time() - self.timestamps_[0])
        self.objective_curve_.append(self._objective(X, Y))
        if self.logger is not None:
            self.logger(self, 'final')
        if self.verbose:
            if self.objective_curve_:
                print("final objective: %f" % self.objective_curve_[-1])
            if self.verbose and self.n_jobs == 1:
                print("calls to inference: %d" % self.model.inference_calls)
        return self
コード例 #39
0
ファイル: greedy.py プロジェクト: justheuristic/pruner
def wheel_up_features_bfs (initialBunch,
                           trees,
                           factory,
                           loss,
                           learning_rate=0.25,
                           nIters=100,
                           trees_sample_size=100,
                           verbose = True,
                           vali_factory = None,
                           learning_rate_decay = 1.,
                           trees_sample_increase = 0,
                           regularizer = 0.,
                           random_walk = True,
                           use_joblib = False,
                           n_jobs = -1,
                           joblib_backend = "threading",
                           copy_pred = False):
    """
    Iterative BFS over best ADD-1 results for [nTrees] iterations
    """
    allTrees = copy.copy(trees)
    
    bunch = copy.copy(initialBunch)
    pred = factory.predict(bunch)
    bestScore = loss.score(factory,pred)
    
    if vali_factory is not None:
        vali_pred = vali_factory.predict(bunch)
        vali_score = loss.score(vali_factory,vali_pred)
        vali_scores = [vali_score]
    
    
    if use_joblib:
        if n_jobs < 0:
            n_jobs = joblib.cpu_count()
                
        if joblib_backend == "threading":
            #create copies of data once to escape GIL forever
            factory = [copy.deepcopy(factory) for i in range(n_jobs)]
            losses = [copy.deepcopy(loss) for i in range(n_jobs)]

        elif joblib_backend == "multiprocessing":
            pass
        else:
            raise ValueError, "joblib_backend must be either 'threading' or 'multiprocessing'"
    
  
    if verbose:
        print "\niteration #",0," ntrees = ", len(bunch),"\nbest loss = ",bestScore
        print "learning_rate = ", learning_rate
        print "sample_size", trees_sample_size

    
    for itr in xrange(1,nIters+1):
        change_index= random.randint(0,len(bunch)-1) if random_walk else  (i-1)%len(bunch)
        trees_sample = random.sample(allTrees,trees_sample_size)+ [bunch[change_index]]
        bunch_wo = copy.copy(bunch)
        replaced_tree = bunch_wo.pop(change_index)

        if use_joblib and joblib_backend=="threading":
            #split trees into sections
            indices = [0]+[len(trees_sample)*(i+1)/n_jobs for i in range(n_jobs)]
            treeSections = [trees_sample[indices[i]:indices[i+1]] for i in range(n_jobs)]
            
            pred_wo = pred - factory[0].predict(PrunedFormula([bunch[change_index]],bias=0.))

            if copy_pred:
                pred_wo = [copy.deepcopy(pred) for i in range(n_jobs)]
            else:
                pred_wo = [pred for i in range(n_jobs)]

            #execute sections in parallel
            tasks = [joblib.delayed(try_add1_bfs)(treeSections[ithread],factory[ithread],
                                                          learning_rate,losses[ithread],
                                                          1,pred_wo[ithread],regularizer=regularizer,
                                                          use_joblib=False)
                                                for ithread in range(n_jobs)]
                                                    
            _res = joblib.Parallel(n_jobs = n_jobs,
                           backend = "threading")(tasks)
            _additions,newScores,newPreds = reduce(lambda a,b:[a[i]+b[i] for i in range(3)], _res)
            
        else:
            pred_wo = pred - factory.predict(PrunedFormula([bunch[change_index]],bias=0.))

            _additions,newScores,newPreds = try_add1_bfs(trees_sample,factory,
                                                         learning_rate,loss,
                                                          1,pred_wo,regularizer=regularizer,
                                                          use_joblib=use_joblib,n_jobs=n_jobs)
            
            
            

        learning_rate *= learning_rate_decay
        trees_sample_size = min(len(allTrees),trees_sample_size + trees_sample_increase)
            
        triples = zip(newScores,_additions,newPreds)
        triples.sort(key = lambda el: el[0])

        newBestScore = min(newScores)
        
        if newBestScore > bestScore:
            pass
        else: 
            bestScore = newBestScore
            _add = triples[0][1]
            bunch = bunch_wo
            bunch.insert(change_index,_add)
            pred = triples[0][2]

        
        
        if verbose:
            print "\niteration #",itr," ntrees = ", len(bunch),"\nbest loss = ", bestScore,"\nlast loss = ",newBestScore
            
            if vali_factory is not None:
                _add = triples[0][1]
                vali_pred_wo = vali_pred - vali_factory.predict(PrunedFormula([replaced_tree], bias=0.))
                vali_pred = vali_pred_wo + vali_factory.predict(PrunedFormula([_add],bias=0.))
                vali_score = loss.score(vali_factory,vali_pred)
                print "Validation loss:", vali_score
                vali_scores.append(vali_score)
                
                
            print "changed index",change_index
            print "learning_rate = ", learning_rate
            print "sample_size", trees_sample_size       
    if verbose>=2:
        print "Validation scores history:"
        from matplotlib import pyplot as plt
        plt.plot(np.arange(len(vali_scores)),vali_scores)
        
    return bunch
コード例 #40
0
    def fit(self, X, y):
        # The smaller C, the stronger the regularization.
        # The more regularization, the more sparsity.

        print('_' * 80)
        print("Cross validation: ")

        #        param_grid = [
        #        {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'kernel': ['linear']},
        #        {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'gamma': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'kernel': ['rbf']},
        #        ]
        #        param_grid = [
        #        {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000]},
        #        ]

        param_grid = [
            {
                'C': np.logspace(-3, 3, 7),
                'penalty': ['l1', 'l2']
            },
        ]
        print(param_grid)
        scoring = 'roc_auc'
        num_folds = 5

        #X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

        #svc = LinearSVC(penalty="l1", dual=False, tol=1e-3)
        svc = LogisticRegression(dual=False, tol=1e-3, class_weight='auto')
        start = time()
        clf = GridSearchCV(svc,
                           param_grid=param_grid,
                           cv=num_folds,
                           scoring=scoring,
                           verbose=2,
                           n_jobs=joblib.cpu_count())
        clf.fit(X, y)
        print(clf)
        print(
            "GridSearchCV took %.2f seconds for %d candidate parameter settings."
            % (time() - start, len(clf.grid_scores_)))

        print("Grid Scores:")
        print()
        print(clf.grid_scores_)
        print()
        print("Best estimator :")
        print()
        print(clf.best_estimator_)
        print()
        print("Best score :")
        print()
        print(clf.best_score_)
        print()
        print("Best Parameters :")
        print()
        print(clf.best_params_)
        print()

        #        self.transformer_ = LinearSVC(C=clf.best_estimator_.C, penalty="l1",
        #                                      dual=False, tol=1e-3, verbose=2)
        #        X = self.transformer_.fit_transform(X, y)
        #        return LinearSVC.fit(self, X, y)

        self.transformer_ = LogisticRegression(
            C=clf.best_estimator_.C,
            penalty=clf.best_estimator_.penalty,
            dual=False,
            tol=1e-3,
            class_weight='auto')
        X = self.transformer_.fit_transform(X, y)
        return LogisticRegression.fit(self, X, y)
コード例 #41
0
import datetime
from sklearn.model_selection import GridSearchCV

FILE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(FILE_DIR)

#prepare the logger
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--profile", default="ipy_profile",
                 help="Name of IPython profile to use")
args = parser.parse_args()
profile = args.profile
logging.basicConfig(filename=os.path.join(FILE_DIR,profile+'.log'),
                    filemode='w',
                    level=logging.DEBUG)
logging.info("number of CPUs found: {0}".format(cpu_count()))
logging.info("args.profile: {0}".format(profile))

#prepare the engines
c = Client(profile=profile)
#The following command will make sure that each engine is running in
# the right working directory to access the custom function(s).
c[:].map(os.chdir, [FILE_DIR]*len(c))
logging.info("c.ids :{0}".format(str(c.ids)))
bview = c.load_balanced_view()
register_parallel_backend('ipyparallel',
                          lambda : IPythonParallelBackend(view=bview))

#Get data
digits = load_digits()
#prepare it for the custom function
コード例 #42
0
ファイル: doc_class.py プロジェクト: sourabhd/kdd2014
    def fit(self, X, y):
        # The smaller C, the stronger the regularization.
        # The more regularization, the more sparsity.

        print('_' * 80)
        print("Cross validation: ")

#        param_grid = [
#        {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'kernel': ['linear']},
#        {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'gamma': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'kernel': ['rbf']},
#        ]
#        param_grid = [
#        {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000]},
#        ]

        param_grid = [
                {'C': np.logspace(-3,3,7), 'penalty':['l1','l2']},
        ]
        print(param_grid)
        scoring = 'roc_auc'
        num_folds = 5

        #X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

        #svc = LinearSVC(penalty="l1", dual=False, tol=1e-3)
        svc = LogisticRegression(dual=False, tol=1e-3, class_weight='auto')
        start = time()
        clf = GridSearchCV(svc, param_grid=param_grid, cv=num_folds, scoring=scoring,verbose=2,n_jobs=joblib.cpu_count())
        clf.fit(X,y)
        print(clf)
        print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(clf.grid_scores_)))

        print("Grid Scores:")
        print()
        print(clf.grid_scores_)
        print()
        print("Best estimator :")
        print()
        print(clf.best_estimator_)
        print()
        print("Best score :")
        print()
        print(clf.best_score_)
        print()
        print("Best Parameters :")
        print()
        print(clf.best_params_)
        print()

#        self.transformer_ = LinearSVC(C=clf.best_estimator_.C, penalty="l1",
#                                      dual=False, tol=1e-3, verbose=2)
#        X = self.transformer_.fit_transform(X, y)
#        return LinearSVC.fit(self, X, y)

        self.transformer_ = LogisticRegression(C=clf.best_estimator_.C, penalty=clf.best_estimator_.penalty,
                                      dual=False, tol=1e-3, class_weight='auto')
        X = self.transformer_.fit_transform(X, y)
        return LogisticRegression.fit(self, X, y)
コード例 #43
0
ファイル: searchlight.py プロジェクト: sytshanli/nilearn
 def __init__(self, n_features, n_jobs=1):
     self.n_features = n_features
     if n_jobs == -1:
         n_jobs = cpu_count()
     self.n_jobs = n_jobs
コード例 #44
0
def greed_up_features_bfs (trees,
                           factory,
                           loss,
                           learning_rate,
                           breadth,
                           nTrees,
                           trees_sample_size,
                           verbose = True,
                           learning_rate_decay = 1.,
                           trees_sample_increase = 0,
                           regularizer = 0.,
                           use_joblib = False,
                           n_jobs = -1,
                           joblib_method = "threads",
                           copy_pred = False,
                           initialBunch = []):
    """
    Iterative BFS over best ADD-1 results for [nTrees] iterations
    """
    allTrees = copy.copy(trees)
    if len(initialBunch)==0:
        trees_sample = np.array(random.sample(allTrees,trees_sample_size))    
        additions,losses,preds = try_add1_bfs(trees_sample,factory,learning_rate,loss,
                                                      breadth,y_pred=factory.labels*0,regularizer = regularizer)
        bunches = [[_added] for _added in additions]                                              
    else:
        bunches = [initialBunch]
        preds = [factory.predict(initialBunch)]
        losses = [np.sum(loss(factory,preds[0]))]
    bestScore = min(losses)

    
    if use_joblib:
        if n_jobs < 0:
            n_jobs = joblib.cpu_count()
                
        if joblib_method == "threads":
            #create copies of data once to escape GIL forever
            factory = [copy.deepcopy(factory) for i in range(n_jobs)]
            loss = [copy.deepcopy(loss) for i in range(n_jobs)]

        elif joblib_method == "processes":
            pass
        else:
            raise ValueError, "joblib_method must be either 'threads' or 'processes'"
    
    

    if verbose:
        print "\niteration #",0," ntrees = ", len(bunches[0]),"\nbest loss = ",bestScore
        print "learning_rate = ", learning_rate
        print "sample_size", trees_sample_size

    
    itr = 0
    while len(bunches[0]) <nTrees:

        itr+=1
        newBunches = []    
        newScores = []
        newPreds = []
        for bunch,pred in zip(bunches,preds):
            trees_sample = np.array(random.sample(allTrees,trees_sample_size))
            
            if use_joblib and joblib_method=="threads":
                #split trees into sections
                indices = [0]+[len(trees_sample)*(i+1)/n_jobs for i in range(n_jobs)]
                treeSections = [trees_sample[indices[i]:indices[i+1]] for i in range(n_jobs)]
                if copy_pred:
                    pred = [copy.deepcopy(pred) for i in range(n_jobs)]
                else:
                    pred = [pred for i in range(n_jobs)]

                #execute sections in parallel
                tasks = [joblib.delayed(try_add1_bfs)(treeSections[ithread],factory[ithread],
                                                              learning_rate,loss[ithread],
                                                              breadth,pred[ithread],regularizer=regularizer,
                                                              use_joblib=False)
                                                    for ithread in range(n_jobs)]
                                                        
                _res = joblib.Parallel(n_jobs = n_jobs,
                               backend = "threading")(tasks)
                _additions,_losses,_preds = reduce(lambda a,b:[a[i]+b[i] for i in range(3)], _res)

                
            else:
                _additions,_losses,_preds = try_add1_bfs(trees_sample,factory,learning_rate,loss,
                                                              breadth,pred,regularizer=regularizer,
                                                              use_joblib=use_joblib,n_jobs=n_jobs)
                

            _bunches = [bunch+[_added] for _added in _additions]
            newBunches+=_bunches
            newScores += _losses
            newPreds += _preds
            
        learning_rate *= learning_rate_decay
        trees_sample_size = min(len(allTrees),trees_sample_size + trees_sample_increase)
            
        triples = zip(newScores,newBunches,newPreds)
        triples.sort(key = lambda el: el[0])
        
        newBestScore = min(newScores)
        
        if newBestScore > bestScore:
            learning_rate /=2.
            if learning_rate < 0.00001:
                break
        else: 
            bestScore = newBestScore
            bunches = [triple[1] for triple in triples[:breadth]]       
            preds = [triple[2] for triple in triples[:breadth]]       

        
        
        if verbose:
            print "\niteration #",itr," ntrees = ", len(bunches[0]),"\nbest loss = ", bestScore,"\nlast loss = ",newBestScore
            print "learning_rate = ", learning_rate
            print "sample_size", trees_sample_size       
    return bunches[0]
コード例 #45
0
ファイル: searchlight.py プロジェクト: pgervais/nilearn
 def __init__(self, n_features, n_jobs=1):
     self.n_features = n_features
     if n_jobs == -1:
         n_jobs = cpu_count()
     self.n_jobs = n_jobs
コード例 #46
0
    def fit(self, X, Y, H_init=None):
        """Learn parameters using subgradient descent.

        Parameters
        ----------
        X : iterable
            Traing instances. Contains the structured input objects.
            No requirement on the particular form of entries of X is made.

        Y : iterable
            Training labels. Contains the strctured labels for inputs in X.
            Needs to have the same length as X.

        constraints : None
            Discarded. Only for API compatibility currently.
        """
        print("Training latent subgradient structural SVM")
        self.w = getattr(self, "w", np.random.normal(
            0, .001, size=self.model.size_psi))
        #constraints = []
        self.objective_curve_ = []
        n_samples = len(X)
        try:
            # catch ctrl+c to stop training
            for iteration in xrange(self.max_iter):
                positive_slacks = 0
                objective = 0.
                #verbose = max(0, self.verbose - 3)

                if self.n_jobs == 1:
                    # online learning
                    for x, y in zip(X, Y):
                        h = self.model.latent(x, y, self.w)
                        h_hat = self.model.loss_augmented_inference(
                            x, h, self.w, relaxed=True)
                        delta_psi = (self.model.psi(x, h)
                                     - self.model.psi(x, h_hat))
                        slack = (-np.dot(delta_psi, self.w)
                                 + self.model.loss(h, h_hat))
                        objective += np.maximum(slack, 0)
                        if slack > 0:
                            positive_slacks += 1
                        self._solve_subgradient(delta_psi, n_samples)
                else:
                    #generate batches of size n_jobs
                    #to speed up inference
                    if self.n_jobs == -1:
                        n_jobs = cpu_count()
                    else:
                        n_jobs = self.j_jobs

                    n_batches = int(np.ceil(float(len(X)) / n_jobs))
                    slices = gen_even_slices(n_samples, n_batches)
                    for batch in slices:
                        X_b = X[batch]
                        Y_b = Y[batch]
                        verbose = self.verbose - 1
                        candidate_constraints = Parallel(
                            n_jobs=self.n_jobs,
                            verbose=verbose)(delayed(find_constraint_latent)(
                                self.model, x, y, self.w)
                                for x, y in zip(X_b, Y_b))
                        dpsi = np.zeros(self.model.size_psi)
                        for x, y, constraint in zip(X_b, Y_b,
                                                    candidate_constraints):
                            y_hat, delta_psi, slack, loss = constraint
                            objective += slack
                            dpsi += delta_psi
                            if slack > 0:
                                positive_slacks += 1
                        dpsi /= float(len(X_b))
                        self._solve_subgradient(dpsi, n_samples)

                # some statistics
                objective += np.sum(self.w ** 2) / self.C / 2.
                #objective /= float(n_samples)

                if positive_slacks == 0:
                    print("No additional constraints")
                    if self.break_on_no_constraints:
                        break
                if self.verbose > 0:
                    print(self)
                    print("iteration %d" % iteration)
                    print("positive slacks: %d, "
                          "objective: %f" %
                          (positive_slacks, objective))
                self.objective_curve_.append(objective)

                if self.verbose > 2:
                    print(self.w)

                self._compute_training_loss(X, Y, iteration)
                if self.logger is not None:
                    self.logger(self, iteration)

        except KeyboardInterrupt:
            pass
        print("final objective: %f" % self.objective_curve_[-1])
        print("calls to inference: %d" % self.model.inference_calls)
        return self
コード例 #47
0
    def fit(self, X, y=None, groups=None):
        """Run fit on the estimator with randomly drawn parameters.

        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples] or [n_samples, n_output]
            Target relative to X for classification or regression (class
            labels should be integers or strings).

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        """

        # check if space is a single dict, convert to list if so
        search_spaces = self.search_spaces
        if isinstance(search_spaces, dict):
            search_spaces = [search_spaces]

        if self.optimizer_kwargs is None:
            self.optimizer_kwargs_ = {}
        else:
            self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
        random_state = check_random_state(self.random_state)
        self.optimizer_kwargs_['random_state'] = random_state

        # Instantiate optimizers for all the search spaces.
        optimizers = []
        for search_space in search_spaces:
            if isinstance(search_space, tuple):
                search_space = search_space[0]
            optimizers.append(self._make_optimizer(search_space))
        self.optimizers_ = optimizers  # will save the states of the optimizers

        self.cv_results_ = defaultdict(list)
        self.best_index_ = None
        self.multimetric_ = False

        n_jobs = self.n_jobs

        # account for case n_jobs < 0
        if n_jobs < 0:
            n_jobs = max(1, cpu_count() + n_jobs + 1)

        for search_space, optimizer in zip(search_spaces, optimizers):
            # if not provided with search subspace, n_iter is taken as
            # self.n_iter
            if isinstance(search_space, tuple):
                search_space, n_iter = search_space
            else:
                n_iter = self.n_iter

            # do the optimization for particular search space
            while n_iter > 0:
                # when n_iter < n_jobs points left for evaluation
                n_jobs_adjusted = min(n_iter, n_jobs)

                self._step(
                    X, y, search_space, optimizer,
                    groups=groups, n_jobs=n_jobs_adjusted
                )
                n_iter -= n_jobs

        # Refit the best model on the the whole dataset
        if self.refit:
            self._fit_best_model(X, y)

        return self
コード例 #48
0
def permuted_ols(tested_vars,
                 target_vars,
                 confounding_vars=None,
                 model_intercept=True,
                 n_perm=10000,
                 random_state=None,
                 n_jobs=1):
    """Massively univariate group analysis with permuted OLS.

    Tested variates are independently fitted to target variates descriptors
    (e.g. brain imaging signal) according to a linear model solved with an
    Ordinary Least Squares criterion.
    Confounding variates may be included in the model.
    Permutation testing is used to assess the significance of the relationship
    between the tested variates and the target variates [1, 2]. A max-type
    procedure is used to obtain family-wise corrected p-values.

    The specific permutation scheme implemented here is the one of
    Freedman & Lane [3]. Its has been demonstrated in [1] that this scheme
    conveys more sensitivity than alternative schemes. This holds for
    neuroimaging applications, as discussed in details in [2].

    Permutations are performed on parallel computing units. Each of them
    performs a fraction of permutations on the whole dataset. Thus, the max
    F-score amongst data descriptors can be computed directly, which avoids
    storing all the computed F-scores.

    The variates should be given C-contiguous. target_vars are fortran-ordered
    automatically to speed-up computations.

    Parameters
    ----------
    tested_vars : array-like, shape=(n_samples, n_regressors)
      Explanatory variates, fitted and tested independently from each others.

    target_vars : array-like, shape=(n_samples, n_descriptors)
      fMRI data, trying to be explained by explanatory and confounding
      variates.

    confounding_vars : array-like, shape=(n_samples, n_covars)
      Confounding variates (covariates), fitted but not tested.
      If None, no confounding variate is added to the model
      (except maybe a constant column according to the value of
      `model_intercept`)

    model_intercept : bool,
      If True, a constant column is added to the confounding variates
      unless the tested variate is already the intercept.

    n_perm : int,
      Number of permutations to perform.
      Permutations are costly but the more are performed, the more precision
      one gets in the p-values estimation.

    random_state : int or None,
      Seed for random number generator, to have the same permutations
      in each computing units.

    n_jobs : int,
      Number of parallel workers.
      If 0 is provided, all CPUs are used.
      A negative number indicates that all the CPUs except (|n_jobs| - 1) ones
      will be used.

    Returns
    -------
    pvals : array-like, shape=(n_regressors, n_descriptors)
      Negative log10 p-values associated with the significance test of the
      n_regressors explanatory variates against the n_descriptors target
      variates. Family-wise corrected p-values.

    score_orig_data : numpy.ndarray, shape=(n_regressors, n_descriptors)
      F-statistic associated with the significance test of the n_regressors
      explanatory variates against the n_descriptors target variates.
      The ranks of the scores into the h0 distribution correspond to the
      p-values.

    h0_fmax : array-like, shape=(n_perm, )
      Distribution of the (max) F-statistic under the null hypothesis
      (obtained from the permutations). Array is sorted.

    References
    ----------
    [1] Anderson, M. J. & Robinson, J. (2001).
        Permutation tests for linear models.
        Australian & New Zealand Journal of Statistics, 43(1), 75-88.
        (http://avesbiodiv.mncn.csic.es/estadistica/permut2.pdf)
    [2] Winkler, A. M. et al. (2014).
        Permutation inference for the general linear model.
        Neuroimage.
    [3] Freedman, D. & Lane, D. (1983).
        A nonstochastic interpretation of reported significance levels.
        J. Bus. Econ. Stats., 1(4), 292-298

    """
    # initialize the seed of the random generator
    rng = check_random_state(random_state)

    # check n_jobs (number of CPUs)
    if n_jobs == 0:  # invalid according to joblib's conventions
        raise ValueError("'n_jobs == 0' is not a valid choice. "
                         "Please provide a positive number of CPUs, or -1 "
                         "for all CPUs, or a negative number (-i) for "
                         "'all but (i-1)' CPUs (joblib conventions).")
    elif n_jobs < 0:
        n_jobs = max(1, joblib.cpu_count() - int(n_jobs) + 1)
    else:
        n_jobs = min(n_jobs, joblib.cpu_count())
    # make target_vars F-ordered to speed-up computation
    if target_vars.ndim != 2:
        raise ValueError(
            "'target_vars' should be a 2D array. "
            "An array with %d dimension%s was passed" %
            (target_vars.ndim, "s" if target_vars.ndim > 1 else ""))
    target_vars = np.asfortranarray(target_vars)  # efficient for chunking
    n_descriptors = target_vars.shape[1]

    # check explanatory variates dimensions
    if tested_vars.ndim == 1:
        tested_vars = np.atleast_2d(tested_vars).T
    n_samples, n_regressors = tested_vars.shape

    # check if explanatory variates is intercept (constant) or not
    if (n_regressors == 1 and np.unique(tested_vars).size == 1):
        intercept_test = True
    else:
        intercept_test = False

    # optionally add intercept
    if model_intercept and not intercept_test:
        if confounding_vars is not None:
            confounding_vars = np.hstack(
                (confounding_vars, np.ones((n_samples, 1))))
        else:
            confounding_vars = np.ones((n_samples, 1))

    ### OLS regression on original data
    if confounding_vars is not None:
        # step 1: extract effect of covars from target vars
        covars_orthonormalized = orthonormalize_matrix(confounding_vars)
        if not covars_orthonormalized.flags['C_CONTIGUOUS']:
            # useful to developer
            warnings.warn('Confounding variates not C_CONTIGUOUS.')
            covars_orthonormalized = np.ascontiguousarray(
                covars_orthonormalized)
        targetvars_normalized = normalize_matrix_on_axis(
            target_vars).T  # faster with F-ordered target_vars_chunk
        if not targetvars_normalized.flags['C_CONTIGUOUS']:
            # useful to developer
            warnings.warn('Target variates not C_CONTIGUOUS.')
            targetvars_normalized = np.ascontiguousarray(targetvars_normalized)
        beta_targetvars_covars = np.dot(targetvars_normalized,
                                        covars_orthonormalized)
        targetvars_resid_covars = targetvars_normalized - np.dot(
            beta_targetvars_covars, covars_orthonormalized.T)
        targetvars_resid_covars = normalize_matrix_on_axis(
            targetvars_resid_covars, axis=1)
        # step 2: extract effect of covars from tested vars
        testedvars_normalized = normalize_matrix_on_axis(tested_vars.T, axis=1)
        beta_testedvars_covars = np.dot(testedvars_normalized,
                                        covars_orthonormalized)
        testedvars_resid_covars = testedvars_normalized - np.dot(
            beta_testedvars_covars, covars_orthonormalized.T)
        testedvars_resid_covars = normalize_matrix_on_axis(
            testedvars_resid_covars, axis=1).T.copy()
    else:
        targetvars_resid_covars = normalize_matrix_on_axis(target_vars).T
        testedvars_resid_covars = normalize_matrix_on_axis(tested_vars).copy()
        covars_orthonormalized = None
    # check arrays contiguousity (for the sake of code efficiency)
    if not targetvars_resid_covars.flags['C_CONTIGUOUS']:
        # useful to developer
        warnings.warn('Target variates not C_CONTIGUOUS.')
        targetvars_resid_covars = np.ascontiguousarray(targetvars_resid_covars)
    if not testedvars_resid_covars.flags['C_CONTIGUOUS']:
        # useful to developer
        warnings.warn('Tested variates not C_CONTIGUOUS.')
        testedvars_resid_covars = np.ascontiguousarray(testedvars_resid_covars)
    # step 3: original regression (= regression on residuals + adjust F score)
    # compute F score for original data
    scores_original_data = _f_score_with_covars_and_normalized_design(
        testedvars_resid_covars, targetvars_resid_covars.T,
        covars_orthonormalized)

    ### Permutations
    # parallel computing units perform a reduced number of permutations each
    if n_perm > n_jobs:
        n_perm_chunks = np.asarray([n_perm / n_jobs] * n_jobs, dtype=int)
        n_perm_chunks[-1] += n_perm % n_jobs
    elif n_perm > 0:
        n_perm_chunks = np.ones(n_perm, dtype=int)
    else:  # 0 or negative number of permutations => original data scores only
        return np.asarray([]), scores_original_data, np.asarray([])
    # actual permutations, seeded from a random integer between 0 and maximum
    # value represented by np.int32 (to have a large entropy).
    ret = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(_permuted_ols_on_chunk)(
            scores_original_data,
            testedvars_resid_covars,
            targetvars_resid_covars.T,
            covars_orthonormalized,
            n_perm_chunk=n_perm_chunk,
            intercept_test=intercept_test,
            random_state=rng.random_integers(np.iinfo(np.int32).max))
        for n_perm_chunk in n_perm_chunks)
    # reduce results
    scores_as_ranks_parts, h0_fmax_parts = zip(*ret)
    h0_fmax = np.hstack((h0_fmax_parts))
    scores_as_ranks = np.zeros((n_regressors, n_descriptors))
    for scores_as_ranks_part in scores_as_ranks_parts:
        scores_as_ranks += scores_as_ranks_part
    # convert ranks into p-values
    pvals = (n_perm + 1 - scores_as_ranks) / float(1 + n_perm)

    return -np.log10(pvals), scores_original_data, h0_fmax[0]
コード例 #49
0
    def step(self, X, y, space_id, groups=None, n_jobs=1):
        """Generate n_jobs parameters and evaluate them in parallel.

        Having a separate function for a single step for search allows to
        save easily checkpoints for the parameter search and restore from
        possible failures.

        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels) as integers or strings.

        space_id : hashable
            Identifier of parameter search space. Add search spaces with

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        n_jobs : int, default=1
            Number of parameters to evaluate in parallel.

        Returns
        -------
        params_dict: dictionary with parameter values.
        """

        # convert n_jobst to int > 0 if necessary
        if n_jobs < 0:
            n_jobs = max(1, cpu_count() + n_jobs + 1)

        # use the cached optimizer for particular parameter space
        if space_id not in self.search_spaces_:
            raise ValueError("Unknown space %s" % space_id)

        # get the search space for a step
        search_space = self.search_spaces_[space_id]
        if isinstance(search_space, tuple):
            search_space, _ = search_space

        # create optimizer if not created already
        if space_id not in self.optimizer_:
            self.optimizer_[space_id] = self._make_optimizer(search_space)
        optimizer = self.optimizer_[space_id]

        # get parameter values to evaluate
        params = optimizer.ask(n_points=n_jobs)
        params_dict = [point_asdict(search_space, p) for p in params]

        # self.cv_results_ is reset at every call to _fit, keep current
        all_cv_results = self.cv_results_

        # record performances with different points
        refit = self.refit
        self.refit = False  # do not fit yet - will be fit later

        # this adds compatibility with different versions of sklearn

        self._fit(X, y, groups, params_dict)

        self.refit = refit

        # merge existing and new cv_results_
        for k in self.cv_results_:
            all_cv_results[k].extend(self.cv_results_[k])

        self.cv_results_ = all_cv_results
        self.best_index_ = np.argmax(self.cv_results_['mean_test_score'])

        # feed the point and objective back into optimizer
        local_results = self.cv_results_['mean_test_score'][-len(params):]

        # optimizer minimizes objective, hence provide negative score
        optimizer.tell(params, [-score for score in local_results])

        # fit the best model if necessary
        if self.refit:
            self._fit_best_model(X, y)
コード例 #50
0
ファイル: subgradient_ssvm.py プロジェクト: lfiaschi/pystruct
    def fit(self, X, Y, constraints=None):
        """Learn parameters using subgradient descent.

        Parameters
        ----------
        X : iterable
            Traing instances. Contains the structured input objects.
            No requirement on the particular form of entries of X is made.

        Y : iterable
            Training labels. Contains the strctured labels for inputs in X.
            Needs to have the same length as X.

        constraints : None
            Discarded. Only for API compatibility currently.
        """
        print("Training primal subgradient structural SVM")
        w = getattr(self, "w", np.zeros(self.problem.size_psi))
        #constraints = []
        loss_curve = []
        objective_curve = []
        n_samples = len(X)
        try:
            # catch ctrl+c to stop training
            for iteration in xrange(self.max_iter):
                positive_slacks = 0
                objective = 0.
                verbose = max(0, self.verbose - 3)

                if self.n_jobs == 1:
                    # online learning
                    for x, y in zip(X, Y):
                        y_hat, delta_psi, slack, loss = \
                            find_constraint(self.problem, x, y, w)
                        objective += slack
                        if slack > 0:
                            positive_slacks += 1
                        w = self._solve_subgradient(w, delta_psi, n_samples)
                else:
                    # generate batches of size n_jobs
                    # to speed up inference
                    if self.n_jobs == -1:
                        n_jobs = cpu_count()
                    else:
                        n_jobs = self.j_jobs

                    n_batches = int(np.ceil(float(len(X)) / n_jobs))
                    slices = gen_even_slices(n_samples, n_batches)
                    for batch in slices:
                        X_b = X[batch]
                        Y_b = Y[batch]
                        candidate_constraints = Parallel(
                            n_jobs=self.n_jobs,
                            verbose=verbose)(delayed(find_constraint)(
                                self.problem, x, y, w)
                                for x, y in zip(X_b, Y_b))
                        dpsi = np.zeros(self.problem.size_psi)
                        for x, y, constraint in zip(X_b, Y_b,
                                                    candidate_constraints):
                            y_hat, delta_psi, slack, loss = constraint
                            objective += slack
                            dpsi += delta_psi
                            if slack > 0:
                                positive_slacks += 1
                        dpsi /= float(len(X_b))
                        w = self._solve_subgradient(w, dpsi, n_samples)

                # some statistics
                objective /= len(X)
                objective += np.sum(w ** 2) / self.C / 2.

                if positive_slacks == 0:
                    print("No additional constraints")
                    break
                if self.verbose > 0:
                    print(self)
                    print("iteration %d" % iteration)
                    print("positive slacks: %d,"
                          "objective: %f" %
                          (positive_slacks, objective))
                objective_curve.append(objective)

                if self.verbose > 2:
                    print(w)

                self._compute_training_loss(X, Y, w, iteration)

        except KeyboardInterrupt:
            pass
        self.w = w
        self.loss_curve_ = loss_curve
        self.objective_curve_ = objective_curve
        print("final objective: %f" % objective_curve[-1])
        print("calls to inference: %d" % self.problem.inference_calls)
        return self
コード例 #51
0
ファイル: runAllSift.py プロジェクト: jgera/masterThesis
testPredictFolder = os.path.join(rusinol.resultPath, 'prediction')
if not os.path.exists(testPredictFolder):
    os.makedirs(testPredictFolder)
# xml file location
xmlName = 'segmentation.xml'
testXmlCompletePath = os.path.join(rusinol.testPath, xmlName)
# scales 
scaleList = [0.1, 0.2 , 0.3 ,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.5]
# now we have trained our classifier 
imagePathSaveList = os.path.join(rusinol.testPath, 'imgPathList.pkl')
with open(imagePathSaveList, 'rb') as pklFile:
    imgPathList = pickle.load(pklFile)
    pklFile.close()
numOfImages = len(imgPathList)
# parallel processing
numCores = joblib.cpu_count()
if numCores == 8:
    numJobs = 5
elif numCores == 4:   
    numJobs = 2
else:
    numJobs = numCores/2    
print 'Running jobs in parallel on cores - ', numJobs

# parallel job scheduler    
joblib.Parallel(n_jobs = numJobs)(joblib.delayed(parallelImageProcessing)(rusinol.cellSize, \
                        rusinol.stepSize, scaleList, testPredictFolder, \
                        pixelTextProbPredictorWeights, feStandardized, scalesFromMSER, eachImgPath) for eachImgPath in imgPathList)
# exit the script
exit()
コード例 #52
0
    def fit(self, X, Y, H_init=None, warm_start=False, initialize=True):
        """Learn parameters using subgradient descent.

        Parameters
        ----------
        X : iterable
            Traing instances. Contains the structured input objects.
            No requirement on the particular form of entries of X is made.

        Y : iterable
            Training labels. Contains the strctured labels for inputs in X.
            Needs to have the same length as X.

        constraints : None
            Discarded. Only for API compatibility currently.

        warm_start : boolean, default=False
            Whether to restart a previous fit.

        initialize : boolean, default=True
            Whether to initialize the model for the data.
            Leave this true except if you really know what you are doing.
        """
        if self.verbose > 0:
            print("Training latent subgradient structural SVM")
        if initialize:
            self.model.initialize(X, Y)
        self.grad_old = np.zeros(self.model.size_joint_feature)
        if not warm_start:
            self.w = getattr(self, "w", np.random.normal(
                0, 1, size=self.model.size_joint_feature))
            self.timestamps_ = [time()]
            self.objective_curve_ = []
            if self.learning_rate == "auto":
                self.learning_rate_ = self.C * len(X)
            else:
                self.learning_rate_ = self.learning_rate
        else:
            # hackety hack
            self.timestamps_[0] = time() - self.timestamps_[-1]
        w = self.w.copy()
        n_samples = len(X)
        try:
            # catch ctrl+c to stop training
            for iteration in xrange(self.max_iter):
                self.timestamps_.append(time() - self.timestamps_[0])
                positive_slacks = 0
                objective = 0.
                #verbose = max(0, self.verbose - 3)

                if self.n_jobs == 1:
                    # online learning
                    for x, y in zip(X, Y):
                        h = self.model.latent(x, y, w)
                        h_hat = self.model.loss_augmented_inference(
                            x, h, w, relaxed=True)
                        delta_joint_feature = (self.model.joint_feature(x, h)
                                     - self.model.joint_feature(x, h_hat))
                        slack = (-np.dot(delta_joint_feature, w)
                                 + self.model.loss(h, h_hat))
                        objective += np.maximum(slack, 0)
                        if slack > 0:
                            positive_slacks += 1
                        w = self._solve_subgradient(delta_joint_feature, n_samples, w)
                else:
                    #generate batches of size n_jobs
                    #to speed up inference
                    if self.n_jobs == -1:
                        n_jobs = cpu_count()
                    else:
                        n_jobs = self.j_jobs

                    n_batches = int(np.ceil(float(len(X)) / n_jobs))
                    slices = gen_even_slices(n_samples, n_batches)
                    for batch in slices:
                        X_b = X[batch]
                        Y_b = Y[batch]
                        verbose = self.verbose - 1
                        candidate_constraints = Parallel(
                            n_jobs=self.n_jobs,
                            verbose=verbose)(delayed(find_constraint_latent)(
                                self.model, x, y, w)
                                for x, y in zip(X_b, Y_b))
                        djoint_feature = np.zeros(self.model.size_joint_feature)
                        for x, y, constraint in zip(X_b, Y_b,
                                                    candidate_constraints):
                            y_hat, delta_joint_feature, slack, loss = constraint
                            objective += slack
                            djoint_feature += delta_joint_feature
                            if slack > 0:
                                positive_slacks += 1
                        djoint_feature /= float(len(X_b))
                        w = self._solve_subgradient(djoint_feature, n_samples, w)

                # some statistics
                objective *= self.C
                objective += np.sum(self.w ** 2) / 2.

                if positive_slacks == 0:
                    print("No additional constraints")
                    if self.break_on_no_constraints:
                        break
                if self.verbose > 0:
                    print(self)
                    print("iteration %d" % iteration)
                    print("positive slacks: %d, "
                          "objective: %f" %
                          (positive_slacks, objective))
                self.objective_curve_.append(objective)

                if self.verbose > 2:
                    print(self.w)

                self._compute_training_loss(X, Y, iteration)
                if self.logger is not None:
                    self.logger(self, iteration)

        except KeyboardInterrupt:
            pass
        self.timestamps_.append(time() - self.timestamps_[0])
        self.objective_curve_.append(self._objective(X, Y))
        if self.logger is not None:
            self.logger(self, 'final')
        if self.verbose:
            if self.objective_curve_:
                print("final objective: %f" % self.objective_curve_[-1])
            if self.verbose and self.n_jobs == 1:
                print("calls to inference: %d" % self.model.inference_calls)
        return self
コード例 #53
0
def permuted_ols(tested_vars, target_vars, confounding_vars=None,
                 model_intercept=True, n_perm=10000,
                 random_state=None, n_jobs=1):
    """Massively univariate group analysis with permuted OLS.

    Tested variates are independently fitted to target variates descriptors
    (e.g. brain imaging signal) according to a linear model solved with an
    Ordinary Least Squares criterion.
    Confounding variates may be included in the model.
    Permutation testing is used to assess the significance of the relationship
    between the tested variates and the target variates [1, 2]. A max-type
    procedure is used to obtain family-wise corrected p-values.

    The specific permutation scheme implemented here is the one of
    Freedman & Lane [3]. Its has been demonstrated in [1] that this scheme
    conveys more sensitivity than alternative schemes. This holds for
    neuroimaging applications, as discussed in details in [2].

    Permutations are performed on parallel computing units. Each of them
    performs a fraction of permutations on the whole dataset. Thus, the max
    F-score amongst data descriptors can be computed directly, which avoids
    storing all the computed F-scores.

    The variates should be given C-contiguous. target_vars are fortran-ordered
    automatically to speed-up computations.

    Parameters
    ----------
    tested_vars : array-like, shape=(n_samples, n_regressors)
      Explanatory variates, fitted and tested independently from each others.

    target_vars : array-like, shape=(n_samples, n_descriptors)
      fMRI data, trying to be explained by explanatory and confounding
      variates.

    confounding_vars : array-like, shape=(n_samples, n_covars)
      Confounding variates (covariates), fitted but not tested.
      If None, no confounding variate is added to the model
      (except maybe a constant column according to the value of
      `model_intercept`)

    model_intercept : bool,
      If True, a constant column is added to the confounding variates
      unless the tested variate is already the intercept.

    n_perm : int,
      Number of permutations to perform.
      Permutations are costly but the more are performed, the more precision
      one gets in the p-values estimation.

    random_state : int or None,
      Seed for random number generator, to have the same permutations
      in each computing units.

    n_jobs : int,
      Number of parallel workers.
      If 0 is provided, all CPUs are used.
      A negative number indicates that all the CPUs except (|n_jobs| - 1) ones
      will be used.

    Returns
    -------
    pvals : array-like, shape=(n_regressors, n_descriptors)
      Negative log10 p-values associated with the significance test of the
      n_regressors explanatory variates against the n_descriptors target
      variates. Family-wise corrected p-values.

    score_orig_data : numpy.ndarray, shape=(n_regressors, n_descriptors)
      F-statistic associated with the significance test of the n_regressors
      explanatory variates against the n_descriptors target variates.
      The ranks of the scores into the h0 distribution correspond to the
      p-values.

    h0_fmax : array-like, shape=(n_perm, )
      Distribution of the (max) F-statistic under the null hypothesis
      (obtained from the permutations). Array is sorted.

    References
    ----------
    [1] Anderson, M. J. & Robinson, J. (2001).
        Permutation tests for linear models.
        Australian & New Zealand Journal of Statistics, 43(1), 75-88.
        (http://avesbiodiv.mncn.csic.es/estadistica/permut2.pdf)
    [2] Winkler, A. M. et al. (2014).
        Permutation inference for the general linear model.
        Neuroimage.
    [3] Freedman, D. & Lane, D. (1983).
        A nonstochastic interpretation of reported significance levels.
        J. Bus. Econ. Stats., 1(4), 292-298

    """
    # initialize the seed of the random generator
    rng = check_random_state(random_state)

    # check n_jobs (number of CPUs)
    if n_jobs == 0:  # invalid according to joblib's conventions
        raise ValueError("'n_jobs == 0' is not a valid choice. "
                         "Please provide a positive number of CPUs, or -1 "
                         "for all CPUs, or a negative number (-i) for "
                         "'all but (i-1)' CPUs (joblib conventions).")
    elif n_jobs < 0:
        n_jobs = max(1, joblib.cpu_count() - int(n_jobs) + 1)
    else:
        n_jobs = min(n_jobs, joblib.cpu_count())
    # make target_vars F-ordered to speed-up computation
    if target_vars.ndim != 2:
        raise ValueError("'target_vars' should be a 2D array. "
                         "An array with %d dimension%s was passed"
                         % (target_vars.ndim,
                            "s" if target_vars.ndim > 1 else ""))
    target_vars = np.asfortranarray(target_vars)  # efficient for chunking
    n_descriptors = target_vars.shape[1]

    # check explanatory variates dimensions
    if tested_vars.ndim == 1:
        tested_vars = np.atleast_2d(tested_vars).T
    n_samples, n_regressors = tested_vars.shape

    # check if explanatory variates is intercept (constant) or not
    if (n_regressors == 1 and np.unique(tested_vars).size == 1):
        intercept_test = True
    else:
        intercept_test = False

    # optionally add intercept
    if model_intercept and not intercept_test:
        if confounding_vars is not None:
            confounding_vars = np.hstack(
                (confounding_vars, np.ones((n_samples, 1))))
        else:
            confounding_vars = np.ones((n_samples, 1))

    ### OLS regression on original data
    if confounding_vars is not None:
        # step 1: extract effect of covars from target vars
        covars_orthonormalized = orthonormalize_matrix(confounding_vars)
        if not covars_orthonormalized.flags['C_CONTIGUOUS']:
            # useful to developer
            warnings.warn('Confounding variates not C_CONTIGUOUS.')
            covars_orthonormalized = np.ascontiguousarray(
                covars_orthonormalized)
        targetvars_normalized = normalize_matrix_on_axis(
            target_vars).T  # faster with F-ordered target_vars_chunk
        if not targetvars_normalized.flags['C_CONTIGUOUS']:
            # useful to developer
            warnings.warn('Target variates not C_CONTIGUOUS.')
            targetvars_normalized = np.ascontiguousarray(targetvars_normalized)
        beta_targetvars_covars = np.dot(targetvars_normalized,
                                        covars_orthonormalized)
        targetvars_resid_covars = targetvars_normalized - np.dot(
            beta_targetvars_covars, covars_orthonormalized.T)
        targetvars_resid_covars = normalize_matrix_on_axis(
            targetvars_resid_covars, axis=1)
        # step 2: extract effect of covars from tested vars
        testedvars_normalized = normalize_matrix_on_axis(tested_vars.T, axis=1)
        beta_testedvars_covars = np.dot(testedvars_normalized,
                                        covars_orthonormalized)
        testedvars_resid_covars = testedvars_normalized - np.dot(
            beta_testedvars_covars, covars_orthonormalized.T)
        testedvars_resid_covars = normalize_matrix_on_axis(
            testedvars_resid_covars, axis=1).T.copy()
    else:
        targetvars_resid_covars = normalize_matrix_on_axis(target_vars).T
        testedvars_resid_covars = normalize_matrix_on_axis(tested_vars).copy()
        covars_orthonormalized = None
    # check arrays contiguousity (for the sake of code efficiency)
    if not targetvars_resid_covars.flags['C_CONTIGUOUS']:
        # useful to developer
        warnings.warn('Target variates not C_CONTIGUOUS.')
        targetvars_resid_covars = np.ascontiguousarray(targetvars_resid_covars)
    if not testedvars_resid_covars.flags['C_CONTIGUOUS']:
        # useful to developer
        warnings.warn('Tested variates not C_CONTIGUOUS.')
        testedvars_resid_covars = np.ascontiguousarray(testedvars_resid_covars)
    # step 3: original regression (= regression on residuals + adjust F score)
    # compute F score for original data
    scores_original_data = _f_score_with_covars_and_normalized_design(
        testedvars_resid_covars, targetvars_resid_covars.T,
        covars_orthonormalized)

    ### Permutations
    # parallel computing units perform a reduced number of permutations each
    if n_perm > n_jobs:
        n_perm_chunks = np.asarray([n_perm / n_jobs] * n_jobs, dtype=int)
        n_perm_chunks[-1] += n_perm % n_jobs
    elif n_perm > 0:
        n_perm_chunks = np.ones(n_perm, dtype=int)
    else:  # 0 or negative number of permutations => original data scores only
        return np.asarray([]), scores_original_data,  np.asarray([])
    # actual permutations, seeded from a random integer between 0 and maximum
    # value represented by np.int32 (to have a large entropy).
    ret = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(_permuted_ols_on_chunk)
          (scores_original_data, testedvars_resid_covars,
           targetvars_resid_covars.T, covars_orthonormalized,
           n_perm_chunk=n_perm_chunk, intercept_test=intercept_test,
           random_state=rng.random_integers(np.iinfo(np.int32).max))
          for n_perm_chunk in n_perm_chunks)
    # reduce results
    scores_as_ranks_parts, h0_fmax_parts = zip(*ret)
    h0_fmax = np.hstack((h0_fmax_parts))
    scores_as_ranks = np.zeros((n_regressors, n_descriptors))
    for scores_as_ranks_part in scores_as_ranks_parts:
        scores_as_ranks += scores_as_ranks_part
    # convert ranks into p-values
    pvals = (n_perm + 1 - scores_as_ranks) / float(1 + n_perm)

    return - np.log10(pvals), scores_original_data, h0_fmax[0]
コード例 #54
0
def wheel_up_features_bfs (initialBunch,
                           trees,
                           factory,
                           loss,
                           learning_rate,
                           nIters,
                           trees_sample_size,
                           verbose = True,
                           learning_rate_decay = 1.,
                           trees_sample_increase = 0,
                           regularizer = 0.,
                           random_walk = True,
                           use_joblib = False,
                           n_jobs = -1,
                           joblib_method = "threads",
                           copy_pred = False):
    """
    Iterative BFS over best ADD-1 results for [nTrees] iterations
    """
    allTrees = copy.copy(trees)
    
    bunch = copy.copy(initialBunch)
    pred = factory.predict(bunch)
    bestScore = sum(loss(factory,pred))
    
    if use_joblib:
        if n_jobs < 0:
            n_jobs = joblib.cpu_count()
                
        if joblib_method == "threads":
            #create copies of data once to escape GIL forever
            factory = [copy.deepcopy(factory) for i in range(n_jobs)]
            loss = [copy.deepcopy(loss) for i in range(n_jobs)]

        elif joblib_method == "processes":
            pass
        else:
            raise ValueError, "joblib_method must be either 'threads' or 'processes'"
    
  
    if verbose:
        print "\niteration #",0," ntrees = ", len(bunch),"\nbest loss = ",bestScore
        print "learning_rate = ", learning_rate
        print "sample_size", trees_sample_size

    
    for itr in xrange(1,nIters+1):
        change_index= random.randint(0,len(bunch)-1) if random_walk else  (i-1)%len(bunch)
        trees_sample = random.sample(allTrees,trees_sample_size)+ [bunch[change_index]]
        bunch_wo = copy.copy(bunch)
        bunch_wo.pop(change_index)

        if use_joblib and joblib_method=="threads":
            #split trees into sections
            indices = [0]+[len(trees_sample)*(i+1)/n_jobs for i in range(n_jobs)]
            treeSections = [trees_sample[indices[i]:indices[i+1]] for i in range(n_jobs)]
            
            pred_wo = pred - factory[0].predict([bunch[change_index]])

            if copy_pred:
                pred_wo = [copy.deepcopy(pred) for i in range(n_jobs)]
            else:
                pred_wo = [pred for i in range(n_jobs)]

            #execute sections in parallel
            tasks = [joblib.delayed(try_add1_bfs)(treeSections[ithread],factory[ithread],
                                                          learning_rate,loss[ithread],
                                                          1,pred_wo[ithread],regularizer=regularizer,
                                                          use_joblib=False)
                                                for ithread in range(n_jobs)]
                                                    
            _res = joblib.Parallel(n_jobs = n_jobs,
                           backend = "threading")(tasks)
            _additions,newScores,newPreds = reduce(lambda a,b:[a[i]+b[i] for i in range(3)], _res)
            
        else:
            pred_wo = pred - factory.predict([bunch[change_index]])

            _additions,newScores,newPreds = try_add1_bfs(trees_sample,factory,
                                                         learning_rate,loss,
                                                          1,pred_wo,regularizer=regularizer,
                                                          use_joblib=use_joblib,n_jobs=n_jobs)
        newBunches = [bunch_wo+[_added] for _added in _additions]
        

        
        learning_rate *= learning_rate_decay
        trees_sample_size = min(len(allTrees),trees_sample_size + trees_sample_increase)
            
        triples = zip(newScores,newBunches,newPreds)
        triples.sort(key = lambda el: el[0])

        newBestScore = min(newScores)
        
        if newBestScore > bestScore:
            pass
        else: 
            bestScore = newBestScore
            bunch = triples[0][1]
            bunch.insert(change_index,bunch.pop())
            pred = triples[0][2]

        
        
        if verbose:
            print "\niteration #",itr," ntrees = ", len(bunch),"\nbest loss = ", bestScore,"\nlast loss = ",newBestScore
            print "changed index",change_index
            print "learning_rate = ", learning_rate
            print "sample_size", trees_sample_size          
    return bunch
コード例 #55
0
ファイル: textclass.py プロジェクト: fnl/libfnl
    def __init__(self, *files, columns=None, ngrams=2, decap=False, patterns=None, mask=None):
        """
        Create a new data object with the following attributes:

            * instances - list of raw text instances
            * labels - array of instance labels in same order as raw text
            * features - matrix of feature vectors per text instance
            * names - array of feature names in same order as features

        Both features and names are undefined until extracted
        using some Vectorizer.

        Exclusive options for either BIO-NER vs. plain-text input:

        1. **BIO-NER** paramters: Define a `columns` integer to define the number of disregarded
           columns and thereby declare that the input will be in BIO-NER format. In addtion, the
           `ngram` option can be set to define the ngram size of the tokens to generate.
           All other keyword parameter will be ignored.

        2. **plain-text** keyword parameters: Set `decap=True` to lower-case the first letter of
           each plain-text line. Use a list of regex `patterns` and a repacement string `mask` to
           "mask" pattern-matched words in regular (non-`column`) input.
        """
        try:
            if columns is None:
                inputs = [[l.strip('\r\n') for l in f] for f in files]

                if decap:
                    for i in range(len(inputs)):
                        inputs[i] = ["{}{}".format(l[0].lower(), l[1:])
                                     for l in inputs[i] if len(l)]

                if patterns and mask:
                    self.instances = []
                    splits = joblib.cpu_count()

                    for lines in inputs:
                        jobs = tuple(lines[i::splits] for i in range(splits))
                        jobs = joblib.Parallel(n_jobs=splits)(
                            delayed(subAll)(patterns, mask, lines) for lines in jobs
                        )
                        self.instances.append(list(zip(lines, chain(*jobs))))
                else:
                    self.instances = [list(zip(lines, lines)) for lines in inputs]

            else:
                self.instances = []

                for f in files:
                    # FIXME: instead of two hardcoded entity masks,
                    # FIXME: this has to be dynamic or generic...
                    sentences = SentenceParser(f, ('FACTOR', 'TARGET'), id_columns=columns)

                    if not columns:
                        sentences = list(enumerate(sentences, start=1))

                    data = [(sid, asDict(s, ngrams)) for sid, s in sentences]
                    self.instances.append(data)
        except UnicodeDecodeError as e:
            import sys
            print('decoding error:', e.reason, 'in input file')
            sys.exit(1)

        # ensure the minority label(s) come first (important for the evaluation, too!)
        self.instances = sorted(self.instances, key=len)

        self.classes = len(self.instances)
        self.labels = np.concatenate([
            (np.zeros(len(data), dtype=np.uint8) + i)
            for i, data in enumerate(self.instances)
        ])
        self.ids = None
        self.raw = None
        self.features = None
        self.names = None

        if columns is None:
            self.raw, self.instances = zip(*list(chain.from_iterable(self.instances)))

            if len(self.raw) and '\t' in self.raw[0]:
                self.ids = [l.split('\t', 1)[0] for l in self.raw]
            else:
                self.ids = self.raw
        else:
            self.ids, self.instances = zip(*list(chain.from_iterable(self.instances)))
コード例 #56
0
ファイル: searchcv.py プロジェクト: MechCoder/scikit-optimize
    def step(self, X, y, space_id, groups=None, n_jobs=1):
        """Generate n_jobs parameters and evaluate them in parallel.

        Having a separate function for a single step for search allows to
        save easily checkpoints for the parameter search and restore from
        possible failures.

        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (class labels) as integers or strings.

        space_id : hashable
            Identifier of parameter search space. Add search spaces with

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.

        n_jobs : int, default=1
            Number of parameters to evaluate in parallel.

        Returns
        -------
        params_dict: dictionary with parameter values.
        """

        # convert n_jobst to int > 0 if necessary
        if n_jobs < 0:
            n_jobs = max(1, cpu_count() + n_jobs + 1)

        # use the cached optimizer for particular parameter space
        if space_id not in self.search_spaces_:
            raise ValueError("Unknown space %s" % space_id)

        # get the search space for a step
        search_space = self.search_spaces_[space_id]
        if isinstance(search_space, tuple):
            search_space, _ = search_space

        # create optimizer if not created already
        if space_id not in self.optimizer_:
            self.optimizer_[space_id] = self._make_optimizer(search_space)
        optimizer = self.optimizer_[space_id]

        # get parameter values to evaluate
        params = optimizer.ask(n_points=n_jobs)
        params_dict = [point_asdict(search_space, p) for p in params]

        # self.cv_results_ is reset at every call to _fit, keep current
        all_cv_results = self.cv_results_

        # record performances with different points
        refit = self.refit
        self.refit = False  # do not fit yet - will be fit later

        # this adds compatibility with different versions of sklearn

        self._fit(X, y, groups, params_dict)

        self.refit = refit

        # merge existing and new cv_results_
        for k in self.cv_results_:
            all_cv_results[k].extend(self.cv_results_[k])

        self.cv_results_ = all_cv_results
        self.best_index_ = np.argmax(self.cv_results_['mean_test_score'])

        # feed the point and objective back into optimizer
        local_results = self.cv_results_['mean_test_score'][-len(params):]

        # optimizer minimizes objective, hence provide negative score
        optimizer.tell(params, [-score for score in local_results])

        # fit the best model if necessary
        if self.refit:
            self._fit_best_model(X, y)
コード例 #57
0
def dict_learning_online(X, n_components=2, alpha=1, n_iter=100,
                         return_code=True, dict_init=None, callback=None,
                         batch_size=3, verbose=False, shuffle=True, n_jobs=1,
                         method='lars', iter_offset=0, random_state=None,
                         return_inner_stats=False, inner_stats=None,
                         return_n_iter=False):
    """Solves a dictionary learning matrix factorization problem online.

    Finds the best dictionary and the corresponding sparse code for
    approximating the data matrix X by solving::

        (U^*, V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1
                     (U,V)
                     with || V_k ||_2 = 1 for all  0 <= k < n_components

    where V is the dictionary and U is the sparse code. This is
    accomplished by repeatedly iterating over mini-batches by slicing
    the input data.

    Read more in the :ref:`User Guide <DictionaryLearning>`.

    Parameters
    ----------
    X : array of shape (n_samples, n_features)
        Data matrix.

    n_components : int,
        Number of dictionary atoms to extract.

    alpha : float,
        Sparsity controlling parameter.

    n_iter : int,
        Number of iterations to perform.

    return_code : boolean,
        Whether to also return the code U or just the dictionary V.

    dict_init : array of shape (n_components, n_features),
        Initial value for the dictionary for warm restart scenarios.

    callback : callable or None, optional (default: None)
        callable that gets invoked every five iterations

    batch_size : int,
        The number of samples to take in each batch.

    verbose : bool, optional (default: False)
        To control the verbosity of the procedure.

    shuffle : boolean,
        Whether to shuffle the data before splitting it in batches.

    n_jobs : int,
        Number of parallel jobs to run, or -1 to autodetect.

    method : {'lars', 'cd'}
        lars: uses the least angle regression method to solve the lasso problem
        (linear_model.lars_path)
        cd: uses the coordinate descent method to compute the
        Lasso solution (linear_model.Lasso). Lars will be faster if
        the estimated components are sparse.

    iter_offset : int, default 0
        Number of previous iterations completed on the dictionary used for
        initialization.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    return_inner_stats : boolean, optional
        Return the inner statistics A (dictionary covariance) and B
        (data approximation). Useful to restart the algorithm in an
        online setting. If return_inner_stats is True, return_code is
        ignored

    inner_stats : tuple of (A, B) ndarrays
        Inner sufficient statistics that are kept by the algorithm.
        Passing them at initialization is useful in online settings, to
        avoid loosing the history of the evolution.
        A (n_components, n_components) is the dictionary covariance matrix.
        B (n_features, n_components) is the data approximation matrix

    return_n_iter : bool
        Whether or not to return the number of iterations.

    Returns
    -------
    code : array of shape (n_samples, n_components),
        the sparse code (only returned if `return_code=True`)

    dictionary : array of shape (n_components, n_features),
        the solutions to the dictionary learning problem

    n_iter : int
        Number of iterations run. Returned only if `return_n_iter` is
        set to `True`.

    See also
    --------
    dict_learning
    DictionaryLearning
    MiniBatchDictionaryLearning
    SparsePCA
    MiniBatchSparsePCA

    """
    if n_components is None:
        n_components = X.shape[1]

    if method not in ('lars', 'cd'):
        raise ValueError('Coding method not supported as a fit algorithm.')
    method = 'lasso_' + method

    t0 = time.time()
    n_samples, n_features = X.shape
    # Avoid integer division problems
    alpha = float(alpha)
    random_state = check_random_state(random_state)

    if n_jobs == -1:
        n_jobs = cpu_count()

    # Init V with SVD of X
    if dict_init is not None:
        dictionary = dict_init
    else:
        # _, S, dictionary = randomized_svd(X, n_components,
        #                                   random_state=random_state)
        # dictionary = S[:, np.newaxis] * dictionary
        print("init dictionary with shape:", X.shape)
        dictionary = np.array(X)
        

    r = len(dictionary)
    if n_components <= r:
        dictionary = dictionary[:n_components, :]
    else:
        dictionary = np.r_[dictionary,
                           np.zeros((n_components - r, dictionary.shape[1]))]

    if verbose == 1:
        print('init dic:', dictionary)
        print('[dict_learning]', end=' ')

    if shuffle:
        X_train = X.copy()
        random_state.shuffle(X_train)
    else:
        X_train = X

    dictionary = check_array(dictionary.T, order='F', dtype=np.float64,
                             copy=False)
    X_train = check_array(X_train, order='C', dtype=np.float64, copy=False)

    batches = gen_batches(n_samples, batch_size)
    batches = itertools.cycle(batches)

    # The covariance of the dictionary
    if inner_stats is None:
        A = np.zeros((n_components, n_components))
        # The data approximation
        B = np.zeros((n_features, n_components))
    else:
        A = inner_stats[0].copy()
        B = inner_stats[1].copy()

    # If n_iter is zero, we need to return zero.
    ii = iter_offset - 1

    for ii, batch in zip(range(iter_offset, iter_offset + n_iter), batches):
        this_X = X_train[batch]
        dt = (time.time() - t0)
        if verbose == 1:
            sys.stdout.write(".")
            sys.stdout.flush()
        elif verbose:
            if verbose > 10 or ii % ceil(100. / verbose) == 0:
                print("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)"
                      % (ii, dt, dt / 60))

        this_code = sparse_encode(this_X, dictionary.T, algorithm=method,
                                  alpha=alpha, n_jobs=n_jobs).T

        # Update the auxiliary variables
        if ii < batch_size - 1:
            theta = float((ii + 1) * batch_size)
        else:
            theta = float(batch_size ** 2 + ii + 1 - batch_size)
        beta = (theta + 1 - batch_size) / (theta + 1)

        A *= beta
        A += np.dot(this_code, this_code.T)
        A += np.diag((np.abs(this_code)).ravel()) * 2 * alpha

        B *= beta
        B += np.dot(this_X.T, this_code.T)
        B += 2 * alpha * np.dot(this_X.T, np.abs(this_code.T))
        # Update dictionary
        dictionary = _update_dict(dictionary, B, A, verbose=verbose,
                                  random_state=random_state)
        # XXX: Can the residuals be of any use?
        # Maybe we need a stopping criteria based on the amount of
        # modification in the dictionary
        if callback is not None:
            callback(locals())

    if return_inner_stats:
        if return_n_iter:
            return dictionary.T, (A, B), ii - iter_offset + 1
        else:
            return dictionary.T, (A, B)
    if return_code:
        if verbose > 1:
            print('Learning code...', end=' ')
        elif verbose == 1:
            print('|', end=' ')
        code = sparse_encode(X, dictionary.T, algorithm=method, alpha=alpha,
                             n_jobs=n_jobs, check_input=False)
        if verbose > 1:
            dt = (time.time() - t0)
            print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60))
        if return_n_iter:
            return code, dictionary.T, ii - iter_offset + 1
        else:
            return code, dictionary.T

    if return_n_iter:
        return dictionary.T, ii - iter_offset + 1
    else:
        return dictionary.T