def check_n_jobs(n_jobs): """Check and adjust the number of CPUs that can work in parallel. Parameters ---------- n_jobs : int, Number of parallel workers, specified according to joblib's conventions: If 0 is provided, all CPUs are used. A negative number indicates that all the CPUs except (|n_jobs| - 1) ones will be used. Returns ------- n_jobs : int, Actual number of CPUs that will be used according to their availability. """ if n_jobs == 0: # invalid according to joblib's conventions raise ValueError( "'n_jobs == 0' is not a valid choice. " "Please provide a positive number of CPUs, or -1 " "for all CPUs, or a negative number (-i) for " "'all but (i-1)' CPUs (joblib conventions)." ) elif n_jobs < 0: n_jobs = max(1, joblib.cpu_count() + n_jobs + 1) else: n_jobs = min(n_jobs, joblib.cpu_count()) return n_jobs
def check_n_jobs(n_jobs): """Check and adjust the number of CPUs that can work in parallel. Parameters ---------- n_jobs : int, Number of parallel workers, specified according to joblib's conventions: If 0 is provided, all CPUs are used. A negative number indicates that all the CPUs except (|n_jobs| - 1) ones will be used. Returns ------- n_jobs : int, Actual number of CPUs that will be used according to their availability. """ if n_jobs == 0: # invalid according to joblib's conventions raise ValueError("'n_jobs == 0' is not a valid choice. " "Please provide a positive number of CPUs, or -1 " "for all CPUs, or a negative number (-i) for " "'all but (i-1)' CPUs (joblib conventions).") elif n_jobs < 0: n_jobs = max(1, joblib.cpu_count() + n_jobs + 1) else: n_jobs = min(n_jobs, joblib.cpu_count()) return n_jobs
def get_split_scores(factory,thresholds,formula, metric = None,#p.e. usability entropy use_joblib = False, joblib_backend = 'threading', n_jobs = -1, min_events_fraction_leaf = 0.,verbose = False): if metric == None: metric = penalized_usability_entropy if min_events_fraction_leaf <=1: min_events_fraction_leaf = int(min_events_fraction_leaf*sum(factory.weights)) if verbose: print min_events_fraction_leaf, sum(factory.weights) if not use_joblib: scores = np.repeat(float("inf"),len(thresholds)) for i,(feature,cut,_) in enumerate(thresholds): predicate = (factory.events[:,feature] > cut) #skip the edge cases... (inf penalty) if np.all(predicate) or (not np.any(predicate)): #if this split does not split, fuggedaboutit continue if min_events_fraction_leaf>0: #get rid of too uneven a cuts sum_weight = np.sum(factory.weights) true_weight = np.sum(factory.weights[predicate]) false_weight = sum_weight - true_weight if true_weight < min_events_fraction_leaf or false_weight < min_events_fraction_leaf: if verbose: print "t:",true_weight,"f:",false_weight, "discarded" continue if verbose: print "t:",true_weight,"f:",false_weight, "passed" #compute score subFactories = factory.split_by(predicate) scores[i] = metric(formula,*subFactories) else: if n_jobs < 0: n_jobs = joblib.cpu_count() +1 - n_jobs indices = [0]+[len(thresholds)*(i+1)/n_jobs for i in range(n_jobs)] thresholdSections = [thresholds[indices[i]:indices[i+1]] for i in range(n_jobs)] if joblib_backend == 'threading': factory = [deepcopy(factory) for i in range(n_jobs)] formula = [deepcopy(formula) for i in range(n_jobs)] metric = [deepcopy(metric) for i in range(n_jobs)] #in case it has some internal data jobs = (joblib.delayed(get_split_scores)(factory[i],thresholdSection, formula[i], metric=metric[i],use_joblib = False, min_events_fraction_leaf = min_events_fraction_leaf, verbose = verbose) for i,thresholdSection in enumerate(thresholdSections)) else: jobs = (joblib.delayed(get_split_scores)(factory,thresholdSection, formula, metric=metric,use_joblib = False, min_events_fraction_leaf = min_events_fraction_leaf, verbose = verbose) for thresholdSection in thresholdSections) scores = np.hstack(joblib.Parallel(n_jobs = n_jobs, backend = joblib_backend)(jobs)) return scores
def _fit_multiclass_task(self, X, y, sample_weight, params): if params['init_model'] is not None: max_digits = len(str(len(self._classes))) init_model_filenames = ['{}.{}'.format(params['init_model'], str(i + 1).zfill(max_digits)) for i in range(self._n_classes)] ovr_list = [None] * self._n_classes for i, cls_num in enumerate(self._classes): if params['init_model'] is not None: params['init_model'] = init_model_filenames[i] self._classes_map[i] = cls_num ovr_list[i] = (y == cls_num).astype(int) self._estimators[i] = RGFExecuter(**params) n_jobs = self.n_jobs if self.n_jobs > 0 else cpu_count() + self.n_jobs + 1 substantial_n_jobs = max(n_jobs, self.n_classes_) if substantial_n_jobs < n_jobs and self.verbose: print('n_jobs = {0}, but RGFClassifier uses {1} CPUs because ' 'classes_ is {2}'.format(n_jobs, substantial_n_jobs, self.n_classes_)) self._estimators = Parallel(n_jobs=self.n_jobs)(delayed(utils.fit_ovr_binary)(self._estimators[i], X, ovr_list[i], sample_weight) for i in range(self._n_classes))
def _parallel_learning(self, X, Y, w): n_samples = len(X) objective, positive_slacks = 0, 0 verbose = max(0, self.verbose - 3) if self.batch_size is not None: raise ValueError("If n_jobs != 1, batch_size needs to" "be None") # generate batches of size n_jobs # to speed up inference if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.n_jobs n_batches = int(np.ceil(float(len(X)) / n_jobs)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] candidate_constraints = Parallel( n_jobs=self.n_jobs, verbose=verbose)(delayed(find_constraint)(self.model, x, y, w) for x, y in zip(X_b, Y_b)) djoint_feature = np.zeros(self.model.size_joint_feature) for x, y, constraint in zip(X_b, Y_b, candidate_constraints): y_hat, delta_joint_feature, slack, loss = constraint if slack > 0: objective += slack djoint_feature += delta_joint_feature positive_slacks += 1 w = self._solve_subgradient(djoint_feature, n_samples, w) return objective, positive_slacks, w
def _fit_multiclass_task(self, X, y, sample_weight, params): if params['init_model'] is not None: max_digits = len(str(len(self._classes))) init_model_filenames = [ '{}.{}'.format(params['init_model'], str(i + 1).zfill(max_digits)) for i in range(self._n_classes) ] ovr_list = [None] * self._n_classes for i, cls_num in enumerate(self._classes): if params['init_model'] is not None: params['init_model'] = init_model_filenames[i] self._classes_map[i] = cls_num ovr_list[i] = (y == cls_num).astype(int) self._estimators[i] = RGFExecuter(**params) n_jobs = self.n_jobs if self.n_jobs > 0 else cpu_count( ) + self.n_jobs + 1 substantial_n_jobs = max(n_jobs, self.n_classes_) if substantial_n_jobs < n_jobs and self.verbose: print('n_jobs = {0}, but RGFClassifier uses {1} CPUs because ' 'classes_ is {2}'.format(n_jobs, substantial_n_jobs, self.n_classes_)) self._estimators = Parallel(n_jobs=self.n_jobs)( delayed(utils.fit_ovr_binary)(self._estimators[i], X, ovr_list[i], sample_weight) for i in range(self._n_classes))
def _parallel_learning(self, X, Y, w): n_samples = len(X) objective, positive_slacks = 0, 0 verbose = max(0, self.verbose - 3) if self.batch_size is not None: raise ValueError("If n_jobs != 1, batch_size needs to" "be None") # generate batches of size n_jobs # to speed up inference if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.n_jobs n_batches = int(np.ceil(float(len(X)) / n_jobs)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] candidate_constraints = Parallel(n_jobs=self.n_jobs, verbose=verbose)( delayed(find_constraint)(self.model, x, y, w) for x, y in zip(X_b, Y_b) ) dpsi = np.zeros(self.model.size_psi) for x, y, constraint in zip(X_b, Y_b, candidate_constraints): y_hat, delta_psi, slack, loss = constraint if slack > 0: objective += slack dpsi += delta_psi positive_slacks += 1 w = self._solve_subgradient(dpsi, n_samples, w) return objective, positive_slacks, w
def define_model(n_random_search: int = 100, n_jobs: int = None) -> Model: # TODO: needs refinements rf = RandomForestClassifier(random_state=None) # Or Extremely Randomized Trees, but currently no big difference in terms of performance. # rf = ExtraTreesClassifier(random_state=None) _n_estimators = list(range(8, 128, 4)) _max_depth = list(range(8, 32, 1)) search_space = dict( n_estimators=_n_estimators, criterion=['gini', 'entropy'], max_features=['auto', 'log2', 0.5, None], max_depth=_max_depth ) if n_jobs: ncores = n_jobs else: ncores = joblib.cpu_count() rfcv = model_selection.RandomizedSearchCV(estimator=rf, param_distributions=search_space, n_iter=n_random_search, n_jobs=ncores, cv=5, verbose=1 ) return rfcv
def _get_n_jobs(n_jobs): """Get number of jobs for the computation. See sklearn/utils/__init__.py for more information. This function reimplements the logic of joblib to determine the actual number of jobs depending on the cpu count. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Parameters ---------- n_jobs : int Number of jobs stated in joblib convention. Returns ------- n_jobs : int The actual number of jobs as positive integer. Examples -------- >>> from sklearn.utils import _get_n_jobs >>> _get_n_jobs(4) 4 >>> jobs = _get_n_jobs(-2) >>> assert jobs == max(cpu_count() - 1, 1) >>> _get_n_jobs(0) Traceback (most recent call last): ... ValueError: Parameter n_jobs == 0 has no meaning. """ if n_jobs < 0: return max(cpu_count() + 1 + n_jobs, 1) elif n_jobs == 0: raise ValueError('Parameter n_jobs == 0 has no meaning.') else: return n_jobs
def compute_partition(self, nb_tasks, data_size): """ Compute data partitioning for parallel computation : min(nb_tasks, data_size) Parameters ---------- nb_tasks : int (!=0) If >0 : the parallelization factor. If <0 : nb_tasks = #cpu+nb_tasks+1 (-1 -> nb_tasks = #cpu) data_size : int > 0 The size of the data to process Return ------ triplet = (nb_tasks, counts, starts) nb_tasks : int The final parallelization factor. It is computed as min(#cpu/nb_tasks, data_size) starts : list of int The start indexes of the data for each parallel task """ # Compute the actual number of core to use if nb_tasks < 0: cpu = cpu_count() + nb_tasks + 1 if cpu <= 0: cpu = 1 nb_tasks = min(cpu, data_size) else: if nb_tasks == 0: nb_tasks = 1 nb_tasks = min(nb_tasks, data_size) # Compute the minimum load increment = data_size // nb_tasks starts = [ x * y for x, y in zip([increment] * nb_tasks, range(nb_tasks)) ] starts.append(data_size) # Distribute the extra load if necessary gap = data_size - increment * nb_tasks if gap > 0: # If there are leftovers, we will increase the number of objects # of the first cores : starts[i] = starts[i] + corrections[i] # The correction vector is [0] [1,2,...,gap] [gap,...,gap] [0] # The first 0 is so as to start at the first element # The second part is to increase the number of datum of the first # cores by one. # Then we have to shifs all the remaining component to keep the # same number of elements... # Except for the last one which must correspond to the lenght of # the data vector corrections = range(gap + 1) + ([gap] * (nb_tasks - gap - 1)) + [0] starts = [x + y for x, y in zip(starts, corrections)] return nb_tasks, starts
def _fit(self, X, y): X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] estimator = clone(self.estimator) # Genetic Algorithm toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y, cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params, caching=self.caching) toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba) toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs > 1: pool = multiprocessing.Pool(processes=self.n_jobs) toolbox.register("map", pool.map) elif self.n_jobs < 0: pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1)) toolbox.register("map", pool.map) pop = toolbox.population(n=self.n_population) hof = tools.HallOfFame(1, similar=np.array_equal) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) if self.verbose > 0: print("Selecting features with genetic algorithm.") algorithms.eaSimple(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba, ngen=self.n_generations, stats=stats, halloffame=hof, verbose=self.verbose) if self.n_jobs != 1: pool.close() pool.join() # Set final attributes support_ = np.array(hof, dtype=np.bool)[0] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, support_], y) self.n_features_ = support_.sum() self.support_ = support_ return self
def test_multi_output_classification_partial_fit_parallelism(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=-1) mor.partial_fit(X, y, classes) est1 = mor.estimators_[0] mor.partial_fit(X, y) est2 = mor.estimators_[0] if cpu_count() > 1: # parallelism requires this to be the case for a sane implementation assert_false(est1 is est2)
def fit(self, X, y=None, groups=None): """Run fit on the estimator with randomly drawn parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output] Target relative to X for classification or regression; groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ # check if the list of parameter spaces is provided. If not, then # only step in manual mode can be used. if len(self.search_spaces_) == 0: raise ValueError( "Please provide search space using `add_spaces` first before" "calling fit method.") n_jobs = self.n_jobs # account for case n_jobs < 0 if n_jobs < 0: n_jobs = max(1, cpu_count() + n_jobs + 1) for space_id in sorted(self.search_spaces_.keys()): elem = self.search_spaces_[space_id] # if not provided with search subspace, n_iter is taken as # self.n_iter if isinstance(elem, tuple): space, n_iter = elem else: n_iter = self.n_iter # do the optimization for particular search space while n_iter > 0: # when n_iter < n_jobs points left for evaluation n_jobs_adjusted = min(n_iter, self.n_jobs) self.step(X, y, space_id, groups=groups, n_jobs=n_jobs_adjusted) n_iter -= n_jobs
def fit(self, X, y=None, groups=None): """Run fit on the estimator with randomly drawn parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output] Target relative to X for classification or regression; groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ # check if the list of parameter spaces is provided. If not, then # only step in manual mode can be used. if len(self.search_spaces_) == 0: raise ValueError( "Please provide search space using `add_spaces` first before" "calling fit method." ) n_jobs = self.n_jobs # account for case n_jobs < 0 if n_jobs < 0: n_jobs = max(1, cpu_count() + n_jobs + 1) for space_id in sorted(self.search_spaces_.keys()): elem = self.search_spaces_[space_id] # if not provided with search subspace, n_iter is taken as # self.n_iter if isinstance(elem, tuple): space, n_iter = elem else: n_iter = self.n_iter # do the optimization for particular search space while n_iter > 0: # when n_iter < n_jobs points left for evaluation n_jobs_adjusted = min(n_iter, self.n_jobs) self.step( X, y, space_id, groups=groups, n_jobs=n_jobs_adjusted ) n_iter -= n_jobs
def _partition_X(X, n_jobs): """Private function used to partition X between jobs.""" n_nodes = X.shape[1] # Compute the number of jobs n_jobs = min(cpu_count() if n_jobs == -1 else n_jobs, n_nodes) # Partition estimators between jobs n_node_per_job = (n_nodes // n_jobs) * np.ones(n_jobs, dtype=np.int) n_node_per_job[:n_nodes % n_jobs] += 1 starts = np.cumsum(n_node_per_job) return n_jobs, [0] + starts.tolist()
def try_add1_bfs(allTrees, factory, learning_rate, loss, breadth, y_pred, regularizer=0., use_joblib=False, n_jobs=-1): ''' select best tree to add (1 step) ''' if factory.__class__ is BinaryClassificationFactory: y_sign = factory.labels_sign margin = y_sign * y_pred elif factory.__class__ is RegressionFactory: margin = factory.labels - y_pred else: raise Exception("Factory type not supported") if use_joblib: if n_jobs < 0: n_jobs = joblib.cpu_count() + 1 - n_jobs indices = [0] + [ len(allTrees) * (i + 1) / n_jobs for i in range(n_jobs) ] treeSections = [ allTrees[indices[i]:indices[i + 1]] for i in range(n_jobs) ] tasks = [ joblib.delayed(_inthread_try_add)(treeSection, factory, loss, margin, y_pred, learning_rate, regularizer) for treeSection in treeSections ] _res = joblib.Parallel(n_jobs=n_jobs, backend="multiprocessing")(tasks) triples = reduce(lambda a, b: a + b, _res) else: triples = [ _try_add(tree, factory, loss, margin, y_pred, learning_rate, regularizer) for tree in allTrees ] triples.sort(key=lambda el: el[0]) return ([triple[1] for triple in triples[:breadth] ], [triple[0] for triple in triples[:breadth]], [triple[2] for triple in triples[:breadth]])
def __init__(self, dataset_config, anomaly_map, feeder_df, n_jobs=1): self.dataset_config = dataset_config self.anomaly_map = anomaly_map self.feeder_df = feeder_df if n_jobs < 0: self.n_jobs = max(cpu_count() + 1 + n_jobs, 1) elif n_jobs == 0: raise ValueError('Parameter n_jobs == 0 has no meaning.') else: self.n_jobs = n_jobs low_cust = feeder_df.loc[feeder_df.CUSTOMERS < 100].index.values zero_len = feeder_df.loc[(feeder_df.FDR_OH == 0) & (feeder_df.FDR_UG == 0)].index.values self.feeder_ignore = set(list(low_cust) + list(zero_len))
def _partition_estimators(n_estimators, n_jobs): """Private function used to partition estimators between jobs.""" # Compute the number of jobs if n_jobs == -1: n_jobs = min(cpu_count(), n_estimators) else: n_jobs = min(n_jobs, n_estimators) # Partition estimators between jobs n_estimators_per_job = (n_estimators // n_jobs) * np.ones(n_jobs, dtype=np.int) n_estimators_per_job[:n_estimators % n_jobs] += 1 starts = np.cumsum(n_estimators_per_job) return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist()
def _partition_estimators(ensemble): """Private function used to partition estimators between jobs.""" # Compute the number of jobs if ensemble.n_jobs == -1: n_jobs = min(cpu_count(), ensemble.n_estimators) else: n_jobs = min(ensemble.n_jobs, ensemble.n_estimators) # Partition estimators between jobs n_estimators = (ensemble.n_estimators // n_jobs) * np.ones(n_jobs, dtype=np.int) n_estimators[:ensemble.n_estimators % n_jobs] += 1 starts = np.cumsum(n_estimators) return n_jobs, n_estimators.tolist(), [0] + starts.tolist()
def try_add1_bfs(allTrees,factory,learning_rate, loss,breadth,y_pred,regularizer = 0., use_joblib = False,n_jobs = -1): ''' select best tree to add (1 step) ''' if factory.__class__ is BinaryClassificationFactory: y_sign = factory.labels_sign margin = y_sign*y_pred elif factory.__class__ is RegressionFactory: margin = factory.labels - y_pred else: raise Exception("Factory type not supported") if use_joblib: if n_jobs < 0: n_jobs = joblib.cpu_count() + 1 - n_jobs indices = [0]+[len(allTrees)*(i+1)/n_jobs for i in range(n_jobs)] treeSections = [allTrees[indices[i]:indices[i+1]] for i in range(n_jobs)] tasks = [joblib.delayed(_inthread_try_add)( treeSection, factory, loss, margin, y_pred, learning_rate, regularizer) for treeSection in treeSections] _res = joblib.Parallel(n_jobs = n_jobs, backend = "multiprocessing")(tasks) triples = reduce(lambda a,b:a+b, _res) else: triples = [_try_add(tree,factory,loss,margin,y_pred,learning_rate,regularizer) for tree in allTrees] triples.sort(key = lambda el: el[0]) return ([triple[1] for triple in triples[:breadth]], [triple[0] for triple in triples[:breadth]], [triple[2] for triple in triples[:breadth]])
def computePartition(self, nbTasks, dataSize): """ Compute data partitioning for parallel computation : min(nbTasks, dataSize) Parameters ---------- nbTasks : int (!=0) If >0 : the parallelization factor. If <0 : nbTasks = #cpu+nbTasks+1 (-1 -> nbTasks = #cpu) dataSize : int > 0 The size of the data to process Return ------ triplet = (nbTasks, counts, starts) nbTasks : int The final parallelization factor. It is computed as min(#cpu/nbTasks, dataSize) counts : list of int The number of data pieces for each parallel task starts : list of int The start indexes of the data for each parallel task """ if nbTasks < 0: cpu = cpu_count() + nbTasks + 1 if cpu <= 0: cpu = 1 nbTasks = min(cpu, dataSize) else: if nbTasks == 0: nbTasks = 1 nbTasks = min(nbTasks, dataSize) counts = [dataSize / nbTasks] * nbTasks for i in xrange(dataSize % nbTasks): counts[i] += 1 starts = [0] * (nbTasks + 1) for i in xrange(1, nbTasks + 1): starts[i] = starts[i - 1] + counts[i - 1] return nbTasks, counts, starts
def computePartition(self, nbTasks, dataSize): """ Compute data partitioning for parallel computation : min(nbTasks, dataSize) Parameters ---------- nbTasks : int (!=0) If >0 : the parallelization factor. If <0 : nbTasks = #cpu+nbTasks+1 (-1 -> nbTasks = #cpu) dataSize : int > 0 The size of the data to process Return ------ triplet = (nbTasks, counts, starts) nbTasks : int The final parallelization factor. It is computed as min(#cpu/nbTasks, dataSize) counts : list of int The number of data pieces for each parallel task starts : list of int The start indexes of the data for each parallel task """ if nbTasks < 0: cpu = cpu_count()+nbTasks+1 if cpu <= 0: cpu = 1 nbTasks = min(cpu, dataSize) else: if nbTasks == 0: nbTasks = 1 nbTasks = min(nbTasks, dataSize) counts = [dataSize / nbTasks] * nbTasks for i in xrange(dataSize % nbTasks): counts[i] += 1 starts = [0] * (nbTasks + 1) for i in xrange(1, nbTasks + 1): starts[i] = starts[i - 1] + counts[i - 1] return nbTasks, counts, starts
def _partition_images(n_jobs, n_images): if n_jobs == -1: n_jobs = min(cpu_count(), n_images) else: n_jobs = min(n_jobs, n_images) counts = [n_images // n_jobs] * n_jobs for i in range(n_images % n_jobs): counts[i] += 1 starts = [0] * (n_jobs + 1) for i in range(1, n_jobs + 1): starts[i] = starts[i - 1] + counts[i - 1] return n_jobs, counts, starts
def _partition_clips(n_jobs, n_clips): if n_jobs == -1: n_jobs = min(cpu_count(), n_clips) else: n_jobs = min(n_jobs, n_clips) counts = [n_clips / n_jobs] * n_jobs for i in xrange(n_clips % n_jobs): counts[i] += 1 starts = [0] * (n_jobs + 1) for i in xrange(1, n_jobs + 1): starts[i] = starts[i - 1] + counts[i - 1] return n_jobs, counts, starts
def _fit_multiclass_task(self, X, y, sample_weight, params): ovr_list = [None] * self._n_classes for i, cls_num in enumerate(self._classes): self._classes_map[i] = cls_num ovr_list[i] = (y == cls_num).astype(int) self._estimators[i] = RGFExecuter(**params) n_jobs = self.n_jobs if self.n_jobs > 0 else cpu_count( ) + self.n_jobs + 1 substantial_n_jobs = max(n_jobs, self.n_classes_) if substantial_n_jobs < n_jobs and self.verbose: print('n_jobs = {0}, but RGFClassifier uses {1} CPUs because ' 'classes_ is {2}'.format(n_jobs, substantial_n_jobs, self.n_classes_)) self._estimators = Parallel(n_jobs=self.n_jobs)( delayed(utils.fit_ovr_binary)(self._estimators[i], X, ovr_list[i], sample_weight) for i in range(self._n_classes))
def _parallel_pairwise(X, Y, func, n_jobs, **kwds): """Break the pairwise matrix in n_jobs even slices and compute them in parallel""" if n_jobs < 0: n_jobs = max(cpu_count() + 1 + n_jobs, 1) if Y is None: Y = X if n_jobs == 1: # Special case to avoid picklability checks in delayed return func(X, Y, **kwds) # TODO: in some cases, backend='threading' may be appropriate fd = delayed(func) ret = Parallel(n_jobs=n_jobs, verbose=0)( fd(X, Y[s], **kwds) for s in gen_even_slices(Y.shape[0], n_jobs)) return np.hstack(ret)
def _set_params_with_dependencies(self): if self.max_bin is None: if self._is_sparse_train_X: self._max_bin = 200 else: self._max_bin = 65000 else: self._max_bin = self.max_bin if isinstance(self.min_samples_leaf, utils.FLOATS): self._min_samples_leaf = ceil(self.min_samples_leaf * self._n_samples) else: self._min_samples_leaf = self.min_samples_leaf if self.n_jobs == -1: self._n_jobs = 0 elif self.n_jobs < 0: self._n_jobs = cpu_count() + self.n_jobs + 1 else: self._n_jobs = self.n_jobs
def _partition_trees(forest): """Private function used to partition trees between jobs.""" # Compute the number of jobs if forest.n_jobs == -1: n_jobs = min(cpu_count(), forest.n_estimators) else: n_jobs = min(forest.n_jobs, forest.n_estimators) # Partition trees between jobs n_trees = [forest.n_estimators // n_jobs] * n_jobs for i in range(forest.n_estimators % n_jobs): n_trees[i] += 1 starts = [0] * (n_jobs + 1) for i in range(1, n_jobs + 1): starts[i] = starts[i - 1] + n_trees[i - 1] return n_jobs, n_trees, starts
def _set_params_with_dependencies(self): if self.max_bin is None: if self._is_sparse_train_X: self._max_bin = 200 else: self._max_bin = 65000 else: self._max_bin = self.max_bin if isinstance(self.min_samples_leaf, utils.FLOATS): self._min_samples_leaf = ceil(self.min_samples_leaf * self._n_samples) else: self._min_samples_leaf = self.min_samples_leaf if self.n_jobs == -1: self._n_jobs = 0 elif self.n_jobs < 0: self._n_jobs = cpu_count() + self.n_jobs + 1 else: self._n_jobs = self.n_jobs self._set_target_and_loss()
def _e_step(self, X, cal_delta): """ E-step set `cal_delta == True` when we need to run _m_step for inference, set it to False """ # parell run e-step if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.n_jobs results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_update_gamma) (X[idx_slice, :], self.expElogbeta, self.alpha, self.rng, 100, self.mean_change_tol, cal_delta) for idx_slice in gen_even_slices(X.shape[0], n_jobs)) # merge result gammas, deltas = zip(*results) gamma = np.vstack(gammas) if cal_delta: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. delta_component = np.zeros(self.components_.shape) for delta in deltas: delta_component += delta delta_component *= self.expElogbeta else: delta_component = None return (gamma, delta_component)
def _e_step(self, X, cal_delta): """ E-step set `cal_delta == True` when we need to run _m_step for inference, set it to False """ # parell run e-step if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.n_jobs results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_update_gamma) (X[idx_slice, :], self.expElogbeta, self.alpha, self.rng, self.max_gamma_update_iter, self.mean_change_tol, cal_delta) for idx_slice in gen_even_slices(X.shape[0], n_jobs)) # merge result gammas, deltas = zip(*results) gamma = np.vstack(gammas) if cal_delta: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. delta_component = np.zeros(self.components_.shape) for delta in deltas: delta_component += delta delta_component *= self.expElogbeta else: delta_component = None return (gamma, delta_component)
def _fit(self, X, y): X, y = check_X_y(X, y, "csr") # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] if self.max_features is not None: if not isinstance(self.max_features, numbers.Integral): raise TypeError("'max_features' should be an integer between 1 and {} features." " Got {!r} instead." .format(n_features, self.max_features)) elif self.max_features < 1 or self.max_features > n_features: raise ValueError("'max_features' should be between 1 and {} features." " Got {} instead." .format(n_features, self.max_features)) max_features = self.max_features else: max_features = n_features if not isinstance(self.n_gen_no_change, (numbers.Integral, np.integer, type(None))): raise ValueError("'n_gen_no_change' should either be None or an integer." " {} was passed." .format(self.n_gen_no_change)) estimator = clone(self.estimator) # Genetic Algorithm toolbox = base.Toolbox() toolbox.register("attr_bool", random.randint, 0, 1) toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features) toolbox.register("population", tools.initRepeat, list, toolbox.individual) toolbox.register("evaluate", _evalFunction, gaobject=self, estimator=estimator, X=X, y=y, cv=cv, scorer=scorer, verbose=self.verbose, fit_params=self.fit_params, max_features=max_features, caching=self.caching) toolbox.register("mate", tools.cxUniform, indpb=self.crossover_independent_proba) toolbox.register("mutate", tools.mutFlipBit, indpb=self.mutation_independent_proba) toolbox.register("select", tools.selTournament, tournsize=self.tournament_size) if self.n_jobs == 0: raise ValueError("n_jobs == 0 has no meaning.") elif self.n_jobs > 1: pool = multiprocessing.Pool(processes=self.n_jobs) toolbox.register("map", pool.map) elif self.n_jobs < 0: pool = multiprocessing.Pool(processes=max(cpu_count() + 1 + self.n_jobs, 1)) toolbox.register("map", pool.map) pop = toolbox.population(n=self.n_population) hof = tools.HallOfFame(1, similar=np.array_equal) stats = tools.Statistics(lambda ind: ind.fitness.values) stats.register("avg", np.mean, axis=0) stats.register("std", np.std, axis=0) stats.register("min", np.min, axis=0) stats.register("max", np.max, axis=0) if self.verbose > 0: print("Selecting features with genetic algorithm.") _, log = _eaFunction(pop, toolbox, cxpb=self.crossover_proba, mutpb=self.mutation_proba, ngen=self.n_generations, ngen_no_change=self.n_gen_no_change, stats=stats, halloffame=hof, verbose=self.verbose) if self.n_jobs != 1: pool.close() pool.join() # Set final attributes support_ = np.array(hof, dtype=np.bool)[0] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, support_], y) self.generation_scores_ = np.array([score for score, _ in log.select("max")]) self.n_features_ = support_.sum() self.support_ = support_ return self
def fit(self, X, Y, H_init=None, warm_start=False, initialize=True): """Learn parameters using subgradient descent. Parameters ---------- X : iterable Traing instances. Contains the structured input objects. No requirement on the particular form of entries of X is made. Y : iterable Training labels. Contains the strctured labels for inputs in X. Needs to have the same length as X. constraints : None Discarded. Only for API compatibility currently. warm_start : boolean, default=False Whether to restart a previous fit. initialize : boolean, default=True Whether to initialize the model for the data. Leave this true except if you really know what you are doing. """ if self.verbose > 0: print("Training latent subgradient structural SVM") if initialize: self.model.initialize(X, Y) self.grad_old = np.zeros(self.model.size_psi) if not warm_start: self.w = getattr(self, "w", np.random.normal( 0, 1, size=self.model.size_psi)) self.timestamps_ = [time()] self.objective_curve_ = [] if self.learning_rate == "auto": self.learning_rate_ = self.C * len(X) else: self.learning_rate_ = self.learning_rate else: # hackety hack self.timestamps_[0] = time() - self.timestamps_[-1] w = self.w.copy() n_samples = len(X) try: # catch ctrl+c to stop training for iteration in xrange(self.max_iter): self.timestamps_.append(time() - self.timestamps_[0]) positive_slacks = 0 objective = 0. #verbose = max(0, self.verbose - 3) if self.n_jobs == 1: # online learning for x, y in zip(X, Y): h = self.model.latent(x, y, w) h_hat = self.model.loss_augmented_inference( x, h, w, relaxed=True) delta_psi = (self.model.psi(x, h) - self.model.psi(x, h_hat)) slack = (-np.dot(delta_psi, w) + self.model.loss(h, h_hat)) objective += np.maximum(slack, 0) if slack > 0: positive_slacks += 1 w = self._solve_subgradient(delta_psi, n_samples, w) else: #generate batches of size n_jobs #to speed up inference if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.j_jobs n_batches = int(np.ceil(float(len(X)) / n_jobs)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] verbose = self.verbose - 1 candidate_constraints = Parallel( n_jobs=self.n_jobs, verbose=verbose)(delayed(find_constraint_latent)( self.model, x, y, w) for x, y in zip(X_b, Y_b)) dpsi = np.zeros(self.model.size_psi) for x, y, constraint in zip(X_b, Y_b, candidate_constraints): y_hat, delta_psi, slack, loss = constraint objective += slack dpsi += delta_psi if slack > 0: positive_slacks += 1 dpsi /= float(len(X_b)) w = self._solve_subgradient(dpsi, n_samples, w) # some statistics objective *= self.C objective += np.sum(self.w ** 2) / 2. if positive_slacks == 0: print("No additional constraints") if self.break_on_no_constraints: break if self.verbose > 0: print(self) print("iteration %d" % iteration) print("positive slacks: %d, " "objective: %f" % (positive_slacks, objective)) self.objective_curve_.append(objective) if self.verbose > 2: print(self.w) self._compute_training_loss(X, Y, iteration) if self.logger is not None: self.logger(self, iteration) except KeyboardInterrupt: pass self.timestamps_.append(time() - self.timestamps_[0]) self.objective_curve_.append(self._objective(X, Y)) if self.logger is not None: self.logger(self, 'final') if self.verbose: if self.objective_curve_: print("final objective: %f" % self.objective_curve_[-1]) if self.verbose and self.n_jobs == 1: print("calls to inference: %d" % self.model.inference_calls) return self
def wheel_up_features_bfs (initialBunch, trees, factory, loss, learning_rate=0.25, nIters=100, trees_sample_size=100, verbose = True, vali_factory = None, learning_rate_decay = 1., trees_sample_increase = 0, regularizer = 0., random_walk = True, use_joblib = False, n_jobs = -1, joblib_backend = "threading", copy_pred = False): """ Iterative BFS over best ADD-1 results for [nTrees] iterations """ allTrees = copy.copy(trees) bunch = copy.copy(initialBunch) pred = factory.predict(bunch) bestScore = loss.score(factory,pred) if vali_factory is not None: vali_pred = vali_factory.predict(bunch) vali_score = loss.score(vali_factory,vali_pred) vali_scores = [vali_score] if use_joblib: if n_jobs < 0: n_jobs = joblib.cpu_count() if joblib_backend == "threading": #create copies of data once to escape GIL forever factory = [copy.deepcopy(factory) for i in range(n_jobs)] losses = [copy.deepcopy(loss) for i in range(n_jobs)] elif joblib_backend == "multiprocessing": pass else: raise ValueError, "joblib_backend must be either 'threading' or 'multiprocessing'" if verbose: print "\niteration #",0," ntrees = ", len(bunch),"\nbest loss = ",bestScore print "learning_rate = ", learning_rate print "sample_size", trees_sample_size for itr in xrange(1,nIters+1): change_index= random.randint(0,len(bunch)-1) if random_walk else (i-1)%len(bunch) trees_sample = random.sample(allTrees,trees_sample_size)+ [bunch[change_index]] bunch_wo = copy.copy(bunch) replaced_tree = bunch_wo.pop(change_index) if use_joblib and joblib_backend=="threading": #split trees into sections indices = [0]+[len(trees_sample)*(i+1)/n_jobs for i in range(n_jobs)] treeSections = [trees_sample[indices[i]:indices[i+1]] for i in range(n_jobs)] pred_wo = pred - factory[0].predict(PrunedFormula([bunch[change_index]],bias=0.)) if copy_pred: pred_wo = [copy.deepcopy(pred) for i in range(n_jobs)] else: pred_wo = [pred for i in range(n_jobs)] #execute sections in parallel tasks = [joblib.delayed(try_add1_bfs)(treeSections[ithread],factory[ithread], learning_rate,losses[ithread], 1,pred_wo[ithread],regularizer=regularizer, use_joblib=False) for ithread in range(n_jobs)] _res = joblib.Parallel(n_jobs = n_jobs, backend = "threading")(tasks) _additions,newScores,newPreds = reduce(lambda a,b:[a[i]+b[i] for i in range(3)], _res) else: pred_wo = pred - factory.predict(PrunedFormula([bunch[change_index]],bias=0.)) _additions,newScores,newPreds = try_add1_bfs(trees_sample,factory, learning_rate,loss, 1,pred_wo,regularizer=regularizer, use_joblib=use_joblib,n_jobs=n_jobs) learning_rate *= learning_rate_decay trees_sample_size = min(len(allTrees),trees_sample_size + trees_sample_increase) triples = zip(newScores,_additions,newPreds) triples.sort(key = lambda el: el[0]) newBestScore = min(newScores) if newBestScore > bestScore: pass else: bestScore = newBestScore _add = triples[0][1] bunch = bunch_wo bunch.insert(change_index,_add) pred = triples[0][2] if verbose: print "\niteration #",itr," ntrees = ", len(bunch),"\nbest loss = ", bestScore,"\nlast loss = ",newBestScore if vali_factory is not None: _add = triples[0][1] vali_pred_wo = vali_pred - vali_factory.predict(PrunedFormula([replaced_tree], bias=0.)) vali_pred = vali_pred_wo + vali_factory.predict(PrunedFormula([_add],bias=0.)) vali_score = loss.score(vali_factory,vali_pred) print "Validation loss:", vali_score vali_scores.append(vali_score) print "changed index",change_index print "learning_rate = ", learning_rate print "sample_size", trees_sample_size if verbose>=2: print "Validation scores history:" from matplotlib import pyplot as plt plt.plot(np.arange(len(vali_scores)),vali_scores) return bunch
def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. print('_' * 80) print("Cross validation: ") # param_grid = [ # {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'kernel': ['linear']}, # {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'gamma': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'kernel': ['rbf']}, # ] # param_grid = [ # {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000]}, # ] param_grid = [ { 'C': np.logspace(-3, 3, 7), 'penalty': ['l1', 'l2'] }, ] print(param_grid) scoring = 'roc_auc' num_folds = 5 #X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0) #svc = LinearSVC(penalty="l1", dual=False, tol=1e-3) svc = LogisticRegression(dual=False, tol=1e-3, class_weight='auto') start = time() clf = GridSearchCV(svc, param_grid=param_grid, cv=num_folds, scoring=scoring, verbose=2, n_jobs=joblib.cpu_count()) clf.fit(X, y) print(clf) print( "GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(clf.grid_scores_))) print("Grid Scores:") print() print(clf.grid_scores_) print() print("Best estimator :") print() print(clf.best_estimator_) print() print("Best score :") print() print(clf.best_score_) print() print("Best Parameters :") print() print(clf.best_params_) print() # self.transformer_ = LinearSVC(C=clf.best_estimator_.C, penalty="l1", # dual=False, tol=1e-3, verbose=2) # X = self.transformer_.fit_transform(X, y) # return LinearSVC.fit(self, X, y) self.transformer_ = LogisticRegression( C=clf.best_estimator_.C, penalty=clf.best_estimator_.penalty, dual=False, tol=1e-3, class_weight='auto') X = self.transformer_.fit_transform(X, y) return LogisticRegression.fit(self, X, y)
import datetime from sklearn.model_selection import GridSearchCV FILE_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(FILE_DIR) #prepare the logger parser = argparse.ArgumentParser() parser.add_argument("-p", "--profile", default="ipy_profile", help="Name of IPython profile to use") args = parser.parse_args() profile = args.profile logging.basicConfig(filename=os.path.join(FILE_DIR,profile+'.log'), filemode='w', level=logging.DEBUG) logging.info("number of CPUs found: {0}".format(cpu_count())) logging.info("args.profile: {0}".format(profile)) #prepare the engines c = Client(profile=profile) #The following command will make sure that each engine is running in # the right working directory to access the custom function(s). c[:].map(os.chdir, [FILE_DIR]*len(c)) logging.info("c.ids :{0}".format(str(c.ids))) bview = c.load_balanced_view() register_parallel_backend('ipyparallel', lambda : IPythonParallelBackend(view=bview)) #Get data digits = load_digits() #prepare it for the custom function
def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. print('_' * 80) print("Cross validation: ") # param_grid = [ # {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'kernel': ['linear']}, # {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'gamma': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000], 'kernel': ['rbf']}, # ] # param_grid = [ # {'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 100000]}, # ] param_grid = [ {'C': np.logspace(-3,3,7), 'penalty':['l1','l2']}, ] print(param_grid) scoring = 'roc_auc' num_folds = 5 #X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0) #svc = LinearSVC(penalty="l1", dual=False, tol=1e-3) svc = LogisticRegression(dual=False, tol=1e-3, class_weight='auto') start = time() clf = GridSearchCV(svc, param_grid=param_grid, cv=num_folds, scoring=scoring,verbose=2,n_jobs=joblib.cpu_count()) clf.fit(X,y) print(clf) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(clf.grid_scores_))) print("Grid Scores:") print() print(clf.grid_scores_) print() print("Best estimator :") print() print(clf.best_estimator_) print() print("Best score :") print() print(clf.best_score_) print() print("Best Parameters :") print() print(clf.best_params_) print() # self.transformer_ = LinearSVC(C=clf.best_estimator_.C, penalty="l1", # dual=False, tol=1e-3, verbose=2) # X = self.transformer_.fit_transform(X, y) # return LinearSVC.fit(self, X, y) self.transformer_ = LogisticRegression(C=clf.best_estimator_.C, penalty=clf.best_estimator_.penalty, dual=False, tol=1e-3, class_weight='auto') X = self.transformer_.fit_transform(X, y) return LogisticRegression.fit(self, X, y)
def __init__(self, n_features, n_jobs=1): self.n_features = n_features if n_jobs == -1: n_jobs = cpu_count() self.n_jobs = n_jobs
def greed_up_features_bfs (trees, factory, loss, learning_rate, breadth, nTrees, trees_sample_size, verbose = True, learning_rate_decay = 1., trees_sample_increase = 0, regularizer = 0., use_joblib = False, n_jobs = -1, joblib_method = "threads", copy_pred = False, initialBunch = []): """ Iterative BFS over best ADD-1 results for [nTrees] iterations """ allTrees = copy.copy(trees) if len(initialBunch)==0: trees_sample = np.array(random.sample(allTrees,trees_sample_size)) additions,losses,preds = try_add1_bfs(trees_sample,factory,learning_rate,loss, breadth,y_pred=factory.labels*0,regularizer = regularizer) bunches = [[_added] for _added in additions] else: bunches = [initialBunch] preds = [factory.predict(initialBunch)] losses = [np.sum(loss(factory,preds[0]))] bestScore = min(losses) if use_joblib: if n_jobs < 0: n_jobs = joblib.cpu_count() if joblib_method == "threads": #create copies of data once to escape GIL forever factory = [copy.deepcopy(factory) for i in range(n_jobs)] loss = [copy.deepcopy(loss) for i in range(n_jobs)] elif joblib_method == "processes": pass else: raise ValueError, "joblib_method must be either 'threads' or 'processes'" if verbose: print "\niteration #",0," ntrees = ", len(bunches[0]),"\nbest loss = ",bestScore print "learning_rate = ", learning_rate print "sample_size", trees_sample_size itr = 0 while len(bunches[0]) <nTrees: itr+=1 newBunches = [] newScores = [] newPreds = [] for bunch,pred in zip(bunches,preds): trees_sample = np.array(random.sample(allTrees,trees_sample_size)) if use_joblib and joblib_method=="threads": #split trees into sections indices = [0]+[len(trees_sample)*(i+1)/n_jobs for i in range(n_jobs)] treeSections = [trees_sample[indices[i]:indices[i+1]] for i in range(n_jobs)] if copy_pred: pred = [copy.deepcopy(pred) for i in range(n_jobs)] else: pred = [pred for i in range(n_jobs)] #execute sections in parallel tasks = [joblib.delayed(try_add1_bfs)(treeSections[ithread],factory[ithread], learning_rate,loss[ithread], breadth,pred[ithread],regularizer=regularizer, use_joblib=False) for ithread in range(n_jobs)] _res = joblib.Parallel(n_jobs = n_jobs, backend = "threading")(tasks) _additions,_losses,_preds = reduce(lambda a,b:[a[i]+b[i] for i in range(3)], _res) else: _additions,_losses,_preds = try_add1_bfs(trees_sample,factory,learning_rate,loss, breadth,pred,regularizer=regularizer, use_joblib=use_joblib,n_jobs=n_jobs) _bunches = [bunch+[_added] for _added in _additions] newBunches+=_bunches newScores += _losses newPreds += _preds learning_rate *= learning_rate_decay trees_sample_size = min(len(allTrees),trees_sample_size + trees_sample_increase) triples = zip(newScores,newBunches,newPreds) triples.sort(key = lambda el: el[0]) newBestScore = min(newScores) if newBestScore > bestScore: learning_rate /=2. if learning_rate < 0.00001: break else: bestScore = newBestScore bunches = [triple[1] for triple in triples[:breadth]] preds = [triple[2] for triple in triples[:breadth]] if verbose: print "\niteration #",itr," ntrees = ", len(bunches[0]),"\nbest loss = ", bestScore,"\nlast loss = ",newBestScore print "learning_rate = ", learning_rate print "sample_size", trees_sample_size return bunches[0]
def fit(self, X, Y, H_init=None): """Learn parameters using subgradient descent. Parameters ---------- X : iterable Traing instances. Contains the structured input objects. No requirement on the particular form of entries of X is made. Y : iterable Training labels. Contains the strctured labels for inputs in X. Needs to have the same length as X. constraints : None Discarded. Only for API compatibility currently. """ print("Training latent subgradient structural SVM") self.w = getattr(self, "w", np.random.normal( 0, .001, size=self.model.size_psi)) #constraints = [] self.objective_curve_ = [] n_samples = len(X) try: # catch ctrl+c to stop training for iteration in xrange(self.max_iter): positive_slacks = 0 objective = 0. #verbose = max(0, self.verbose - 3) if self.n_jobs == 1: # online learning for x, y in zip(X, Y): h = self.model.latent(x, y, self.w) h_hat = self.model.loss_augmented_inference( x, h, self.w, relaxed=True) delta_psi = (self.model.psi(x, h) - self.model.psi(x, h_hat)) slack = (-np.dot(delta_psi, self.w) + self.model.loss(h, h_hat)) objective += np.maximum(slack, 0) if slack > 0: positive_slacks += 1 self._solve_subgradient(delta_psi, n_samples) else: #generate batches of size n_jobs #to speed up inference if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.j_jobs n_batches = int(np.ceil(float(len(X)) / n_jobs)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] verbose = self.verbose - 1 candidate_constraints = Parallel( n_jobs=self.n_jobs, verbose=verbose)(delayed(find_constraint_latent)( self.model, x, y, self.w) for x, y in zip(X_b, Y_b)) dpsi = np.zeros(self.model.size_psi) for x, y, constraint in zip(X_b, Y_b, candidate_constraints): y_hat, delta_psi, slack, loss = constraint objective += slack dpsi += delta_psi if slack > 0: positive_slacks += 1 dpsi /= float(len(X_b)) self._solve_subgradient(dpsi, n_samples) # some statistics objective += np.sum(self.w ** 2) / self.C / 2. #objective /= float(n_samples) if positive_slacks == 0: print("No additional constraints") if self.break_on_no_constraints: break if self.verbose > 0: print(self) print("iteration %d" % iteration) print("positive slacks: %d, " "objective: %f" % (positive_slacks, objective)) self.objective_curve_.append(objective) if self.verbose > 2: print(self.w) self._compute_training_loss(X, Y, iteration) if self.logger is not None: self.logger(self, iteration) except KeyboardInterrupt: pass print("final objective: %f" % self.objective_curve_[-1]) print("calls to inference: %d" % self.model.inference_calls) return self
def fit(self, X, y=None, groups=None): """Run fit on the estimator with randomly drawn parameters. Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. y : array-like, shape = [n_samples] or [n_samples, n_output] Target relative to X for classification or regression (class labels should be integers or strings). groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. """ # check if space is a single dict, convert to list if so search_spaces = self.search_spaces if isinstance(search_spaces, dict): search_spaces = [search_spaces] if self.optimizer_kwargs is None: self.optimizer_kwargs_ = {} else: self.optimizer_kwargs_ = dict(self.optimizer_kwargs) random_state = check_random_state(self.random_state) self.optimizer_kwargs_['random_state'] = random_state # Instantiate optimizers for all the search spaces. optimizers = [] for search_space in search_spaces: if isinstance(search_space, tuple): search_space = search_space[0] optimizers.append(self._make_optimizer(search_space)) self.optimizers_ = optimizers # will save the states of the optimizers self.cv_results_ = defaultdict(list) self.best_index_ = None self.multimetric_ = False n_jobs = self.n_jobs # account for case n_jobs < 0 if n_jobs < 0: n_jobs = max(1, cpu_count() + n_jobs + 1) for search_space, optimizer in zip(search_spaces, optimizers): # if not provided with search subspace, n_iter is taken as # self.n_iter if isinstance(search_space, tuple): search_space, n_iter = search_space else: n_iter = self.n_iter # do the optimization for particular search space while n_iter > 0: # when n_iter < n_jobs points left for evaluation n_jobs_adjusted = min(n_iter, n_jobs) self._step( X, y, search_space, optimizer, groups=groups, n_jobs=n_jobs_adjusted ) n_iter -= n_jobs # Refit the best model on the the whole dataset if self.refit: self._fit_best_model(X, y) return self
def permuted_ols(tested_vars, target_vars, confounding_vars=None, model_intercept=True, n_perm=10000, random_state=None, n_jobs=1): """Massively univariate group analysis with permuted OLS. Tested variates are independently fitted to target variates descriptors (e.g. brain imaging signal) according to a linear model solved with an Ordinary Least Squares criterion. Confounding variates may be included in the model. Permutation testing is used to assess the significance of the relationship between the tested variates and the target variates [1, 2]. A max-type procedure is used to obtain family-wise corrected p-values. The specific permutation scheme implemented here is the one of Freedman & Lane [3]. Its has been demonstrated in [1] that this scheme conveys more sensitivity than alternative schemes. This holds for neuroimaging applications, as discussed in details in [2]. Permutations are performed on parallel computing units. Each of them performs a fraction of permutations on the whole dataset. Thus, the max F-score amongst data descriptors can be computed directly, which avoids storing all the computed F-scores. The variates should be given C-contiguous. target_vars are fortran-ordered automatically to speed-up computations. Parameters ---------- tested_vars : array-like, shape=(n_samples, n_regressors) Explanatory variates, fitted and tested independently from each others. target_vars : array-like, shape=(n_samples, n_descriptors) fMRI data, trying to be explained by explanatory and confounding variates. confounding_vars : array-like, shape=(n_samples, n_covars) Confounding variates (covariates), fitted but not tested. If None, no confounding variate is added to the model (except maybe a constant column according to the value of `model_intercept`) model_intercept : bool, If True, a constant column is added to the confounding variates unless the tested variate is already the intercept. n_perm : int, Number of permutations to perform. Permutations are costly but the more are performed, the more precision one gets in the p-values estimation. random_state : int or None, Seed for random number generator, to have the same permutations in each computing units. n_jobs : int, Number of parallel workers. If 0 is provided, all CPUs are used. A negative number indicates that all the CPUs except (|n_jobs| - 1) ones will be used. Returns ------- pvals : array-like, shape=(n_regressors, n_descriptors) Negative log10 p-values associated with the significance test of the n_regressors explanatory variates against the n_descriptors target variates. Family-wise corrected p-values. score_orig_data : numpy.ndarray, shape=(n_regressors, n_descriptors) F-statistic associated with the significance test of the n_regressors explanatory variates against the n_descriptors target variates. The ranks of the scores into the h0 distribution correspond to the p-values. h0_fmax : array-like, shape=(n_perm, ) Distribution of the (max) F-statistic under the null hypothesis (obtained from the permutations). Array is sorted. References ---------- [1] Anderson, M. J. & Robinson, J. (2001). Permutation tests for linear models. Australian & New Zealand Journal of Statistics, 43(1), 75-88. (http://avesbiodiv.mncn.csic.es/estadistica/permut2.pdf) [2] Winkler, A. M. et al. (2014). Permutation inference for the general linear model. Neuroimage. [3] Freedman, D. & Lane, D. (1983). A nonstochastic interpretation of reported significance levels. J. Bus. Econ. Stats., 1(4), 292-298 """ # initialize the seed of the random generator rng = check_random_state(random_state) # check n_jobs (number of CPUs) if n_jobs == 0: # invalid according to joblib's conventions raise ValueError("'n_jobs == 0' is not a valid choice. " "Please provide a positive number of CPUs, or -1 " "for all CPUs, or a negative number (-i) for " "'all but (i-1)' CPUs (joblib conventions).") elif n_jobs < 0: n_jobs = max(1, joblib.cpu_count() - int(n_jobs) + 1) else: n_jobs = min(n_jobs, joblib.cpu_count()) # make target_vars F-ordered to speed-up computation if target_vars.ndim != 2: raise ValueError( "'target_vars' should be a 2D array. " "An array with %d dimension%s was passed" % (target_vars.ndim, "s" if target_vars.ndim > 1 else "")) target_vars = np.asfortranarray(target_vars) # efficient for chunking n_descriptors = target_vars.shape[1] # check explanatory variates dimensions if tested_vars.ndim == 1: tested_vars = np.atleast_2d(tested_vars).T n_samples, n_regressors = tested_vars.shape # check if explanatory variates is intercept (constant) or not if (n_regressors == 1 and np.unique(tested_vars).size == 1): intercept_test = True else: intercept_test = False # optionally add intercept if model_intercept and not intercept_test: if confounding_vars is not None: confounding_vars = np.hstack( (confounding_vars, np.ones((n_samples, 1)))) else: confounding_vars = np.ones((n_samples, 1)) ### OLS regression on original data if confounding_vars is not None: # step 1: extract effect of covars from target vars covars_orthonormalized = orthonormalize_matrix(confounding_vars) if not covars_orthonormalized.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Confounding variates not C_CONTIGUOUS.') covars_orthonormalized = np.ascontiguousarray( covars_orthonormalized) targetvars_normalized = normalize_matrix_on_axis( target_vars).T # faster with F-ordered target_vars_chunk if not targetvars_normalized.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Target variates not C_CONTIGUOUS.') targetvars_normalized = np.ascontiguousarray(targetvars_normalized) beta_targetvars_covars = np.dot(targetvars_normalized, covars_orthonormalized) targetvars_resid_covars = targetvars_normalized - np.dot( beta_targetvars_covars, covars_orthonormalized.T) targetvars_resid_covars = normalize_matrix_on_axis( targetvars_resid_covars, axis=1) # step 2: extract effect of covars from tested vars testedvars_normalized = normalize_matrix_on_axis(tested_vars.T, axis=1) beta_testedvars_covars = np.dot(testedvars_normalized, covars_orthonormalized) testedvars_resid_covars = testedvars_normalized - np.dot( beta_testedvars_covars, covars_orthonormalized.T) testedvars_resid_covars = normalize_matrix_on_axis( testedvars_resid_covars, axis=1).T.copy() else: targetvars_resid_covars = normalize_matrix_on_axis(target_vars).T testedvars_resid_covars = normalize_matrix_on_axis(tested_vars).copy() covars_orthonormalized = None # check arrays contiguousity (for the sake of code efficiency) if not targetvars_resid_covars.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Target variates not C_CONTIGUOUS.') targetvars_resid_covars = np.ascontiguousarray(targetvars_resid_covars) if not testedvars_resid_covars.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Tested variates not C_CONTIGUOUS.') testedvars_resid_covars = np.ascontiguousarray(testedvars_resid_covars) # step 3: original regression (= regression on residuals + adjust F score) # compute F score for original data scores_original_data = _f_score_with_covars_and_normalized_design( testedvars_resid_covars, targetvars_resid_covars.T, covars_orthonormalized) ### Permutations # parallel computing units perform a reduced number of permutations each if n_perm > n_jobs: n_perm_chunks = np.asarray([n_perm / n_jobs] * n_jobs, dtype=int) n_perm_chunks[-1] += n_perm % n_jobs elif n_perm > 0: n_perm_chunks = np.ones(n_perm, dtype=int) else: # 0 or negative number of permutations => original data scores only return np.asarray([]), scores_original_data, np.asarray([]) # actual permutations, seeded from a random integer between 0 and maximum # value represented by np.int32 (to have a large entropy). ret = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_permuted_ols_on_chunk)( scores_original_data, testedvars_resid_covars, targetvars_resid_covars.T, covars_orthonormalized, n_perm_chunk=n_perm_chunk, intercept_test=intercept_test, random_state=rng.random_integers(np.iinfo(np.int32).max)) for n_perm_chunk in n_perm_chunks) # reduce results scores_as_ranks_parts, h0_fmax_parts = zip(*ret) h0_fmax = np.hstack((h0_fmax_parts)) scores_as_ranks = np.zeros((n_regressors, n_descriptors)) for scores_as_ranks_part in scores_as_ranks_parts: scores_as_ranks += scores_as_ranks_part # convert ranks into p-values pvals = (n_perm + 1 - scores_as_ranks) / float(1 + n_perm) return -np.log10(pvals), scores_original_data, h0_fmax[0]
def step(self, X, y, space_id, groups=None, n_jobs=1): """Generate n_jobs parameters and evaluate them in parallel. Having a separate function for a single step for search allows to save easily checkpoints for the parameter search and restore from possible failures. Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csc_matrix``. y : array-like, shape = [n_samples] or [n_samples, n_outputs] The target values (class labels) as integers or strings. space_id : hashable Identifier of parameter search space. Add search spaces with groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. n_jobs : int, default=1 Number of parameters to evaluate in parallel. Returns ------- params_dict: dictionary with parameter values. """ # convert n_jobst to int > 0 if necessary if n_jobs < 0: n_jobs = max(1, cpu_count() + n_jobs + 1) # use the cached optimizer for particular parameter space if space_id not in self.search_spaces_: raise ValueError("Unknown space %s" % space_id) # get the search space for a step search_space = self.search_spaces_[space_id] if isinstance(search_space, tuple): search_space, _ = search_space # create optimizer if not created already if space_id not in self.optimizer_: self.optimizer_[space_id] = self._make_optimizer(search_space) optimizer = self.optimizer_[space_id] # get parameter values to evaluate params = optimizer.ask(n_points=n_jobs) params_dict = [point_asdict(search_space, p) for p in params] # self.cv_results_ is reset at every call to _fit, keep current all_cv_results = self.cv_results_ # record performances with different points refit = self.refit self.refit = False # do not fit yet - will be fit later # this adds compatibility with different versions of sklearn self._fit(X, y, groups, params_dict) self.refit = refit # merge existing and new cv_results_ for k in self.cv_results_: all_cv_results[k].extend(self.cv_results_[k]) self.cv_results_ = all_cv_results self.best_index_ = np.argmax(self.cv_results_['mean_test_score']) # feed the point and objective back into optimizer local_results = self.cv_results_['mean_test_score'][-len(params):] # optimizer minimizes objective, hence provide negative score optimizer.tell(params, [-score for score in local_results]) # fit the best model if necessary if self.refit: self._fit_best_model(X, y)
def fit(self, X, Y, constraints=None): """Learn parameters using subgradient descent. Parameters ---------- X : iterable Traing instances. Contains the structured input objects. No requirement on the particular form of entries of X is made. Y : iterable Training labels. Contains the strctured labels for inputs in X. Needs to have the same length as X. constraints : None Discarded. Only for API compatibility currently. """ print("Training primal subgradient structural SVM") w = getattr(self, "w", np.zeros(self.problem.size_psi)) #constraints = [] loss_curve = [] objective_curve = [] n_samples = len(X) try: # catch ctrl+c to stop training for iteration in xrange(self.max_iter): positive_slacks = 0 objective = 0. verbose = max(0, self.verbose - 3) if self.n_jobs == 1: # online learning for x, y in zip(X, Y): y_hat, delta_psi, slack, loss = \ find_constraint(self.problem, x, y, w) objective += slack if slack > 0: positive_slacks += 1 w = self._solve_subgradient(w, delta_psi, n_samples) else: # generate batches of size n_jobs # to speed up inference if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.j_jobs n_batches = int(np.ceil(float(len(X)) / n_jobs)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] candidate_constraints = Parallel( n_jobs=self.n_jobs, verbose=verbose)(delayed(find_constraint)( self.problem, x, y, w) for x, y in zip(X_b, Y_b)) dpsi = np.zeros(self.problem.size_psi) for x, y, constraint in zip(X_b, Y_b, candidate_constraints): y_hat, delta_psi, slack, loss = constraint objective += slack dpsi += delta_psi if slack > 0: positive_slacks += 1 dpsi /= float(len(X_b)) w = self._solve_subgradient(w, dpsi, n_samples) # some statistics objective /= len(X) objective += np.sum(w ** 2) / self.C / 2. if positive_slacks == 0: print("No additional constraints") break if self.verbose > 0: print(self) print("iteration %d" % iteration) print("positive slacks: %d," "objective: %f" % (positive_slacks, objective)) objective_curve.append(objective) if self.verbose > 2: print(w) self._compute_training_loss(X, Y, w, iteration) except KeyboardInterrupt: pass self.w = w self.loss_curve_ = loss_curve self.objective_curve_ = objective_curve print("final objective: %f" % objective_curve[-1]) print("calls to inference: %d" % self.problem.inference_calls) return self
testPredictFolder = os.path.join(rusinol.resultPath, 'prediction') if not os.path.exists(testPredictFolder): os.makedirs(testPredictFolder) # xml file location xmlName = 'segmentation.xml' testXmlCompletePath = os.path.join(rusinol.testPath, xmlName) # scales scaleList = [0.1, 0.2 , 0.3 ,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.5] # now we have trained our classifier imagePathSaveList = os.path.join(rusinol.testPath, 'imgPathList.pkl') with open(imagePathSaveList, 'rb') as pklFile: imgPathList = pickle.load(pklFile) pklFile.close() numOfImages = len(imgPathList) # parallel processing numCores = joblib.cpu_count() if numCores == 8: numJobs = 5 elif numCores == 4: numJobs = 2 else: numJobs = numCores/2 print 'Running jobs in parallel on cores - ', numJobs # parallel job scheduler joblib.Parallel(n_jobs = numJobs)(joblib.delayed(parallelImageProcessing)(rusinol.cellSize, \ rusinol.stepSize, scaleList, testPredictFolder, \ pixelTextProbPredictorWeights, feStandardized, scalesFromMSER, eachImgPath) for eachImgPath in imgPathList) # exit the script exit()
def fit(self, X, Y, H_init=None, warm_start=False, initialize=True): """Learn parameters using subgradient descent. Parameters ---------- X : iterable Traing instances. Contains the structured input objects. No requirement on the particular form of entries of X is made. Y : iterable Training labels. Contains the strctured labels for inputs in X. Needs to have the same length as X. constraints : None Discarded. Only for API compatibility currently. warm_start : boolean, default=False Whether to restart a previous fit. initialize : boolean, default=True Whether to initialize the model for the data. Leave this true except if you really know what you are doing. """ if self.verbose > 0: print("Training latent subgradient structural SVM") if initialize: self.model.initialize(X, Y) self.grad_old = np.zeros(self.model.size_joint_feature) if not warm_start: self.w = getattr(self, "w", np.random.normal( 0, 1, size=self.model.size_joint_feature)) self.timestamps_ = [time()] self.objective_curve_ = [] if self.learning_rate == "auto": self.learning_rate_ = self.C * len(X) else: self.learning_rate_ = self.learning_rate else: # hackety hack self.timestamps_[0] = time() - self.timestamps_[-1] w = self.w.copy() n_samples = len(X) try: # catch ctrl+c to stop training for iteration in xrange(self.max_iter): self.timestamps_.append(time() - self.timestamps_[0]) positive_slacks = 0 objective = 0. #verbose = max(0, self.verbose - 3) if self.n_jobs == 1: # online learning for x, y in zip(X, Y): h = self.model.latent(x, y, w) h_hat = self.model.loss_augmented_inference( x, h, w, relaxed=True) delta_joint_feature = (self.model.joint_feature(x, h) - self.model.joint_feature(x, h_hat)) slack = (-np.dot(delta_joint_feature, w) + self.model.loss(h, h_hat)) objective += np.maximum(slack, 0) if slack > 0: positive_slacks += 1 w = self._solve_subgradient(delta_joint_feature, n_samples, w) else: #generate batches of size n_jobs #to speed up inference if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.j_jobs n_batches = int(np.ceil(float(len(X)) / n_jobs)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] verbose = self.verbose - 1 candidate_constraints = Parallel( n_jobs=self.n_jobs, verbose=verbose)(delayed(find_constraint_latent)( self.model, x, y, w) for x, y in zip(X_b, Y_b)) djoint_feature = np.zeros(self.model.size_joint_feature) for x, y, constraint in zip(X_b, Y_b, candidate_constraints): y_hat, delta_joint_feature, slack, loss = constraint objective += slack djoint_feature += delta_joint_feature if slack > 0: positive_slacks += 1 djoint_feature /= float(len(X_b)) w = self._solve_subgradient(djoint_feature, n_samples, w) # some statistics objective *= self.C objective += np.sum(self.w ** 2) / 2. if positive_slacks == 0: print("No additional constraints") if self.break_on_no_constraints: break if self.verbose > 0: print(self) print("iteration %d" % iteration) print("positive slacks: %d, " "objective: %f" % (positive_slacks, objective)) self.objective_curve_.append(objective) if self.verbose > 2: print(self.w) self._compute_training_loss(X, Y, iteration) if self.logger is not None: self.logger(self, iteration) except KeyboardInterrupt: pass self.timestamps_.append(time() - self.timestamps_[0]) self.objective_curve_.append(self._objective(X, Y)) if self.logger is not None: self.logger(self, 'final') if self.verbose: if self.objective_curve_: print("final objective: %f" % self.objective_curve_[-1]) if self.verbose and self.n_jobs == 1: print("calls to inference: %d" % self.model.inference_calls) return self
def permuted_ols(tested_vars, target_vars, confounding_vars=None, model_intercept=True, n_perm=10000, random_state=None, n_jobs=1): """Massively univariate group analysis with permuted OLS. Tested variates are independently fitted to target variates descriptors (e.g. brain imaging signal) according to a linear model solved with an Ordinary Least Squares criterion. Confounding variates may be included in the model. Permutation testing is used to assess the significance of the relationship between the tested variates and the target variates [1, 2]. A max-type procedure is used to obtain family-wise corrected p-values. The specific permutation scheme implemented here is the one of Freedman & Lane [3]. Its has been demonstrated in [1] that this scheme conveys more sensitivity than alternative schemes. This holds for neuroimaging applications, as discussed in details in [2]. Permutations are performed on parallel computing units. Each of them performs a fraction of permutations on the whole dataset. Thus, the max F-score amongst data descriptors can be computed directly, which avoids storing all the computed F-scores. The variates should be given C-contiguous. target_vars are fortran-ordered automatically to speed-up computations. Parameters ---------- tested_vars : array-like, shape=(n_samples, n_regressors) Explanatory variates, fitted and tested independently from each others. target_vars : array-like, shape=(n_samples, n_descriptors) fMRI data, trying to be explained by explanatory and confounding variates. confounding_vars : array-like, shape=(n_samples, n_covars) Confounding variates (covariates), fitted but not tested. If None, no confounding variate is added to the model (except maybe a constant column according to the value of `model_intercept`) model_intercept : bool, If True, a constant column is added to the confounding variates unless the tested variate is already the intercept. n_perm : int, Number of permutations to perform. Permutations are costly but the more are performed, the more precision one gets in the p-values estimation. random_state : int or None, Seed for random number generator, to have the same permutations in each computing units. n_jobs : int, Number of parallel workers. If 0 is provided, all CPUs are used. A negative number indicates that all the CPUs except (|n_jobs| - 1) ones will be used. Returns ------- pvals : array-like, shape=(n_regressors, n_descriptors) Negative log10 p-values associated with the significance test of the n_regressors explanatory variates against the n_descriptors target variates. Family-wise corrected p-values. score_orig_data : numpy.ndarray, shape=(n_regressors, n_descriptors) F-statistic associated with the significance test of the n_regressors explanatory variates against the n_descriptors target variates. The ranks of the scores into the h0 distribution correspond to the p-values. h0_fmax : array-like, shape=(n_perm, ) Distribution of the (max) F-statistic under the null hypothesis (obtained from the permutations). Array is sorted. References ---------- [1] Anderson, M. J. & Robinson, J. (2001). Permutation tests for linear models. Australian & New Zealand Journal of Statistics, 43(1), 75-88. (http://avesbiodiv.mncn.csic.es/estadistica/permut2.pdf) [2] Winkler, A. M. et al. (2014). Permutation inference for the general linear model. Neuroimage. [3] Freedman, D. & Lane, D. (1983). A nonstochastic interpretation of reported significance levels. J. Bus. Econ. Stats., 1(4), 292-298 """ # initialize the seed of the random generator rng = check_random_state(random_state) # check n_jobs (number of CPUs) if n_jobs == 0: # invalid according to joblib's conventions raise ValueError("'n_jobs == 0' is not a valid choice. " "Please provide a positive number of CPUs, or -1 " "for all CPUs, or a negative number (-i) for " "'all but (i-1)' CPUs (joblib conventions).") elif n_jobs < 0: n_jobs = max(1, joblib.cpu_count() - int(n_jobs) + 1) else: n_jobs = min(n_jobs, joblib.cpu_count()) # make target_vars F-ordered to speed-up computation if target_vars.ndim != 2: raise ValueError("'target_vars' should be a 2D array. " "An array with %d dimension%s was passed" % (target_vars.ndim, "s" if target_vars.ndim > 1 else "")) target_vars = np.asfortranarray(target_vars) # efficient for chunking n_descriptors = target_vars.shape[1] # check explanatory variates dimensions if tested_vars.ndim == 1: tested_vars = np.atleast_2d(tested_vars).T n_samples, n_regressors = tested_vars.shape # check if explanatory variates is intercept (constant) or not if (n_regressors == 1 and np.unique(tested_vars).size == 1): intercept_test = True else: intercept_test = False # optionally add intercept if model_intercept and not intercept_test: if confounding_vars is not None: confounding_vars = np.hstack( (confounding_vars, np.ones((n_samples, 1)))) else: confounding_vars = np.ones((n_samples, 1)) ### OLS regression on original data if confounding_vars is not None: # step 1: extract effect of covars from target vars covars_orthonormalized = orthonormalize_matrix(confounding_vars) if not covars_orthonormalized.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Confounding variates not C_CONTIGUOUS.') covars_orthonormalized = np.ascontiguousarray( covars_orthonormalized) targetvars_normalized = normalize_matrix_on_axis( target_vars).T # faster with F-ordered target_vars_chunk if not targetvars_normalized.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Target variates not C_CONTIGUOUS.') targetvars_normalized = np.ascontiguousarray(targetvars_normalized) beta_targetvars_covars = np.dot(targetvars_normalized, covars_orthonormalized) targetvars_resid_covars = targetvars_normalized - np.dot( beta_targetvars_covars, covars_orthonormalized.T) targetvars_resid_covars = normalize_matrix_on_axis( targetvars_resid_covars, axis=1) # step 2: extract effect of covars from tested vars testedvars_normalized = normalize_matrix_on_axis(tested_vars.T, axis=1) beta_testedvars_covars = np.dot(testedvars_normalized, covars_orthonormalized) testedvars_resid_covars = testedvars_normalized - np.dot( beta_testedvars_covars, covars_orthonormalized.T) testedvars_resid_covars = normalize_matrix_on_axis( testedvars_resid_covars, axis=1).T.copy() else: targetvars_resid_covars = normalize_matrix_on_axis(target_vars).T testedvars_resid_covars = normalize_matrix_on_axis(tested_vars).copy() covars_orthonormalized = None # check arrays contiguousity (for the sake of code efficiency) if not targetvars_resid_covars.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Target variates not C_CONTIGUOUS.') targetvars_resid_covars = np.ascontiguousarray(targetvars_resid_covars) if not testedvars_resid_covars.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Tested variates not C_CONTIGUOUS.') testedvars_resid_covars = np.ascontiguousarray(testedvars_resid_covars) # step 3: original regression (= regression on residuals + adjust F score) # compute F score for original data scores_original_data = _f_score_with_covars_and_normalized_design( testedvars_resid_covars, targetvars_resid_covars.T, covars_orthonormalized) ### Permutations # parallel computing units perform a reduced number of permutations each if n_perm > n_jobs: n_perm_chunks = np.asarray([n_perm / n_jobs] * n_jobs, dtype=int) n_perm_chunks[-1] += n_perm % n_jobs elif n_perm > 0: n_perm_chunks = np.ones(n_perm, dtype=int) else: # 0 or negative number of permutations => original data scores only return np.asarray([]), scores_original_data, np.asarray([]) # actual permutations, seeded from a random integer between 0 and maximum # value represented by np.int32 (to have a large entropy). ret = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(_permuted_ols_on_chunk) (scores_original_data, testedvars_resid_covars, targetvars_resid_covars.T, covars_orthonormalized, n_perm_chunk=n_perm_chunk, intercept_test=intercept_test, random_state=rng.random_integers(np.iinfo(np.int32).max)) for n_perm_chunk in n_perm_chunks) # reduce results scores_as_ranks_parts, h0_fmax_parts = zip(*ret) h0_fmax = np.hstack((h0_fmax_parts)) scores_as_ranks = np.zeros((n_regressors, n_descriptors)) for scores_as_ranks_part in scores_as_ranks_parts: scores_as_ranks += scores_as_ranks_part # convert ranks into p-values pvals = (n_perm + 1 - scores_as_ranks) / float(1 + n_perm) return - np.log10(pvals), scores_original_data, h0_fmax[0]
def wheel_up_features_bfs (initialBunch, trees, factory, loss, learning_rate, nIters, trees_sample_size, verbose = True, learning_rate_decay = 1., trees_sample_increase = 0, regularizer = 0., random_walk = True, use_joblib = False, n_jobs = -1, joblib_method = "threads", copy_pred = False): """ Iterative BFS over best ADD-1 results for [nTrees] iterations """ allTrees = copy.copy(trees) bunch = copy.copy(initialBunch) pred = factory.predict(bunch) bestScore = sum(loss(factory,pred)) if use_joblib: if n_jobs < 0: n_jobs = joblib.cpu_count() if joblib_method == "threads": #create copies of data once to escape GIL forever factory = [copy.deepcopy(factory) for i in range(n_jobs)] loss = [copy.deepcopy(loss) for i in range(n_jobs)] elif joblib_method == "processes": pass else: raise ValueError, "joblib_method must be either 'threads' or 'processes'" if verbose: print "\niteration #",0," ntrees = ", len(bunch),"\nbest loss = ",bestScore print "learning_rate = ", learning_rate print "sample_size", trees_sample_size for itr in xrange(1,nIters+1): change_index= random.randint(0,len(bunch)-1) if random_walk else (i-1)%len(bunch) trees_sample = random.sample(allTrees,trees_sample_size)+ [bunch[change_index]] bunch_wo = copy.copy(bunch) bunch_wo.pop(change_index) if use_joblib and joblib_method=="threads": #split trees into sections indices = [0]+[len(trees_sample)*(i+1)/n_jobs for i in range(n_jobs)] treeSections = [trees_sample[indices[i]:indices[i+1]] for i in range(n_jobs)] pred_wo = pred - factory[0].predict([bunch[change_index]]) if copy_pred: pred_wo = [copy.deepcopy(pred) for i in range(n_jobs)] else: pred_wo = [pred for i in range(n_jobs)] #execute sections in parallel tasks = [joblib.delayed(try_add1_bfs)(treeSections[ithread],factory[ithread], learning_rate,loss[ithread], 1,pred_wo[ithread],regularizer=regularizer, use_joblib=False) for ithread in range(n_jobs)] _res = joblib.Parallel(n_jobs = n_jobs, backend = "threading")(tasks) _additions,newScores,newPreds = reduce(lambda a,b:[a[i]+b[i] for i in range(3)], _res) else: pred_wo = pred - factory.predict([bunch[change_index]]) _additions,newScores,newPreds = try_add1_bfs(trees_sample,factory, learning_rate,loss, 1,pred_wo,regularizer=regularizer, use_joblib=use_joblib,n_jobs=n_jobs) newBunches = [bunch_wo+[_added] for _added in _additions] learning_rate *= learning_rate_decay trees_sample_size = min(len(allTrees),trees_sample_size + trees_sample_increase) triples = zip(newScores,newBunches,newPreds) triples.sort(key = lambda el: el[0]) newBestScore = min(newScores) if newBestScore > bestScore: pass else: bestScore = newBestScore bunch = triples[0][1] bunch.insert(change_index,bunch.pop()) pred = triples[0][2] if verbose: print "\niteration #",itr," ntrees = ", len(bunch),"\nbest loss = ", bestScore,"\nlast loss = ",newBestScore print "changed index",change_index print "learning_rate = ", learning_rate print "sample_size", trees_sample_size return bunch
def __init__(self, *files, columns=None, ngrams=2, decap=False, patterns=None, mask=None): """ Create a new data object with the following attributes: * instances - list of raw text instances * labels - array of instance labels in same order as raw text * features - matrix of feature vectors per text instance * names - array of feature names in same order as features Both features and names are undefined until extracted using some Vectorizer. Exclusive options for either BIO-NER vs. plain-text input: 1. **BIO-NER** paramters: Define a `columns` integer to define the number of disregarded columns and thereby declare that the input will be in BIO-NER format. In addtion, the `ngram` option can be set to define the ngram size of the tokens to generate. All other keyword parameter will be ignored. 2. **plain-text** keyword parameters: Set `decap=True` to lower-case the first letter of each plain-text line. Use a list of regex `patterns` and a repacement string `mask` to "mask" pattern-matched words in regular (non-`column`) input. """ try: if columns is None: inputs = [[l.strip('\r\n') for l in f] for f in files] if decap: for i in range(len(inputs)): inputs[i] = ["{}{}".format(l[0].lower(), l[1:]) for l in inputs[i] if len(l)] if patterns and mask: self.instances = [] splits = joblib.cpu_count() for lines in inputs: jobs = tuple(lines[i::splits] for i in range(splits)) jobs = joblib.Parallel(n_jobs=splits)( delayed(subAll)(patterns, mask, lines) for lines in jobs ) self.instances.append(list(zip(lines, chain(*jobs)))) else: self.instances = [list(zip(lines, lines)) for lines in inputs] else: self.instances = [] for f in files: # FIXME: instead of two hardcoded entity masks, # FIXME: this has to be dynamic or generic... sentences = SentenceParser(f, ('FACTOR', 'TARGET'), id_columns=columns) if not columns: sentences = list(enumerate(sentences, start=1)) data = [(sid, asDict(s, ngrams)) for sid, s in sentences] self.instances.append(data) except UnicodeDecodeError as e: import sys print('decoding error:', e.reason, 'in input file') sys.exit(1) # ensure the minority label(s) come first (important for the evaluation, too!) self.instances = sorted(self.instances, key=len) self.classes = len(self.instances) self.labels = np.concatenate([ (np.zeros(len(data), dtype=np.uint8) + i) for i, data in enumerate(self.instances) ]) self.ids = None self.raw = None self.features = None self.names = None if columns is None: self.raw, self.instances = zip(*list(chain.from_iterable(self.instances))) if len(self.raw) and '\t' in self.raw[0]: self.ids = [l.split('\t', 1)[0] for l in self.raw] else: self.ids = self.raw else: self.ids, self.instances = zip(*list(chain.from_iterable(self.instances)))
def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, return_code=True, dict_init=None, callback=None, batch_size=3, verbose=False, shuffle=True, n_jobs=1, method='lars', iter_offset=0, random_state=None, return_inner_stats=False, inner_stats=None, return_n_iter=False): """Solves a dictionary learning matrix factorization problem online. Finds the best dictionary and the corresponding sparse code for approximating the data matrix X by solving:: (U^*, V^*) = argmin 0.5 || X - U V ||_2^2 + alpha * || U ||_1 (U,V) with || V_k ||_2 = 1 for all 0 <= k < n_components where V is the dictionary and U is the sparse code. This is accomplished by repeatedly iterating over mini-batches by slicing the input data. Read more in the :ref:`User Guide <DictionaryLearning>`. Parameters ---------- X : array of shape (n_samples, n_features) Data matrix. n_components : int, Number of dictionary atoms to extract. alpha : float, Sparsity controlling parameter. n_iter : int, Number of iterations to perform. return_code : boolean, Whether to also return the code U or just the dictionary V. dict_init : array of shape (n_components, n_features), Initial value for the dictionary for warm restart scenarios. callback : callable or None, optional (default: None) callable that gets invoked every five iterations batch_size : int, The number of samples to take in each batch. verbose : bool, optional (default: False) To control the verbosity of the procedure. shuffle : boolean, Whether to shuffle the data before splitting it in batches. n_jobs : int, Number of parallel jobs to run, or -1 to autodetect. method : {'lars', 'cd'} lars: uses the least angle regression method to solve the lasso problem (linear_model.lars_path) cd: uses the coordinate descent method to compute the Lasso solution (linear_model.Lasso). Lars will be faster if the estimated components are sparse. iter_offset : int, default 0 Number of previous iterations completed on the dictionary used for initialization. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. return_inner_stats : boolean, optional Return the inner statistics A (dictionary covariance) and B (data approximation). Useful to restart the algorithm in an online setting. If return_inner_stats is True, return_code is ignored inner_stats : tuple of (A, B) ndarrays Inner sufficient statistics that are kept by the algorithm. Passing them at initialization is useful in online settings, to avoid loosing the history of the evolution. A (n_components, n_components) is the dictionary covariance matrix. B (n_features, n_components) is the data approximation matrix return_n_iter : bool Whether or not to return the number of iterations. Returns ------- code : array of shape (n_samples, n_components), the sparse code (only returned if `return_code=True`) dictionary : array of shape (n_components, n_features), the solutions to the dictionary learning problem n_iter : int Number of iterations run. Returned only if `return_n_iter` is set to `True`. See also -------- dict_learning DictionaryLearning MiniBatchDictionaryLearning SparsePCA MiniBatchSparsePCA """ if n_components is None: n_components = X.shape[1] if method not in ('lars', 'cd'): raise ValueError('Coding method not supported as a fit algorithm.') method = 'lasso_' + method t0 = time.time() n_samples, n_features = X.shape # Avoid integer division problems alpha = float(alpha) random_state = check_random_state(random_state) if n_jobs == -1: n_jobs = cpu_count() # Init V with SVD of X if dict_init is not None: dictionary = dict_init else: # _, S, dictionary = randomized_svd(X, n_components, # random_state=random_state) # dictionary = S[:, np.newaxis] * dictionary print("init dictionary with shape:", X.shape) dictionary = np.array(X) r = len(dictionary) if n_components <= r: dictionary = dictionary[:n_components, :] else: dictionary = np.r_[dictionary, np.zeros((n_components - r, dictionary.shape[1]))] if verbose == 1: print('init dic:', dictionary) print('[dict_learning]', end=' ') if shuffle: X_train = X.copy() random_state.shuffle(X_train) else: X_train = X dictionary = check_array(dictionary.T, order='F', dtype=np.float64, copy=False) X_train = check_array(X_train, order='C', dtype=np.float64, copy=False) batches = gen_batches(n_samples, batch_size) batches = itertools.cycle(batches) # The covariance of the dictionary if inner_stats is None: A = np.zeros((n_components, n_components)) # The data approximation B = np.zeros((n_features, n_components)) else: A = inner_stats[0].copy() B = inner_stats[1].copy() # If n_iter is zero, we need to return zero. ii = iter_offset - 1 for ii, batch in zip(range(iter_offset, iter_offset + n_iter), batches): this_X = X_train[batch] dt = (time.time() - t0) if verbose == 1: sys.stdout.write(".") sys.stdout.flush() elif verbose: if verbose > 10 or ii % ceil(100. / verbose) == 0: print("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" % (ii, dt, dt / 60)) this_code = sparse_encode(this_X, dictionary.T, algorithm=method, alpha=alpha, n_jobs=n_jobs).T # Update the auxiliary variables if ii < batch_size - 1: theta = float((ii + 1) * batch_size) else: theta = float(batch_size ** 2 + ii + 1 - batch_size) beta = (theta + 1 - batch_size) / (theta + 1) A *= beta A += np.dot(this_code, this_code.T) A += np.diag((np.abs(this_code)).ravel()) * 2 * alpha B *= beta B += np.dot(this_X.T, this_code.T) B += 2 * alpha * np.dot(this_X.T, np.abs(this_code.T)) # Update dictionary dictionary = _update_dict(dictionary, B, A, verbose=verbose, random_state=random_state) # XXX: Can the residuals be of any use? # Maybe we need a stopping criteria based on the amount of # modification in the dictionary if callback is not None: callback(locals()) if return_inner_stats: if return_n_iter: return dictionary.T, (A, B), ii - iter_offset + 1 else: return dictionary.T, (A, B) if return_code: if verbose > 1: print('Learning code...', end=' ') elif verbose == 1: print('|', end=' ') code = sparse_encode(X, dictionary.T, algorithm=method, alpha=alpha, n_jobs=n_jobs, check_input=False) if verbose > 1: dt = (time.time() - t0) print('done (total time: % 3is, % 4.1fmn)' % (dt, dt / 60)) if return_n_iter: return code, dictionary.T, ii - iter_offset + 1 else: return code, dictionary.T if return_n_iter: return dictionary.T, ii - iter_offset + 1 else: return dictionary.T