def batch_update(parallel, data, row, col): U_ = U[row, :] V_ = V[col, :] bu_ = bu[row] bm_ = bm[col] du = parallel(delayed(gred_u)(data[i], U_[i,:], V_[i,:], bu_[i], bm_[i], avg, C) for i in xrange(len(data))) dv = parallel(delayed(gred_v)(data[i], U_[i,:], V_[i,:], bu_[i], bm_[i], avg, C) for i in xrange(len(data))) dbu = parallel(delayed(gred_bu)(data[i], U_[i,:], V_[i,:], bu_[i], bm_[i], avg, C) for i in xrange(len(data))) dbm = parallel(delayed(gred_bm)(data[i], U_[i,:], V_[i,:], bu_[i], bm_[i], avg, C) for i in xrange(len(data))) if method=='sgd': for i in xrange(len(data)): U_[i,:] -= eta*du[i] V_[i,:] -= eta*dv[i] bu_[i] -= eta*dbu[i] bm_[i] -= eta*dbm[i] for c, i in enumerate(row): U[i,:]=U_[c,:] bu[i]=bu_[c] for c, j in enumerate(col): V[j,:]=V_[c,:] bm[j]=bm_[c] elif method=='adagrad': for c, i in enumerate(row): gdu[i] += np.dot(du[c], du[c]) gdbu[i] += np.dot(dbu[c], dbu[c]) U[i,:]-=eta*du[c]/sqrt(gdu[i]+epislon) bu[i]-=eta*dbu[c]/sqrt(gdbu[i]+epislon) for c, j in enumerate(col): gdv[j] += np.dot(dv[c], dv[c]) gdbm[j] += np.dot(dbm[c], dbm[c]) V[j,:]-=eta*dv[c]/sqrt(gdv[i]+epislon) bm[j]-=eta*dbm[c]/sqrt(gdbm[i]+epislon)
def executeWithStart(self, desc, function, data, *args, **kwargs): #Splitting task tSplitter = TaskSplitter() nbJobs, splittedData, starts = tSplitter.partition(self._nbParal, data) #Logging self.setTask(1, ("Starting parallelization : "+desc)) #Parallelization parallelizer = Parallel(n_jobs=nbJobs, temp_folder=self._tmpFolder, verbose=self.verbosity,) if len(args) == 0: if len(kwargs) == 0: allData = parallelizer(delayed(function)( splittedData[i], startIndex=starts[i]) for i in xrange(nbJobs)) else: allData = parallelizer(delayed(function)( splittedData[i], startIndex=starts[i], **kwargs) for i in xrange(nbJobs)) elif len(kwargs) == 0: allData = parallelizer(delayed(function)( splittedData[i], startIndex=starts[i], *args) for i in xrange(nbJobs)) else: allData = parallelizer(delayed(function)( splittedData[i], startIndex=starts[i], *args, **kwargs) for i in xrange(nbJobs)) self.endTask() return allData
def orig_main(): if len(sys.argv) == 4: path, adjective, n_jobs = sys.argv[1:] n_jobs = int(n_jobs) print "Training the adjective %s for the phase %s" % ( adjective) loaded_features = load_adjective_phase(path) p = Parallel(n_jobs=n_jobs,verbose=10) p(delayed(orig_train_adjective_phase_classifier)(path, adjective, loaded_features)) elif len(sys.argv) == 3: path, n_jobs = sys.argv[1:] n_jobs = int(n_jobs) print "Training the all adjectives" loaded_features = load_adjective_phase(path) p = Parallel(n_jobs=n_jobs,verbose=10) p(delayed(orig_train_adjective_phase_classifier)(path, adjective, loaded_features) for adjective in adjectives) else: print "Usage:" print "%s path adjective n_jobs" % sys.argv[0] print "%s path n_jobs" % sys.argv[0] print "Path to the base directory"
def warmstart_all_parallel(x, y, x_test, y_test, fname_in='results_softmax_regression_mnist', fname_out='results_softmax_regression_warmstart_mnist', model_type='softmax_regression', w_diff_term_crit=0.0001, learning_rate=0.0001, regularizations = [100., 10., 1., 0.1, 0.01, 0.001, 0.]): pretrained_models = pickle.load(open(fname_in, 'rb')) if model_type == 'softmax_regression': #previous_loss_train=None, previous_regularization_penalty_train=None results = joblib.Parallel(n_jobs=47)(delayed(tf_softmax_regression.train_softmax) ( x, y, x_test, y_test, learning_rate=learning_rate, max_iterations=1000000, w_diff_term_crit=w_diff_term_crit, verbose=True, regularization=regularizations[target_i], model=pretrained_models[init_i]['model'], regularization_initialization=pretrained_models[init_i]['regularization'], previous_loss_train=pretrained_models[init_i]['loss_train'], previous_regularization_penalty_train=pretrained_models[init_i]['regularization_penalty_train'] ) for target_i in xrange(0, len(regularizations)) for init_i in xrange(0, len(pretrained_models)) ) elif model_type == 'linear_regression': results = joblib.Parallel(n_jobs=47)(delayed(tf_linear_regression.train) ( x, y, x_test, y_test, learning_rate=learning_rate, max_iterations=1000000, w_diff_term_crit=w_diff_term_crit, verbose=True, regularization=regularizations[target_i], model=pretrained_models[init_i]['model'], regularization_initialization=pretrained_models[init_i][ 'regularization'] ) for target_i in xrange(0, len(regularizations)) for init_i in xrange(0, len(pretrained_models)) ) pickle.dump(results, open(fname_out, 'wb'))
def predict_(self, X, probability=False): """Predict class for X. The predicted class of an input sample is a vote by the individual searchlights. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, it will be converted to ``dtype=np.float32`` and if a sparse matrix is provided to a sparse ``csr_matrix``. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes. """ # votes = [] # for v in range(self.n_best): # votes += [self.estimators_[v].predict(np.array([x.get_data()[self.best_spheres[v]] for x in X]))] if not isinstance(X, dict): raise ValueError("X has to be a dict") if self.base_estimator._estimator_type == "searchlight_ensemble": self.votes = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_vote)(e, X[roi_id][0], probability) for roi_id, e in self.estimators_.items() ) else: self.votes = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_vote)(e, X[roi_id], probability) for roi_id, e in self.estimators_.items() ) self.votes_pooled = np.array(self.votes).swapaxes(0, 1).dot(self.vote_weighting) / sum(self.vote_weighting)
def get_split_scores(factory,thresholds,formula, metric = None,#p.e. usability entropy use_joblib = False, joblib_backend = 'threading', n_jobs = -1, min_events_fraction_leaf = 0.,verbose = False): if metric == None: metric = penalized_usability_entropy if min_events_fraction_leaf <=1: min_events_fraction_leaf = int(min_events_fraction_leaf*sum(factory.weights)) if verbose: print min_events_fraction_leaf, sum(factory.weights) if not use_joblib: scores = np.repeat(float("inf"),len(thresholds)) for i,(feature,cut,_) in enumerate(thresholds): predicate = (factory.events[:,feature] > cut) #skip the edge cases... (inf penalty) if np.all(predicate) or (not np.any(predicate)): #if this split does not split, fuggedaboutit continue if min_events_fraction_leaf>0: #get rid of too uneven a cuts sum_weight = np.sum(factory.weights) true_weight = np.sum(factory.weights[predicate]) false_weight = sum_weight - true_weight if true_weight < min_events_fraction_leaf or false_weight < min_events_fraction_leaf: if verbose: print "t:",true_weight,"f:",false_weight, "discarded" continue if verbose: print "t:",true_weight,"f:",false_weight, "passed" #compute score subFactories = factory.split_by(predicate) scores[i] = metric(formula,*subFactories) else: if n_jobs < 0: n_jobs = joblib.cpu_count() +1 - n_jobs indices = [0]+[len(thresholds)*(i+1)/n_jobs for i in range(n_jobs)] thresholdSections = [thresholds[indices[i]:indices[i+1]] for i in range(n_jobs)] if joblib_backend == 'threading': factory = [deepcopy(factory) for i in range(n_jobs)] formula = [deepcopy(formula) for i in range(n_jobs)] metric = [deepcopy(metric) for i in range(n_jobs)] #in case it has some internal data jobs = (joblib.delayed(get_split_scores)(factory[i],thresholdSection, formula[i], metric=metric[i],use_joblib = False, min_events_fraction_leaf = min_events_fraction_leaf, verbose = verbose) for i,thresholdSection in enumerate(thresholdSections)) else: jobs = (joblib.delayed(get_split_scores)(factory,thresholdSection, formula, metric=metric,use_joblib = False, min_events_fraction_leaf = min_events_fraction_leaf, verbose = verbose) for thresholdSection in thresholdSections) scores = np.hstack(joblib.Parallel(n_jobs = n_jobs, backend = joblib_backend)(jobs)) return scores
def fit(self, imgs, y=None, confounds=None): """Compute the mask and the ICA maps across subjects Parameters ---------- imgs: list of Niimg-like objects See http://nilearn.github.io/building_blocks/manipulating_mr_images.html#niimg. Data on which PCA must be calculated. If this is a list, the affine is considered the same for all. confounds: CSV file path or 2D matrix This parameter is passed to nilearn.signal.clean. Please see the related documentation for details """ MultiPCA.fit(self, imgs, y=y, confounds=confounds) random_state = check_random_state(self.random_state) seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init) if (LooseVersion(sklearn.__version__).version > [0, 12]): # random_state in fastica was added in 0.13 results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(fastica)(self.components_.T, whiten=True, fun='cube', random_state=seed) for seed in seeds) else: results = Parallel(n_jobs=1, verbose=self.verbose)( delayed(fastica)(self.components_.T, whiten=True, fun='cube') for seed in seeds) ica_maps_gen_ = (result[2].T for result in results) ica_maps_and_sparsities = ((ica_map, np.sum(np.abs(ica_map), axis=1).max()) for ica_map in ica_maps_gen_) ica_maps, _ = min(ica_maps_and_sparsities, key=itemgetter(-1)) # Thresholding ratio = None if isinstance(self.threshold, float): ratio = self.threshold elif self.threshold == 'auto': ratio = 1. elif self.threshold is not None: raise ValueError("Threshold must be None, " "'auto' or float. You provided %s." % str(self.threshold)) if ratio is not None: abs_ica_maps = np.abs(ica_maps) threshold = scoreatpercentile( abs_ica_maps, 100. - (100. / len(ica_maps)) * ratio) ica_maps[abs_ica_maps < threshold] = 0. self.components_ = ica_maps # flip signs in each component so that peak is +ve for component in self.components_: if component.max() < -component.min(): component *= -1 return self
def plot_learning_curves_across_topics(n_runs, start_idx, stop_idx, estimators_dict, comment=None): """ TODO Most probably buggy """ for topic_id, data in texts_vote_lists_truths_by_topic_id.iteritems(): print 'Loading topic %s' % topic_id texts, vote_lists, truths = data n_documents = len(texts) vectorizer = TfidfVectorizer() tfidf = vectorizer.fit_transform(texts) text_similarity = cosine_similarity(tfidf) x = np.arange(start_idx, stop_idx) y_by_estimator = dict( (estimator, []) for estimator in estimators_dict.keys() ) for estimator_name, estimator_and_args in estimators_dict.iteritems(): print 'Calculating for %s' % estimator_name estimator, args, active_pars = estimator_and_args if active_pars is None: sequences = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, vote_lists, truths, text_similarity, idx, False, *args) for idx in xrange(n_runs) ) else: sequences = Parallel(n_jobs=4)( delayed(get_accuracy_sequence_active)(estimator, stop_idx, texts, vote_lists, truths, text_similarity, active_pars, idx, False, *args) for idx in xrange(n_runs) ) good_slices = [ s[start_idx:] for s in sequences if s is not None ] if good_slices: results = np.vstack(good_slices) begin_accuracies = results[:, 0] end_accuracies = results[:, -1] begin_accuracies.dump("pickles/%s-%s-begin-accuracies--.pkl" % (topic_id, estimator_name) ) end_accuracies.dump("pickles/%s-%s-end-accuracies--.pkl" % (topic_id, estimator_name)) # We will then need to vstack and avg though all the topic accuracies for each estimator y_by_estimator[estimator_name].append( np.mean(results, axis=0) ) else: print 'Topic %s is not represented with estimator %s' % (topic_id, estimator_name) result_by_estimator = {} for estimator_name, mean_accuracy_sequences in y_by_estimator.iteritems(): if mean_accuracy_sequences: to_avg = np.vstack(mean_accuracy_sequences) result_by_estimator[estimator_name] = np.mean(to_avg, axis=0) else: print "Nope" if comment: title = 'Across topics, %s runs, %s' % (n_runs, comment) else: title = 'Across topics, %s runs' % topic_id plot_learning_curve(title, x, result_by_estimator, 'Votes sampled', 'Accuracy')
def train_all_parallel(x, y, x_test, y_test, fname='results_softmax_regression_mnist', model_type='softmax_regression', w_diff_term_crit=0.0001, learning_rate=0.0001, regularizations = [100., 10., 1., 0.1, 0.01, 0.001, 0.]): if model_type == 'softmax_regression': results = joblib.Parallel(n_jobs=47)(delayed( tf_softmax_regression.train_softmax)( x, y, x_test, y_test, learning_rate=learning_rate, max_iterations=1000000, regularization=regularizations[reg_i], w_diff_term_crit=w_diff_term_crit, verbose=True) for i_par in range(48) for reg_i in xrange(0, len(regularizations))) elif model_type == 'linear_regression': results = joblib.Parallel(n_jobs=47)(delayed(tf_linear_regression.train)( x, y, x_test, y_test, learning_rate=learning_rate, max_iterations=1000000, regularization=regularizations[reg_i], w_diff_term_crit=w_diff_term_crit, verbose=True) for i_par in range(48) for reg_i in xrange(0, len(regularizations))) pickle.dump(results, open(fname, 'wb'))
def main(): """ if len(sys.argv) == 6: database, path, adjective, phase, sensor = sys.argv[1:] train_single_dataset(database, path, adjective, phase, sensor) """ if len(sys.argv) == 6: database, path, adjective, phase, n_jobs = sys.argv[1:] n_jobs = int(n_jobs) print "Training the adjectives %s and for phase %s" %( adjective, phase) p = Parallel(n_jobs=n_jobs,verbose=10) p(delayed(create_single_dataset)(database, path, adjective, phase)) if len(sys.argv) == 5: database, path, adjective, n_jobs = sys.argv[1:] n_jobs = int(n_jobs) print "Training all the phases for adjective %s" %( adjective) p = Parallel(n_jobs=n_jobs,verbose=10) p(delayed(create_single_dataset)(database, path, adjective, phase) for phase in itertools.product(phases)) # create_single_dataset(database, path, adjective, phase)) elif len(sys.argv) == 3: database, path = sys.argv[1:] #n_jobs = int(n_jobs) print "Training all combinations of adjectives and phases" #p = Parallel(n_jobs=n_jobs,verbose=10) #p(delayed(create_single_dataset)(database, path, adjective, phase) #for adjective, phase in itertools.product(adjectives, # phases)) base_directory = path untrained_directory = os.path.join(base_directory, "untrained_adjectives") hmm_feature_directory = os.path.join(base_directory, "adjective_phase_set") check_dir(hmm_feature_directory) for adj_f in os.listdir(untrained_directory): full_adj_path = os.path.join(untrained_directory, adj_f) adj_obj = cPickle.load(open(full_adj_path)) assert isinstance(adj_obj, AdjectiveClassifier) create_single_dataset(database, hmm_feature_directory, adj_obj) # create_single_dataset(database, path, adjective, "some_phase") else: print "Usage:" print "%s database path adjective phase n_jobs" % sys.argv[0] print "%s database path adjective n_jobs" % sys.argv[0] print "%s database path" % sys.argv[0] print "Files will be saved in path/adjective_phase_set"
def fit_transform(self, Z, **fit_params): """TODO: rewrite docstring Fit all transformers using X, transform the data and concatenate results. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) Input data to be transformed. Returns ------- X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ result = Parallel(n_jobs=self.n_jobs, backend="threading")( delayed(_fit_transform_one)(trans, name, Z, self.transformer_weights, **fit_params) for name, trans in self.transformer_list) Zs, transformers = zip(*result) self._update_transformer_list(transformers) X = reduce(lambda x, y: x.zip(y._rdd), Zs) for item in X.first(): if sp.issparse(item): return X.map(lambda x: sp.hstack(x)) X = X.map(lambda x: np.hstack(x))
def t_test_accuracy(topic_id, n_runs, estimator_params_votes_per_doc_tuples): """ Test if accuracy for estimators with given parameters is significantly better than that of the first estimator in the tuple """ texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id] vectorizer = TfidfVectorizer() text_similarity = cosine_similarity(vectorizer.fit_transform(texts)) accuracy_arrays = [] for estimator, args, votes_per_doc in estimator_params_votes_per_doc_tuples: stop_idx = votes_per_doc * len(texts) # Now get n_runs accuracies and put then into numpy arrays accuracies = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, vote_lists, truths, text_similarity, idx, True, *args) for idx in xrange(n_runs) ) accuracy_arrays.append( np.array( filter(lambda x: x is not None, accuracies) ) ) # Baseline result_row = [] result_row.append( "%0.2f" % np.mean(accuracy_arrays[0]) ) # T-tests for accuracy_array in accuracy_arrays[1:]: _, pval = ttest_ind(accuracy_array, accuracy_arrays[0], equal_var=False) significance_indicator = lambda p: "*" if p < 0.01 else " " is_better = "$" if np.mean(accuracy_array) > np.mean(accuracy_arrays[0]) else " " result_row.append( "%0.2f %s %s" % (np.mean(accuracy_array), significance_indicator(pval), is_better)) return "|".join(result_row)
def cross_val_predict( estimator, X, y, loss=None, cv=8, n_jobs=1, verbose=0, fit_params=None, proba=False, pre_dispatch='2*n_jobs'): """ """ if isinstance(cv, int): cv1 = cross_validation.StratifiedKFold(y, cv) else: cv1 = cv fit_params = fit_params if fit_params is not None else {} parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) results = parallel( delayed(_cross_val_predict)(clone(estimator), X, y, train, test, verbose, fit_params, proba) for train, test in cv1) y_pred = np.zeros(len(y)) scores = [] for (mask, y_p) in results: y_pred[mask] = y_p if loss: y_test = y[mask] scores.append(-loss(y_test, y_p)) if loss: scores = np.asarray(scores) return np.asarray(y_pred), scores
def fit(self, X, y=None): transformer_idx_list = map(lambda trans, idx:(trans[0], trans[1], idx), self.transformer_list, self.idx_list) transformers = Parallel(n_jobs=self.n_jobs)( delayed(_fit_one_transformer)(trans, X[:,idx], y) for name, trans, idx in transformer_idx_list) self._update_transformer_list(transformers) return self
def predict(self, X): """Predict multi-output variable using a model trained for each target variable. Parameters ---------- X : (sparse) array-like, shape (n_samples, n_features) Data. Returns ------- y : (sparse) array-like, shape (n_samples, n_outputs) Multi-output targets predicted across multiple predictors. Note: Separate models are generated for each predictor. """ check_is_fitted(self, 'estimators_') if not hasattr(self.estimator, "predict"): raise ValueError("The base estimator should implement a predict method") X = check_array(X, accept_sparse=True) y = Parallel(n_jobs=self.n_jobs)(delayed(parallel_helper)(e, 'predict', X) for e in self.estimators_) return np.asarray(y).T
def cluster(seqs, k, m): dispatcher = Parallel(n_jobs=N_JOBS, verbose=VERBOSE, pre_dispatch=PRE_DISPATCH) Q, R = embed(k, m, seqs, dispatcher) N, T = R.shape debug('computing pairwise distances') ds = dispatcher(delayed(cosine)(R[i, :], R[j, :]) for i in range(N) for j in range(i + 1, N)) D = np.zeros((N, N), dtype=float) idx = 0 for i in range(N): for j in range(i + 1, N): D[i, j] = ds[idx] D[j, i] = ds[idx] idx += 1 # cluster debug('clustering using dbscan') db = DBSCAN(eps=0.01, min_samples=10, metric='precomputed').fit(D) # core = db.core_sample_indices_ labels = db.labels_ labelset = set(labels) n_clusters_ = len(labelset) - (1 if -1 in labelset else 0) debug('dbscan: found %d clusters' % n_clusters_) m = {} for l in labelset: m[l] = [] for i, s in enumerate(seqs): if labels[i] == l: m[l].append(str(s.seq)) return m
def _find_new_constraint(self, X, Y, joint_feature_gt, constraints, check=True): if self.n_jobs != 1: # do inference in parallel verbose = max(0, self.verbose - 3) Y_hat = Parallel(n_jobs=self.n_jobs, verbose=verbose)( delayed(loss_augmented_inference)( self.model, x, y, self.w, relaxed=True) for x, y in zip(X, Y)) else: Y_hat = self.model.batch_loss_augmented_inference( X, Y, self.w, relaxed=True) # compute the mean over joint_features and losses if getattr(self.model, 'rescale_C', False): djoint_feature = (joint_feature_gt - self.model.batch_joint_feature(X, Y_hat, Y)) / len(X) else: djoint_feature = (joint_feature_gt - self.model.batch_joint_feature(X, Y_hat)) / len(X) loss_mean = np.mean(self.model.batch_loss(Y, Y_hat)) violation = loss_mean - np.dot(self.w, djoint_feature) if check and self._check_bad_constraint( violation, djoint_feature, loss_mean, constraints, break_on_bad=self.break_on_bad): raise NoConstraint return Y_hat, djoint_feature, loss_mean
def transform(self, traj_list): """Transform traj_list separately by each transformer, concatenate results. Parameters ---------- trajectories : list (of mdtraj.Trajectory objects) Trajectories to featurize Returns ------- Y : list (of np.ndarray) Y[i] is the featurized version of X[i] Y[i] will have shape (n_samples_i, n_features), where n_samples_i is the length of trajectory i and n_features is the total (concatenated) number of features in the concatenated list of featurizers. """ Xs = Parallel(n_jobs=self.n_jobs)( delayed(sklearn.pipeline._transform_one)(trans, name, traj_list, self.transformer_weights) for name, trans in self.transformer_list) X_i_stacked = [np.hstack([Xs[feature_ind][trj_ind] for feature_ind in range(len(Xs))]) for trj_ind in range(len(Xs[0]))] return X_i_stacked
def _intra_cluster_distances_block(X, labels, metric, n_jobs=1, **kwds): """Calculate the mean intra-cluster distance for sample i. Parameters ---------- X : array [n_samples_a, n_features] Feature array. labels : array, shape = [n_samples] label values for each sample metric : string, or callable The metric to use when calculating distance between instances in a feature array. If metric is a string, it must be one of the options allowed by metrics.pairwise.pairwise_distances. If X is the distance array itself, use "precomputed" as the metric. `**kwds` : optional keyword parameters Any further parameters are passed directly to the distance function. If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples. Returns ------- a : array [n_samples_a] Mean intra-cluster distance """ intra_dist = np.zeros(labels.size, dtype=float) values = Parallel(n_jobs=n_jobs)( delayed(_intra_cluster_distances_block_) (X[np.where(labels == label)[0]], metric, **kwds) for label in np.unique(labels)) for label, values_ in zip(np.unique(labels), values): intra_dist[np.where(labels == label)[0]] = values_ return intra_dist
def decode_stash_parallel(stash, penalty, label_map, num_cpus=NUM_CPUS, **viterbi_args): """Apply Viterbi decoding over a stash in parallel. Parameters ---------- stash : biggie.Stash Stash of fretboard posteriors. penalty : scalar Self-transition penalty. label_map : callable object Map from frets to string labels. num_cpus : int Number of CPUs to use in parallel. **viterbi_args, other args to pass to util.viterbi Returns ------- annotset : dict of pyjams.RangeAnnotations Range annotations under the same keys as the input stash. """ assert not __interactive__ keys = stash.keys() pool = Parallel(n_jobs=num_cpus) decode = delayed(decode_fretboard) results = pool(decode(stash.get(k), penalty, label_map) for k in keys) return {k: r for k, r in zip(keys, results)}
def fit(self, data, Y=None): if hasattr(data, 'copy'): # It's an array data = data.copy() else: # Probably a list data = copy.deepcopy(data) memory = self.memory if isinstance(memory, basestring): memory = Memory(cachedir=memory) pcas = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(_subject_pca)(subject_data, n_components=self.n_components, mem=memory) for subject_data in data) pcas = np.concatenate(pcas, axis=1) ica_maps = self._find_high_kurtosis(pcas, memory) del pcas self.maps_ = ica_maps if not self.maps_only: # Relearn the time series self.learn_from_maps(data) return self
def find_bmu(self, input_matrix, njb=1): """ Finds the best matching unit (bmu) for each input data from the input matrix. It does all at once parallelizing the calculation instead of going through each input and running it against the codebook. :param input_matrix: numpy matrix representing inputs as rows and features/dimension as cols :param njb: number of jobs to parallelize the search :returns: the best matching unit for each input """ dlen = input_matrix.shape[0] y2 = np.einsum("ij,ij->i", self.codebook.matrix, self.codebook.matrix) parallelizer = Parallel(n_jobs=njb, pre_dispatch="3*n_jobs") chunk_bmu_finder = delayed(_chunk_based_bmu_find) row_chunk = lambda part: part * dlen // njb col_chunk = lambda part: min((part + 1) * dlen // njb, dlen) b = parallelizer( chunk_bmu_finder(input_matrix[row_chunk(i) : col_chunk(i)], self.codebook.matrix, y2) for i in xrange(njb) ) bmu = np.asarray(list(itertools.chain(*b))).T del b return bmu
def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols) cols = _cols_if_none(X, self.cols) # Now get sqnms in parallel self.sq_nms_ = dict(zip(cols, Parallel(n_jobs=self.n_jobs)( delayed(_sq_norm_single) (X[nm]) for nm in cols))) return self
def fit(self, X, y=None): """Fit the transformer. Parameters ---------- X : Pandas ``DataFrame`` The Pandas frame to fit. The frame will only be fit on the prescribed ``cols`` (see ``__init__``) or all of them if ``cols`` is None. Furthermore, ``X`` will not be altered in the process of the fit. y : None Passthrough for ``sklearn.pipeline.Pipeline``. Even if explicitly set, will not change behavior of ``fit``. Returns ------- self """ # check on state of X and cols X, self.cols = validate_is_pd(X, self.cols, assert_all_finite=True) # creates a copy -- we need all to be finite cols = _cols_if_none(X, self.cols) # ensure enough rows _validate_rows(X) # Now estimate the lambdas in parallel self.lambda_ = dict(zip(cols, Parallel(n_jobs=self.n_jobs)( delayed(_yj_estimate_lambda_single_y) (X[nm]) for nm in cols))) return self
def permutation_test_score(estimator, X, y, groups=None, cv=None, n_permutations=100, n_jobs=1, random_state=0, verbose=0, scoring=None): """ Evaluate the significance of a cross-validated score with permutations, as in test 1 of [Ojala2010]_. A modification of original sklearn's permutation test score function to evaluate p-value outside this function, so that the score can be reused from outside. .. [Ojala2010] Ojala and Garriga. Permutation Tests for Studying Classifier Performance. The Journal of Machine Learning Research (2010) vol. 11 """ X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) random_state = check_random_state(random_state) # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_permutation_test_score)( clone(estimator), X, _shuffle(y, groups, random_state), groups, cv, scorer) for _ in range(n_permutations)) permutation_scores = np.array(permutation_scores) return permutation_scores
def cross_val_predict_proba( estimator, X, y, scoring='roc_auc', cv=8, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Predict probabilities using cross-validation. """ if isinstance(cv, int): cv1 = cross_validation.StratifiedKFold(y, cv) else: cv1 = cv fit_params = fit_params if fit_params is not None else {} parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) results = parallel( delayed(_cross_val_predict)(clone(estimator), X, y, train, test, verbose, fit_params, proba=True) for train, test in cv1) y_pred = np.zeros(len(y)) scores = [] for (mask, y_p) in results: y_pred[mask] = y_p if scoring == 'roc_auc': y_test = y[mask] if len(np.unique(y_test)) > 1: scores.append(compute_auc(y_test, y_p)) # scores.append(roc_auc_score(y_test, y_p)) return np.asarray(y_pred), np.asarray(scores)
def prepare_merge_jobs(self, results): result_groups = grouper(results, self.split_bins) merge_jobs = [] for result_group in result_groups: result_group = list(result_group) merge_jobs.append(joblib.delayed(self.load_and_merge_results_job)(result_group)) return merge_jobs
def cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): """ Evaluate a score by cross-validation """ if not isinstance(scoring, (list, tuple)): scoring = [scoring] X, y, groups = indexable(X, y, groups) cv = check_cv(cv, y, classifier=is_classifier(estimator)) splits = list(cv.split(X, y, groups)) scorer = [check_scoring(estimator, scoring=s) for s in scoring] # We clone the estimator to make sure that all the folds are # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, train, test, verbose, None, fit_params) for train, test in splits) group_order = [] if hasattr(cv, 'groups'): group_order = [np.array(cv.groups)[test].tolist()[0] for _, test in splits] return np.squeeze(np.array(scores)), group_order
def _parallel_learning(self, X, Y, w): n_samples = len(X) objective, positive_slacks = 0, 0 verbose = max(0, self.verbose - 3) if self.batch_size is not None: raise ValueError("If n_jobs != 1, batch_size needs to" "be None") # generate batches of size n_jobs # to speed up inference if self.n_jobs == -1: n_jobs = cpu_count() else: n_jobs = self.n_jobs n_batches = int(np.ceil(float(len(X)) / n_jobs)) slices = gen_even_slices(n_samples, n_batches) for batch in slices: X_b = X[batch] Y_b = Y[batch] candidate_constraints = Parallel(n_jobs=self.n_jobs, verbose=verbose)( delayed(find_constraint)(self.model, x, y, w) for x, y in zip(X_b, Y_b) ) dpsi = np.zeros(self.model.size_psi) for x, y, constraint in zip(X_b, Y_b, candidate_constraints): y_hat, delta_psi, slack, loss = constraint if slack > 0: objective += slack dpsi += delta_psi positive_slacks += 1 w = self._solve_subgradient(dpsi, n_samples, w) return objective, positive_slacks, w
def optimal_allocation_with_skopt(t, X, Y, n=10, n_parallel=4, const_income=True): # [0, 1] nn = 2 opt_fun = _fun if const_income: nn = 1 opt_fun = _fun_constant_income dimensions = [Real(0, 1)] * nn * (t - 1) optimizer = skopt.Optimizer( dimensions, base_estimator='gp', random_state=1 # n_random_starts=None, n_initial_points=10, acq_func='gp_hedge', acq_optimizer='auto', acq_func_kwargs=None, acq_optimizer_kwargs=None ) # fun = functools.partial(_fun, t, X, Y) fun = opt_fun(t, X, Y) if n_parallel <= 1: print('not parallel') for i in range(n): suggested = optimizer.ask() y = fun(suggested) optimizer.tell(suggested, y) print('iteration: {}, {}, {}'.format(i, suggested, y)) else: # something not working here print('parallel') n_left = n for i in range(0, n, max(n_parallel, 1)): suggested = optimizer.ask(n_points=min(n_left, n_parallel)) n_left -= n_parallel print(n_left) y = Parallel()(delayed(fun)(x) for x in suggested) optimizer.tell(suggested, y) print('iteration: {}, {}, {}, {}'.format(i, suggested, y, action_to_zeroone(np.array(suggested)))) print('min is', min(optimizer.yi)) return optimizer
def compute_thresholds(epochs, method='bayesian_optimization', random_state=None, picks=None, verbose='progressbar', n_jobs=1): """Compute thresholds for each channel. Parameters ---------- epochs : instance of mne.Epochs The epochs objects whose thresholds must be computed. method : str 'bayesian_optimization' or 'random_search' random_state : int seed, RandomState instance, or None (default) The seed of the pseudo random number generator to use picks : ndarray, shape(n_channels,) | None The channels to be considered for autoreject. If None, defaults to data channels {'meg', 'eeg'}. verbose : 'tqdm', 'tqdm_notebook', 'progressbar' or False The verbosity of progress messages. If `'progressbar'`, use `mne.utils.ProgressBar`. If `'tqdm'`, use `tqdm.tqdm`. If `'tqdm_notebook'`, use `tqdm.tqdm_notebook`. If False, suppress all output messages. n_jobs : int The number of jobs. Examples -------- For example, we can compute the channel-level thresholds for all the EEG sensors this way: >>> compute_thresholds(epochs) """ if method not in ['bayesian_optimization', 'random_search']: raise ValueError('`method` param not recognized') picks = _handle_picks(epochs.info, picks) _check_data(epochs, picks, verbose=verbose, ch_constraint='data_channels') sub_picks = _check_sub_picks(picks=picks, info=epochs.info) if sub_picks is not False: threshes = dict() for ch_type, this_picks in sub_picks: threshes.update( compute_thresholds(epochs=epochs, method=method, random_state=random_state, picks=this_picks, verbose=verbose, n_jobs=n_jobs)) else: n_epochs = len(epochs) epochs_interp = clean_by_interp(epochs, picks=picks, verbose=verbose) data = np.concatenate((epochs.get_data(), epochs_interp.get_data()), axis=0) # non-data channels will be duplicate y = np.r_[np.zeros((n_epochs, )), np.ones((n_epochs, ))] cv = StratifiedShuffleSplit(y, n_iter=10, test_size=0.2, random_state=random_state) ch_names = epochs_interp.ch_names my_thresh = delayed(_compute_thresh) verbose = 51 if verbose is not False else 0 # send output to stdout threshes = Parallel(n_jobs=n_jobs, verbose=verbose)(my_thresh( data[:, pick], cv=cv, method=method, random_state=random_state) for pick in picks) threshes = {ch_names[p]: thresh for p, thresh in zip(picks, threshes)} return threshes
def permuted_ols(tested_vars, target_vars, confounding_vars=None, model_intercept=True, n_perm=10000, two_sided_test=True, random_state=None, n_jobs=1, verbose=0): """Massively univariate group analysis with permuted OLS. Tested variates are independently fitted to target variates descriptors (e.g. brain imaging signal) according to a linear model solved with an Ordinary Least Squares criterion. Confounding variates may be included in the model. Permutation testing is used to assess the significance of the relationship between the tested variates and the target variates [1, 2]. A max-type procedure is used to obtain family-wise corrected p-values. The specific permutation scheme implemented here is the one of Freedman & Lane [3]. Its has been demonstrated in [1] that this scheme conveys more sensitivity than alternative schemes. This holds for neuroimaging applications, as discussed in details in [2]. Permutations are performed on parallel computing units. Each of them performs a fraction of permutations on the whole dataset. Thus, the max t-score amongst data descriptors can be computed directly, which avoids storing all the computed t-scores. The variates should be given C-contiguous. target_vars are fortran-ordered automatically to speed-up computations. Parameters ---------- tested_vars : array-like, shape=(n_samples, n_regressors) Explanatory variates, fitted and tested independently from each others. target_vars : array-like, shape=(n_samples, n_descriptors) fMRI data, trying to be explained by explanatory and confounding variates. confounding_vars : array-like, shape=(n_samples, n_covars) Confounding variates (covariates), fitted but not tested. If None, no confounding variate is added to the model (except maybe a constant column according to the value of `model_intercept`) model_intercept : bool, If True, a constant column is added to the confounding variates unless the tested variate is already the intercept. n_perm : int, Number of permutations to perform. Permutations are costly but the more are performed, the more precision one gets in the p-values estimation. two_sided_test : boolean, If True, performs an unsigned t-test. Both positive and negative effects are considered; the null hypothesis is that the effect is zero. If False, only positive effects are considered as relevant. The null hypothesis is that the effect is zero or negative. random_state : int or None, Seed for random number generator, to have the same permutations in each computing units. n_jobs : int, Number of parallel workers. If 0 is provided, all CPUs are used. A negative number indicates that all the CPUs except (|n_jobs| - 1) ones will be used. verbose: int, optional verbosity level (0 means no message). Returns ------- pvals : array-like, shape=(n_regressors, n_descriptors) Negative log10 p-values associated with the significance test of the n_regressors explanatory variates against the n_descriptors target variates. Family-wise corrected p-values. score_orig_data : numpy.ndarray, shape=(n_regressors, n_descriptors) t-statistic associated with the significance test of the n_regressors explanatory variates against the n_descriptors target variates. The ranks of the scores into the h0 distribution correspond to the p-values. h0_fmax : array-like, shape=(n_perm, ) Distribution of the (max) t-statistic under the null hypothesis (obtained from the permutations). Array is sorted. References ---------- [1] Anderson, M. J. & Robinson, J. (2001). Permutation tests for linear models. Australian & New Zealand Journal of Statistics, 43(1), 75-88. [2] Winkler, A. M. et al. (2014). Permutation inference for the general linear model. Neuroimage. [3] Freedman, D. & Lane, D. (1983). A nonstochastic interpretation of reported significance levels. J. Bus. Econ. Stats., 1(4), 292-298 """ # initialize the seed of the random generator rng = check_random_state(random_state) # check n_jobs (number of CPUs) if n_jobs == 0: # invalid according to joblib's conventions raise ValueError("'n_jobs == 0' is not a valid choice. " "Please provide a positive number of CPUs, or -1 " "for all CPUs, or a negative number (-i) for " "'all but (i-1)' CPUs (joblib conventions).") elif n_jobs < 0: n_jobs = max(1, joblib.cpu_count() - int(n_jobs) + 1) else: n_jobs = min(n_jobs, joblib.cpu_count()) # make target_vars F-ordered to speed-up computation if target_vars.ndim != 2: raise ValueError( "'target_vars' should be a 2D array. " "An array with %d dimension%s was passed" % (target_vars.ndim, "s" if target_vars.ndim > 1 else "")) target_vars = np.asfortranarray(target_vars) # efficient for chunking n_descriptors = target_vars.shape[1] # check explanatory variates dimensions if tested_vars.ndim == 1: tested_vars = np.atleast_2d(tested_vars).T n_samples, n_regressors = tested_vars.shape # check if explanatory variates is intercept (constant) or not if (n_regressors == 1 and np.unique(tested_vars).size == 1): intercept_test = True else: intercept_test = False # optionally add intercept if model_intercept and not intercept_test: if confounding_vars is not None: confounding_vars = np.hstack( (confounding_vars, np.ones((n_samples, 1)))) else: confounding_vars = np.ones((n_samples, 1)) ### OLS regression on original data if confounding_vars is not None: # step 1: extract effect of covars from target vars covars_orthonormalized = orthonormalize_matrix(confounding_vars) if not covars_orthonormalized.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Confounding variates not C_CONTIGUOUS.') covars_orthonormalized = np.ascontiguousarray( covars_orthonormalized) targetvars_normalized = normalize_matrix_on_axis( target_vars).T # faster with F-ordered target_vars_chunk if not targetvars_normalized.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Target variates not C_CONTIGUOUS.') targetvars_normalized = np.ascontiguousarray(targetvars_normalized) beta_targetvars_covars = np.dot(targetvars_normalized, covars_orthonormalized) targetvars_resid_covars = targetvars_normalized - np.dot( beta_targetvars_covars, covars_orthonormalized.T) targetvars_resid_covars = normalize_matrix_on_axis( targetvars_resid_covars, axis=1) # step 2: extract effect of covars from tested vars testedvars_normalized = normalize_matrix_on_axis(tested_vars.T, axis=1) beta_testedvars_covars = np.dot(testedvars_normalized, covars_orthonormalized) testedvars_resid_covars = testedvars_normalized - np.dot( beta_testedvars_covars, covars_orthonormalized.T) testedvars_resid_covars = normalize_matrix_on_axis( testedvars_resid_covars, axis=1).T.copy() else: targetvars_resid_covars = normalize_matrix_on_axis(target_vars).T testedvars_resid_covars = normalize_matrix_on_axis(tested_vars).copy() covars_orthonormalized = None # check arrays contiguousity (for the sake of code efficiency) if not targetvars_resid_covars.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Target variates not C_CONTIGUOUS.') targetvars_resid_covars = np.ascontiguousarray(targetvars_resid_covars) if not testedvars_resid_covars.flags['C_CONTIGUOUS']: # useful to developer warnings.warn('Tested variates not C_CONTIGUOUS.') testedvars_resid_covars = np.ascontiguousarray(testedvars_resid_covars) # step 3: original regression (= regression on residuals + adjust t-score) # compute t score for original data scores_original_data = _t_score_with_covars_and_normalized_design( testedvars_resid_covars, targetvars_resid_covars.T, covars_orthonormalized) if two_sided_test: sign_scores_original_data = np.sign(scores_original_data) scores_original_data = np.fabs(scores_original_data) ### Permutations # parallel computing units perform a reduced number of permutations each if n_perm > n_jobs: n_perm_chunks = np.asarray([n_perm / n_jobs] * n_jobs, dtype=int) n_perm_chunks[-1] += n_perm % n_jobs elif n_perm > 0: warnings.warn('The specified number of permutations is %d and ' 'the number of jobs to be performed in parallel has ' 'set to %s. This is incompatible so only %d jobs will ' 'be running. You may want to perform more permutations ' 'in order to take the most of the available computing ' 'ressources.' % (n_perm, n_jobs, n_perm)) n_perm_chunks = np.ones(n_perm, dtype=int) else: # 0 or negative number of permutations => original data scores only if two_sided_test: scores_original_data = (scores_original_data * sign_scores_original_data) return np.asarray([]), scores_original_data, np.asarray([]) # actual permutations, seeded from a random integer between 0 and maximum # value represented by np.int32 (to have a large entropy). ret = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)( joblib.delayed(_permuted_ols_on_chunk)( scores_original_data, testedvars_resid_covars, targetvars_resid_covars.T, covars_orthonormalized, n_perm_chunk=n_perm_chunk, intercept_test=intercept_test, two_sided_test=two_sided_test, random_state=rng.random_integers(np.iinfo(np.int32).max)) for n_perm_chunk in n_perm_chunks) # reduce results scores_as_ranks_parts, h0_fmax_parts = zip(*ret) h0_fmax = np.hstack((h0_fmax_parts)) scores_as_ranks = np.zeros((n_regressors, n_descriptors)) for scores_as_ranks_part in scores_as_ranks_parts: scores_as_ranks += scores_as_ranks_part # convert ranks into p-values pvals = (n_perm + 1 - scores_as_ranks) / float(1 + n_perm) # put back sign on scores if it was removed in the case of a two-sided test # (useful to distinguish between positive and negative effects) if two_sided_test: scores_original_data = scores_original_data * sign_scores_original_data return -np.log10(pvals), scores_original_data.T, h0_fmax[0]
directory = 'CES_RESULTS' subdirectory = 'ORDER' assert exists(project_path) dirnames = sorted(filter(isdir, glob('%s/weka.classifiers.*' % project_path))) # load and parse project properties p = load_properties(project_path) seeds = int(p['seeds']) metric = p['metric'] RULE = p['RULE'] use_cluster = True if p['useCluster'] in ['Y', 'y', 'yes', 'true', 'True' ] else False start_state = '1' #initialize ensemble with top model max_num_clsf = len(dirnames) * seeds sizes = range(1, max_num_clsf + 1) if not exists('%s/%s/' % (project_path, directory)): makedirs('%s/%s/' % (project_path, directory)) for o in range(seeds): if not exists("%s/%s/%s%i" % (project_path, directory, subdirectory, o)): makedirs("%s/%s/%s%i" % (project_path, directory, subdirectory, o)) all_parameters = list( product([code_dir], [project_path], sizes, range(seeds), [RULE], [start_state], [metric])) Parallel(n_jobs=get_num_cores(), verbose=50)(delayed(CES_ens)(parameters) for parameters in all_parameters) print "\nDone!"
def run_glm(Y, X, noise_model='ar1', bins=100, n_jobs=1, verbose=0): """ GLM fit for an fMRI data matrix Parameters ---------- Y : array of shape (n_time_points, n_voxels) The fMRI data. X : array of shape (n_time_points, n_regressors) The design matrix. noise_model : {'ar1', 'ols'}, optional The temporal variance model. Defaults to 'ar1'. bins : int, optional Maximum number of discrete bins for the AR(1) coef histogram. n_jobs : int, optional The number of CPUs to use to do the computation. -1 means 'all CPUs'. verbose : int, optional The verbosity level. Defaut is 0 Returns ------- labels : array of shape (n_voxels,), A map of values on voxels used to identify the corresponding model. results : dict, Keys correspond to the different labels values values are RegressionResults instances corresponding to the voxels. """ acceptable_noise_models = ['ar1', 'ols'] if noise_model not in acceptable_noise_models: raise ValueError( "Acceptable noise models are {0}. You provided 'noise_model={1}'".\ format(acceptable_noise_models, noise_model)) if Y.shape[0] != X.shape[0]: raise ValueError( 'The number of rows of Y should match the number of rows of X.' ' You provided X with shape {0} and Y with shape {1}'.\ format(X.shape, Y.shape)) # Create the model ols_result = OLSModel(X).fit(Y) if noise_model == 'ar1': # compute and discretize the AR1 coefs ar1 = ((ols_result.resid[1:] * ols_result.resid[:-1]).sum(axis=0) / (ols_result.resid ** 2).sum(axis=0)) del ols_result ar1 = (ar1 * bins).astype(np.int) * 1. / bins # Fit the AR model acccording to current AR(1) estimates results = {} labels = ar1 # Parallelize by creating a job per ARModel vals = np.unique(ar1) ar_result = Parallel(n_jobs=n_jobs, verbose=verbose)( delayed(_ar_model_fit)(X, val, Y[:, labels == val]) for val in vals) for val, result in zip(vals, ar_result): results[val] = result del vals del ar_result else: labels = np.zeros(Y.shape[1]) results = {0.0: ols_result} return labels, results
def _fit(self, X, y, sample_weight, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator cv = self.cv n_samples = _num_samples(X) X, y, sample_weight = check_arrays(X, y, sample_weight, allow_lists=True, sparse_format='csr') if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) if sample_weight is not None: sample_weight = np.asarray(sample_weight) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print( "Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(self.estimator) # first fit at each grid point using the maximum n_estimators param_grid = self.param_grid.copy() param_grid['n_estimators'] = [self.max_n_estimators] grid = ParameterGrid(param_grid) pre_dispatch = self.pre_dispatch clfs = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(delayed(fit_grid_point)( base_estimator, clf_params, X, y, sample_weight, train, test, self.verbose, **self.fit_params) for clf_params in grid for train, test in cv) # now use the already fitted ensembles but trancate to N estimators for # N from 1 to n_estimators_max - 1 (inclusive) out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)( delayed(score_each_boost) (clf, clf_params, self.min_n_estimators, X, y, sample_weight, self.score_func, train, test, self.verbose) for clf, clf_params, train, test in clfs) out = reduce(operator.add, [zip(*stage) for stage in out]) # out is now a list of triplet: score, estimator_params, n_test_samples n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1 n_fits = len(out) n_folds = len(cv) grid_scores = list() for block in range(0, n_fits, n_folds * n_estimators_points): for grid_start in range(block, block + n_estimators_points): n_test_samples = 0 score = 0 all_scores = list() for this_score, parameters, this_n_test_samples in \ out[grid_start: grid_start + n_folds * n_estimators_points: n_estimators_points]: all_scores.append(this_score) if self.iid: this_score *= this_n_test_samples score += this_score n_test_samples += this_n_test_samples if self.iid: score /= float(n_test_samples) else: score /= float(n_folds) grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Store the computed scores self.grid_scores_ = grid_scores # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] self.best_params_ = best.parameters self.best_score_ = best.mean_validation_score if self.refit: fit_params = self.fit_params if sample_weight is not None: fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y): """Fit estimators from the training set (X, y). Returns ------- self : object Returns self. """ if not isinstance(X, dict): raise ValueError("X has to be a dict") self.classes_ = np.unique(y) estimators = dict() for modality, Xm in X.items(): for roi_id, x in Xm.items(): estimator = clone(self.base_estimators[modality]) estimator.id = (modality, roi_id) estimators[estimator.id] = estimator y_pred = {k: np.full(len(y), np.nan) for k in estimators.keys()} t0 = time.time() print('Start [1]') for f, (train_index, test_index) in enumerate(LeaveOneOut()): y_train = [y[i] for i in train_index] estimators_fit = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")(delayed(_parallel_build_estimator)( e, np.array([X[id[0]][id[1]][i] for i in train_index]), y_train) for id, e in estimators.items()) estimators_fit = {e.id: e for e in estimators_fit} # for roi_id, e in estimators_fit.items(): # e.predict([X[roi_id][i] for i in test_index]) y_pred_ = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_vote)(e, [X[id[0]][id[1]][i] for i in test_index], False) for id, e in estimators_fit.items()) for i, id in enumerate(estimators.keys()): y_pred[id][test_index] = y_pred_[i] print('[1] Elapsed time: %.2f secs' % (time.time() - t0)) for i, id in enumerate(estimators.keys()): self.priors[(self.classes_[0], self.classes_[0])] = np.mean( y_pred[id][y == self.classes_[0]] == self.classes_[0]) self.priors[(self.classes_[1], self.classes_[0])] = 1 - self.priors[ (self.classes_[0], self.classes_[0])] self.priors[(self.classes_[1], self.classes_[1])] = np.mean( y_pred[id][y == self.classes_[1]] == self.classes_[1]) self.priors[(self.classes_[0], self.classes_[1])] = 1 - self.priors[ (self.classes_[1], self.classes_[1])] t0 = time.time() estimators = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)(e, X[id[0]][id[1]], y) for id, e in estimators.items()) print('[2] Elapsed time: %.2f secs' % (time.time() - t0)) self.estimators_ = {e.id: e for e in estimators} return self
def forward(self, features): regs = 0 self.weights = self._get_weights(self.log_alpha) self.revised_arch_index() if self.args.early_fix_arch: if len(self.fix_arch_index.keys()) > 0: for key, value_lst in self.fix_arch_index.items(): self.weights[key, :].zero_() self.weights[key, value_lst[0]] = 1 cate_prob = F.softmax(self.log_alpha, dim=-1) self.cate_prob = cate_prob.clone().detach() loss_alpha = torch.log( (self.weights * F.softmax(self.log_alpha, dim=-1)).sum(-1)).sum() self.weights.requires_grad_() inferences = 0 max_index = self.weights.argmax().item() cur_weights = self.weights cur_index = 0 from sklearn.externals.joblib import Parallel, delayed names_all = [] for name1 in self.columns: for name2 in self.columns: if self.args.multi_operation: cur_weights = self.weights[cur_index] max_index = cur_weights.argmax().item() cur_index += 1 if self.args.ofm: name1_embedding = self.embedding_all[name1][max_index]( features[name1]) name2_embedding = self.embedding_all[name2][max_index]( features[name2]) else: name1_embedding = self.embedding_all[name1]( features[name1]) name2_embedding = self.embedding_all[name2]( features[name2]) names_all.append([ name1_embedding, name2_embedding, cur_weights.view(-1, ), self.FC[name1 + ":" + name2] ]) res = Parallel(n_jobs=8, backend="threading")( delayed(MixedBinary)(para1, para2, para3, para4) for para1, para2, para3, para4 in names_all) inferences = sum(res) # for name1 in self.columns: # for name2 in self.columns: # if self.args.multi_operation: # cur_weights = self.weights[cur_index] # max_index = cur_weights.argmax().item() # cur_index += 1 # if self.args.ofm: # name1_embedding = self.embedding_all[name1][max_index](features[name1]) # name2_embedding = self.embedding_all[name2][max_index](features[name2]) # else: # name1_embedding = self.embedding_all[name1](features[name1]) # name2_embedding = self.embedding_all[name2](features[name2]) # regs += self.reg * (torch.norm(name1_embedding) + torch.norm(name2_embedding)) # name1_embedding_trans = self.mlp_p(name1_embedding.view(-1, 1)).view(name1_embedding.size()) # name2_embedding_trans = self.mlp_p(name2_embedding.view(-1, 1)).view(name2_embedding.size()) # inferences += MixedBinary(name1_embedding_trans, name2_embedding_trans, cur_weights.view(-1,), self.FC[name1 + ":" + name2]) loss = (inferences - features["label"])**2 weighted_loss = torch.mean( torch.sum(torch.mul(features["pos_weights"], loss), dim=1)) self.weights.grad = torch.zeros_like(self.weights) (weighted_loss + loss_alpha).backward() self.block_reward = self.weights.grad.data.sum(-1) self.log_alpha.grad.data.mul_(self.block_reward.view(-1, 1)) return inferences, weighted_loss, loss_alpha
mapper = DataFrameMapper([([nv], preprocessing.StandardScaler()) for nv in metric_cols]) if (dn == 0) or (n_draws is not None): # Norm columns for variance estimation variance_mapper = DataFrameMapper([ ([nv], preprocessing.StandardScaler()) for nv in (list(metric_cols) + ['interview_age']) ]) var_df = raw_df.copy(deep=True) var_df.loc[:, list(metric_cols) + ['interview_age']] = variance_mapper.fit_transform(raw_df) print("Estimate variance contributions for each metric", flush=True) var_res = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed( run_variance_metric_perm)(pn, perms[pn], metric, var_df) for metric in metric_cols) else: var_res = None #var_res = run_variance_perm(pn, perms[pn], raw_df, metric_cols) # set up perm raw_df.loc[:, 'deviceserialnumber'] = raw_df.loc[perms[pn], 'deviceserialnumber'].values # draw samples strata = ['deviceserialnumber'] balance = ['gender', 'ehi_y_ss_scoreb'] order = ['interview_age']
# paradigm=paradigm, frametimes=frametimes, # drift_model=drift_model, hrf_model=hrf_model) # ProgressReport().finish_dir(subject_output_dir) return dict(subject_id=subject_id, mask=mask_path, effects_maps=effects_maps, z_maps=z_maps, contrasts=contrasts) # first level GLM mem = Memory(os.path.join(output_dir, "cache_dir")) n_jobs = min(n_jobs, len(subject_ids)) first_levels = Parallel(n_jobs=n_jobs)( delayed(mem.cache(do_subject_glm))(subject_id) for subject_id in subject_ids) # run second-level GLM group_zmaps = group_one_sample_t_test( [subject_data["mask"] for subject_data in first_levels], [subject_data["effects_maps"] for subject_data in first_levels], first_levels[0]["contrasts"], output_dir, threshold=2.) plot_prob_atlas([zmap for zmap in group_zmaps.values() if "_minus_" in zmap], threshold=1.2, view_type="filled_contours") plt.savefig("group_zmaps.png") show()
def fit(self, subjects, y=None): """Compute cross-validated group-sparse precisions. Parameters ---------- subjects : list of numpy.ndarray with shapes (n_samples, n_features) input subjects. Each subject is a 2D array, whose columns contain signals. Sample number can vary from subject to subject, but all subjects must have the same number of features (i.e. of columns.) Attributes ---------- covariances_ : numpy.ndarray, shape (n_features, n_features, n_subjects) covariance matrices, one per subject. precisions_ : numpy.ndarray, shape (n_features, n_features, n_subjects) precision matrices, one per subject. All matrices have the same sparsity pattern (if a coefficient is zero for a given matrix, it is also zero for every other.) alpha_ : float selected value for penalization parameter. cv_alphas_ : list of float all penalization parameter values explored. cv_scores_ : numpy.ndarray with shape (n_alphas, n_folds) scores obtained on test set for each value of the penalization parameter explored. Returns ======= self: GroupSparseCovarianceCV the object instance itself. """ # Empirical covariances emp_covs, n_samples = \ empirical_covariances(subjects, assume_centered=False) n_subjects = emp_covs.shape[2] # One cv generator per subject must be created, because each subject # can have a different number of samples from the others. cv = [] for k in range(n_subjects): cv.append( sklearn.cross_validation.check_cv(self.cv, subjects[k], None, classifier=False)) path = list() # List of (alpha, scores, covs) n_alphas = self.alphas if isinstance(n_alphas, collections.Sequence): alphas = list(self.alphas) n_alphas = len(alphas) n_refinements = 1 else: n_refinements = self.n_refinements alpha_1, _ = compute_alpha_max(emp_covs, n_samples) alpha_0 = 1e-2 * alpha_1 alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1), n_alphas)[::-1] covs_init = itertools.repeat(None) for i in range(n_refinements): # Compute the cross-validated loss on the current grid train_test_subjs = [] for train_test in zip(*cv): assert (len(train_test) == n_subjects) train_test_subjs.append( zip(*[(subject[train, :], subject[test, :]) for subject, (train, test) in zip(subjects, train_test)])) if self.early_stopping: probes = [ EarlyStopProbe(test_subjs, verbose=self.verbose) for _, test_subjs in train_test_subjs ] else: probes = itertools.repeat(None) this_path = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(group_sparse_covariance_path)( train_subjs, alphas, test_subjs=test_subjs, max_iter=self.max_iter_cv, tol=self.tol_cv, verbose=self.verbose, debug=self.debug, # Warm restart is useless with early stopping. precisions_init=None if self.early_stopping else prec_init, probe_function=probe) for (train_subjs, test_subjs), prec_init, probe in zip( train_test_subjs, covs_init, probes)) # this_path[i] is a tuple (precisions_list, scores) # - scores: scores obtained with the i-th folding, for each value # of alpha. # - precisions_list: corresponding precisions matrices, for each # value of alpha. precisions_list, scores = zip(*this_path) # now scores[i][j] is the score for the i-th folding, j-th value of # alpha (analoguous for precisions_list) precisions_list = zip(*precisions_list) scores = [np.mean(sc) for sc in zip(*scores)] # scores[i] is the mean score obtained for the i-th value of alpha. path.extend(zip(alphas, scores, precisions_list)) path = sorted(path, key=operator.itemgetter(0), reverse=True) # Find the maximum score (avoid using the built-in 'max' function # to have a fully-reproducible selection of the smallest alpha in # case of equality) best_score = -np.inf last_finite_idx = 0 for index, (alpha, this_score, _) in enumerate(path): if this_score >= .1 / np.finfo(np.float).eps: this_score = np.nan if np.isfinite(this_score): last_finite_idx = index if this_score >= best_score: best_score = this_score best_index = index # Refine the grid if best_index == 0: # We do not need to go back: we have chosen # the highest value of alpha for which there are # non-zero coefficients alpha_1 = path[0][0] alpha_0 = path[1][0] covs_init = path[0][2] elif (best_index == last_finite_idx and not best_index == len(path) - 1): # We have non-converged models on the upper bound of the # grid, we need to refine the grid there alpha_1 = path[best_index][0] alpha_0 = path[best_index + 1][0] covs_init = path[best_index][2] elif best_index == len(path) - 1: alpha_1 = path[best_index][0] alpha_0 = 0.01 * path[best_index][0] covs_init = path[best_index][2] else: alpha_1 = path[best_index - 1][0] alpha_0 = path[best_index + 1][0] covs_init = path[best_index - 1][2] alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0), len(alphas) + 2) alphas = alphas[1:-1] if n_refinements > 1: logger.log("[GroupSparseCovarianceCV] Done refinement " "% 2i out of %i" % (i + 1, n_refinements), verbose=self.verbose) path = list(zip(*path)) cv_scores_ = list(path[1]) alphas = list(path[0]) self.cv_scores_ = np.array(cv_scores_) self.alpha_ = alphas[best_index] self.cv_alphas_ = alphas # Finally, fit the model with the selected alpha logger.log("Final optimization", verbose=self.verbose) self.covariances_ = emp_covs self.precisions_ = _group_sparse_covariance(emp_covs, n_samples, self.alpha_, tol=self.tol, max_iter=self.max_iter, verbose=self.verbose, debug=self.debug) return self
if not os.path.exists('/'.join( [bird_db_loc, species, subject_id, 'TextGrids'])): os.makedirs('/'.join([bird_db_loc, species, subject_id, 'TextGrids'])) # save wav if not os.path.exists(wav_location): try: urllib.request.urlretrieve(wav, wav_location) except HTTPError: print('Could not retreive ' + wav) # save textgrid if not os.path.exists(grid_location): try: urllib.request.urlretrieve( 'http://taylor0.biology.ucla.edu/birdDBQuery/Files/' + text_grid, grid_location) except HTTPError: print('Could not retreive ' + 'http://taylor0.biology.ucla.edu/birdDBQuery/Files/' + text_grid) if parallel: with Parallel(n_jobs=n_jobs, verbose=verbosity) as parallel: parallel( delayed(downloadBirdDB)(row) for idx, row in tqdm(song_db.iterrows(), total=len(song_db))) else: for idx, row in tqdm(song_db.iterrows(), total=len(song_db)): downloadBirdDB(row)
def fit(self, X, y, **fit_params): ''' determine the factor levels, build the fits ''' #--- determine groups Xg = self.cat_trans.fit_transform(X) assert (Xg.shape[1] == 1) self.varname = self.cat_trans.get_feature_names()[0] levels, counts = np.unique(Xg.iloc[:, 0], return_counts=True) idx = np.array(list(reversed(np.argsort(counts)))) levels, counts, coverage = [ np.take(x, idx) for x in [levels, counts, counts / np.sum(counts)] ] #--- decide which levels to take self.levels_ = [ ] #regular levels with enough coverage one group/subpipe per self.default_levels_ = [ ] #munched up levels with not enough coverage, all project on the last group/subpipe self.coverage_ = [ ] #the coverage of each group subpipe, where the last entry is for the default_levels if they exist for l, c in zip(levels, coverage): if len(self.levels_) < self.max_levels and c >= self.min_coverage: self.levels_.append(l) self.coverage_.append(c) else: self.default_levels_.append(l) #--- insert the default key if neccessary if len(self.levels_) < len(levels): self.default_key_ = self.default_name else: self.default_key_ = None if self.default_key_ is not None: self.levels_.append(self.default_name) self.coverage_.append(1. - sum(self.coverage_)) logger.trace('grouping') #--- translate labels to group_indexes self.lg_dict = {l: g for g, l in enumerate(self.levels_)} def xghelper(v): res = self.lg_dict.get(v) if res is not None: return res if v in self.default_levels_: return self.lg_dict.get(self.default_key_) raise Exception( "Unknown level '%s' encountered for variable '%s', and no default enabled" % (v, self.varname)) xgroups = Xg.iloc[:, 0].apply(xghelper).values logger.trace("pre") #--- compute the pre_pipe result and split up into groups Xt = self.pre_trans.fit_transform(X, y) if not self.take_pre_only: if isinstance(Xt, pd.SparseDataFrame): Xt = Xt.to_dense() Xt = pd.concat([X, Xt], axis=1) if self.propagate_disc_labels: self.level_encoder_ = OneHotTransformer( sparse_output=False).fit(Xg) Xgt = self.level_encoder_.transform(Xg) #from sklearn.preprocessing import LabelEncoder #self.level_encoder_ = LabelEncoder(). #self.level_encoder_.classes_ = np.array(levels) #Xgt = Xg.apply(self.level_encoder_.transform, axis=1) Xt = pd.concat([Xt, Xgt], axis=1) Xtgroups = {gk: df for gk, df in Xt.groupby(xgroups)} ygroups = {gk: df for gk, df in y.groupby(xgroups)} logger.trace("segment fit") #--- create pipes and fit them for every group self.sub_pipes_ = [copy.deepcopy(self.sub_pipe) for l in self.levels_] pls = Parallel(n_jobs=self.n_jobs)(delayed(_fit_one_fittable)( self.sub_pipes_[gk], Xtgroups[gk], ygroups[gk]) for gk in Xtgroups.keys()) self.sub_pipes_ = pls self.coverage_ = np.array( [df.shape[0] / X.shape[0] for gk, df in Xtgroups.items()]) return self
def k_means(X, n_clusters, init='k-means++', precompute_distances='auto', n_init=10, max_iter=300, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1, return_n_iter=False): """K-means clustering algorithm. Read more in the :ref:`User Guide <k_means>`. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The observations to cluster. n_clusters : int The number of clusters to form as well as the number of centroids to generate. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. n_init : int, optional, default: 10 Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. precompute_distances : {'auto', True, False} Precompute distances (faster but takes more memory). 'auto' : do not precompute distances if n_samples * n_clusters > 12 million. This corresponds to about 100MB overhead per job using double precision. True : always precompute distances False : never precompute distances tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. copy_x : boolean, optional When pre-computing distances it is more numerically accurate to center the data first. If copy_x is True, then the original data is not modified. If False, the original data is modified, and put back before the function returns, but small numerical differences may be introduced by subtracting and then adding the data mean. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. return_n_iter : bool, optional Whether or not to return the number of iterations. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). best_n_iter: int Number of iterations corresponding to the best results. Returned only if `return_n_iter` is set to True. """ if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) best_inertia = np.infty X = as_float_array(X, copy=copy_x) tol = _tolerance(X, tol) # If the distances are precomputed every job will create a matrix of shape # (n_clusters, n_samples). To stop KMeans from eating up memory we only # activate this if the created matrix is guaranteed to be under 100MB. 12 # million entries consume a little under 100MB if they are of type double. if precompute_distances == 'auto': n_samples = X.shape[0] precompute_distances = (n_clusters * n_samples) < 12e6 elif isinstance(precompute_distances, bool): pass else: raise ValueError("precompute_distances should be 'auto' or True/False" ", but a value of %r was passed" % precompute_distances) # subtract of mean of x for more accurate distance computations if not sp.issparse(X) or hasattr(init, '__array__'): X_mean = X.mean(axis=0) if not sp.issparse(X): # The copy was already done above X -= X_mean if hasattr(init, '__array__'): init = check_array(init, dtype=np.float64, copy=True) _validate_center_shape(X, n_clusters, init) init -= X_mean if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in k-means instead of n_init=%d' % n_init, RuntimeWarning, stacklevel=2) n_init = 1 # precompute squared norms of data points x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None if n_jobs == 1: # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-means once labels, inertia, centers, n_iter_ = _kmeans_single( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, precompute_distances=precompute_distances, tol=tol, x_squared_norms=x_squared_norms, random_state=random_state) # determine if these results are the best so far if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia best_n_iter = n_iter_ else: # parallelisation of k-means runs seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) results = Parallel(n_jobs=n_jobs, verbose=0)( delayed(_kmeans_single)( X, n_clusters, max_iter=max_iter, init=init, verbose=verbose, tol=tol, precompute_distances=precompute_distances, x_squared_norms=x_squared_norms, # Change seed to ensure variety random_state=seed) for seed in seeds) # Get results with the lowest inertia labels, inertia, centers, n_iters = zip(*results) best = np.argmin(inertia) best_labels = labels[best] best_inertia = inertia[best] best_centers = centers[best] best_n_iter = n_iters[best] if not sp.issparse(X): if not copy_x: X += X_mean best_centers += X_mean if return_n_iter: return best_centers, best_labels, best_inertia, best_n_iter else: return best_centers, best_labels, best_inertia
return all_args def run_grid(paths): for path in paths: os.chdir(path) os.system("grid -i grid.in") def run_dock6(paths): for path in paths: os.chdir(path) os.system("dock6 -i anchor_grow_dock.in") n_jobs = 4 total_n_paths = 500 base_path = "/Users/tud51931/projects/murA/MurA-dock-MSMs" #rec_path = "/Users/tud51931/projects/murA/MurA-MSM-mol2" #setup_working_dirs(base_path) indices_args = gen_paths(n_jobs, total_n_paths, base_path) #print indices_args #print len(indices_args) Parallel(n_jobs=n_jobs, verbose=True)(delayed(run_grid)(indices) for indices in indices_args) Parallel(n_jobs=n_jobs, verbose=True)(delayed(run_dock6)(indices) for indices in indices_args)
def monkeypatch_fit(self, X, y=None, groups=None, **fit_params): if self.fit_params is not None: warnings.warn('"fit_params" as a constructor argument was ' 'deprecated in version 0.19 and will be removed ' 'in version 0.21. Pass fit parameters to the ' '"fit" method instead.', DeprecationWarning) if fit_params: warnings.warn('Ignoring fit_params passed as a constructor ' 'argument in favor of keyword arguments to ' 'the "fit" method.', RuntimeWarning) else: fit_params = self.fit_params estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) scorers, self.multimetric_ = _check_multimetric_scoring( self.estimator, scoring=self.scoring) if self.multimetric_: if self.refit is not False and ( not isinstance(self.refit, six.string_types) or # This will work for both dict / list (tuple) self.refit not in scorers): raise ValueError("For multi-metric scoring, the parameter " "refit must be set to a scorer key " "to refit an estimator with the best " "parameter setting on the whole data and " "make the best_* attributes " "available for that metric. If this is not " "needed, refit should be set to False " "explicitly. %r was passed." % self.refit) else: refit_metric = self.refit else: refit_metric = 'score' X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) # Regenerate parameter iterable for each fit candidate_params = list(self._get_param_iterator()) n_candidates = len(candidate_params) if self.verbose > 0: print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch # =================================================================== # BEGIN MONKEYPATCH MODIFICATION # =================================================================== parallel_cv = cv.split(X, y, groups) if type(self.pipeline_split_idx) == int and isinstance(base_estimator, Pipeline): split_idx = self.pipeline_split_idx pre_pipe_steps = base_estimator.steps[:split_idx] new_pipe_steps = base_estimator.steps[split_idx:] memory = base_estimator.memory pre_pipe = Pipeline(pre_pipe_steps, memory) if len(new_pipe_steps) == 1: est_name, base_estimator = new_pipe_steps[0] else: est_name = None base_estimator = Pipeline(new_pipe_steps, memory) fit_params_pre_pipe = {} steps_pre_pipe = [tup[0] for tup in pre_pipe_steps] fit_param_keys = fit_params.keys() for pname in fit_param_keys: step, param = pname.split('__', 1) if step in steps_pre_pipe: fit_params_pre_pipe[pname] = fit_params.pop(pname) elif step == est_name: fit_params[param] = fit_params.pop(pname) if est_name is not None: for dic in candidate_params: for k in dic: step, param = k.split('__', 1) if step == est_name: dic.update({param: dic.pop(k)}) try: X = pre_pipe.fit_transform(X, **fit_params_pre_pipe) except TypeError: raise RuntimeError('Pipeline before pipeline_split_idx requires ' 'fitting to y. Please initialize with an ' 'earlier index.') if self.transform_before_grid and isinstance(base_estimator, Pipeline): pipe = base_estimator est_name, base_estimator = pipe.steps.pop() X_cv, y_cv, parallel_cv = [], [], [] sample_count = 0 fit_params_est = {} fit_param_keys = fit_params.keys() for pname in fit_param_keys: step, param = pname.split('__', 1) if step == est_name: fit_params_est[param] = fit_params.pop(pname) for dic in candidate_params: for k in dic: step, param = k.split('__', 1) if step == est_name: dic.update({param: dic.pop(k)}) for (train, test) in cv.split(X, y, groups): if y is not None: if isinstance(X, pd.DataFrame): pipe.fit(X.iloc[train], y.iloc[train], **fit_params) else: pipe.fit(X[train], y[train], **fit_params) y_cv.append(y) else: if isinstance(X, pd.DataFrame): pipe.fit(X.iloc[train], **fit_params) else: pipe.fit(X[train], **fit_params) X_cv.append(pipe.transform(X)) train = train + sample_count test = test + sample_count sample_count += len(train) sample_count += len(test) parallel_cv.append((train, test)) if isinstance(X, pd.DataFrame): X = pd.concat(tuple(X_cv)) else: X = np.vstack(tuple(X_cv)) if y is not None: if isinstance(y, pd.Series): y = pd.concat(tuple(y_cv)) else: y = np.hstack(tuple(y_cv)) if 'sample_weight' in fit_params_est: samp_weight = fit_params_est['sample_weight'] fit_params_est['sample_weight'] = np.tile(samp_weight, len(y_cv)) fit_params = fit_params_est out = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )(delayed(monkeypatch_fit_and_score) (clone(base_estimator), X, y, scorers, train, test, self.verbose, parameters, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=False, error_score=self.error_score) for parameters, (train, test) in product(candidate_params, parallel_cv)) # =================================================================== # END MONKEYPATCH MODIFICATION # =================================================================== # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_score_dicts, test_score_dicts, test_sample_counts, fit_time, score_time) = zip(*out) else: (test_score_dicts, test_sample_counts, fit_time, score_time) = zip(*out) # test_score_dicts and train_score dicts are lists of dictionaries and # we make them into dict of lists test_scores = _aggregate_score_dicts(test_score_dicts) if self.return_train_score: train_scores = _aggregate_score_dicts(train_score_dicts) # TODO: replace by a dict in 0.21 results = (DeprecationDict() if self.return_train_score == 'warn' else {}) def _store(key_name, array, weights=None, splits=False, rank=False): """A small helper to store the scores/times to the cv_results_""" # When iterated first by splits, then by parameters # We want `array` to have `n_candidates` rows and `n_splits` cols. array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits) if splits: for split_i in range(n_splits): # Uses closure to alter the results results["split%d_%s" % (split_i, key_name)] = array[:, split_i] array_means = np.average(array, axis=1, weights=weights) results['mean_%s' % key_name] = array_means # Weighted std is not directly available in numpy array_stds = np.sqrt(np.average((array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights)) results['std_%s' % key_name] = array_stds if rank: results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) _store('fit_time', fit_time) _store('score_time', score_time) # Use one MaskedArray and mask all the places where the param is not # applicable for that candidate. Use defaultdict as each candidate may # not contain all the params param_results = defaultdict(partial(MaskedArray, np.empty(n_candidates,), mask=True, dtype=object)) for cand_i, params in enumerate(candidate_params): for name, value in params.items(): # An all masked empty array gets created for the key # `"param_%s" % name` at the first occurence of `name`. # Setting the value at an index also unmasks that index param_results["param_%s" % name][cand_i] = value results.update(param_results) # Store a list of param dicts at the key 'params' results['params'] = candidate_params # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) for scorer_name in scorers.keys(): # Computed the (weighted) mean and std for test scores alone _store('test_%s' % scorer_name, test_scores[scorer_name], splits=True, rank=True, weights=test_sample_counts if self.iid else None) if self.return_train_score: prev_keys = set(results.keys()) _store('train_%s' % scorer_name, train_scores[scorer_name], splits=True) if self.return_train_score == 'warn': for key in set(results.keys()) - prev_keys: message = ( 'You are accessing a training score ({!r}), ' 'which will not be available by default ' 'any more in 0.21. If you need training scores, ' 'please set return_train_score=True').format(key) # warn on key access results.add_warning(key, message, FutureWarning) # For multi-metric evaluation, store the best_index_, best_params_ and # best_score_ iff refit is one of the scorer names # In single metric evaluation, refit_metric is "score" if self.refit or not self.multimetric_: self.best_index_ = results["rank_test_%s" % refit_metric].argmin() self.best_params_ = candidate_params[self.best_index_] self.best_score_ = results["mean_test_%s" % refit_metric][ self.best_index_] if self.refit: self.best_estimator_ = clone(base_estimator).set_params( **self.best_params_) if y is not None: self.best_estimator_.fit(X, y, **fit_params) else: self.best_estimator_.fit(X, **fit_params) # Store the only scorer not as a dict for single metric evaluation self.scorer_ = scorers if self.multimetric_ else scorers['score'] self.cv_results_ = results self.n_splits_ = n_splits return self
def transform(self, X): """Apply the encoding to a dataframe. This method will encode the features in the test frame with the levels discovered in the ``fit`` computation. Parameters ---------- X : pd.DataFrame, shape=(n_samples, n_features) The Pandas frame to transform. The operation will be applied to a copy of the input data, and the result will be returned. Returns ------- X : pd.DataFrame or np.ndarray, shape=(n_samples, n_features) The operation is applied to a copy of ``X``, and the result set is returned. """ check_is_fitted(self, 'ohe_') X, _ = check_dataframe(X, cols=self.cols) # validate that fit cols in test set cols = self.fit_cols_ validate_test_set_columns(cols, X.columns) # fit params that we need ohe = self.ohe_ lenc = self.le_ sep = self.sep drop = self.drop_one_level # Do transformations in parallel transformations = list(Parallel(n_jobs=self.n_jobs)( delayed(_le_transform)( col=col, vec=X[col].values, le=lenc[col], handle=self.handle_unknown, sep=sep) for col in cols)) # This is another pass of O(N), but it's not performing any incremental # transformations of any sort. It just traverses the list of affected # columns, extending the column order list and tracking the columns to # drop. All of the heavy lifting for the transformations was handled # in parallel above. col_order = [] drops = [] for col, vec_trans, classes in transformations: X[col] = vec_trans col_order.extend(classes) # if we want to drop one, just drop the last if drop and len(classes) > 1: drops.append(classes[-1]) # now we can get the transformed OHE ohe_trans = pd.DataFrame.from_records(data=ohe.transform(X[cols]), columns=col_order) # set the index to be equal to X's for a smooth concat ohe_trans.index = X.index # if we're dropping one level, do so now if drops: ohe_trans = ohe_trans.drop(drops, axis=1) # drop the original columns from X X = X.drop(cols, axis=1) # We might have dropped ALL columns from X. And if that's the case, we # can just return the encoded columns if not X.columns.tolist(): return dataframe_or_array(ohe_trans, self.as_df) # otherwise concat the new columns X = pd.concat([X, ohe_trans], axis=1) # type: pd.DataFrame return dataframe_or_array(X, self.as_df)
def _evaluate_individuals(self, individuals, features, target, sample_weight=None, groups=None): """Determine the fit of the provided individuals. Parameters ---------- individuals: a list of DEAP individual One individual is a list of pipeline operators and model parameters that can be compiled by DEAP into a callable function features: numpy.ndarray {n_samples, n_features} A numpy matrix containing the training and testing features for the individual's evaluation target: numpy.ndarray {n_samples} A numpy matrix containing the training and testing target for the individual's evaluation sample_weight: array-like {n_samples}, optional List of sample weights to balance (or un-balanace) the dataset target as needed groups: array-like {n_samples, }, optional Group labels for the samples used while splitting the dataset into train/test set Returns ------- fitnesses_ordered: float Returns a list of tuple value indicating the individual's fitness according to its performance on the provided data """ if self.max_time_mins: total_mins_elapsed = (datetime.now() - self._start_datetime).total_seconds() / 60. if total_mins_elapsed >= self.max_time_mins: raise KeyboardInterrupt( '{} minutes have elapsed. TPOT will close down.'.format( total_mins_elapsed)) # Check we do not evaluate twice the same individual in one pass. _, unique_individual_indices = np.unique( [str(ind) for ind in individuals], return_index=True) unique_individuals = [ ind for i, ind in enumerate(individuals) if i in unique_individual_indices ] # return fitness scores operator_counts = {} # 4 lists of DEAP individuals, their sklearn pipelines and their operator counts for parallel computing eval_individuals_str = [] sklearn_pipeline_list = [] for individual in unique_individuals: # Disallow certain combinations of operators because they will take too long or take up too much RAM # This is a fairly hacky way to prevent TPOT from getting stuck on bad pipelines and should be improved in a future release individual_str = str(individual) sklearn_pipeline_str = generate_pipeline_code( expr_to_tree(individual, self._pset), self.operators) if sklearn_pipeline_str.count('PolynomialFeatures') > 1: if self.verbosity > 2: self._pbar.write( 'Invalid pipeline encountered. Skipping its evaluation.' ) self.evaluated_individuals_[individual_str] = (5000., -float('inf')) if not self._pbar.disable: self._pbar.update(1) # Check if the individual was evaluated before elif individual_str in self.evaluated_individuals_: if self.verbosity > 2: self._pbar.write( 'Pipeline encountered that has previously been evaluated during the ' 'optimization process. Using the score from the previous evaluation.' ) if not self._pbar.disable: self._pbar.update(1) else: try: # Transform the tree expression into an sklearn pipeline sklearn_pipeline = self._toolbox.compile(expr=individual) # Fix random state when the operator allows self._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) # Setting the seed is needed for XGBoost support because XGBoost currently stores # both a seed and random_state, and they're not synced correctly. # XGBoost will raise an exception if random_state != seed. if 'XGB' in sklearn_pipeline_str: self._set_param_recursive(sklearn_pipeline.steps, 'seed', 42) # Count the number of pipeline operators as a measure of pipeline complexity operator_count = self._operator_count(individual) operator_counts[individual_str] = max(1, operator_count) except Exception: self.evaluated_individuals_[individual_str] = ( 5000., -float('inf')) if not self._pbar.disable: self._pbar.update(1) continue eval_individuals_str.append(individual_str) sklearn_pipeline_list.append(sklearn_pipeline) # evalurate pipeline resulting_score_list = [] # chunk size for pbar update for chunk_idx in range(0, len(sklearn_pipeline_list), self.n_jobs * 4): jobs = [] for sklearn_pipeline in sklearn_pipeline_list[chunk_idx:chunk_idx + self.n_jobs * 4]: job = delayed(_wrapped_cross_val_score)( sklearn_pipeline=sklearn_pipeline, features=features, target=target, cv=self.cv, scoring_function=self.scoring_function, sample_weight=sample_weight, max_eval_time_mins=self.max_eval_time_mins, groups=groups) jobs.append(job) parallel = Parallel(n_jobs=self.n_jobs, verbose=0, pre_dispatch='2*n_jobs') tmp_result_score = parallel(jobs) # update pbar for val in tmp_result_score: if not self._pbar.disable: self._pbar.update(1) if val == 'Timeout': if self.verbosity > 2: self._pbar.write( 'Skipped pipeline #{0} due to time out. ' 'Continuing to the next pipeline.'.format( self._pbar.n)) resulting_score_list.append(-float('inf')) else: resulting_score_list.append(val) for resulting_score, individual_str in zip(resulting_score_list, eval_individuals_str): if type(resulting_score) in [float, np.float64, np.float32]: self.evaluated_individuals_[individual_str] = ( operator_counts[individual_str], resulting_score) else: raise ValueError('Scoring function does not return a float.') return [ self.evaluated_individuals_[str(individual)] for individual in individuals ]
def fit(self, X, Y, constraints=None, warm_start=None, initialize=True): """Learn parameters using cutting plane method. Parameters ---------- X : iterable Traing instances. Contains the structured input objects. No requirement on the particular form of entries of X is made. Y : iterable Training labels. Contains the strctured labels for inputs in X. Needs to have the same length as X. contraints : iterable Known constraints for warm-starts. List of same length as X. Each entry is itself a list of constraints for a given instance x . Each constraint is of the form [y_hat, delta_psi, loss], where y_hat is a labeling, ``delta_psi = psi(x, y) - psi(x, y_hat)`` and loss is the loss for predicting y_hat instead of the true label y. initialize : boolean, default=True Whether to initialize the model for the data. Leave this true except if you really know what you are doing. """ print("Training n-slack dual structural SVM") cvxopt.solvers.options['show_progress'] = self.verbose > 3 if initialize: self.model.initialize(X, Y) self.w = np.zeros(self.model.size_psi) n_samples = len(X) stopping_criterion = False if constraints is None: # fresh start constraints = [[] for i in xrange(n_samples)] self.last_active = [[] for i in xrange(n_samples)] self.objective_curve_ = [] self.primal_objective_curve_ = [] self.timestamps_ = [time()] else: # warm start objective = self._solve_n_slack_qp(constraints, n_samples) try: # catch ctrl+c to stop training # we have to update at least once after going through the dataset for iteration in xrange(self.max_iter): # main loop self.timestamps_.append(time() - self.timestamps_[0]) if self.verbose > 0: print("iteration %d" % iteration) if self.verbose > 2: print(self) new_constraints = 0 # generate slices through dataset from batch_size if self.batch_size < 1 and not self.batch_size == -1: raise ValueError("batch_size should be integer >= 1 or -1," "got %s." % str(self.batch_size)) batch_size = (self.batch_size if self.batch_size != -1 else len(X)) n_batches = int(np.ceil(float(len(X)) / batch_size)) slices = gen_even_slices(n_samples, n_batches) indices = np.arange(n_samples) slack_sum = 0 for batch in slices: new_constraints_batch = 0 verbose = max(0, self.verbose - 3) X_b = X[batch] Y_b = Y[batch] indices_b = indices[batch] candidate_constraints = Parallel( n_jobs=self.n_jobs, verbose=verbose)( delayed(find_constraint)(self.model, x, y, self.w) for x, y in zip(X_b, Y_b)) # for each batch, gather new constraints for i, x, y, constraint in zip(indices_b, X_b, Y_b, candidate_constraints): # loop over samples in batch y_hat, delta_psi, slack, loss = constraint slack_sum += slack if self.verbose > 3: print("current slack: %f" % slack) if not loss > 0: # can have y != y_hat but loss = 0 in latent svm. # we need this here as dpsi is then != 0 continue if self._check_bad_constraint(y_hat, slack, constraints[i]): continue constraints[i].append([y_hat, delta_psi, loss]) new_constraints_batch += 1 # after processing the slice, solve the qp if new_constraints_batch: objective = self._solve_n_slack_qp( constraints, n_samples) new_constraints += new_constraints_batch self.objective_curve_.append(objective) self._compute_training_loss(X, Y, iteration) primal_objective = (self.C * slack_sum + np.sum(self.w**2) / 2) self.primal_objective_curve_.append(primal_objective) if self.verbose > 0: print("new constraints: %d, " "cutting plane objective: %f primal objective: %f" % (new_constraints, objective, primal_objective)) if new_constraints == 0: print("no additional constraints") stopping_criterion = True if (iteration > 1 and self.objective_curve_[-1] - self.objective_curve_[-2] < self.tol): print("objective converged.") stopping_criterion = True if stopping_criterion: if (self.switch_to is not None and self.model.inference_method != self.switch_to): print("Switching to %s inference" % str(self.switch_to)) self.model.inference_method_ = \ self.model.inference_method self.model.inference_method = self.switch_to stopping_criterion = False continue else: break if self.verbose > 5: print(self.w) if self.logger is not None: self.logger(self, iteration) except KeyboardInterrupt: pass if self.logger is not None: self.logger(self, 'final') self.constraints_ = constraints if self.verbose and self.n_jobs == 1: print("calls to inference: %d" % self.model.inference_calls) return self
# featdict['behavior_timestamp_click_month_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.month.nunique() # featdict['behavior_timestamp_click_day_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.day.nunique() # featdict['behavior_timestamp_click_hour_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.hour.nunique() # featdict['behavior_timestamp_click_minute_NUNIQUE'] = df_tmp['behavior_timestamp'].dt.minute.nunique() return featdict train_id = pd.read_csv('./train_id.csv') test_id = pd.read_csv('./test_id.csv') train_id = pd.read_csv('./train_id.csv') test_id = pd.read_csv('./test_id.csv') train_feat = Parallel(n_jobs=30)( delayed(feature_agg)(i, './train/' + id + '.hdf') for i, id in enumerate(train_id['user_id'].iloc[:])) test_feat = Parallel(n_jobs=30)( delayed(feature_agg)(i, './test/' + id + '.hdf') for i, id in enumerate(test_id['user_id'].iloc[:])) train_feat = pd.DataFrame(train_feat) test_feat = pd.DataFrame(test_feat) train_feat = pd.merge(train_feat, train_id, on='user_id', how='left') params = { 'learning_rate': 0.01, 'min_child_samples': 5, 'max_depth': -1, 'lambda_l1': 2, 'boosting': 'gbdt',
def _cpu_map(fun, param_grid, n_jobs, verbose): return Parallel( n_jobs=n_jobs, verbose=verbose, backend="threading", # any sklearn backend should work here )(delayed(fun)(params) for params in param_grid)
def fit(self, X, y): """Build a Bagging ensemble of estimators from the training set (X, y). Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. y : array-like, shape = [n_samples] The target values (class labels in classification, real numbers in regression). Returns ------- self : object Returns self. """ random_state = check_random_state(self.random_state) # Convert data X, y = check_X_y(X, y, ['csr', 'csc']) # Remap output n_samples, self.n_features_ = X.shape y = self._validate_y(y) # Check parameters self._validate_estimator() if isinstance(self.max_samples, (numbers.Integral, np.integer)): max_samples = self.max_samples else: # float max_samples = int(self.max_samples * X.shape[0]) if not (0 < max_samples <= X.shape[0]): raise ValueError("max_samples must be in (0, n_samples]") if isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) if not (0 < max_features <= self.n_features_): raise ValueError("max_features must be in (0, n_features]") if not self.bootstrap and self.oob_score: raise ValueError("Out of bag estimation only available" " if bootstrap=True") if self.warm_start and self.oob_score: raise ValueError("Out of bag estimate only available" " if warm_start=False") if hasattr(self, "oob_score_") and self.warm_start: del self.oob_score_ if not self.warm_start or len(self.estimators_) == 0: # Free allocated memory, if any self.estimators_ = [] self.estimators_samples_ = [] self.estimators_features_ = [] n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: raise ValueError('n_estimators=%d must be larger or equal to ' 'len(estimators_)=%d when warm_start==True' % (self.n_estimators, len(self.estimators_))) elif n_more_estimators == 0: warn("Warm-start fitting without increasing n_estimators does not " "fit new trees.") return self # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( n_more_estimators, self.n_jobs) # Advance random state to state after training # the first n_estimators if self.warm_start and len(self.estimators_) > 0: random_state.randint(MAX_INT, size=len(self.estimators_)) seeds = random_state.randint(MAX_INT, size=n_more_estimators) all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose)( # TEF: changed following call to balanced procedure: delayed(_parallel_build_balanced_estimators)( n_estimators[i], self, X, y, seeds[starts[i]:starts[i + 1]], verbose=self.verbose) for i in range(n_jobs)) # Reduce self.estimators_ += list( itertools.chain.from_iterable(t[0] for t in all_results)) self.estimators_samples_ += list( itertools.chain.from_iterable(t[1] for t in all_results)) self.estimators_features_ += list( itertools.chain.from_iterable(t[2] for t in all_results)) if self.oob_score: self._set_oob_score(X, y) return self
def grid_search_early_stopping(estimator, param_grid, verbose, scoring, cv, X, y, early_stopping_rounds, eval_set_size, n_jobs=1, iid=True, refit=True, pre_dispatch='2*n_jobs', error_score='raise'): ''' This is from scikit-learn package. ''' parameter_iterable = ParameterGrid(param_grid) scorer_ = check_scoring(estimator, scoring=scoring) n_samples = _num_samples(X) X, y = indexable(X, y) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if verbose > 0: if isinstance(parameter_iterable, Sized): n_candidates = len(parameter_iterable) print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(len(cv), n_candidates, n_candidates * len(cv))) base_estimator = clone(estimator) pre_dispatch = pre_dispatch out = Parallel( n_jobs=n_jobs, verbose=2 if verbose > 0 else 0, pre_dispatch=pre_dispatch)(delayed(_fit_and_score)( clone(base_estimator), X, y, scorer_, train, test, 2 if verbose > 0 else 0, parameters, { "early_stopping_rounds": early_stopping_rounds, "eval_metric": get_xgboost_eval_metric(scoring), "eval_set": [_safe_split(estimator, X, y, test, train)], "verbose": True if verbose > 1 else False }, return_parameters=True, error_score=error_score) for parameters in parameter_iterable for train, test in cv) # Out is a list of triplet: score, estimator, n_test_samples n_fits = len(out) n_folds = len(cv) scores = list() grid_scores = list() for grid_start in range(0, n_fits, n_folds): n_test_samples = 0 score = 0 all_scores = [] for this_score, this_n_test_samples, _, parameters in \ out[grid_start:grid_start + n_folds]: all_scores.append(this_score) if iid: this_score *= this_n_test_samples n_test_samples += this_n_test_samples score += this_score if iid: score /= float(n_test_samples) else: score /= float(n_folds) scores.append((score, parameters)) # TODO: shall we also store the test_fold_sizes? grid_scores.append( _CVScoreTuple(parameters, score, np.array(all_scores))) # Find the best parameters by comparing on the mean validation score: # note that `sorted` is deterministic in the way it breaks ties best = sorted(grid_scores, key=lambda x: x.mean_validation_score, reverse=True)[0] best_score_ = best.mean_validation_score if refit: # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params(**best.parameters) if y is not None: best_estimator, _, _ = fit_estimator_early_stopping( best_estimator, X, y, scoring, early_stopping_rounds, eval_set_size, verbose) else: raise ValueError('y is required.') return best_estimator, best.parameters, grid_scores
def fit(self, X, y=None, groups=None, **fit_params): """Run fit on the estimator with randomly drawn parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. groups : array-like, with shape (n_samples,), optional Group labels for the samples used while splitting the dataset into train/test set. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the estimator """ estimator = self.estimator cv = check_cv(self.cv, y, classifier=is_classifier(estimator)) self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) self._random_state = check_random_state(self.random_state) X, y, groups = indexable(X, y, groups) n_splits = cv.get_n_splits(X, y, groups) R = list(self.cost_parameter_max.values())[0] if self.cost_parameter_min is None: Rmin = 1 else: Rmin = list(self.cost_parameter_min.values())[0] n_candidates = hyperband_num_per_run(self.eta, R, Rmin) log.debug( "Fitting %d folds for each of %d candidates, totalling " "%d fits.", n_splits, n_candidates, n_candidates * n_splits) if self.verbose > 0: print("Fitting {0} folds for each of {1} candidates, totalling" " {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) base_estimator = clone(self.estimator) pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) out = [] smax = int(np.floor(np.log(R / Rmin) / np.log(self.eta))) B = (smax + 1.0) * R # This code is hyperband, but I have swapped the order of the # inner and outer loops to expose more parallelism. Fun. Ts = [] ns = [] rs = [] for s in range(smax, -1, -1): ns.append(int(np.ceil(B / R * np.power(self.eta, s) / (s + 1.0)))) rs.append(int(R / np.power(self.eta, s))) Ts.append( list( ParameterSampler(self.param_distributions, ns[-1], random_state=self._random_state))) nums = copy.copy(ns) # these are the offsets to the hyperparameter configurations for # each value of s in the loop above # they get updated as the loop over the different rounds get run # below offsets = [0] + list(np.cumsum( np.array(nums) * n_splits).astype(int))[:-1] # iterate the maximum number of times for each resource budget # configuration. # If we should skip an interation, T will be an empty list for rnd in range(0, smax + 1): # set the costs for this round r_rnd = [] for ind, s in enumerate(range(smax, -1, -1)): _r = int(rs[ind] * np.power(self.eta, rnd)) r_rnd += [_r] * nums[ind] # run the jobs _jobs = [] for parameters, _r in zip(itertools.chain.from_iterable(Ts), r_rnd): _parameters = copy.deepcopy(parameters) _parameters.update( {list(self.cost_parameter_max.keys())[0]: _r}) for train, test in cv_iter: _jobs.append( delayed(_fit_and_score)( clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, _parameters, fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, return_times=True, return_parameters=True, error_score=self.error_score)) _out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch)(_jobs) out += _out # now post-process new_Ts = [] new_nums = [] for ind, s in enumerate(range(smax, -1, -1)): n_i = int(np.floor(ns[ind] / np.power(self.eta, rnd))) num_to_keep = int(np.floor(n_i / self.eta)) # keep for next round only if num_to_keep > 0 AND # the round after this round will be executed # in otherwords, you only need to cut the configurations # down by eta if you are going to test them in the next # round if num_to_keep > 0 and rnd < s: _out_s = _out[offsets[ind]:(offsets[ind] + nums[ind] * n_splits)] results, _ = self._process_outputs(_out_s, n_splits) sind = np.argsort(results["rank_test_score"]) msk = np.zeros(len(results['rank_test_score'])) msk[sind[0:num_to_keep]] = 1 msk = msk.astype(bool) new_Ts.append( [p for k, p in enumerate(results['params']) if msk[k]]) new_nums.append(num_to_keep) else: new_Ts.append([]) new_nums.append(0) Ts = new_Ts nums = new_nums offsets = [0] + list( np.cumsum(np.array(nums) * n_splits).astype(int))[:-1] results, best_index = self._process_outputs(out, n_splits) self.cv_results_ = results self.best_index_ = best_index self.n_splits_ = n_splits self.multimetric_ = False if not hasattr(self, 'best_score_'): self.best_score_ = results['mean_test_score'][best_index] if not hasattr(self, 'best_params_'): self.best_params_ = results['params'][best_index] if self.refit: best_estimator = clone(self.estimator).set_params( **self.cv_results_['params'][self.best_index_]) if y is not None: best_estimator.fit(X, y, **fit_params) else: best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self
def fit(self, X, y): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. Returns ------- self : classifier Returns self. """ rs = self._get_random_state() # Create dataset ds = get_dataset(X, order="fortran") n_samples = ds.get_n_samples() n_features = ds.get_n_features() if self.penalty != "l1/l2" and self.multiclass: raise NotImplementedError( "True multiclass options not implemented " "for non group-lasso(l1/l2) penalties.") # Create label transformers #neg_label = 0 if self.penalty == "nn" else -1 reencode = self.penalty == "l1/l2" y, n_classes, n_vectors = self._set_label_transformers(y, reencode, neg_label=-1) Y = np.asfortranarray(self.label_binarizer_.transform(y), dtype=np.float64) # Initialize coefficients if not self.warm_start or self.coef_ is None: self.C_init = self.C self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self._init_errors(Y) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) indices = np.arange(n_features, dtype=np.int32) max_steps = self._get_max_steps() # Learning if self.penalty == "l1/l2": tol = self.tol #n_min = np.min(np.sum(Y == 1, axis=0)) #tol *= max(n_min, 1) / n_samples vinit = self.violation_init_.get(0, 0) * self.C / self.C_init model = _primal_cd(self, self.coef_, self.errors_, ds, y, Y, -1, self.multiclass, indices, 12, self._get_loss(), self.selection, self.permute, self.termination, self.C, self.alpha, self.max_iter, max_steps, self.shrinking, vinit, rs, tol, self.callback, self.n_calls, self.verbose) viol = model[0] if self.warm_start and len(self.violation_init_) == 0: self.violation_init_[0] = viol elif self.penalty in ("l1", "l2", "nn"): penalty = self._get_penalty() n_pos = np.zeros(n_vectors) vinit = self.C / self.C_init * np.ones_like(n_pos) for k in xrange(n_vectors): n_pos[k] = np.sum(Y[:, k] == 1) vinit[k] *= self.violation_init_.get(k, 0) n_neg = n_samples - n_pos tol = self.tol * np.maximum(np.minimum(n_pos, n_neg), 1) / n_samples jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_, ds, y, Y, k, False, indices, penalty, self._get_loss(), self.selection, self.permute, self.termination, self.C, self.alpha, self.max_iter, max_steps, self.shrinking, vinit[k], rs, tol[k], self.callback, self.n_calls, self.verbose) for k in xrange(n_vectors)) model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs) viol, coefs, errors = zip(*model) self.coef_ = np.asarray(coefs) self.errors_ = np.asarray(errors) for k in range(n_vectors): if self.warm_start and not k in self.violation_init_: self.violation_init_[k] = viol[k] if self.debiasing: nz = self.coef_ != 0 if not self.warm_debiasing: self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self._init_errors(Y) indices = np.arange(n_features, dtype=np.int32) jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_, ds, y, Y, k, False, indices[nz[k]], 2, self._get_loss(), "cyclic", self.permute, "violation_sum", self.Cd, 1.0, self.max_iter, max_steps, False, 0, rs, self.tol, self.callback, self.n_calls, self.verbose) for k in xrange(n_vectors)) model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs) viol, coefs, errors = zip(*model) self.coef_ = np.asarray(coefs) self.errors_ = np.asarray(errors) return self
def plot_partial_dependence(gbrt, X, features, feature_names=None, label=None, n_cols=3, grid_resolution=100, percentiles=(0.05, 0.95), n_jobs=1, verbose=0, ax=None, line_kw=None, contour_kw=None, **fig_kw): """Partial dependence plots for ``features``. The ``len(features)`` plots are arranged in a grid with ``n_cols`` columns. Two-way partial dependence plots are plotted as contour plots. Read more in the :ref:`User Guide <partial_dependence>`. Parameters ---------- gbrt : BaseGradientBoosting A fitted gradient boosting model. X : array-like, shape=(n_samples, n_features) The data on which ``gbrt`` was trained. features : seq of tuples or ints If seq[i] is an int or a tuple with one int value, a one-way PDP is created; if seq[i] is a tuple of two ints, a two-way PDP is created. feature_names : seq of str Name of each feature; feature_names[i] holds the name of the feature with index i. label : object The class label for which the PDPs should be computed. Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``. n_cols : int The number of columns in the grid plot (default: 3). percentiles : (low, high), default=(0.05, 0.95) The lower and upper percentile used to create the extreme values for the PDP axes. grid_resolution : int, default=100 The number of equally spaced points on the axes. n_jobs : int The number of CPUs to use to compute the PDs. -1 means 'all CPUs'. Defaults to 1. verbose : int Verbose output during PD computations. Defaults to 0. ax : Matplotlib axis object, default None An axis object onto which the plots will be drawn. line_kw : dict Dict with keywords passed to the ``pylab.plot`` call. For one-way partial dependence plots. contour_kw : dict Dict with keywords passed to the ``pylab.plot`` call. For two-way partial dependence plots. fig_kw : dict Dict with keywords passed to the figure() call. Note that all keywords not recognized above will be automatically included here. Returns ------- fig : figure The Matplotlib Figure object. axs : seq of Axis objects A seq of Axis objects, one for each subplot. Examples -------- >>> from sklearn.datasets import make_friedman1 >>> from sklearn.ensemble import GradientBoostingRegressor >>> X, y = make_friedman1() >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y) >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP ... """ import matplotlib.pyplot as plt from matplotlib import transforms from matplotlib.ticker import MaxNLocator from matplotlib.ticker import ScalarFormatter # if not isinstance(gbrt, BaseGradientBoosting): # raise ValueError('gbrt has to be an instance of BaseGradientBoosting') if gbrt.estimators_.shape[0] == 0: raise ValueError('Call %s.fit before partial_dependence' % gbrt.__class__.__name__) # set label_idx for multi-class GBRT if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2: if label is None: raise ValueError('label is not given for multi-class PDP') label_idx = np.searchsorted(gbrt.classes_, label) if gbrt.classes_[label_idx] != label: raise ValueError('label %s not in ``gbrt.classes_``' % str(label)) else: # regression and binary classification label_idx = 0 X = check_array(X, dtype=DTYPE, order='C') if gbrt.n_features != X.shape[1]: raise ValueError('X.shape[1] does not match gbrt.n_features') if line_kw is None: line_kw = {'color': 'green'} if contour_kw is None: contour_kw = {} # convert feature_names to list if feature_names is None: # if not feature_names use fx indices as name feature_names = [str(i) for i in range(gbrt.n_features)] elif isinstance(feature_names, np.ndarray): feature_names = feature_names.tolist() def convert_feature(fx): if isinstance(fx, six.string_types): try: fx = feature_names.index(fx) except ValueError: raise ValueError('Feature %s not in feature_names' % fx) return fx # convert features into a seq of int tuples tmp_features = [] for fxs in features: if isinstance(fxs, (numbers.Integral, ) + six.string_types): fxs = (fxs, ) try: fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32) except TypeError: raise ValueError('features must be either int, str, or tuple ' 'of int/str') if not (1 <= np.size(fxs) <= 2): raise ValueError('target features must be either one or two') tmp_features.append(fxs) features = tmp_features names = [] try: for fxs in features: l = [] # explicit loop so "i" is bound for exception below for i in fxs: l.append(feature_names[i]) names.append(l) except IndexError: raise ValueError('features[i] must be in [0, n_features) ' 'but was %d' % i) # compute PD functions pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(delayed( partial_dependence )(gbrt, fxs, X=X, grid_resolution=grid_resolution, percentiles=percentiles) for fxs in features) # get global min and max values of PD grouped by plot type pdp_lim = {} for pdp, axes in pd_result: min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max() n_fx = len(axes) old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd)) min_pd = min(min_pd, old_min_pd) max_pd = max(max_pd, old_max_pd) pdp_lim[n_fx] = (min_pd, max_pd) # create contour levels for two-way plots if 2 in pdp_lim: Z_level = np.linspace(*pdp_lim[2], num=8) if ax is None: fig = plt.figure(**fig_kw) else: fig = ax.get_figure() fig.clear() n_cols = min(n_cols, len(features)) n_rows = int(np.ceil(len(features) / float(n_cols))) axs = [] for i, fx, name, (pdp, axes) in zip(count(), features, names, pd_result): ax = fig.add_subplot(n_rows, n_cols, i + 1) if len(axes) == 1: ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw) else: # make contour plot assert len(axes) == 2 XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[label_idx].reshape(list(map(np.size, axes))).T CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors='k') ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], vmin=Z_level[0], alpha=0.75, **contour_kw) ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) # plot data deciles + axes labels deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory(ax.transData, ax.transAxes) ylim = ax.get_ylim() ax.vlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_xlabel(name[0]) ax.set_ylim(ylim) # prevent x-axis ticks from overlapping ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower')) tick_formatter = ScalarFormatter() tick_formatter.set_powerlimits((-3, 4)) ax.xaxis.set_major_formatter(tick_formatter) if len(axes) > 1: # two-way PDP - y-axis deciles + labels deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1)) trans = transforms.blended_transform_factory( ax.transAxes, ax.transData) xlim = ax.get_xlim() ax.hlines(deciles, [0], 0.05, transform=trans, color='k') ax.set_ylabel(name[1]) # hline erases xlim ax.set_xlim(xlim) else: ax.set_ylabel('Partial dependence') if len(axes) == 1: ax.set_ylim(pdp_lim[1]) axs.append(ax) fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4, hspace=0.3) return fig, axs
def fit(self, X, y): """Fit model according to X and y. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values. Returns ------- self : regressor Returns self. """ rs = self._get_random_state() # Create dataset ds = get_dataset(X, order="fortran") n_features = ds.get_n_features() self.outputs_2d_ = len(y.shape) == 2 if self.outputs_2d_: Y = y else: Y = y.reshape(-1, 1) Y = np.asfortranarray(Y, dtype=np.float64) y = np.empty(0, dtype=np.int32) n_vectors = Y.shape[1] # Initialize coefficients if not self.warm_start or self.coef_ is None: self.C_init = self.C self.coef_ = np.zeros((n_vectors, n_features), dtype=np.float64) self._init_errors(Y) self.intercept_ = np.zeros(n_vectors, dtype=np.float64) indices = np.arange(n_features, dtype=np.int32) if self.penalty == "l1/l2": vinit = self.violation_init_.get(0, 0) * self.C / self.C_init model = _primal_cd(self, self.coef_, self.errors_, ds, y, Y, -1, False, indices, 12, self._get_loss(), self.selection, self.permute, self.termination, self.C, self.alpha, self.max_iter, self.max_steps, self.shrinking, vinit, rs, self.tol, self.callback, self.n_calls, self.verbose) viol = model[0] if self.warm_start and len(self.violation_init_) == 0: self.violation_init_[0] = viol else: penalty = self._get_penalty() vinit = np.asarray( [self.violation_init_.get(k, 0) for k in xrange(n_vectors)]) * self.C / self.C_init jobs = (delayed(_primal_cd)(self, self.coef_, self.errors_, ds, y, Y, k, False, indices, penalty, self._get_loss(), self.selection, self.permute, self.termination, self.C, self.alpha, self.max_iter, self.max_steps, self.shrinking, vinit[k], rs, self.tol, self.callback, self.n_calls, self.verbose) for k in xrange(n_vectors)) model = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(jobs) viol, self.coef_, self.error_ = zip(*model) self.coef_ = np.asarray(self.coef_) self.error_ = np.asarray(self.error_) if self.warm_start and not k in self.violation_init_: self.violation_init_[k] = viol return self
def transform_imgs(self, imgs_list, confounds=None, copy=True, n_jobs=1): """Prepare multi subject data in parallel Parameters ---------- imgs_list: list of Niimg-like objects See http://nilearn.github.io/manipulating_visualizing/manipulating_images.html#niimg. List of imgs file to prepare. One item per subject. confounds: list of confounds, optional List of confounds (2D arrays or filenames pointing to CSV files). Must be of same length than imgs_list. copy: boolean, optional If True, guarantees that output array has no memory in common with input array. n_jobs: integer, optional The number of cpus to use to do the computation. -1 means 'all cpus'. Returns ------- region_signals: list of 2D numpy.ndarray List of signal for each element per subject. shape: list of (number of scans, number of elements) """ if not hasattr(self, 'mask_img_'): raise ValueError( 'It seems that %s has not been fitted. ' 'You must call fit() before calling transform().' % self.__class__.__name__) target_fov = None if self.target_affine is None: # Force resampling on first image target_fov = 'first' niimg_iter = _iter_check_niimg(imgs_list, ensure_ndim=None, atleast_4d=False, target_fov=target_fov, memory=self.memory, memory_level=self.memory_level, verbose=self.verbose) if confounds is None: confounds = itertools.repeat(None, len(imgs_list)) # Ignore the mask-computing params: they are not useful and will # just invalidate the cache for no good reason # target_shape and target_affine are conveyed implicitly in mask_img params = get_params( self.__class__, self, ignore=['mask_img', 'mask_args', 'mask_strategy', 'copy']) func = self._cache( filter_and_mask, ignore=['verbose', 'memory', 'memory_level', 'copy']) data = Parallel(n_jobs=n_jobs)( delayed(func)(imgs, self.mask_img_, params, memory_level=self.memory_level, memory=self.memory, verbose=self.verbose, confounds=cfs, copy=copy) for imgs, cfs in izip(niimg_iter, confounds)) return [d[0] for d in data]
def applyAugmentation(self): self.readImagesAndAnnotations() Parallel(n_jobs=-1)(delayed(readAndGenerateImageSegmentation)( self.outputPath, self.generators, self.labelsExtension, x) for x in enumerate(self.imagePaths))
def fit(self, X, y): """Fit estimators from the training set (X, y). Returns ------- self : object Returns self. """ if not isinstance(X, dict): raise ValueError("X has to be a dict") if self.base_estimator._estimator_type == 'classifier': self.classes_ = np.unique(y) self.set_random_state() estimators = dict() for roi_id, x in X.items(): estimator = clone(self.base_estimator) estimator.roi_id = roi_id if self.base_estimator._estimator_type == 'searchlight_ensemble': estimator.set_params(process_mask_img=x[1]) estimators[roi_id] = estimator if self.vote_graded: y_pred = {k: np.full(len(y), np.nan) for k in X.keys()} for f, (train_index, test_index) in enumerate(LeaveOneOut()): y_train = [y[i] for i in train_index] if self.base_estimator._estimator_type == 'searchlight_ensemble': estimators_fit = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)( e, [X[roi_id][0][i] for i in train_index], y_train) for roi_id, e in estimators.items()) estimators_fit = {e.roi_id: e for e in estimators_fit} y_pred_ = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_vote)( e, [X[roi_id][0][i] for i in test_index], False) for roi_id, e in estimators_fit.items()) else: estimators_fit = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)( e, [X[roi_id][i] for i in train_index], y_train) for roi_id, e in estimators.items()) estimators_fit = {e.roi_id: e for e in estimators_fit} y_pred_ = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_vote)( e, [X[roi_id][i] for i in test_index], False) for roi_id, e in estimators_fit.items()) for i, roi_id in enumerate(X.keys()): y_pred[roi_id][test_index] = y_pred_[i] self.vote_weighting = [ np.mean(v == np.array(y)) for v in y_pred.values() ] if not np.any(self.vote_weighting): self.vote_weighting = 1e-10 * np.ones(len(self.vote_weighting)) else: self.vote_weighting = np.ones(len(X.keys())) / len(X.keys()) if self.base_estimator._estimator_type == 'searchlight_ensemble': estimators = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)(e, X[roi_id][0], y) for roi_id, e in estimators.items()) else: estimators = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_estimator)(e, X[roi_id], y) for roi_id, e in estimators.items()) self.estimators_ = {e.roi_id: e for e in estimators} return self
exp_prefix = experiment_name + '/lambda=' + str( rho) + '__' + 'beta=' + str(beta) + '__' + 'metric=' + str(metric) print("-" * 30, exp_prefix) os.makedirs('./%s/%s/%s' % (res_folder, data_prefix, exp_prefix), exist_ok=True) lml = LaundryML(data_prefix, test_prefix, exp_prefix, res_folder, k, opt, rho, beta, metric, maj_pos, min_pos, sensitve_attr, non_sensitve_attr, decision_attr) lml.run() #create_rules(data_prefix=params['data_prefix'], original_dataset_path=params['original_dataset_path']) #time.sleep(35) # searching for rationalization models for _metric in params['metrics']: for _lambdak in params['lambdas']: Parallel(n_jobs=-1)( delayed(bench)(beta=_beta, rho=_lambdak, metric=_metric) for _beta in params['betas']) # plotting results for _metric in params['metrics']: for _lambdak in params['lambdas']: for _beta in params['betas']: exp_prefix = params['experiment_name'] + '/lambda=' + str( _lambdak) + '__' + 'beta=' + str( _beta) + '__' + 'metric=' + str(_metric) print("=" * 30, exp_prefix) enumplot(params['res_folder'], params['data_prefix'], exp_prefix) #get_audit(data_prefix, exp_prefix)