def __init__(self, X, y, n_classes, batch_size): check_arrays(X, dtype=np.float32) check_arrays(y, dtype=None) self.X = X self.y = y self.n_classes = n_classes self.batch_size = batch_size
def benchmark(clf, X, y, cv=None): X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = check_cv(cv, X, y, classifier=is_classifier(clf)) # learning_curve_ = learning_curve(clf, X_all, y_all, cv=cv) train_times = [] test_times = [] confusion_matrices = [] confusion_matrix_indices = [] coefs = [] for train, test in cv: X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] t0 = time() clf.fit(X_train, y_train) train_times.append(time()-t0) t0 = time() y_pred = clf.predict(X_test) test_times.append(time()-t0) confusion_matrices.append(confusion_matrix(y_test, y_pred)) confusion_matrix_indices.append(np.array([[test[pred] for pred in true] for true in confusion_matrix_instances(y_test, y_pred)])) coefs.append(clf.coef_) return dict( train_times = np.array(train_times), test_times = np.array(test_times), confusion_matrices = np.array(confusion_matrices), confusion_matrix_indices = np.array(confusion_matrix_indices), coefs = np.array(coefs) )
def fit(self, X, y=None): """Fit the model to the data X. Parameters ---------- X : {array-like, sparse matrix} shape (n_samples, n_features) Training data. Returns ------- self : BernoulliRBM The fitted model. """ X, = check_arrays(X, sparse_format='csr', dtype=np.float) n_samples = X.shape[0] rng = check_random_state(self.random_state) self.components_ = np.asarray(rng.normal( 0, 0.01, (self.n_components, X.shape[1])), order='fortran') self.intercept_hidden_ = np.zeros(self.n_components, ) self.intercept_visible_ = np.zeros(X.shape[1], ) self.h_samples_ = np.zeros((self.batch_size, self.n_components)) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) batch_slices = list( gen_even_slices(n_batches * self.batch_size, n_batches, n_samples)) for iteration in xrange(1, self.n_iter + 1): for batch_slice in batch_slices: self._fit(X[batch_slice], rng) return self
def predict(self, X): """Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the estimators in the ensemble. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y : array of shape = [n_samples] The predicted values. """ # Check data X, = check_arrays(X) # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self) all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_predict_regression)( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)) # Reduce y_hat = sum(all_y_hat) / self.n_estimators return y_hat
def fit(self, X, y=None): """Fit the model to the data X. Parameters ---------- X : {array-like, sparse matrix} shape (n_samples, n_features) Training data. Returns ------- self : BernoulliRBM The fitted model. """ X, = check_arrays(X, sparse_format='csr', dtype=np.float) n_samples = X.shape[0] rng = check_random_state(self.random_state) self.components_ = np.asarray( rng.normal(0, 0.01, (self.n_components, X.shape[1])), order='fortran') self.intercept_hidden_ = np.zeros(self.n_components, ) self.intercept_visible_ = np.zeros(X.shape[1], ) self.h_samples_ = np.zeros((self.batch_size, self.n_components)) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) batch_slices = list(gen_even_slices(n_batches * self.batch_size, n_batches, n_samples)) for iteration in xrange(1, self.n_iter + 1): for batch_slice in batch_slices: self._fit(X[batch_slice], rng) return self
def plot_vs_cut(self, y_true, proba, sample_weight=None): """ Compute metric for each possible prediction threshold :param y_true: array-like true labels :param proba: array-like of shape [n_samples, 2] with predicted probabilities :param sample_weight: array-like weight :rtype: plotting.FunctionsPlot """ from .. import plotting y_true, proba, sample_weight = check_arrays(y_true, proba, sample_weight) ordered_proba, metrics_val = self.compute(y_true, proba, sample_weight) ind = numpy.argmax(metrics_val) print('Optimal cut=%1.4f, quality=%1.4f' % (ordered_proba[ind], metrics_val[ind])) plot_fig = plotting.FunctionsPlot( {self.metric.__name__: (ordered_proba, metrics_val)}) plot_fig.xlabel = 'cut' plot_fig.ylabel = 'metrics ' + self.metric.__name__ return plot_fig
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the predicted class probabilities of the underlying estimators. If base estimators do not implement a ``predict_proba`` method, then ``NotImplementError`` is raise. Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- proba : array of shape = [n_samples, n_classes] The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ # Check data X, = check_arrays(X) if not hasattr(self.estimator_, "predict_proba"): raise NotImplementedError('Underlying estimator of class' + self.estimator_.__class__ + 'has no attibute ``pedict_proba``.') return self.estimator_.predict_proba(X)
def loadMultiData(filenames, weights, weight_types, isMergeFeatures=False, data_size=-1): print "Loading data from multiple files: weights=%s, types=%s" % ( str(weights), str(weight_types)) datas = [] for index in range(0, len(filenames)): filename = filenames[index] weight_type = weight_types[index] data = loadData(filename, weight_type=weight_type, data_size=data_size) datas.append(data) # Combine multiple matrices # Does not merge features. Just concatenate all feature spaces if isMergeFeatures == False: combined = datas[0] * weights[0] for i in range(1, len(datas)): data = datas[i] data = data * weights[i] combined = sp.hstack([combined, data]) combined = check_arrays( combined, sparse_format="csr", copy=False, dtype=np.float64)[0] # convert to the type: csr sparse matrix # Changed: normalization is done in each method # combined_norms = normalize(combined,'l2',axis=1,copy=False) # Squared euclidean norm of each data point. return combined #.astype('f') #convert type to float32 to save space else: #Donot implement the merging features version yet. pass
def _transform(self, X): """Assumes X contains only categorical features.""" X = check_arrays(X, sparse_format='csc', allow_nans=True)[0] n_samples, n_features = X.shape indices = self.feature_indices_ if n_features != len(indices): raise ValueError("X has different shape than during fitting." " Expected %d, got %d." % (len(indices), n_features)) row_indices = np.tile(np.arange(n_samples, dtype=np.int32), n_features) data = [] column_indices = [] for idx, feature in enumerate(range(n_features)): offset = np.sum(self.n_values[:idx+1]) feature_indices_idx = self.feature_indices_[idx] column_indices_idx = [feature_indices_idx.get(x, offset) for x in X[:,idx]] data_idx = [1 if feature_indices_idx.get(x) is not None else 0 for x in X[:, idx]] column_indices.extend(column_indices_idx) data.extend(data_idx) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, np.sum(self.n_values)), dtype=self.dtype).tocsr() return out if self.sparse else out.toarray()
def transform(self, X, y=None, copy=None): """Perform standardization by centering and scaling Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ copy = copy if copy is not None else self.copy X = check_arrays(X, copy=copy, sparse_format="csc")[0] if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.center_sparse: for i in range(X.shape[1]): X.data[X.indptr[i]:X.indptr[i + 1]] -= self.mean_[i] elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: pass if self.std_ is not None: inplace_column_scale(X, 1 / self.std_) else: if self.with_mean: X -= self.mean_ if self.with_std: X /= self.std_ return X
def mean_absolute_percentage_error(y_true, y_pred): y_true, y_pred = check_arrays(y_true, y_pred) ## Note: does not handle mix 1d representation #if _is_1d(y_true): # y_true, y_pred = _check_1d_array(y_true, y_pred) return round(np.mean(np.abs((y_true - y_pred) / y_true)) * 100, 2)
def mean_absolute_percentage_error(y_true, y_pred): y_true, y_pred = check_arrays(y_true, y_pred) ## Note: does not handle mix 1d representation #if _is_1d(y_true): # y_true, y_pred = _check_1d_array(y_true, y_pred) return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
def fit(self, X, y=None): """Don't trust the documentation of this module! Compute the mean and std to be used for later scaling. Parameters ---------- X : array-like or CSR matrix with shape [n_samples, n_features] The data used to compute the mean and standard deviation used for later scaling along the features axis. """ X = check_arrays(X, copy=self.copy, sparse_format="csc")[0] if warn_if_not_float(X, estimator=self): X = X.astype(np.float) if sparse.issparse(X): if self.center_sparse: means = [] vars = [] # This only works for csc matrices... for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: means.append(0) vars.append(1) else: vars.append(X.data[X.indptr[i]:X.indptr[i + 1]].var()) # If the variance is 0, set all occurences of this # features to 1 means.append(X.data[X.indptr[i]:X.indptr[i + 1]].mean()) if 0.0000001 >= vars[-1] >= -0.0000001: means[-1] -= 1 self.std_ = np.sqrt(np.array(vars)) self.std_[np.array(vars) == 0.0] = 1.0 self.mean_ = np.array(means) return self elif self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " "instead. See docstring for motivation and alternatives.") else: self.mean_ = None if self.with_std: var = mean_variance_axis0(X)[1] self.std_ = np.sqrt(var) self.std_[var == 0.0] = 1.0 else: self.std_ = None return self else: self.mean_, self.std_ = _mean_and_std(X, axis=0, with_mean=self.with_mean, with_std=self.with_std) return self
def loadData(filename, weight_type="count", feature_perc=1.0, data_size=-1, items_subset=set()): """ filename: the name of data file weight_type: weight type of tokens: "count","tf-idf","tf" feature_pect: percentage of features to select according to its weighted values. (count: #document contains the feature; tfidf: tf-idf value) data_size: #data to select. default is -1, means select all data. item_subset: only read lines in the item_subset, which is a set of line numbers """ weight_type = weight_type.lower() print "Loading data from %s, weight_type = %s" % (filename, weight_type) raw_data = [] lines = file(filename, "r").readlines() i = 0 for line in lines: if len(items_subset) == 0 or (i in items_subset): line = line.replace('\n', '') arr = ast.literal_eval(line.split("|")[2]) #converted to list raw_data.append(arr) if (data_size > 0 and i >= data_size): break i = i + 1 data, feature_names, features_score = fit_transform( raw_data, weight_type=weight_type ) # type of data is <class 'scipy.sparse.coo.coo_matrix'> #write_features("%s_features" %filename, feature_names) # feature selection # Note: after selection, the vocabulary of features names are changed! Not modify them yet! if feature_perc < 1: n_features = data.shape[1] upbound = n_features * feature_perc dict_feat_score = {} for i in range(0, len(features_score)): dict_feat_score[i] = features_score[i] sorted_list = sorted( dict_feat_score.items(), key=lambda d: d[1], reverse=True ) #Sort the dict by its values, descending(reverse=True),ascending(reverse=False) selected_features = [] i = 0 for key, value in sorted_list: selected_features.append(key) i = i + 1 if i > upbound: break data = data.tocsc() data = data[:, selected_features] print "\t Selected top %d(%f) from %d features:" % ( len(selected_features), feature_perc, n_features) # print vectorizer.get_features_tfidf() # print vectorizer.vocabulary_ data = check_arrays( data, sparse_format="csr", copy=False, dtype=np.float64)[0] # convert to the type: csr sparse matrix # Change: Normalization is done in each method # data_norms = normalize(data,'l2',axis=1,copy=False) # Squared euclidean norm(l2-norm) of each data point. return data #.astype('f') #convert type to float32 to save space
def fit(self, X, y): """Fit MLP Classifier according to X, y Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] or [n_samples, n_classes] Target values. It determines the problem type. *binary* If y is a vector of integers with two unique values. *multiclass* If y is a vector of integers with three or more values or if y is a two-dimensional array of integers and there exists only one non-zero element per row. *multiclass-multioutput* If y is two-dimensional array of integers with two unique values and there exists more than one non-zero element per row. *continuous* If y is a vector of floats. *continuous-multioutput* If y is a two-dimensional array of floats. Returns ------- self : object Returns self. """ X, = check_arrays(X, sparse_format='dense') n_samples, self.input_size_ = X.shape y = np.atleast_1d(y) self.type_of_target_ = type_of_target(y) if self.verbose > 0: print("The inferred type of y is %s" % self.type_of_target_) if self.type_of_y != None: if self.type_of_y != self.type_of_target_: print("Passed type of y is %s, inferred type is %s" % (self.type_of_y, self.type_of_target_)) raise("derp") self.check_type_implemented() y = self._get_output(y) X, y = self._scale(X, y) self._inst_mlp() self._fit_mlp(X, y) if self.dropout and self.type_of_target_ in ['continuous', 'continuous-multioutput']: self._lineregress(X, y)
def fit(self, X, y, store_covariances=False, tol=1.0e-4): """ Fit the QDA model according to the given training data and parameters. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array, shape = [n_samples] Target values (integers) store_covariances : boolean If True the covariance matrices are computed and stored in the `self.covariances_` attribute. """ X, y = check_arrays(X, y) self.classes_, y = unique(y, return_inverse=True) n_samples, n_features = X.shape n_classes = len(self.classes_) if n_classes < 2: raise ValueError('y has less than 2 classes') if self.priors is None: self.priors_ = np.bincount(y) / float(n_samples) else: self.priors_ = self.priors cov = None if store_covariances: cov = [] means = [] scalings = [] rotations = [] for ind in xrange(n_classes): Xg = X[y == ind, :] meang = Xg.mean(0) means.append(meang) Xgc = Xg - meang # Xgc = U * S * V.T U, S, Vt = np.linalg.svd(Xgc, full_matrices=False) rank = np.sum(S > tol) if rank < n_features: warnings.warn("Variables are collinear") S2 = (S**2) / (len(Xg) - 1) if store_covariances: # cov = V * (S^2 / (n-1)) * V.T cov.append(np.dot(S2 * Vt.T, Vt)) scalings.append(S2) rotations.append(Vt.T) if store_covariances: self.covariances_ = cov self.means_ = np.asarray(means) self.scalings_ = np.asarray(scalings) self.rotations_ = rotations return self
def fit_all(self, X, y, n_shop, last_obs_plan): # if not warmstart - clear the estimator state if not self.warm_start: self._clear_state() # Check input X, = check_arrays(X, dtype=DTYPE, sparse_format="dense") y = column_or_1d(y, warn=True) n_samples, n_features = X.shape self.n_features = n_features random_state = check_random_state(self.random_state) self._check_params() if not self._is_initialized(): if self.verbose: print 'Initializing gradient boosting...' # init state self._init_state() # fit initial model if not self.fix_history: idx = get_truncated_shopping_indices(n_shop) else: idx = np.arange(len(n_shop)) # init predictions by averaging over the shopping histories y_pred = self.init_.predict(last_obs_plan[idx]) print 'First training accuracy:', accuracy_score(y, y_pred.argmax(axis=1)) begin_at_stage = 0 else: # add more estimators to fitted model # invariant: warm_start = True if self.n_estimators < self.estimators_.shape[0]: raise ValueError('n_estimators=%d must be larger or equal to ' 'estimators_.shape[0]=%d when ' 'warm_start==True' % (self.n_estimators, self.estimators_.shape[0])) begin_at_stage = self.estimators_.shape[0] y_pred = self.decision_function(X) self._resize_state() # fit the boosting stages n_stages = self._fit_stages(X, y, y_pred, random_state, begin_at_stage, n_shop) # change shape of arrays after fit (early-stopping or additional tests) if n_stages != self.estimators_.shape[0]: self.estimators_ = self.estimators_[:n_stages] self.train_score_ = self.train_score_[:n_stages] if hasattr(self, 'oob_improvement_'): self.oob_improvement_ = self.oob_improvement_[:n_stages] if hasattr(self, '_oob_score_'): self._oob_score_ = self._oob_score_[:n_stages] return self
def inverse_transform(self, X): """Undo the scaling of X according to feature_range. Parameters ---------- X : array-like with shape [n_samples, n_features] Input data that will be transformed. """ X = check_arrays(X, sparse_format="dense", copy=self.copy)[0] X -= self.min_ X /= self.scale_ return X
def fit(self, X, y): self._validate() X, y = check_arrays(X, y, sparse_format="csc") n_samples, n_features = X.shape # discretize continuous features if np.issubdtype(X.dtype, float): X_new = X.astype(np.int) if np.any(X_new != X): raise ValueError('X could not safely be converted to integers.' ' MRMR does not support continuous values.') X = X_new if np.issubdtype(y.dtype, float): y_new = y.astype(np.int) if np.any(y_new != y): raise ValueError('y could not safely be converted to integers.' ' MRMR does not support continuous values.') y = y_new if self.k is None: k = n_features // 2 else: k = self.k X_classes = np.array(list(set(X.reshape((n_samples * n_features,))))) y_classes = np.array(list(set(y.reshape((n_samples,))))) if len(X_classes) > self.warn_limit: print('Warning: X contains {} discrete values. MRMR may' ' run slow'.format(len(X_classes))) if len(y_classes) > self.warn_limit: print('Warning: y contains {} discrete values. MRMR may' ' run slow'.format(len(y_classes))) method = self.methods[self.method] idxs, _ = _mrmr(n_samples, n_features, y.astype(np.long), X.astype(np.long), y_classes.astype(np.long), X_classes.astype(np.long), y_classes.shape[0], X_classes.shape[0], k, method, self.normalize) support_ = np.zeros(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) + k support_[idxs] = True for i, idx in enumerate(idxs, start=1): ranking_[idx] = i self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ self.selected_ = np.argsort(self.ranking_)[:self.n_features_] return self
def fit(self, X): """Fit the model to the data X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training Data i Returns ------- self: convolutionRBM The fitted Model. """ X, = check_arrays(X, sparse_format='csr', dtype=np.float) n_samples = X.shape[0] rng = check_random_state(self.random_state) self.components_ = np.asarray(rng.normal( 0, 0.001, (self.n_groups, self.window_size * self.window_size)), order='fortran') self.intercept_hidden_ = np.zeros(self.n_groups) self.intercept_visible_ = 0 self.h_samples_ = np.zeros((self.batch_size, self.n_components)) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) batch_slices = list( self.gen_even_slices(n_batches * self.batch_size, n_batches, n_samples)) verbose = self.verbose for iteration in xrange(1, self.n_iter + 1): reconstructError = 0 for batch_slice in batch_slices: if (not self.use_theano): reconstructError += self._fit(X[batch_slice], rng) else: reconstructError += self._fit_theano(X[batch_slice], rng) print "step:", iteration, "reconstruct Error: ", reconstructError if verbose: end = time.time() print( "[%s] Iteration %d, pseudo-likelihood = %.2f," " time = %.2fs" % (type(self).__name__, iteration, self.score_samples(X).mean(), end - begin)) begin = end return self
def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None): y_true, y_pred = check_arrays(y_true, y_pred) assert(beta > 0) if labels is None: labels = unique_labels(y_true, y_pred) else: labels = np.asarray(labels, dtype=np.int) n_labels = labels.size true_pos = np.zeros(n_labels, dtype=np.double) false_pos = np.zeros(n_labels, dtype=np.double) false_neg = np.zeros(n_labels, dtype=np.double) support = np.zeros(n_labels, dtype=np.long) for i, label_i in enumerate(labels): true_pos[i] = np.sum(y_pred[y_true == label_i] == label_i) false_pos[i] = np.sum(y_pred[y_true != label_i] == label_i) false_neg[i] = np.sum(y_pred[y_true == label_i] != label_i) support[i] = np.sum(y_true == label_i) try: # oddly, we may get an "invalid" rather than a "divide" error here old_err_settings = np.seterr(divide='ignore', invalid='ignore') # precision and recall # Micro-averaging is used precision = true_pos.sum() / (true_pos.sum() + false_pos.sum()) recall = true_pos.sum() / (true_pos.sum() + false_neg.sum()) # print false_pos # print false_neg # print false_pos.sum() # print false_neg.sum() # # handle division by 0.0 in precision and recall # precision[(true_pos + false_pos) == 0.0] = 0.0 # recall[(true_pos + false_neg) == 0.0] = 0.0 # fbeta score beta2 = beta ** 2 fscore = (1 + beta2) * (precision * recall) / ( beta2 * precision + recall) # print (beta2 * precision + recall) # handle division by 0.0 in fscore if (precision + recall) == 0.0: fscore = 0.0 # fscore[(precision + recall) == 0.0] = 0.0 finally: np.seterr(**old_err_settings) return precision, recall, fscore, support
def test_check_arrays(): # check that error is raised on different length inputs X = [0, 1] Y = np.arange(3) assert_raises(ValueError, check_arrays, X, Y) # check error for sparse matrix and array X = sp.csc_matrix(np.arange(4)) assert_raises(ValueError, check_arrays, X, Y) # check they y=None pattern X = [0, 1, 2] X_, Y_, Z_ = check_arrays(X, Y, None) assert_true(Z_ is None) # check that lists are converted X_, Y_ = check_arrays(X, Y) assert_true(isinstance(X_, np.ndarray)) assert_true(isinstance(Y_, np.ndarray)) # check that Y was not copied: assert_true(Y_ is Y) # check copying X_, Y_ = check_arrays(X, Y, copy=True) assert_false(Y_ is Y) # check forcing dtype X_, Y_ = check_arrays(X, Y, dtype=np.int) assert_equal(X_.dtype, np.int) assert_equal(Y_.dtype, np.int) X_, Y_ = check_arrays(X, Y, dtype=np.float) assert_equal(X_.dtype, np.float) assert_equal(Y_.dtype, np.float) # test check_ccontiguous Y = np.arange(6).reshape(3, 2).copy('F') # if we don't specify it, it is not changed X_, Y_ = check_arrays(X, Y) assert_true(Y_.flags['F_CONTIGUOUS']) assert_false(Y_.flags['C_CONTIGUOUS']) X_, Y_ = check_arrays(X, Y, check_ccontiguous=True) assert_true(Y_.flags['C_CONTIGUOUS']) assert_false(Y_.flags['F_CONTIGUOUS']) # check that lists are passed through if allow_lists is true X_, Y_ = check_arrays(X, Y, allow_lists=True) assert_true(isinstance(X_, list))
def predict_log_proba(self, X): """Predict class log-probabilities for X. The predicted class log-probabilities of an input sample is computed as the log of the mean predicted class probabilities of the base estimators in the ensemble. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class log-probabilities of the input samples. Classes are ordered by arithmetical order. """ if hasattr(self.base_estimator_, "predict_log_proba"): # Check data X, = check_arrays(X) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} " "and input n_features is {1} " "".format(self.n_features_, X.shape[1])) # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self) all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_predict_log_proba)( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)) # Reduce log_proba = all_log_proba[0] for j in range(1, len(all_log_proba)): log_proba = logaddexp(log_proba, all_log_proba[j]) log_proba -= np.log(self.n_estimators) return log_proba else: return np.log(self.predict_proba(X))
def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv(k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification))
def fit(self, X, winnerTakeAll,plList, y=None,): """Fit the model to the data X. Parameters ---------- X : {array-like, sparse matrix} shape (n_samples, n_features) Training data. Returns ------- self : BernoulliRBM The fitted model. """ X, = check_arrays(X, sparse_format='csr', dtype=np.float) n_samples = X.shape[0] rng = check_random_state(self.random_state) self.components_ = np.asarray( rng.normal(0, 0.001, (self.n_components, X.shape[1])), order='fortran') self.intercept_hidden_ = np.zeros(self.n_components, ) self.intercept_visible_ = np.zeros(X.shape[1], ) self.h_samples_ = np.zeros((self.batch_size, self.n_components)) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) batch_slices = list(self.gen_even_slices(n_batches * self.batch_size,n_batches, n_samples)) verbose = self.verbose for iteration in xrange(self.n_iter): pl = 0. if verbose: begin = time.time() batch_index = 0 for batch_slice in batch_slices: if(batch_index + 1 != n_batches - 1): #next_batch = batch_slice next_h_pos_mean_hidden = self._mean_hiddens(X[batch_index + 1]) pl_batch = self._fit(X[batch_slice], rng,winnerTakeAll) if verbose: pl += pl_batch.sum() #self.printOutWeight() batch_index = batch_index + 1 if verbose: pl /= n_samples end = time.time() print("Iteration %d, pseudo-likelihood = %.2f, time = %.2fs" % (iteration, pl, end - begin)) plList[iteration] = pl #self.printOutWeight() return self
def f_regression_cov(X, y, C): """Univariate linear regression tests Quick linear model for testing the effect of a single regressor, sequentially for many regressors. This is done in 3 steps: 1. the regressor of interest and the data are orthogonalized wrt constant regressors 2. the cross correlation between data and regressors is computed 3. it is converted to an F score then to a p-value Parameters ---------- X : {array-like, sparse matrix} shape = (n_samples, n_features) The set of regressors that will tested sequentially. y : array of shape(n_samples). The data matrix c : {array-like, sparse matrix} shape = (n_samples, n_covariates) The set of covariates. Returns ------- F : array, shape=(n_features,) F values of features. pval : array, shape=(n_features,) p-values of F-scores. """ X, C, y = check_arrays(X, C, y, dtype=np.float) y = y.ravel() assert C.shape[1] < C.shape[0] cpinv = np.linalg.pinv(C) X -= np.dot(C,(np.dot(cpinv, X))) y -= np.dot(C,(np.dot(cpinv, y))) # compute the correlation corr = np.dot(y, X) corr /= np.asarray(np.sqrt(safe_sqr(X).sum(axis=0))).ravel() corr /= np.asarray(np.sqrt(safe_sqr(y).sum())).ravel() # convert to p-value dof = (X.shape[0] - 1 - C.shape[1]) / (1) #(df_fm / (df_rm - df_fm)) F = corr ** 2 / (1 - corr ** 2) * dof pv = stats.f.sf(F, 1, dof) return F, pv
def partial_fit(self, X, y=None, weights=None): """Update k means estimate on a single mini-batch X. Parameters ---------- X: array-like, shape = [n_samples, n_features] Coordinates of the data points to cluster. """ X = check_arrays(X, sparse_format="csr", copy=False)[0] n_samples, n_features = X.shape if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=np.float64) if n_samples == 0: return self x_squared_norms = _squared_norms(X) self.random_state_ = check_random_state(self.random_state) if (not hasattr(self, 'counts_') or not hasattr(self, 'cluster_centers_')): # this is the first call partial_fit on this object: # initialize the cluster centers self.cluster_centers_ = _init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, x_squared_norms=x_squared_norms, init_size=self.init_size, weights=weights) self.initial_cluster_centers_ = self.cluster_centers_.copy() self.counts_ = np.zeros(self.n_clusters, dtype=np.int32) random_reassign = False else: # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts random_reassign = self.random_state_.randint( 10 * (1 + self.counts_.min())) == 0 _mini_batch_step(X, x_squared_norms, self.cluster_centers_, self.counts_, np.zeros(0, np.double), 0, random_reassign=random_reassign, random_state=self.random_state_, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, weights=weights, sphered=self.sphered) if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia( X, x_squared_norms, self.cluster_centers_, weights=weights) return self
def _check_clf_targets(y_true, y_pred): """Check that y_true and y_pred belong to the same classification task This converts multiclass or binary types to a common shape, and raises a ValueError for a mix of multilabel and multiclass targets, a mix of multilabel formats, for the presence of continuous-valued or multioutput targets, or for targets of different lengths. Column vectors are squeezed to 1d. Parameters ---------- y_true : array-like, y_pred : array-like Returns ------- type_true : one of {'multilabel-indicator', 'multilabel-sequences', \ 'multiclass', 'binary'} The type of the true target data, as output by ``utils.multiclass.type_of_target`` y_true : array or indicator matrix or sequence of sequences y_pred : array or indicator matrix or sequence of sequences """ y_true, y_pred = check_arrays(y_true, y_pred, allow_lists=True) type_true = type_of_target(y_true) type_pred = type_of_target(y_pred) y_type = set([type_true, type_pred]) if y_type == set(["binary", "multiclass"]): y_type = set(["multiclass"]) if len(y_type) > 1: raise ValueError("Can't handle mix of {0} and {1}" "".format(type_true, type_pred)) # We can't have more than one value on y_type => The set is no more needed y_type = y_type.pop() # No metrics support "multiclass-multioutput" format if y_type not in ["binary", "multiclass", "multilabel-indicator", "multilabel-sequences"]: raise ValueError("{0} is not supported".format(y_type)) if y_type in ["binary", "multiclass"]: y_true = column_or_1d(y_true) y_pred = column_or_1d(y_pred) return y_type, y_true, y_pred
def predict_log_proba(self, X): """Predict class log-probabilities for X. The predicted class log-probabilities of an input sample is computed as the log of the mean predicted class probabilities of the base estimators in the ensemble. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class log-probabilities of the input samples. Classes are ordered by arithmetical order. """ if hasattr(self.base_estimator_, "predict_log_proba"): # Check data X, = check_arrays(X) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} " "and input n_features is {1} " "".format(self.n_features_, X.shape[1])) # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self) all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_predict_log_proba) (self.estimators_[starts[i]:starts[i + 1]], self. estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)) # Reduce log_proba = all_log_proba[0] for j in range(1, len(all_log_proba)): log_proba = logaddexp(log_proba, all_log_proba[j]) log_proba -= np.log(self.n_estimators) return log_proba else: return np.log(self.predict_proba(X))
def _fit_transform(self, X): self.nbrs_ = NearestNeighbors(self.n_neighbors, algorithm=self.neighbors_algorithm) random_state = check_random_state(self.random_state) X, = check_arrays(X, sparse_format='dense') self.nbrs_.fit(X) self.embedding_, self.reconstruction_error_ = \ locally_linear_embedding( self.nbrs_, self.n_neighbors, self.n_components, eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter, method=self.method, hessian_tol=self.hessian_tol, modified_tol=self.modified_tol, random_state=random_state)
def fit(self, X): """Fit the model to the data X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training Data i Returns ------- self: convolutionRBM The fitted Model. """ X, = check_arrays(X, sparse_format = 'csr', dtype = np.float) n_samples = X.shape[0] rng = check_random_state(self.random_state) self.components_ = np.asarray( rng.normal(0, 0.001, (self.n_groups,self.window_size * self.window_size)),order='fortran') self.intercept_hidden_ = np.zeros(self.n_groups) self.intercept_visible_ = 0 self.h_samples_ = np.zeros((self.batch_size, self.n_components)) n_batches = int(np.ceil(float(n_samples)/ self.batch_size)) batch_slices = list(self.gen_even_slices(n_batches * self.batch_size, n_batches, n_samples)) verbose = self.verbose for iteration in xrange(1, self.n_iter + 1): reconstructError = 0 for batch_slice in batch_slices: if(not self.use_theano): reconstructError += self._fit(X[batch_slice], rng) else: reconstructError += self._fit_theano(X[batch_slice],rng) print "step:", iteration, "reconstruct Error: ", reconstructError if verbose: end = time.time() print("[%s] Iteration %d, pseudo-likelihood = %.2f," " time = %.2fs" % (type(self).__name__, iteration, self.score_samples(X).mean(), end - begin)) begin = end return self
def fit(self, X, y=None): """Compute the minimum and maximum to be used for later scaling. Parameters ---------- X : array-like, shape [n_samples, n_features] The data used to compute the per-feature minimum and maximum used for later scaling along the features axis. """ X = check_arrays(X, sparse_format="csc", copy=self.copy)[0] warn_if_not_float(X, estimator=self) feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError("Minimum of desired feature range must be smaller" " than maximum. Got %s." % str(feature_range)) if sparse.issparse(X): data_min = [] data_max = [] data_range = [] for i in range(X.shape[1]): if X.indptr[i] == X.indptr[i + 1]: data_min.append(0) data_max.append(0) data_range.append(0) else: data_min.append(X.data[X.indptr[i]:X.indptr[i + 1]].min()) data_max.append(X.data[X.indptr[i]:X.indptr[i + 1]].max()) data_min = np.array(data_min) data_max = np.array(data_max) data_range = data_max - data_min else: data_min = np.min(X, axis=0) data_range = np.max(X, axis=0) - data_min # Do not scale constant features if isinstance(data_range, np.ndarray): # For a sparse matrix, constant features will be set to one! if sparse.issparse(X): for i in range(len(data_min)): if data_range[i] == 0.0: data_min[i] = data_min[i] - 1 data_range[data_range == 0.0] = 1.0 elif data_range == 0.: data_range = 1. self.scale_ = (feature_range[1] - feature_range[0]) / data_range self.min_ = feature_range[0] - data_min * self.scale_ self.data_range = data_range self.data_min = data_min return self
def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv( k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification))
def combineData(datas, weights, norm): """ First normalize each view, then combine """ combined = norm_data(datas[0], norm) * weights[0] for i in range(1, len(datas)): data = norm_data(datas[i], norm) * weights[i] combined = sp.hstack([combined, data]) combined = check_arrays( combined, sparse_format="csr", copy=False, dtype=np.float64)[0] # convert to the type: csr sparse matrix # Changed: normalization is done in each method # combined_norms = normalize(combined,'l2',axis=1,copy=False) # Squared euclidean norm of each data point. return combined #.astype('f') #convert type to float32 to save space
def f_regression_nosparse(X, y, center=True): """Univariate linear regression tests Quick linear model for testing the effect of a single regressor, sequentially for many regressors. This is done in 3 steps: 1. the regressor of interest and the data are orthogonalized with respect to constant regressors 2. the cross correlation between data and regressors is computed 3. it is converted to an F score then to a p-value Parameters ---------- X : {array-like, sparse matrix} shape = (n_samples, n_features) The set of regressors that will tested sequentially. y : array of shape(n_samples). The data matrix center : True, bool, If true, X and y will be centered. Returns ------- F : array, shape=(n_features,) F values of features. pval : array, shape=(n_features,) p-values of F-scores. """ X, y = check_arrays(X, y, dtype=np.float) y = y.ravel() if center: y = y - np.mean(y) X = X.copy('F') # faster in fortran X -= X.mean(axis=0) # compute the correlation corr = np.dot(y, X) # XXX could use corr /= row_norms(X.T) here, but the test doesn't pass corr /= np.asarray(np.sqrt((X ** 2).sum(axis=0))).ravel() corr /= norm(y) # convert to p-value degrees_of_freedom = y.size - (2 if center else 1) F = corr ** 2 / (1 - corr ** 2) * degrees_of_freedom pv = stats.f.sf(F, 1, degrees_of_freedom) return F, pv
def transform(self, X): """Compute the hidden layer activation probabilities, P(h=1|v=X). Parameters ---------- X : {array-like, sparse matrix} shape (n_samples, n_features) The data to be transformed. Returns ------- h : array, shape (n_samples, n_components) Latent representations of the data. """ X, = check_arrays(X, sparse_format='csr', dtype=np.float) return self._mean_hiddens(X)
def fit(self, X): """Fit SGVB to the data Parameters ---------- X : array-like, shape (N, n_features) The data that the SGVB needs to fit on Returns ------- list_lowerbound : list of int list of lowerbound over time """ X, = check_arrays(X, sparse_format='csr', dtype=np.float) [N, dimX] = X.shape rng = check_random_state(self.random_state) self._initParams(dimX, rng) list_lowerbound = np.array([]) n_batches = int(np.ceil(float(N) / self.batch_size)) batch_slices = list(gen_even_slices(n_batches * self.batch_size, n_batches, N)) if self.verbose: print "Initializing gradients for AdaGrad" for i in xrange(10): self._initH(X[batch_slices[i]], rng) begin = time.time() for iteration in xrange(1, self.n_iter + 1): iteration_lowerbound = 0 for batch_slice in batch_slices: lowerbound = self._updateParams(X[batch_slice], N, rng) iteration_lowerbound += lowerbound if self.verbose: end = time.time() print("[%s] Iteration %d, lower bound = %.2f," " time = %.2fs" % (self.__class__.__name__, iteration, iteration_lowerbound / N, end - begin)) begin = end list_lowerbound = np.append( list_lowerbound, iteration_lowerbound / N) return list_lowerbound
def fit(self, X, y, sample_weight=None): """Fit Naive Bayes classifier according to X, y Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples], optional Weights applied to individual samples (1. for unweighted). Returns ------- self : object Returns self. """ X, y = check_arrays(X, y, sparse_format='csr') X = X.astype(np.float) y = column_or_1d(y, warn=True) _, n_features = X.shape labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) # convert to float to support sample weight consistently Y = Y.astype(np.float64) if sample_weight is not None: Y *= array2d(sample_weight).T class_prior = self.class_prior # Count raw events from data before updating the class log prior # and feature log probas n_effective_classes = Y.shape[1] self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) self._count(X, Y) self._update_feature_log_prob() self._update_class_log_prior(class_prior=class_prior) return self
def fit(self, X): """Fit SGVB to the data Parameters ---------- X : array-like, shape (N, n_features) The data that the SGVB needs to fit on Returns ------- list_lowerbound : list of int list of lowerbound over time """ X, = check_arrays(X, sparse_format='csr', dtype=np.float) [N, dimX] = X.shape rng = check_random_state(self.random_state) self._initParams(dimX, rng) list_lowerbound = np.array([]) n_batches = int(np.ceil(float(N) / self.batch_size)) batch_slices = list( gen_even_slices(n_batches * self.batch_size, n_batches, N)) if self.verbose: print "Initializing gradients for AdaGrad" for i in xrange(10): self._initH(X[batch_slices[i]], rng) begin = time.time() for iteration in xrange(1, self.n_iter + 1): iteration_lowerbound = 0 for batch_slice in batch_slices: lowerbound = self._updateParams(X[batch_slice], N, rng) iteration_lowerbound += lowerbound if self.verbose: end = time.time() print( "[%s] Iteration %d, lower bound = %.2f," " time = %.2fs" % (self.__class__.__name__, iteration, iteration_lowerbound / N, end - begin)) begin = end list_lowerbound = np.append(list_lowerbound, iteration_lowerbound / N) return list_lowerbound
def transform(self, X): """Scaling features of X according to feature_range. Parameters ---------- X : array-like with shape [n_samples, n_features] Input data that will be transformed. """ X = check_arrays(X, sparse_format="csc", copy=self.copy)[0] if sparse.issparse(X): for i in range(X.shape[1]): X.data[X.indptr[i]:X.indptr[i + 1]] *= self.scale_[i] X.data[X.indptr[i]:X.indptr[i + 1]] += self.min_[i] else: X *= self.scale_ X += self.min_ return X
def transform(self, X, y=None, copy=None): """ Perform standardization by calculating percentile within trained data. Parameters ---------- X : array-like with shape [n_samples, n_features] The data used to scale along the features axis. """ copy = copy if copy is not None else self.copy X = check_arrays(X, copy=copy, sparse_format="csr")[0] if sp.issparse(X): #TODO: implement for sparse arrays pass else: return (self.tform_func(X)/100)
def fit(self, X, y=None): """Fit the model to the data X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data. Returns ------- self : BernoulliRBM The fitted model. """ X, = check_arrays(X, sparse_format='csc', dtype=np.float) n_samples = X.shape[0] rng = check_random_state(self.random_state) self.components_ = np.asarray( rng.normal(0, 0.01, (self.n_components, X.shape[1])), order='fortran') self.intercept_hidden_ = np.zeros(self.n_components, ) self.intercept_visible_ = np.zeros(X.shape[1], ) self.h_samples_ = np.zeros((self.batch_size, self.n_components)) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) batch_slices = list(gen_even_slices(n_batches * self.batch_size, n_batches)) verbose = self.verbose for iteration in xrange(self.n_iter): pl = 0. if verbose: begin = time.time() for batch_slice in batch_slices: pl_batch = self._fit(X[batch_slice], rng) if verbose: pl += pl_batch.sum() if verbose: pl /= n_samples end = time.time() print("Iteration %d, pseudo-likelihood = %.2f, time = %.2fs" % (iteration, pl, end - begin)) return self
def fit(self, X, y, mask=None): """Fit Gaussian Naive Bayes according to X, y Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. mask : array-like, shape = [n_samples, n_features] Binary, 1 at unobserved features. Returns ------- self : object Returns self. """ X, y = check_arrays(X, y, sparse_format='dense') n_samples, n_features = X.shape if n_samples != y.shape[0]: raise ValueError("X and y have incompatible shapes") if mask is not None: mask = array2d(mask) X = X.copy() X[mask] = np.nan self.classes_ = unique_y = np.unique(y) n_classes = unique_y.shape[0] self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_prior_ = np.zeros(n_classes) self._n_ij = [] epsilon = 1e-9 for i, y_i in enumerate(unique_y): self.theta_[i, :] = bn.nanmean(X[y == y_i, :], axis=0) self.sigma_[i, :] = bn.nanvar(X[y == y_i, :], axis=0) + epsilon self.class_prior_[i] = np.float(np.sum(y == y_i)) / n_samples self._n_ij.append(-0.5 * np.sum(np.log(np.pi * self.sigma_[i, :]))) self._logprior = np.log(self.class_prior_) return self
def fit(self, X, y=None): """Fit the model to the data X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data. Returns ------- self : BernoulliRBM The fitted model. """ X, = check_arrays(X, sparse_format='csc', dtype=np.float) n_samples = X.shape[0] rng = check_random_state(self.random_state) self.components_ = np.asarray(rng.normal( 0, 0.01, (self.n_components, X.shape[1])), order='fortran') self.intercept_hidden_ = np.zeros(self.n_components, ) self.intercept_visible_ = np.zeros(X.shape[1], ) self.h_samples_ = np.zeros((self.batch_size, self.n_components)) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) batch_slices = list( gen_even_slices(n_batches * self.batch_size, n_batches)) verbose = self.verbose for iteration in xrange(self.n_iter): pl = 0. if verbose: begin = time.time() for batch_slice in batch_slices: pl_batch = self._fit(X[batch_slice], rng) if verbose: pl += pl_batch.sum() if verbose: pl /= n_samples end = time.time() print("Iteration %d, pseudo-likelihood = %.2f, time = %.2fs" % (iteration, pl, end - begin)) return self
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the mean predicted class probabilities of the base estimators in the ensemble. If base estimators do not implement a ``predict_proba`` method, then it resorts to voting and the predicted class probabilities of a an input sample represents the proportion of estimators predicting each class. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class probabilities of the input samples. Classes are ordered by arithmetical order. """ # Check data X, = check_arrays(X) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} and " "input n_features is {1}." "".format(self.n_features_, X.shape[1])) # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self) all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_predict_proba)( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)) # Reduce proba = sum(all_proba) / self.n_estimators return proba
def compute(self, y_true, proba, sample_weight=None): """ Compute metric for each possible prediction threshold :param y_true: array-like true labels :param proba: array-like of shape [n_samples, 2] with predicted probabilities :param sample_weight: array-like weight :rtype: tuple(array, array) :return: thresholds and corresponding metric values """ y_true, proba, sample_weight = check_arrays(y_true, proba, sample_weight) pred = proba[:, self.signal_label] b, s, thresholds = roc_curve(y_true == self.signal_label, pred, sample_weight=sample_weight) metric_values = self.metric(s * self.expected_s, b * self.expected_b) thresholds = numpy.clip(thresholds, pred.min() - 1e-6, pred.max() + 1e-6) return thresholds, metric_values
def syn_counts(n_samples=50, offset=0.0, xv=(1., -0.5, 1.0), random_state=None): """Synthetic count data generator with len(xv) - 1 features. Returns ------- X : np.array, shape=(n_samples, len(xv) - 1) The features y = np.array, shape=(n_samples,) The response """ rs = check_random_state(random_state) xv, = check_arrays(xv) p = xv.shape[0] - 1 X = np.c_[np.ones(n_samples), rs.normal(size=n_samples * p).reshape((n_samples, p))] xb = np.dot(X, xv) exb = np.exp(xb + offset) py = rs.poisson(lam=exb, size=n_samples) return X[:, 1:], py
def cross_val_predict(estimator, X, y, cv=5, n_jobs=1, refit=False, predict_fun="predict"): X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) pred = Parallel(n_jobs=n_jobs)( delayed(_cross_val_predict)( clone(estimator), X, y, train, test, predict_fun) for train, test in cv) pred = np.concatenate(pred) if cv.indices: index = np.concatenate([test for _, test in cv]) else: index = np.concatenate([np.where(test)[0] for _, test in cv]) ## pred[index] = pred doesn't work as expected pred[index] = pred.copy() if refit: return pred, clone(estimator).fit(X,y) else: return pred
def decision_function(self, X): """Average of the decision functions of the base classifiers. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- score : array, shape = [n_samples, k] The decision function of the input samples. The columns correspond to the classes in sorted order, as they appear in the attribute ``classes_``. Regression and binary classification are special cases with ``k == 1``, otherwise ``k==n_classes``. """ # Trigger an exception if not supported if not hasattr(self.base_estimator_, "decision_function"): raise NotImplementedError # Check data X, = check_arrays(X) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {1} and " "input n_features is {2} " "".format(self.n_features_, X.shape[1])) # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self) all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_decision_function)( self.estimators_[starts[i]:starts[i + 1]], self.estimators_features_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)) # Reduce decisions = sum(all_decisions) / self.n_estimators return decisions
def predict(self, X): """Predict class for X. The predicted class of an input sample is computed as the predicted class of the underlying estimator. Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- y : array of shape = [n_samples] The predicted classes. """ X = check_arrays(X) return self.estimator_.predict(X)
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the mean predicted class probabilities of the base estimators in the ensemble. If base estimators do not implement a ``predict_proba`` method, then it resorts to voting and the predicted class probabilities of a an input sample represents the proportion of estimators predicting each class. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes] The class probabilities of the input samples. Classes are ordered by arithmetical order. """ # Check data X, = check_arrays(X) if self.n_features_ != X.shape[1]: raise ValueError("Number of features of the model must " "match the input. Model n_features is {0} and " "input n_features is {1}." "".format(self.n_features_, X.shape[1])) # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self) all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)( delayed(_parallel_predict_proba) (self.estimators_[starts[i]:starts[i + 1]], self. estimators_features_[starts[i]:starts[i + 1]], X, self.n_classes_) for i in range(n_jobs)) # Reduce proba = sum(all_proba) / self.n_estimators return proba