def fit(self, X, y): """ Fit the tree classifier Arguments: X {[df]} -- Dataframe containing the features y {pd.series} -- Label vector """ X = assert_df(X) y = assert_series(y) self.attributes = [ a for a in self.domain.attributes if a.name in X.columns.values ] self.columns = [a.name for a in self.attributes] s_domain = Domain(self.attributes, class_vars=self.domain.class_var) rows = pd.concat([X[self.columns], y], axis=1).values.tolist() train = Table.from_list(domain=s_domain, rows=rows) if isinstance(self.domain.class_var, DiscreteVariable): self.tree = TreeClassifier().fit_storage(train) else: self.tree = TreeRegressor().fit_storage(train) return self
def __init__(self, f_types, l_type, shape, **kwargs): self.f_types = assert_series(f_types) self.l_type = assert_l_type(l_type) self.shape = shape self._init_parameters(**kwargs) self.is_fitted = False self.names = self.f_types.index.tolist() self.feature_importances = {name: -1 for name in self.names}
def __init__(self, f_types, l_type, **kwargs): """ Class which predicts label for unseen samples Arguments: f_types {pd.series} -- Series of feature types l_type {str} -- Type of label """ self.f_types = assert_series(f_types) self.l_type = assert_l_type(l_type) self.params = { "knn_neighbors": kwargs.get("knn_neighbors", 6), "nominal_distance": kwargs.get("nominal_distance", 1), "distance_metric": kwargs.get("distance_metric", "partial"), }
def fit(self, X, y): if self.is_fitted: print("Selector is already fitted") return self X = assert_df(X).reset_index(drop=True) y = assert_series(y).reset_index(drop=True) data = Data(X, y, self.f_types, self.l_type, X.shape) self.data = data.shuffle_rows() self.domain = None if self.params["eval_method"] == "tree": self.domain = self.data.to_table().domain if self.params["eval_method"] == "mi": self.data = self.data.add_salt() self._fit() self.is_fitted = True return self
def _get_mi_cc(X, y, f_types, l_type, k, dist): """ Estimate mutual information for continous label types and at least one continous feature Checks how many samples are inside a given radius Arguments: """ nx = np.ones(X.shape[0]) * -1 ny = np.ones(X.shape[0]) * -1 D_x = get_dist_matrix(X, f_types, nominal_distance=dist) D_x.sort() new_y = assert_df(y) new_types = assert_series(l_type) D_y = get_dist_matrix(new_y, new_types, nominal_distance=dist) D_y.sort() for row in range(X.shape[0]): # Get distances inside features and labels dist_x = D_x[row, :] dist_y = D_y[row, :] # Update statistics if sample contains non-nan values radius = max(dist_x[k + 1], dist_y[k + 1]) if not np.isinf(radius): nx[row] = (dist_x <= radius).sum() - 1 ny[row] = (dist_y <= radius).sum() - 1 nx = nx[nx >= 0] ny = ny[nx >= 0] mi = digamma(len(nx)) + digamma(k) - (1 / k) - \ digamma(np.mean(nx)) - digamma(np.mean(ny)) return max(mi, 0)