def fit_ftdata(self, x_ft, y_ft, x_ab, y_ab): """ Fit the model with featurized data as input :param x_ft: x_featurized :param y_ft: y_featurized :param x_ab: x_inverse_featurized :param y_ab: y_inverse_featurized """ self.clf0 = CLF(**self.params).fit(x_ft, y_ft != 0) # causal or confounded? self.clf1 = CLF(**self.params).fit(x_ab, y_ab == 1) # causal or anticausal?
def fit( self, x, y, is_q=False ): # x,y = original data and original labels => x=(npairs,2),y=(npairs,1) '''This function takes the entire dataframe and labels as input => x and y are original "data" and original "labels" respectively. The output is a trained CLF. The idea of "train" is to 1) separately go through each row, 2) featurize each row two times (for A->B and B->A), and 3) vectically stack them. The idea of "labels" is to duplicate the given labels (i.e., the y variable) by cosidering both A->B and B->A directions. Example: obj.fit(data, labels) ''' train = np.vstack(( np.array([ self.featurize_row(row.iloc[0], row.iloc[1], is_q=is_q) for idx, row in x.iterrows() ]), #(npairs,3m) #Does this really work for quantum?! np.array([ self.featurize_row(row.iloc[1], row.iloc[0], is_q=is_q) for idx, row in x.iterrows() ]))) #(npairs,3m) #train=(2*npairs,3m) labels = np.vstack((y, -y)).ravel() #(2*npairs,) verbose = 1 if self.verbose else 0 self.clf = CLF(verbose=verbose, min_samples_leaf=self.L, n_estimators=self.E, max_depth=self.max_depth, n_jobs=self.njobs).fit(train, labels)
def fit(self, x, y): # CAUTION this x and y should not be dataframe, but preprocessed above print('training CLF ..') verbose = 1 if self.verbose else 0 # FIXME and this is very im-balanced self.clf = CLF(verbose=verbose, min_samples_leaf=self.L, n_estimators=self.E, max_depth=self.max_depth, n_jobs=self.njobs).fit(x, y)
def fit(self, x, y): """Train the model. args: x: pandas.Dataframe of the data y: targets """ train = np.vstack((np.array([self.featurize_row(row.iloc[0], row.iloc[1]) for idx, row in x.iterrows()]), np.array([self.featurize_row(row.iloc[1], row.iloc[0]) for idx, row in x.iterrows()]))) labels = np.vstack((y, -y)).ravel() verbose = 1 if self.verbose else 0 self.clf = CLF(verbose=verbose, min_samples_leaf=self.L, n_estimators=self.E, max_depth=self.max_depth, n_jobs=self.n_jobs).fit(train, labels)
def fit(self, x, y): """Train the model. Args: x_tr (pd.DataFrame): CEPC format dataframe containing the pairs y_tr (pd.DataFrame or np.ndarray): labels associated to the pairs """ train = np.vstack((np.array([self.featurize_row(row.iloc[0], row.iloc[1]) for idx, row in x.iterrows()]), np.array([self.featurize_row(row.iloc[1], row.iloc[0]) for idx, row in x.iterrows()]))) labels = np.vstack((y, -y)).ravel() verbose = 1 if self.verbose else 0 self.clf = CLF(verbose=verbose, min_samples_leaf=self.L, n_estimators=self.E, max_depth=self.max_depth, n_jobs=self.njobs).fit(train, labels)
y_te = np.hstack((y_te, -y_te)) d_tr = np.hstack((d_tr, d_tr)) d_te = np.hstack((d_te, d_te)) x_ab = x_tr[(y_tr == 1) | (y_tr == -1)] y_ab = y_tr[(y_tr == 1) | (y_tr == -1)] params = { 'random_state': 0, 'n_estimators': E, 'max_features': None, 'max_depth': 50, 'min_samples_leaf': 10, 'verbose': 10 } params = { 'random_state': 0, 'n_estimators': E, 'min_samples_leaf': L, 'n_jobs': 16 } clf0 = CLF(**params).fit(x_tr, y_tr != 0) # causal or confounded? clf1 = CLF(**params).fit(x_ab, y_ab == 1) # causal or anticausal? clfd = CLF(**params).fit(x_tr, d_tr) # dependent or independent? p_te = clf0.predict_proba(x_te)[:, 1] * (2 * clf1.predict_proba(x_te)[:, 1] - 1) print([score(y_te, p_te), clf0.score(x_te, y_te != 0), clfd.score(x_te, d_te)])