def _get_variable_data(self, variable, categorical, label=None, trend=None): ''' Extract (and cache) design matrix for variable/categorical combo. ''' # assign default labels (for variables passed in via split_by or orthogonalize) if label is None: label = '_'.join(listify(variable)) # hash labels (rather than variables) cache_key = hash((label, categorical)) if cache_key not in self.cache: n_rows = len(self.dataset.activation) # Handle special cases if variable == 'intercept': dm = np.ones((n_rows, 1)) elif variable in ['subject', 'run']: n_vols, n_runs = self.dataset.n_vols, self.dataset.n_runs n_grps = self.dataset.activation['subject'].nunique() if variable == 'run': n_grps *= n_runs else: n_vols *= n_runs dm = np.zeros((n_rows, n_grps)) val = 1 if trend is None else standardize( np.arange(n_vols)**trend) for i in range(n_grps): dm[(n_vols * i):(n_vols * i + n_vols), i] = val else: run_dms = [] events = self.events.copy() sr = 100 # Sampling rate, in Hz tr = self.dataset.TR scale = np.ceil(tr * sr) events['run_onset'] = (events['run_onset'] * sr).round() events['duration'] = (events['duration'] * sr).round() n_rows = int(np.ceil(self.dataset.n_vols * scale)) if categorical: variable_cols = events[variable] if isinstance(variable, (list, tuple)): variable_cols = variable_cols.stack() n_cols = variable_cols.nunique() # map unique values onto numerical indices, and return # data as a DataFrame where each column is a (named) level # of the variable levels = variable_cols.unique() mapping = OrderedDict(zip(levels, list(range(n_cols)))) if label is not None: self.level_map[label] = mapping events[variable] = events[variable].replace(mapping) else: n_cols = 1 for (sub_, run_), g in events.groupby(['subject', 'run']): dm = np.zeros((n_rows, n_cols)) for i, row in g.iterrows(): start = int(row['run_onset']) end = int(start + row['duration']) if categorical: for var in listify(variable): dm[start:end, np.array(row[variable], dtype=int)] = 1 else: if isinstance(variable, (tuple, list)): raise ValueError( "Adding a list of terms is only " "supported for categorical variables " "(e.g., random factors).") dm[start:end, 0] = row[variable] dm = dm.reshape(-1, scale.astype(int), n_cols).mean(axis=1) run_dms.append(dm[:self.dataset.n_vols]) dm = np.concatenate(run_dms) self.cache[cache_key] = dm dm = self.cache[cache_key] # NOTE: we return a copy in order to avoid in-place changes to the # cached design matrix (e.g., we don't want the HRF convolution to # overwrite what's in the cache). return dm.copy()
def _get_variable_data(self, variable, categorical, label=None, trend=None): ''' Extract (and cache) design matrix for variable/categorical combo. ''' # assign default labels (for variables passed in via split_by or orthogonalize) if label is None: label = '_'.join(listify(variable)) # hash labels (rather than variables) cache_key = hash((label, categorical)) if cache_key not in self.cache: n_rows = len(self.dataset.activation) # Handle special cases if variable == 'intercept': dm = np.ones((n_rows, 1)) elif variable in ['subject', 'run']: n_vols, n_runs = self.dataset.n_vols, self.dataset.n_runs n_grps = self.dataset.activation['subject'].nunique() if variable == 'run': n_grps *= n_runs else: n_vols *= n_runs dm = np.zeros((n_rows, n_grps)) val = 1 if trend is None else standardize( np.arange(n_vols)**trend) for i in range(n_grps): dm[(n_vols*i):(n_vols*i+n_vols), i] = val else: run_dms = [] events = self.events.copy() sr = 100 # Sampling rate, in Hz events['run_onset'] = (events['run_onset'] * sr).round() events['duration'] = (events['duration'] * sr).round() tr = self.dataset.TR scale = np.ceil(tr * sr) n_rows = self.dataset.n_vols * scale if categorical: variable_cols = events[variable] if isinstance(variable, (list, tuple)): variable_cols = variable_cols.stack() n_cols = variable_cols.nunique() # map unique values onto numerical indices, and return # data as a DataFrame where each column is a (named) level # of the variable levels = variable_cols.unique() mapping = OrderedDict(zip(levels, list(range(n_cols)))) if label is not None: self.level_map[label] = mapping events[variable] = events[variable].replace(mapping) else: n_cols = 1 for (sub_, run_), g in events.groupby(['subject', 'run']): dm = np.zeros((n_rows, n_cols)) for i, row in g.iterrows(): start = int(row['run_onset']) end = int(start + row['duration']) if categorical: for var in listify(variable): dm[start:end, row[variable]] = 1 else: if isinstance(variable, (tuple, list)): raise ValueError("Adding a list of terms is only " "supported for categorical variables " "(e.g., random factors).") dm[start:end, 0] = row[variable] dm = dm.reshape(-1, scale, n_cols).mean(axis=1) run_dms.append(dm[:self.dataset.n_vols]) dm = np.concatenate(run_dms) self.cache[cache_key] = dm dm = self.cache[cache_key] # NOTE: we return a copy in order to avoid in-place changes to the # cached design matrix (e.g., we don't want the HRF convolution to # overwrite what's in the cache). return dm.copy()
def add_term(self, variable, label=None, categorical=False, random=False, split_by=None, yoke_random_mean=False, estimate_random_mean=False, dist='Normal', scale=None, trend=None, orthogonalize=None, convolution=None, conv_kws=None, sigma_kws=None, withhold=False, plot=False, **kwargs): ''' Args: variable (str): name of the variable in the Dataset that contains the predictor data for the term, or a list of variable names. label (str): short name/label of the term; will be used as the name passed to PyMC. If None, the variable name is used. categorical (bool): if False, treat the input data as continuous; if True, treats input as categorical, and assigns discrete levels to different columns in the predictor matrix random (bool): if False, model as fixed effect; if True, model as random effect split_by (str): optional name of another variable on which to split the target variable. A separate hyperparameter will be included for each level in the split_by variable. E.g., if variable = 'stimulus' and split_by = 'category', the model will include one parameter for each individual stimulus, plus C additional hyperparameters for the stimulus variances (one per category). yoke_random_mean (bool): estimate_random_mean (bool): If False (default), set mean of random effect distribution to 0. If True, estimate mean parameters for each level of split_by (in which case the corresponding fixed effect parameter should be omitted, for identifiability reasons). If split_by=None, this is equivalent to estimating a fixed intercept term. Note that models parameterized in this way are often less numerically stable than the default parameterization. dist (str, Distribution): the PyMC3 distribution to use for the prior. Can be either a string (must be the name of a class in pymc3.distributions), or an uninitialized Distribution object. scale (str, bool): if 'before', scaling will be applied before convolving with the HRF. If 'after', scaling will be applied to the convolved regressor. True is treated like 'before'. If None (default), no scaling is done. trend (int): if variable is 'subject' or 'run', passing an int here will result in addition of an Nth-order polynomial trend instead of the expected intercept. E.g., when variable = 'run' and trend = 1, a linear trend will be added for each run. orthogonalize (list): list of variables to orthogonalize the target variable with respect to. For now, this only works for categorical covariates. E.g., if variable = 'condition' and orthogonalize = ['stimulus_category'], each level of condition will be residualized on all (binarized) levels of stimulus condition. convolution (str): the name of the convolution function to apply to the input data; must be a valid function in convolutions.py. If None, the default convolution function set at class initialization is used. If 'none' is passed, no convolution at all is applied. conv_kws (dict): optional dictionary of additional keyword arguments to pass onto the selected convolution function. sigma_kws (dict): optional dictionary of keyword arguments specifying the parameters of the Distribution to use as the sigma for a random variable. Defaults to HalfCauchy with beta=10. Ignored unless random=True. withhold (bool): if True, the PyMC distribution(s) will be created but not added to the prediction equation. This is useful when, e.g., yoking the mean of one distribution to the estimated value of another distribution, without including the same quantity twice. plot (bool): if True, plots the resulting design matrix component. kwargs: optional keyword arguments passed onto the selected PyMC3 Distribution. ''' if label is None: label = '_'.join(listify(variable)) # Load design matrix for requested variable dm = self._get_variable_data(variable, categorical, label=label, trend=trend) n_cols = dm.shape[1] # Handle random effects with nesting/crossing. Basically this splits the design # matrix into a separate matrix for each level of split_by, stacked into 3D array if split_by is not None: split_dm = self._get_variable_data(split_by, True) dm = np.einsum('ab,ac->abc', dm, split_dm) # Orthogonalization # TODO: generalize this to handle any combination of settings; right # now it will only work properly when both the target variable and the # covariates are categorical fixed effects. if orthogonalize is not None: dm = self._orthogonalize(dm, orthogonalize) # Scaling and HRF: apply over last dimension # if there is no split_by, add a dummy 3rd dimension so code below works in general if dm.ndim == 2: dm = dm[..., None] if plot and plot != 'convolved': self.plot_design_matrix(dm, variable, split_by) for i in range(dm.shape[-1]): if scale and scale != 'after': dm[..., i] = standardize(dm[..., i]) # Convolve with HRF if variable not in ['intercept'] and convolution is not 'none': if convolution is None: convolution = self.convolution elif not hasattr(convolution, 'shape'): convolution = get_convolution(convolution, conv_kws) # Convolve each run separately n_vols = self.dataset.n_vols n_runs = int(len(dm) / n_vols) for r in range(n_runs): start, end = r * n_vols, (r * n_vols) + n_vols _convolved = self._convolve(dm[start:end, :, i], convolution) dm[start:end, :, i] = _convolved # np.squeeze(_convolved) if scale == 'after': dm[..., i] = standardize(dm[..., i]) if plot and plot == 'convolved': self.plot_design_matrix(dm, variable, split_by) # remove the dummy 3rd dimension if it was added prior to scaling/convolution if dm.shape[-1] == 1: dm = dm.reshape(dm.shape[:2]) with self.model: # Random effects if random: # User can pass sigma specification in sigma_kws. # If not provided, default to HalfCauchy with beta = 10. if sigma_kws is None: sigma_kws = {'dist': 'HalfCauchy', 'beta': 1} if split_by is None: sigma = self._build_dist('sigma_' + label, **sigma_kws) if estimate_random_mean: mu = self._build_dist('b_' + label, dist) else: mu = 0. u = self._build_dist('u_' + label, dist, mu=mu, sd=sigma, shape=n_cols, **kwargs) self.mu += pm.dot(dm, u) else: # id_map is essentially a crosstab except each cell is either 0 or 1 id_map = self._get_membership_graph(variable, split_by) for i in range(id_map.shape[1]): # select just the factor levels that appear with the # current level of split_by group_items = id_map.iloc[:, i].astype(bool) selected = dm[:, group_items.values, i] # add the level effects to the model name = '%s_%s' % (label, id_map.columns[i]) sigma = self._build_dist('sigma_' + name, **sigma_kws) if yoke_random_mean: mu = self.dists['b_' + split_by][i] elif estimate_random_mean: mu = self._build_dist('b_' + name, dist) else: mu = 0. name, size = 'u_' + name, selected.shape[1] u = self._build_dist(name, dist, mu=mu, sd=sigma, shape=size, **kwargs) self.mu += pm.dot(selected, u) # Update the level map levels = group_items[group_items].index.tolist() self.level_map[name] = OrderedDict( zip(levels, list(range(size)))) # Fixed effects else: b = self._build_dist('b_' + label, dist, shape=dm.shape[-1], **kwargs) if split_by is not None: dm = np.squeeze(dm) if not withhold: self.mu += pm.dot(dm, b)
def add_term(self, variable, label=None, categorical=False, random=False, split_by=None, yoke_random_mean=False, estimate_random_mean=False, dist='Normal', scale=None, trend=None, orthogonalize=None, convolution=None, conv_kws=None, sigma_kws=None, withhold=False, plot=False, **kwargs): ''' Args: variable (str): name of the variable in the Dataset that contains the predictor data for the term, or a list of variable names. label (str): short name/label of the term; will be used as the name passed to PyMC. If None, the variable name is used. categorical (bool): if False, treat the input data as continuous; if True, treats input as categorical, and assigns discrete levels to different columns in the predictor matrix random (bool): if False, model as fixed effect; if True, model as random effect split_by (str): optional name of another variable on which to split the target variable. A separate hyperparameter will be included for each level in the split_by variable. E.g., if variable = 'stimulus' and split_by = 'category', the model will include one parameter for each individual stimulus, plus C additional hyperparameters for the stimulus variances (one per category). yoke_random_mean (bool): estimate_random_mean (bool): If False (default), set mean of random effect distribution to 0. If True, estimate mean parameters for each level of split_by (in which case the corresponding fixed effect parameter should be omitted, for identifiability reasons). If split_by=None, this is equivalent to estimating a fixed intercept term. Note that models parameterized in this way are often less numerically stable than the default parameterization. dist (str, Distribution): the PyMC3 distribution to use for the prior. Can be either a string (must be the name of a class in pymc3.distributions), or an uninitialized Distribution object. scale (str, bool): if 'before', scaling will be applied before convolving with the HRF. If 'after', scaling will be applied to the convolved regressor. True is treated like 'before'. If None (default), no scaling is done. trend (int): if variable is 'subject' or 'run', passing an int here will result in addition of an Nth-order polynomial trend instead of the expected intercept. E.g., when variable = 'run' and trend = 1, a linear trend will be added for each run. orthogonalize (list): list of variables to orthogonalize the target variable with respect to. For now, this only works for categorical covariates. E.g., if variable = 'condition' and orthogonalize = ['stimulus_category'], each level of condition will be residualized on all (binarized) levels of stimulus condition. convolution (str): the name of the convolution function to apply to the input data; must be a valid function in convolutions.py. If None, the default convolution function set at class initialization is used. If 'none' is passed, no convolution at all is applied. conv_kws (dict): optional dictionary of additional keyword arguments to pass onto the selected convolution function. sigma_kws (dict): optional dictionary of keyword arguments specifying the parameters of the Distribution to use as the sigma for a random variable. Defaults to HalfCauchy with beta=10. Ignored unless random=True. withhold (bool): if True, the PyMC distribution(s) will be created but not added to the prediction equation. This is useful when, e.g., yoking the mean of one distribution to the estimated value of another distribution, without including the same quantity twice. plot (bool): if True, plots the resulting design matrix component. kwargs: optional keyword arguments passed onto the selected PyMC3 Distribution. ''' if label is None: label = '_'.join(listify(variable)) # Load design matrix for requested variable dm = self._get_variable_data(variable, categorical, label=label, trend=trend) n_cols = dm.shape[1] # Handle random effects with nesting/crossing. Basically this splits the design # matrix into a separate matrix for each level of split_by, stacked into 3D array if split_by is not None: split_dm = self._get_variable_data(split_by, True) dm = np.einsum('ab,ac->abc', dm, split_dm) # Orthogonalization # TODO: generalize this to handle any combination of settings; right # now it will only work properly when both the target variable and the # covariates are categorical fixed effects. if orthogonalize is not None: dm = self._orthogonalize(dm, orthogonalize) # Scaling and HRF: apply over last dimension # if there is no split_by, add a dummy 3rd dimension so code below works in general if dm.ndim == 2: dm = dm[..., None] for i in range(dm.shape[-1]): if scale and scale != 'after': dm[..., i] = standardize(dm[..., i]) if plot: self.plot_design_matrix(dm, variable, split_by) # Convolve with HRF if variable not in ['subject', 'run', 'intercept'] and convolution is not 'none': if convolution is None: convolution = self.convolution elif not hasattr(convolution, 'shape'): convolution = get_convolution(convolution, conv_kws) _convolved = self._convolve(dm[..., i], convolution) dm[..., i] = _convolved # np.squeeze(_convolved) if scale == 'after': dm[..., i] = standardize(dm[..., i]) # remove the dummy 3rd dimension if it was added prior to scaling/convolution if dm.shape[-1] == 1: dm = dm.reshape(dm.shape[:2]) with self.model: # Random effects if random: # User can pass sigma specification in sigma_kws. # If not provided, default to HalfCauchy with beta = 10. if sigma_kws is None: sigma_kws = {'dist': 'HalfCauchy', 'beta': 10} if split_by is None: sigma = self._build_dist('sigma_' + label, **sigma_kws) if estimate_random_mean: mu = self._build_dist('b_' + label, dist) else: mu = 0. u = self._build_dist('u_' + label, dist, mu=mu, sd=sigma, shape=n_cols, **kwargs) self.mu += pm.dot(dm, u) else: # id_map is essentially a crosstab except each cell is either 0 or 1 id_map = self._get_membership_graph(variable, split_by) for i in range(id_map.shape[1]): # select just the factor levels that appear with the # current level of split_by group_items = id_map.iloc[:, i].astype(bool) selected = dm[:, group_items.values, i] # add the level effects to the model name = '%s_%s' % (label, id_map.columns[i]) sigma = self._build_dist('sigma_' + name, **sigma_kws) if yoke_random_mean: mu = self.dists['b_' + split_by][i] elif estimate_random_mean: mu = self._build_dist('b_' + name, dist) else: mu = 0. name, size = 'u_' + name, selected.shape[1] u = self._build_dist(name, dist, mu=mu, sd=sigma, shape=size, **kwargs) self.mu += pm.dot(selected, u) # Update the level map levels = group_items[group_items].index.tolist() self.level_map[name] = OrderedDict(zip(levels, list(range(size)))) # Fixed effects else: b = self._build_dist('b_' + label, dist, shape=dm.shape[-1], **kwargs) if split_by is not None: dm = np.squeeze(dm) if not withhold: self.mu += pm.dot(dm, b)
def predict(self, prediction: np.ndarray): train_ozturk = np.sort(self.__beta_reduction(prediction), axis=1) detrended = standardize(train_ozturk, axis=1) u, v = self.__ozturk_function(detrended) return u, v
def main(plot): hungary_csv = 'processed.filled.hungarian.csv' swiss_csv = 'processed.filled.switzerland.csv' df_hungary = pd.read_csv(hungary_csv) df_hungary.columns = [ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'num' ] df_swiss = pd.read_csv(swiss_csv) df_swiss.columns = [ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'num' ] X_hungary = df_hungary.iloc[:, :-1].values Y_hungary = df_hungary.iloc[:, -1].values X_swiss = df_swiss.iloc[:, :-1].values Y_swiss = df_swiss.iloc[:, -1].values # Map swiss dataset labels to binary df_swiss['num'] = df_swiss.num.map({0: 0, 1: 1, 2: 1, 3: 1, 4: 1}) df_swiss['sex'] = df_swiss.sex.map({0: 'female', 1: 'male'}) df_hungary['sex'] = df_hungary.sex.map({0: 'female', 1: 'male'}) ## split into training and testing ### Create plots for summary statistics of the dataset if (plot): plot_summary_statistics(df_hungary) plot_summary_statistics(df_swiss) x_train_hung, x_test_hung, y_train_hung, y_test_hung = train_test_split( X_hungary, Y_hungary, test_size=0.2, random_state=0, stratify=Y_hungary) x_train_swiss, x_test_swiss, y_train_swiss, y_test_swiss = train_test_split( X_swiss, Y_swiss, test_size=0.2, random_state=0, stratify=Y_swiss) ## Standardize data standardize_scaler = standardize() x_train_hung = standardize_scaler.fit_transform(x_train_hung) x_test_hung = standardize_scaler.transform(x_test_hung) x_train_swiss = standardize_scaler.fit_transform(x_train_swiss) x_test_swiss = standardize_scaler.transform(x_test_swiss) criterion = ['gini', 'entropy'] c = [0.1, 0.5, 1, 5, 10] kernel = ('linear', 'poly', 'rbf', 'sigmoid') gamma = ('auto', 0.0001, 0.001, 0.01, 1, 'scale') degree = (1, 2, 3, 4, 5) coef0 = (0, 0.0001, 0.001, 0.01, 1) penalty = ['l1', 'l2'] learning_rate = [0.05, 0.01, 0.3, 0.5] max_depth = [3, 5, 7, 10] print("Performing gridsearch for optimal SVM hyperparameters...") parameters = { 'C': c, 'kernel': kernel, 'degree': degree, 'coef0': coef0, 'gamma': gamma } ## Train and Predict with SVM init_svm = SVC() hung_svm_clf = GridSearchCV(init_svm, parameters, cv=3, iid=True) hung_svm_clf = hung_svm_clf.fit(x_train_hung, y_train_hung) hung_svm_test_pred = hung_svm_clf.predict(x_test_hung) print(hung_svm_clf.best_estimator_) print("Testing accuracy of SVM over Hungary data: " + str(accuracy_score(y_test_hung, hung_svm_test_pred))) print("Testing precision of SVM over Hungary data: " + str(metrics.precision_score(y_test_hung, hung_svm_test_pred))) print("Testing recall of SVM over Hungary data: " + str(metrics.recall_score(y_test_hung, hung_svm_test_pred))) print("Testing F-measure of SVM over Hungary data: " + str(metrics.f1_score(y_test_hung, hung_svm_test_pred))) print() swiss_svm = SVC() swiss_svm_clf = GridSearchCV(swiss_svm, parameters, cv=3, iid=True) swiss_svm_clf = swiss_svm_clf.fit(x_train_swiss, y_train_swiss) swiss_svm_predictions = swiss_svm_clf.predict(x_test_swiss) print("Testing accuracy of SVM over Swiss data: " + str(accuracy_score(y_test_swiss, swiss_svm_predictions))) print("Testing precision of SVM over Swiss data: " + str(metrics.precision_score(y_test_swiss, swiss_svm_predictions))) print("Testing recall of SVM over Swiss data: " + str(metrics.recall_score(y_test_swiss, swiss_svm_predictions))) print("Testing F-measure of SVM over Swiss data: " + str(metrics.f1_score(y_test_swiss, swiss_svm_predictions))) print() # Train and predict with Logistic Regressin print( "Performing gridsearch for optimal Logistic Regression hyperparameters..." ) parameters = {'C': c, 'penalty': penalty} lr_hung = LogisticRegression(solver='liblinear') lr_swiss = LogisticRegression(solver='liblinear') lr_hung_clf = GridSearchCV(lr_hung, parameters, cv=3, iid=True) # Fit hungary model lr_hung_clf.fit(x_train_hung, y_train_hung) predictions_lr_hung = lr_hung_clf.predict(x_test_hung) print(lr_hung_clf.best_estimator_) print("Testing accuracy of Logistic Regression over Hungary data: " + str(accuracy_score(y_test_hung, predictions_lr_hung))) print("Testing precision of LR over Hungary data: " + str(metrics.precision_score(y_test_hung, predictions_lr_hung))) print("Testing recall of LR over Hungary data: " + str(metrics.recall_score(y_test_hung, predictions_lr_hung))) print("Testing F-measure of LR over Hungary data: " + str(metrics.f1_score(y_test_hung, predictions_lr_hung))) print() lr_swiss_clf = GridSearchCV(lr_swiss, parameters, cv=3, iid=True) #Fit swiss model lr_swiss_clf.fit(x_train_swiss, y_train_swiss) predictions_lr_swiss = lr_swiss_clf.predict(x_test_swiss) print("Testing accuracy of Logistic Regression over Swiss data: " + str(accuracy_score(y_test_swiss, predictions_lr_swiss))) print("Testing precision of Logistic Regression over Swiss data: " + str(metrics.precision_score(y_test_swiss, predictions_lr_swiss))) print("Testing recall of Logistic Regression over Swiss data: " + str(metrics.recall_score(y_test_swiss, predictions_lr_swiss))) print("Testing F-measure of Logistic Regression over Swiss data: " + str(metrics.f1_score(y_test_swiss, predictions_lr_swiss))) print() print("Performing gridsearch for optimal Decision Tree hyperparameters...") # Train and predict with Decision Tree parameters = {'criterion': criterion} hung_dt = DecisionTreeClassifier() hung_dt_clf = GridSearchCV(hung_dt, parameters, cv=3, iid=True) # Fit model hung_dt_clf.fit(x_train_hung, y_train_hung) prediction_hung_dt = hung_dt_clf.predict(x_test_hung) print(hung_dt_clf.best_estimator_) print("Testing accuracy of Decision Tree over Hungary data: " + str(accuracy_score(y_test_hung, prediction_hung_dt))) print("Testing precision of Decision Tree over Hungary data: " + str(metrics.precision_score(y_test_hung, prediction_hung_dt))) print("Testing recall of Decision Tree over Hungary data: " + str(metrics.recall_score(y_test_hung, prediction_hung_dt))) print("Testing F-measure of Decision Tree over Hungary data: " + str(metrics.f1_score(y_test_hung, prediction_hung_dt))) print() swiss_dt = DecisionTreeClassifier() swiss_dt_clf = GridSearchCV(swiss_dt, parameters, cv=3, iid=True) #Fit Model swiss_dt_clf.fit(x_train_swiss, y_train_swiss) prediction_swiss_dt = swiss_dt_clf.predict(x_test_swiss) print("Testing accuracy of Decision Tree over Swiss data: " + str(accuracy_score(y_test_swiss, prediction_swiss_dt))) print("Testing precision of Decision Tree over Swiss data: " + str(metrics.precision_score(y_test_swiss, prediction_swiss_dt))) print("Testing recall of Decision Tree over Swiss data: " + str(metrics.recall_score(y_test_swiss, prediction_swiss_dt))) print("Testing F-measure of Decision Tree over Swiss data: " + str(metrics.f1_score(y_test_swiss, prediction_swiss_dt))) print() print("Performing gridsearch for optimal XGBoost hyperparameters...") ## Train and predict with XGBoost parameters = {'learning_rate': learning_rate, 'max_depth': max_depth} hung_xg = XGBClassifier() hung_xg_clf = GridSearchCV(hung_xg, parameters, cv=3, iid=True) hung_xg_clf.fit(x_train_hung, y_train_hung) predictions_hung_xg = hung_xg_clf.predict(x_test_hung) print(hung_xg_clf.best_estimator_) print("Testing accuracy of XGBoost over Hungary Data: " + str(accuracy_score(y_test_hung, predictions_hung_xg))) print("Testing precision of XGBoost over Hungary Data: " + str(metrics.precision_score(y_test_hung, predictions_hung_xg))) print("Testing recall of XGBoost over Hungary Data: " + str(metrics.recall_score(y_test_hung, predictions_hung_xg))) print("Testing F-measure of XGBoost over Hungary Data: " + str(metrics.f1_score(y_test_hung, predictions_hung_xg))) print() swiss_xg = XGBClassifier() swiss_xg_clf = GridSearchCV(swiss_xg, parameters, cv=3, iid=True) swiss_xg_clf.fit(x_train_swiss, y_train_swiss) predictions_swiss_xg = swiss_xg_clf.predict(x_test_swiss) print("Testing accuracy of XGBoost over Swiss Data: " + str(accuracy_score(y_test_swiss, predictions_swiss_xg))) print("Testing precision of XGBoost over Swiss Data: " + str(metrics.precision_score(y_test_swiss, predictions_swiss_xg))) print("Testing recall of XGBoost over Swiss Data: " + str(metrics.recall_score(y_test_swiss, predictions_swiss_xg))) print("Testing F-measure of XGBoost over Swiss Data: " + str(metrics.f1_score(y_test_swiss, predictions_swiss_xg)))
labels = np.array(h5["labels"])[idx] mask = (labels.sum(axis=1) == 1) labels = labels[mask].astype(int) n_lab = labels.shape[1] labels = (labels * np.arange(n_lab)[None, :]).sum(axis=1) l_set = np.unique(labels) cmap = plt.cm.gist_ncar bounds = np.linspace(0, len(l_set), len(l_set) + 1) ticks = [gen_d[i] for i in range(n_lab)] norm = mpl.colors.BoundaryNorm(bounds, cmap.N) # FOR LOOP HERE print(" -> loading data") feat = np.array(h5["res50_avg"])[idx][mask] print(" -> standardizing data") normalize(feat, copy=False) feat = standardize(feat) print(" -> PCA-ing data") feat = PCA(50).fit_transform(feat) print(" -> TSNE-ing") feat = TSNE(2).fit_transform(feat) scat = plt.scatter(feat[:, 0], feat[:, 1], c=labels, cmap=cmap, norm=norm) cb = plt.colorbar(scat, spacing='proportional', ticks=bounds) cb.ax.set_yticklabels(ticks) #weird, labens between colors... plt.show()
def __oa_hidden(self, distribution_sequence): train_ozturk = np.sort(self.__beta_reduction(distribution_sequence), axis=1) detrended = standardize(train_ozturk, axis=1) u, v = self.__ozturk_function(detrended) return u, v