def _one_fit(X, y, group, train_index, test_index, estimator, scale_function=None): # Normalize scaler = scalingClass(scaling=scale_function) X_train = scaler.fit_transform(X[train_index]) X_test = scaler.transform(X[test_index]) # Train classifier estimator.fit(X_train, y[train_index]) # Predict y_pred = estimator.predict(X_test) if hasattr(estimator, 'predict_proba'): probas = estimator.predict_proba(X_test) elif hasattr(estimator, 'decision_function'): probas = estimator.decision_function(X_test) elif hasattr(estimator, 'oob_decision_function') and estimator.oob_score: probas = estimator.oob_decision_function(X_test) else: probas = None score = accuracy_score(y[test_index], y_pred) score_maj = score_majority_vote(y[test_index], group[test_index], y_pred=y_pred, probas=probas, labels=estimator.classes_, vote_type=sum_rule) return score, score_maj
def _one_fit(X, y, group, train_index, test_index, estimator, sample_weight, scale_function=None): # Normalize scaler = scalingClass(scaling=scale_function) X_train = scaler.fit_transform(X[train_index]) X_test = scaler.transform(X[test_index]) # Train classifier if sample_weight is None: estimator.fit(X_train, y[train_index]) else: estimator.fit(X_train, y[train_index], sample_weight=sample_weight[train_index]) # Predict y_pred = estimator.predict(X_test) assert all(estimator.classes_ == np.unique(y)), \ 'Not all classes are represented in the folds.' if hasattr(estimator, 'predict_proba'): y_probas = estimator.predict_proba(X_test) elif hasattr(estimator, 'decision_function'): y_probas = estimator.decision_function(X_test) elif hasattr(estimator, 'oob_decision_function') and estimator.oob_score: y_probas = estimator.oob_decision_function(X_test) else: y_probas = None return test_index, y_pred, y_probas, estimator
def _majority_vote_CV(X, y, group, estimator, splitter, scale_function=None): ## Majority vote #--------------- X = np.array(X) y = np.array(y) group = np.array(group) scores = [] scores_maj = [] for train_index, test_index in splitter.split(X, y, group): # Normalize scaler = scalingClass(scaling=scale_function) X_train = scaler.fit_transform(X[train_index]) X_test = scaler.transform(X[test_index]) # Train classifier estimator.fit(X_train, y[train_index]) # Predict y_pred = estimator.predict(X_test) scores.append(accuracy_score(y[test_index], y_pred)) scores_maj.append(score_majority_vote(y[test_index], group[test_index]), y_pred=y_pred) return scores, scores_maj
def cv_predict_single(X, y, splitter, estimator, group=None, scale_function=None, sample_weight=None): labels = np.unique(y) X = np.array(X) y = np.array(y) if sample_weight is not None: sample_weight = np.array(sample_weight) pred = np.empty_like(y) probas = np.empty((X.shape[0], labels.shape[0])) test_folds = [] trained_estimators = [] for train_index, test_index in splitter.split(X, y, group): test_folds.append(test_index) # Normalize scaler = scalingClass(scaling=scale_function) X_train = scaler.fit_transform(X[train_index]) X_test = scaler.transform(X[test_index]) # Train classifier if sample_weight is not None: estimator.fit(X_train, y[train_index], sample_weight[train_index]) else: estimator.fit(X_train, y[train_index]) trained_estimators.append(estimator) # Predict pred[test_index] = estimator.predict(X_test) assert all(estimator.classes_ == labels), \ 'Not all classes are represented in the folds.' if hasattr(estimator, 'predict_proba'): probas[test_index] = estimator.predict_proba(X_test) elif hasattr(estimator, 'decision_function'): probas[test_index] = estimator.decision_function(X_test) elif hasattr(estimator, 'oob_decision_function') and estimator.oob_score: probas[test_index] = estimator.oob_decision_function(X_test) else: probas = None return pred, probas, labels, test_folds, trained_estimators
def scaler(self, scalingtype): if isinstance(scalingtype, str) or scalingtype is None: scalerinstance = scalingClass(scaling=scalingtype) elif hasattr(scalingtype, 'fit'): scalerinstance = scalingtype else: raise ValueError( 'Scaling parameter type not recognised. Valid parameter types ' + 'include strings \'minmax_scale\', \'standardize\' and \'normalize\' ' + 'and instances of scaling classes with a fit() method.') return scalerinstance
scorer = ['accuracy', ClassifScorer(scorer='f1', average='macro')] scorenames = ['accuracy', 'f1'] # Feature set root_classif = Path(ANALYSIS_DIR) / 'classification' / 'results' path_to_set = root_classif / 'feature_selection_results' / 'best_feature_set.csv' feat_set = pd.read_csv(path_to_set, header=None)[0].to_numpy() # Base estimator pipeline # (estimator used to classify tierpsy feature vectors to modes of action) path_to_params = root_classif / 'optimize_hyperparameters_results' / 'best_params.p' params = pickle.load(open(path_to_params, 'rb')) estimator = LogisticRegression(**params) pipe = Pipeline([('scaler', scalingClass(scaling=scale)), ('estimator', estimator)]) # Ensemble of binary estimators pipeline # (estimators used to classify compounds in known-novel based on theta scores) svc = SVC(C=1, kernel='linear', class_weight='balanced', gamma='auto') binary_pipe = Pipeline([('scaler', StandardScaler()), ('svc', svc)]) saveto = Path().cwd() / 'results' saveto.mkdir(parents=True, exist_ok=True) #%% Read data feat = pd.read_csv(data_file, usecols=feat_set) y = pd.read_csv(meta_file, usecols=['MOA_group']).values.flatten() group = pd.read_csv(meta_file, usecols=['drug_type']).values.flatten()
def k_significant_from_classifier(feat, y_class, estimator, k=5, scale=None, feat_names=None, k_to_plot=None, close_after_plotting=False, saveto=None, figsize=None, title=None, xlabel=None): """ param: feat: array-like, shape=(n_samlples, n_features) The feature matrix y_class: array-like, shape=(n_samples) Vector with the class of each samples estimator: object A supervised learning estimator with a fit method that provides information about feature importance either through a coef_ attribute or through a feature_importances_ attribute. k: integer or 'all', optional Number of fetures to select Default is 5. scale: None, str or function, optional If string 'standardize', 'minmax_scale', the tierpsytools.preprocessing.scaling_class.scalingClass is used to scale the features. Otherwise the used can input a function that scales features. Default is None (no scaling). feat_names: list shape=(n_features) The names of the features, when feat is an array and not a dataframe (will be used for plotting) plot: boolean If True, the boxplots of the chosen features will be plotted return: top_feat: list, shape=(k,) The names or indexes (if feature names are not given) of the top features, sorted by importance. scores: array-like, shape=(k,) The scores of the top features, sorted by importance. support: array of booleans, shape=(n_features,) True for the selected features, False for the rest plot """ from tierpsytools.preprocessing.scaling_class import scalingClass if k_to_plot is None: plot = False else: plot = True if isinstance(feat, np.ndarray): feat = pd.DataFrame(feat, columns=feat_names) if isinstance(k, str): if k == 'all': k = feat.shape[1] if scale is not None: if isinstance(scale, str): scaler = scalingClass(scaling=scale) feat_scaled = scaler.fit_transform(feat) else: feat_scaled = scale(feat) else: feat_scaled = feat estimator.fit(feat_scaled, y_class) if hasattr(estimator, 'coef_'): scores = np.linalg.norm(estimator.coef_, axis=0, ord=1) elif hasattr(estimator, 'feture_importances_'): scores = estimator.feture_importances_ else: raise ValueError( 'The chosen estimator does not have a coef_ attribute' + ' or a feature_importances_ attribute.') top_ft_ids = np.flip(np.argsort(scores))[:k] support = np.zeros(feat.shape[1]).astype(bool) support[top_ft_ids] = True scores = scores[top_ft_ids] # Plot a boxplot for each feature, showing its distribution in each class if plot: plot_feature_boxplots(feat.iloc[:, top_ft_ids[:k_to_plot]], y_class, scores, figsize=figsize, saveto=saveto, xlabel=xlabel, close_after_plotting=close_after_plotting) top_feat = feat.columns[top_ft_ids].to_list() return top_feat, scores, support
def k_significant_feat(feat, y_class, k=5, score_func='f_classif', scale=None, feat_names=None, plot=True, k_to_plot=None, close_after_plotting=False, saveto=None, figsize=None, title=None, xlabel=None): """ Finds the k most significant features in the feature matrix, based on how well they separate the data in groups defined in y_class. It uses univariate statistical tests (the type of test is specified in the variable score_func). param: feat: array-like, shape=(n_samlples, n_features) The feature matrix y_class: array-like, shape=(n_samples) Vector with the class of each samples k: integer or 'all' Number of fetures to select score_func: str or function, optional If string 'f_classif', 'chi2', 'mutual_info_classif' then the function f_classif, chi2 or mutual_info_classif from sklearn.feature_selection will be used. Otherwise, the user needs to input a function that takes two arrays X and y, and returns a pair of arrays (scores, pvalues) or a single array with scores. Default is 'f_classif'. scale: None, str or function, optional If string 'standardize', 'minmax_scale', the tierpsytools.preprocessing.scaling_class.scalingClass is used to scale the features. Otherwise the used can input a function that scales features. Default is None (no scaling). feat_names: list shape=(n_features) The names of the features, when feat is an array and not a dataframe (will be used for plotting) return: support: array of booleans True for the selected features, False for the rest plot: boolean If True, the boxplots of the chosen features will be plotted plot """ from sklearn.feature_selection import \ SelectKBest, chi2,f_classif, mutual_info_classif if plot and k_to_plot is None: k_to_plot = k if isinstance(feat, np.ndarray): feat = pd.DataFrame(feat, columns=feat_names) feat = feat.loc[:, feat.std() != 0] if isinstance(k, str): if k == 'all': k = feat.shape[1] else: raise Exception('Data type for k not recognized.') # Find most significant features if isinstance(score_func, str): if score_func == 'f_classif': score_func = f_classif elif score_func == 'chi2': score_func = chi2 elif score_func == 'mutual_info_classif': score_func = mutual_info_classif if scale is not None: if isinstance(scale, str): scaler = scalingClass(scaling=scale) feat_scaled = scaler.fit_transform(feat) else: feat_scaled = scale(feat) else: feat_scaled = feat skb = SelectKBest(score_func=score_func, k=k) skb.fit(feat_scaled, y_class) support = skb.get_support() sorted_scores = np.sort(skb.scores_) ids_sorted_scores = np.argsort(skb.scores_) top_ft_ids = np.flip(ids_sorted_scores[~np.isnan(sorted_scores)])[:k] scores = skb.scores_[top_ft_ids] if hasattr(skb, 'pvalues_'): pvalues = skb.pvalues_[top_ft_ids] else: pvalues = None # Plot a boxplot for each feature, showing its distribution in each class if plot: plot_feature_boxplots(feat.iloc[:, top_ft_ids[:k_to_plot]], y_class, scores, pvalues=pvalues, figsize=figsize, saveto=saveto, xlabel=xlabel, close_after_plotting=close_after_plotting) if pvalues is not None: return feat.columns[top_ft_ids].to_list(), (scores, pvalues), support else: return feat.columns[top_ft_ids].to_list(), scores, support
from sklearn.linear_model import LogisticRegression from tierpsytools.analysis.cv_splitters import StratifiedGroupKFold from tierpsytools.analysis.scorers import ClassifScorer from feat_selection import main_feature_selection from optimize_hyperparameters import main_optimize_hyperparameters from predict_test_set import main_predict_test_set from moaclassification import INPUT_DIR import pdb #%% # Input parameters align_blue = True balance_classes = True n_average = 20 scale = 'rescale1' scaler = scalingClass(scaling=scale) # Estimator parameters estimator = LogisticRegression(penalty='elasticnet', l1_ratio=0.5, C=1, solver='saga', multi_class='multinomial', max_iter=500) pipe = Pipeline([('scaler', scaler), ('estimator', estimator)]) # CV parameters n_folds = 4 cv = StratifiedGroupKFold(n_splits=n_folds, random_seed=724) vote_type = 'counts'
feat, meta = filter_n_skeletons(feat, meta, min_nskel_per_video=2000, min_nskel_sum=None) # Filter rows based on percentage of nan values feat = filter_nan_inf(feat, 0.2, 1) meta = meta.loc[feat.index] # Filter features based on percentage of nan values feat = filter_nan_inf(feat, 0.05, 0) #%% Preprocess data # Impute the remainig nan values feat = impute_nan_inf(feat, groupby=None) # Scale the features (necessary if tou want to do PCA) scaler = scalingClass(scaling='standardize') feat = scaler.fit_transform(feat) #%% Check day-to-day variation # Get the PCA decomposition pca = PCA(n_components=2) Y = pca.fit_transform(feat) # Plot the samples of each day in the first two PCs plt.figure() for day in meta['date_yyyymmdd'].unique(): plt.scatter(*Y[meta['date_yyyymmdd'] == day, :].T, label=day) plt.legend()
#%% Keep only the moas of interest # mask = meta['drug_type'].isin(DRUGS) # meta = meta[mask] # feat = feat[mask] # make mapper to get drug names meta['name_'] = meta[['MOA_group', 'drug_name' ]].apply(lambda x: ' - '.join(map(str, x.values)), axis=1) namemapper = dict(meta[['drug_type', 'name_']].values) moamapper = dict(meta[['MOA_group', 'MOA_general' ]].drop_duplicates(subset=['MOA_group']).values) #%% Scale features scaler = scalingClass() feat = scaler.fit_transform(feat) sfeat = scaler.fit_transform(sfeat) #%% PCA colored by moa pca = PCA(n_components=3) Y = pca.fit_transform(feat) sY = pca.transform(sfeat) fig = plt.figure(figsize=(6, 6)) ax = fig.add_subplot(111, projection='3d') ax.set_title('original data') # ax.set_aspect('equal') for moa, c in zip(moas_to_plot, colors): mask = meta['MOA_group'].isin([moa]).values