def find_best_features_extratree(self, participants, X, calibrations, y, **kwargs): from sklearn import ensemble, preprocessing default_fields =\ utils.all_body_fields() + utils.all_body_orientation_fields() fields = kwargs.get('fields', default_fields) load_features = kwargs.get('load_features', False) if load_features: X = self.load_all_features(participants, X, calibrations, y, include=fields) X = X[self.add_features] columns = copy.deepcopy(X.columns) with pd.option_context('mode.use_inf_as_na', True): X = X.fillna(X.mean()) labelEncoder = preprocessing.LabelEncoder() y['target'] = y[utils.target_fields()]\ .apply(lambda xs: str(tuple(xs)), axis=1) y = labelEncoder.fit_transform(y['target']) scaler = StandardScaler() X = scaler.fit_transform(X) model = ensemble.ExtraTreesClassifier() model.fit(X, y) feat_importances = pd.Series(model.feature_importances_, index=columns) return feat_importances.sort_values()
def export(filename="export", **kwargs): participants, X, y = log(**kwargs) X.groupby(['pid', 'cid'])\ .tail(1)\ .reset_index()\ .set_index("pid")\ .join(participants, how='outer')\ .reset_index()\ .rename(columns={'level_0': 'participantId'})\ .merge( y.reset_index(), left_on=['participantId', 'cid'], right_on=['pid', 'id'] )\ .drop(columns=[ 'id_x', 'id_y', 'pid' ])\ .rename(columns={'cid': 'collectionId'})\ .set_index('collectionId')\ .sort_values(by=utils.target_fields())[ ['participantId'] + utils.target_fields() + utils.participant_fields() + utils.all_body_fields() + utils.all_body_orientation_fields() ]\ .to_csv('%s.csv' % filename) print("exported to %s.csv" % filename)
def compute_pairplot(self, **kwargs): additional_fields = kwargs.get('additional_fields', []) base_fields = kwargs.get('base_fields', utils.all_body_fields()) fields = base_fields + utils.target_fields() + additional_fields _, X, _, y = self.preprocess(features=fields) X['target'] = X[utils.target_fields()]\ .apply(lambda xs: tuple(xs), axis=1) self.pairplot_data = X.drop(columns=utils.target_fields())
def compute_correlation_matrix(self, **kwargs): additional_fields = kwargs.get('additional_fields', []) method = kwargs.get('corr_method', 'pearson') base_fields = kwargs.get('base_fields', utils.all_body_fields()) fields = base_fields + additional_fields kwargs['features'] = fields _, X, _, _ = self.preprocess(include=base_fields, **kwargs) self.correlation_matrix = X.corr(method=method)
def plot_pca_coefficients(self, **kwargs): self.reset() kwargs['base_fields'] = kwargs.get( 'base_fields', utils.all_body_fields() + utils.all_body_orientation_fields() + utils.all_features() ) kwargs['n_components'] = kwargs.get('n_components', .99) pca, fs = self.compute_pca(**kwargs) components = pd.DataFrame(pca.components_, columns=fs) kwargs = {**self.plotConfig.getConfig('pca_coefficients'), **kwargs} plotting.plot_pca_coefficients(components, **kwargs)
def automatic_features(self, participants, X, calibrations, y, model, **kwargs): # X = self.attach_target(X, y) # X = self.load_all_features(participants, X, calibrations, y) X['cid_pid'] = X.apply(lambda xs: "%s_%s" % (xs['cid'], xs['pid']), axis=1) # print(participants.head(), X.head(), y.head(), sep='\n') body_types = { k: ft.variable_types.Numeric for k in utils.all_body_fields() } target_types = {k: ft.variable_types.Id for k in utils.target_fields()} vt = { 'cid_pid': ft.variable_types.Index, 'id': ft.variable_types.Id, 'cid': ft.variable_types.Id, 'pid': ft.variable_types.Id, 'time': ft.variable_types.Numeric, **body_types, # **target_types } es = ft.EntitySet(id="pointing_movement") es = es.entity_from_dataframe(entity_id='collections', dataframe=X, index='cid_pid', variable_types=vt) selected = ['hmd', 'indexfinger'] for f in utils.flatten(map(utils.body_field, selected)): es = es.normalize_entity(base_entity_id='collections', new_entity_id=f, index=f) # es.plot(to_file='./plot.png') feature_matrix, features_defs = ft.dfs(entityset=es, entities=es, target_entity="collections", verbose=1) # feature_matrix.to_csv('./export.csv') # feature_matrix = self.load_all_features( # participants, feature_matrix, calibrations, y # ) feature_matrix = feature_matrix.fillna(feature_matrix.median()) modl = model(feature_matrix, y) # modl.gridsearch() modl.better_kfold_cross_validation()
def plot_extratrees(self, **kwargs): fs = utils.all_body_fields() +\ utils.all_body_orientation_fields() +\ utils.all_features() fields = kwargs.get('fields', fs) kwargs['fields'] = fields exclude = kwargs.get('exclude_features', []) p, X, c, y = self.preprocess(features=fields, exclude_features=exclude) kbest = self.find_best_features_extratree( p, X, c, y, **kwargs ) kwargs = {**self.plotConfig.getConfig('extratrees'), **kwargs} plotting.plot_extratrees(kbest, **kwargs)
def plot_selectKBest_chi2(self, **kwargs): fs = utils.all_body_fields() +\ utils.all_body_orientation_fields() +\ utils.all_features() fields = kwargs.get('fields', fs) kwargs['fields'] = fields exclude = kwargs.get('exclude_features', []) p, X, c, y = self.preprocess(features=fields, exclude_features=exclude) kbest = self.find_best_features( p, X, c, y, **kwargs ) if(len(kbest) == 0): return kwargs = {**self.plotConfig.getConfig('selectKBest'), **kwargs} plotting.plot_selectKBest_chi2(kbest, **kwargs)
def export_SelectKBest_chi2(point_model, path): path += 'content/pointing_movement/import/' p, X, c, y = point_model.normalized fs = utils.all_body_fields() +\ utils.all_body_orientation_fields() +\ utils.all_features() k = 10 kbest = point_model.find_best_features( p, X, c, y, fields=fs, load_features=True ).head(k) table = kbest.to_latex(escape=False, index=False) latex = pack_table( table, '$\\chi^2$ scores of the top %d features' % k, 'tab:pointing_movement:chi2' ) path += 'table_chi2.tex' with open(path, 'w') as f: f.write(latex) print('exported SelectKBest chi2 latex table.')
def export_pca_components(point_model, path): path += 'content/pointing_movement/import/' point_model.reset() fs = utils.all_body_fields() +\ utils.all_body_orientation_fields() +\ utils.all_features() fs = point_model.compute_pca(base_fields=fs) components = pd.DataFrame(point_model.pca.components_, columns=fs).round(2) components = components.abs().sum().sort_values(ascending=False) path += 'table_pca_components.tex' table = components.to_latex(escape=False) latex = pack_table( table, 'Overview of PCA Coefficients', 'tab:data_collection:participants' ) with open(path, 'w') as f: f.write(latex) print('exported PCA components latex table.')
def compute_pca(self, **kwargs): base_fields = kwargs.get('base_fields', utils.all_body_fields()) n_components = kwargs.get('n_components', 'mle') exclude_features = kwargs.get('exclude_features', []) include_features = kwargs.get('include_features', utils.all_features()) load_features = kwargs.get('load_features', True) fields = base_fields if load_features: feature_functions = {*include_features} - set(exclude_features) fields = list(set(list(feature_functions) + fields)) _, X, _, y = self.preprocess(features=fields, include=fields) X = X.fillna(X.mean()) if any(X.isna().any().values) > 0: X = X.fillna(0) X = X[self.add_features].values X = StandardScaler().fit_transform(X) # print('computing new PCA, fields: %s' % ', '.join(fields)) pca = PCA(n_components=n_components) self.pca = pca.fit(X) return self.pca, fields
def analyze_all_features(self, path='./feature_analysis', **kwargs): base_pairplot = kwargs.get('base_pairplot', False) save = kwargs.get('save', True) utils.ensure_dir_exists(path) participants, X, calibrations, y = self.normalized featureX = self.load_all_features(participants, X, calibrations, y) featureX = self.attach_target(featureX, y) features = utils.all_features() describe_path = "%s/describe.csv" % path featureX[features].describe().to_csv(describe_path) for key in features: self.analyze_feature(participants, featureX, calibrations, y, key, path=path, **kwargs) self.plot_correlation_matrix(force=True, save=True, additional_fields=features, save_path="%s/correlation_matrix.png" % path) self.plot_correlation_matrix( force=True, save=True, additional_fields=features + utils.target_fields(), save_path="%s/correlation_matrix_with_targets.png" % path) self.plot_correlation_matrix( force=True, save=True, base_fields=features + utils.target_fields(), save_path="%s/correlation_matrix_features.png" % path) self.plot_correlation_matrix( force=True, save=True, include=list(self.feature_functions.keys()), base_fields=(list(self.feature_functions.keys()) + utils.target_fields()), save_path=("%s/correlation_matrix_all_features_with_targets.png" % path)) self.plot_correlation_matrix( force=True, save=True, additional_fields=utils.target_fields(), save_path="%s/correlation_matrix_base.png" % path) self.plot_pca_coefficients(force=True, save=True, base_fields=utils.all_body_fields(), save_path="%s/pca_coefficients_base.png" % path) self.plot_pca_coefficients( force=True, save=True, base_fields=features, save_path="%s/pca_coefficients_features.png" % path) save_path = "%s/pca_base.png" % path utils.ensure_dir_exists(save_path, is_file=True) self.plot_pca(load_features=False, force=True, save=save, save_path=save_path) save_path = "%s/pca_all.png" % path utils.ensure_dir_exists(save_path, is_file=True) self.plot_pca(include_features=features, force=True, save=save, save_path=save_path) if base_pairplot: print('plotting pairplots:') print('\t start features') self.plot_pairplot(force=True, base_fields=features + utils.target_fields(), save=save, save_path="%s/pairplot_features.png" % path) print('\t start base') self.plot_pairplot(force=True, save=save, base_fields=utils.all_body_fields() + utils.target_fields(), save_path="%s/pairplot_base.png" % path) print('\t start orientations') self.plot_pairplot( force=True, save=save, base_fields=(utils.all_body_orientation_fields() + utils.target_fields()), save_path="%s/pairplot_orientations.png" % path)
def find_best_features(self, participants, X, calibrations, y, **kwargs): default_fields =\ utils.all_body_fields() + utils.all_body_orientation_fields() fields = kwargs.get('fields', default_fields) load_features = kwargs.get('load_features', False) k = kwargs.get('k', 'all') regression = kwargs.get('regression', False) target_fields = kwargs.get('target_fields', utils.target_fields()) if load_features: X = self.load_all_features(participants, X, calibrations, y, include=fields, **kwargs) X = X[self.add_features] with pd.option_context('mode.use_inf_as_na', True): X = X.fillna(X.mean()) if any(X.isna().any().values) > 0: X = X.fillna(0) scaler = MinMaxScaler() Xs = scaler.fit_transform(X) y = y[target_fields] if not regression: y = y.apply(lambda xs: str(tuple(xs)), axis=1) function = chi2 sKbest = SelectKBest(function, k=k) sKbest.fit_transform(Xs, y) support = sKbest.get_support(indices=True) col_names = X.columns[support].values scores = sKbest.scores_[support] pvalues = sKbest.pvalues_[support] zipped = list(zip(col_names, pvalues)) zipped.sort(key=lambda t: t[1]) return pd.DataFrame(zipped, columns=['feature', 'p-value']).round(5) else: scores_dict = {} for t in target_fields: ys = y[t] function = mutual_info_regression sKbest = SelectKBest(function, k=k) sKbest.fit_transform(Xs, ys) support = sKbest.get_support(indices=True) col_names = X.columns[support].values scores = sKbest.scores_[support] zipped = list(zip(col_names, scores)) zipped.sort(key=lambda t: t[0]) idx, sorted_scores = list(zip(*zipped)) scores_dict[t] = sorted_scores return pd.DataFrame(scores_dict, index=idx)
def export_calibrations(point_model, path, big=False): section = 'appendix' take = utils.all_body_fields() note = '' drop = ['count', '50%', 'min', 'max'] if not big: section = 'pointing_movement' take = utils.flatten([ utils.body_field('rightShoulder'), utils.body_field('leftShoulder'), utils.body_field('hmd') ]) note = '. See \\cref{} for all datapoints.' # drop = ['count', '50%'] path += 'content/%s/import/' % section utils.ensure_dir_exists(path) participants = point_model.participants calibrations = point_model.calibrations def get_stats(take, only_total=False): aggs = ['mean', 'std'] total = calibrations[take]\ .rename(columns=utils.body_fields_to_readable())\ .agg(aggs) idx = pd.MultiIndex.from_product([take, aggs]) total = pd.DataFrame(np.ravel(total.values, order='F'), idx, [''])\ .T\ .rename(columns=utils.body_fields_to_readable())\ .rename(mapper=lambda s: '\\textbf{%s}' % s, axis=1) if not only_total: stats = calibrations[['pid'] + take]\ .rename(columns=utils.body_fields_to_readable())\ .groupby(['pid'])\ .describe(percentiles=[])\ .drop(drop, axis=1, level=1)\ .rename(mapper=lambda s: '\\textbf{%s}' % s, axis=1) stats = stats.append(total, sort=False) else: stats = total.T return stats * 100 def write_stats(path, stats): with open(path, 'w') as f: file_name = os.path.basename(f.name) column_format = 'l' * (len(stats.columns)+1) table = stats.to_latex( escape=False, column_format=column_format, float_format="{:0.2f}".format ) latex = pack_table( table, 'Overview of the calibration [cm]%s' % note, 'tab:%s:%s' % (section, file_name.replace('.tex', '')), ) f.write(latex) print('exported %s calibration latex table.' % section) if not big: stats = get_stats(take) spath = path + 'table_calibration.tex' write_stats(spath, stats) stats = get_stats(take, True) spath = path + 'table_calibration_total.tex' write_stats(spath, stats) else: # num_chunks = 3 # chunk_len = round(len(take)/num_chunks) chunk_len = 6 for i, cs in enumerate(utils.chunks(take, chunk_len)): stats = get_stats(cs) log_path = path + 'table_calibration_%s.tex' % i write_stats(log_path, stats)
def normalize_height(xs, fields=[]): fields = fields if len(fields) > 0 else utils.all_body_fields() pid = xs.name height = self.participants.loc[pid]['height'] #/ 100 xs[fields] = xs[fields] / height return xs