Ejemplo n.º 1
0
    def find_best_features_extratree(self, participants, X, calibrations, y,
                                     **kwargs):
        from sklearn import ensemble, preprocessing
        default_fields =\
            utils.all_body_fields() + utils.all_body_orientation_fields()
        fields = kwargs.get('fields', default_fields)
        load_features = kwargs.get('load_features', False)

        if load_features:
            X = self.load_all_features(participants,
                                       X,
                                       calibrations,
                                       y,
                                       include=fields)

        X = X[self.add_features]
        columns = copy.deepcopy(X.columns)
        with pd.option_context('mode.use_inf_as_na', True):
            X = X.fillna(X.mean())

        labelEncoder = preprocessing.LabelEncoder()
        y['target'] = y[utils.target_fields()]\
            .apply(lambda xs: str(tuple(xs)), axis=1)
        y = labelEncoder.fit_transform(y['target'])

        scaler = StandardScaler()
        X = scaler.fit_transform(X)

        model = ensemble.ExtraTreesClassifier()
        model.fit(X, y)
        feat_importances = pd.Series(model.feature_importances_, index=columns)
        return feat_importances.sort_values()
def export(filename="export", **kwargs):
    participants, X, y = log(**kwargs)
    X.groupby(['pid', 'cid'])\
        .tail(1)\
        .reset_index()\
        .set_index("pid")\
        .join(participants, how='outer')\
        .reset_index()\
        .rename(columns={'level_0': 'participantId'})\
        .merge(
            y.reset_index(),
            left_on=['participantId', 'cid'], right_on=['pid', 'id']
        )\
        .drop(columns=[
            'id_x', 'id_y', 'pid'
        ])\
        .rename(columns={'cid': 'collectionId'})\
        .set_index('collectionId')\
        .sort_values(by=utils.target_fields())[
            ['participantId'] + utils.target_fields() +
            utils.participant_fields() + utils.all_body_fields() +
            utils.all_body_orientation_fields()
        ]\
        .to_csv('%s.csv' % filename)
    print("exported to %s.csv" % filename)
Ejemplo n.º 3
0
 def compute_pairplot(self, **kwargs):
     additional_fields = kwargs.get('additional_fields', [])
     base_fields = kwargs.get('base_fields', utils.all_body_fields())
     fields = base_fields + utils.target_fields() + additional_fields
     _, X, _, y = self.preprocess(features=fields)
     X['target'] = X[utils.target_fields()]\
         .apply(lambda xs: tuple(xs), axis=1)
     self.pairplot_data = X.drop(columns=utils.target_fields())
Ejemplo n.º 4
0
 def compute_correlation_matrix(self, **kwargs):
     additional_fields = kwargs.get('additional_fields', [])
     method = kwargs.get('corr_method', 'pearson')
     base_fields = kwargs.get('base_fields', utils.all_body_fields())
     fields = base_fields + additional_fields
     kwargs['features'] = fields
     _, X, _, _ = self.preprocess(include=base_fields, **kwargs)
     self.correlation_matrix = X.corr(method=method)
 def plot_pca_coefficients(self, **kwargs):
     self.reset()
     kwargs['base_fields'] = kwargs.get(
         'base_fields',
         utils.all_body_fields() + utils.all_body_orientation_fields() +
         utils.all_features()
     )
     kwargs['n_components'] = kwargs.get('n_components', .99)
     pca, fs = self.compute_pca(**kwargs)
     components = pd.DataFrame(pca.components_, columns=fs)
     kwargs = {**self.plotConfig.getConfig('pca_coefficients'), **kwargs}
     plotting.plot_pca_coefficients(components, **kwargs)
Ejemplo n.º 6
0
    def automatic_features(self, participants, X, calibrations, y, model,
                           **kwargs):
        # X = self.attach_target(X, y)
        # X = self.load_all_features(participants, X, calibrations, y)
        X['cid_pid'] = X.apply(lambda xs: "%s_%s" % (xs['cid'], xs['pid']),
                               axis=1)

        # print(participants.head(), X.head(), y.head(), sep='\n')
        body_types = {
            k: ft.variable_types.Numeric
            for k in utils.all_body_fields()
        }
        target_types = {k: ft.variable_types.Id for k in utils.target_fields()}
        vt = {
            'cid_pid': ft.variable_types.Index,
            'id': ft.variable_types.Id,
            'cid': ft.variable_types.Id,
            'pid': ft.variable_types.Id,
            'time': ft.variable_types.Numeric,
            **body_types,
            # **target_types
        }

        es = ft.EntitySet(id="pointing_movement")

        es = es.entity_from_dataframe(entity_id='collections',
                                      dataframe=X,
                                      index='cid_pid',
                                      variable_types=vt)

        selected = ['hmd', 'indexfinger']
        for f in utils.flatten(map(utils.body_field, selected)):
            es = es.normalize_entity(base_entity_id='collections',
                                     new_entity_id=f,
                                     index=f)

        # es.plot(to_file='./plot.png')

        feature_matrix, features_defs = ft.dfs(entityset=es,
                                               entities=es,
                                               target_entity="collections",
                                               verbose=1)

        # feature_matrix.to_csv('./export.csv')

        # feature_matrix = self.load_all_features(
        #     participants, feature_matrix, calibrations, y
        # )

        feature_matrix = feature_matrix.fillna(feature_matrix.median())
        modl = model(feature_matrix, y)
        # modl.gridsearch()
        modl.better_kfold_cross_validation()
 def plot_extratrees(self, **kwargs):
     fs = utils.all_body_fields() +\
         utils.all_body_orientation_fields() +\
         utils.all_features()
     fields = kwargs.get('fields', fs)
     kwargs['fields'] = fields
     exclude = kwargs.get('exclude_features', [])
     p, X, c, y = self.preprocess(features=fields, exclude_features=exclude)
     kbest = self.find_best_features_extratree(
         p, X, c, y, **kwargs
     )
     kwargs = {**self.plotConfig.getConfig('extratrees'), **kwargs}
     plotting.plot_extratrees(kbest, **kwargs)
 def plot_selectKBest_chi2(self, **kwargs):
     fs = utils.all_body_fields() +\
         utils.all_body_orientation_fields() +\
         utils.all_features()
     fields = kwargs.get('fields', fs)
     kwargs['fields'] = fields
     exclude = kwargs.get('exclude_features', [])
     p, X, c, y = self.preprocess(features=fields, exclude_features=exclude)
     kbest = self.find_best_features(
         p, X, c, y, **kwargs
     )
     if(len(kbest) == 0):
         return
     kwargs = {**self.plotConfig.getConfig('selectKBest'), **kwargs}
     plotting.plot_selectKBest_chi2(kbest, **kwargs)
def export_SelectKBest_chi2(point_model, path):
    path += 'content/pointing_movement/import/'
    p, X, c, y = point_model.normalized
    fs = utils.all_body_fields() +\
        utils.all_body_orientation_fields() +\
        utils.all_features()
    k = 10
    kbest = point_model.find_best_features(
        p, X, c, y, fields=fs, load_features=True
    ).head(k)
    table = kbest.to_latex(escape=False, index=False)
    latex = pack_table(
        table,
        '$\\chi^2$ scores of the top %d features' % k,
        'tab:pointing_movement:chi2'
    )
    path += 'table_chi2.tex'
    with open(path, 'w') as f:
        f.write(latex)
        print('exported SelectKBest chi2 latex table.')
def export_pca_components(point_model, path):
    path += 'content/pointing_movement/import/'
    point_model.reset()
    fs = utils.all_body_fields() +\
        utils.all_body_orientation_fields() +\
        utils.all_features()
    fs = point_model.compute_pca(base_fields=fs)

    components = pd.DataFrame(point_model.pca.components_, columns=fs).round(2)
    components = components.abs().sum().sort_values(ascending=False)
    path += 'table_pca_components.tex'
    table = components.to_latex(escape=False)
    latex = pack_table(
        table,
        'Overview of PCA Coefficients',
        'tab:data_collection:participants'
    )
    with open(path, 'w') as f:
        f.write(latex)
        print('exported PCA components latex table.')
Ejemplo n.º 11
0
    def compute_pca(self, **kwargs):
        base_fields = kwargs.get('base_fields', utils.all_body_fields())
        n_components = kwargs.get('n_components', 'mle')
        exclude_features = kwargs.get('exclude_features', [])
        include_features = kwargs.get('include_features', utils.all_features())
        load_features = kwargs.get('load_features', True)

        fields = base_fields
        if load_features:
            feature_functions = {*include_features} - set(exclude_features)
            fields = list(set(list(feature_functions) + fields))

        _, X, _, y = self.preprocess(features=fields, include=fields)
        X = X.fillna(X.mean())
        if any(X.isna().any().values) > 0:
            X = X.fillna(0)

        X = X[self.add_features].values
        X = StandardScaler().fit_transform(X)
        # print('computing new PCA, fields: %s' % ', '.join(fields))
        pca = PCA(n_components=n_components)
        self.pca = pca.fit(X)
        return self.pca, fields
Ejemplo n.º 12
0
    def analyze_all_features(self, path='./feature_analysis', **kwargs):
        base_pairplot = kwargs.get('base_pairplot', False)
        save = kwargs.get('save', True)

        utils.ensure_dir_exists(path)

        participants, X, calibrations, y = self.normalized
        featureX = self.load_all_features(participants, X, calibrations, y)
        featureX = self.attach_target(featureX, y)

        features = utils.all_features()

        describe_path = "%s/describe.csv" % path
        featureX[features].describe().to_csv(describe_path)

        for key in features:
            self.analyze_feature(participants,
                                 featureX,
                                 calibrations,
                                 y,
                                 key,
                                 path=path,
                                 **kwargs)

        self.plot_correlation_matrix(force=True,
                                     save=True,
                                     additional_fields=features,
                                     save_path="%s/correlation_matrix.png" %
                                     path)

        self.plot_correlation_matrix(
            force=True,
            save=True,
            additional_fields=features + utils.target_fields(),
            save_path="%s/correlation_matrix_with_targets.png" % path)

        self.plot_correlation_matrix(
            force=True,
            save=True,
            base_fields=features + utils.target_fields(),
            save_path="%s/correlation_matrix_features.png" % path)

        self.plot_correlation_matrix(
            force=True,
            save=True,
            include=list(self.feature_functions.keys()),
            base_fields=(list(self.feature_functions.keys()) +
                         utils.target_fields()),
            save_path=("%s/correlation_matrix_all_features_with_targets.png" %
                       path))

        self.plot_correlation_matrix(
            force=True,
            save=True,
            additional_fields=utils.target_fields(),
            save_path="%s/correlation_matrix_base.png" % path)

        self.plot_pca_coefficients(force=True,
                                   save=True,
                                   base_fields=utils.all_body_fields(),
                                   save_path="%s/pca_coefficients_base.png" %
                                   path)

        self.plot_pca_coefficients(
            force=True,
            save=True,
            base_fields=features,
            save_path="%s/pca_coefficients_features.png" % path)

        save_path = "%s/pca_base.png" % path
        utils.ensure_dir_exists(save_path, is_file=True)
        self.plot_pca(load_features=False,
                      force=True,
                      save=save,
                      save_path=save_path)

        save_path = "%s/pca_all.png" % path
        utils.ensure_dir_exists(save_path, is_file=True)
        self.plot_pca(include_features=features,
                      force=True,
                      save=save,
                      save_path=save_path)

        if base_pairplot:
            print('plotting pairplots:')

            print('\t start features')
            self.plot_pairplot(force=True,
                               base_fields=features + utils.target_fields(),
                               save=save,
                               save_path="%s/pairplot_features.png" % path)

            print('\t start base')
            self.plot_pairplot(force=True,
                               save=save,
                               base_fields=utils.all_body_fields() +
                               utils.target_fields(),
                               save_path="%s/pairplot_base.png" % path)

            print('\t start orientations')
            self.plot_pairplot(
                force=True,
                save=save,
                base_fields=(utils.all_body_orientation_fields() +
                             utils.target_fields()),
                save_path="%s/pairplot_orientations.png" % path)
Ejemplo n.º 13
0
    def find_best_features(self, participants, X, calibrations, y, **kwargs):
        default_fields =\
            utils.all_body_fields() + utils.all_body_orientation_fields()
        fields = kwargs.get('fields', default_fields)
        load_features = kwargs.get('load_features', False)
        k = kwargs.get('k', 'all')
        regression = kwargs.get('regression', False)
        target_fields = kwargs.get('target_fields', utils.target_fields())

        if load_features:
            X = self.load_all_features(participants,
                                       X,
                                       calibrations,
                                       y,
                                       include=fields,
                                       **kwargs)

        X = X[self.add_features]
        with pd.option_context('mode.use_inf_as_na', True):
            X = X.fillna(X.mean())
        if any(X.isna().any().values) > 0:
            X = X.fillna(0)

        scaler = MinMaxScaler()
        Xs = scaler.fit_transform(X)

        y = y[target_fields]
        if not regression:
            y = y.apply(lambda xs: str(tuple(xs)), axis=1)
            function = chi2
            sKbest = SelectKBest(function, k=k)
            sKbest.fit_transform(Xs, y)
            support = sKbest.get_support(indices=True)

            col_names = X.columns[support].values
            scores = sKbest.scores_[support]
            pvalues = sKbest.pvalues_[support]
            zipped = list(zip(col_names, pvalues))
            zipped.sort(key=lambda t: t[1])

            return pd.DataFrame(zipped, columns=['feature',
                                                 'p-value']).round(5)
        else:
            scores_dict = {}
            for t in target_fields:
                ys = y[t]
                function = mutual_info_regression

                sKbest = SelectKBest(function, k=k)
                sKbest.fit_transform(Xs, ys)
                support = sKbest.get_support(indices=True)

                col_names = X.columns[support].values
                scores = sKbest.scores_[support]
                zipped = list(zip(col_names, scores))
                zipped.sort(key=lambda t: t[0])

                idx, sorted_scores = list(zip(*zipped))
                scores_dict[t] = sorted_scores

            return pd.DataFrame(scores_dict, index=idx)
def export_calibrations(point_model, path, big=False):
    section = 'appendix'
    take = utils.all_body_fields()
    note = ''
    drop = ['count', '50%', 'min', 'max']
    if not big:
        section = 'pointing_movement'
        take = utils.flatten([
            utils.body_field('rightShoulder'),
            utils.body_field('leftShoulder'),
            utils.body_field('hmd')
        ])
        note = '. See \\cref{} for all datapoints.'
        # drop = ['count', '50%']
    path += 'content/%s/import/' % section
    utils.ensure_dir_exists(path)
    participants = point_model.participants
    calibrations = point_model.calibrations

    def get_stats(take, only_total=False):
        aggs = ['mean', 'std']
        total = calibrations[take]\
            .rename(columns=utils.body_fields_to_readable())\
            .agg(aggs)
        idx = pd.MultiIndex.from_product([take, aggs])
        total = pd.DataFrame(np.ravel(total.values, order='F'), idx, [''])\
            .T\
            .rename(columns=utils.body_fields_to_readable())\
            .rename(mapper=lambda s: '\\textbf{%s}' % s, axis=1)

        if not only_total:
            stats = calibrations[['pid'] + take]\
                .rename(columns=utils.body_fields_to_readable())\
                .groupby(['pid'])\
                .describe(percentiles=[])\
                .drop(drop, axis=1, level=1)\
                .rename(mapper=lambda s: '\\textbf{%s}' % s, axis=1)
            stats = stats.append(total, sort=False)
        else:
            stats = total.T
        return stats * 100

    def write_stats(path, stats):
        with open(path, 'w') as f:
            file_name = os.path.basename(f.name)
            column_format = 'l' * (len(stats.columns)+1)
            table = stats.to_latex(
                escape=False, column_format=column_format,
                float_format="{:0.2f}".format
            )
            latex = pack_table(
                table,
                'Overview of the calibration [cm]%s' % note,
                'tab:%s:%s' % (section, file_name.replace('.tex', '')),
            )
            f.write(latex)
            print('exported %s calibration latex table.' % section)

    if not big:
        stats = get_stats(take)
        spath = path + 'table_calibration.tex'
        write_stats(spath, stats)

        stats = get_stats(take, True)
        spath = path + 'table_calibration_total.tex'
        write_stats(spath, stats)
    else:
        # num_chunks = 3
        # chunk_len = round(len(take)/num_chunks)
        chunk_len = 6
        for i, cs in enumerate(utils.chunks(take, chunk_len)):
            stats = get_stats(cs)
            log_path = path + 'table_calibration_%s.tex' % i
            write_stats(log_path, stats)
Ejemplo n.º 15
0
 def normalize_height(xs, fields=[]):
     fields = fields if len(fields) > 0 else utils.all_body_fields()
     pid = xs.name
     height = self.participants.loc[pid]['height']  #/ 100
     xs[fields] = xs[fields] / height
     return xs