Esempio n. 1
0
    def get_data_with_kv_tags(self, *args, **kwargs):
        if 'kv_tag_keys' in kwargs.keys():
            kv_tag_keys = kwargs.get('kv_tag_keys')
            kwargs.pop('kv_tag_keys')
        else:
            kv_tag_keys = ['move_direction', 'vehicle_type']

        if len(args) > 0 and isinstance(args[0], Cursor):
            c = args[0]
        else:
            c = self.mgc.find(*args, **kwargs)
        d = list()
        for ci in c:
            for seg in ci[self.segment_field]:
                dd = {'path': ci[self.path_field]}
                for tag_key in kv_tag_keys:
                    dd.update(
                        {tag_key: ci[self.kv_tag_field].get(tag_key, None)})
                dd.update(seg['fv'])
                dd.update({
                    'offset_s': seg['offset_s'],
                    'duration': seg['duration']
                })
                d += [dd]
        d = reorder_columns_as(pd.DataFrame(d), ['path'] + kv_tag_keys +
                               ['offset_s', 'duration'])
        return d
Esempio n. 2
0
def get_info_df(store, keys=None, info=None, cols=None):
    # process inputs
    if not keys:
        keys = store.keys()
    else:
        keys = util_ulist.ascertain_list(keys)
        keys = colloc.intersect(keys, store.keys())
    # get info_dict
    info_dict = get_info_dict(store)
    # make the df
    df = pd.DataFrame(
        [dict(v, **{'key': k}) for k, v in info_dict.iteritems()])
    df = df[df['key'].isin(keys)]
    if 'shape' in df.columns:
        del df['shape']
    if 'ncols' not in df.columns:
        df['ncols'] = np.nan
    if 'nrows' not in df.columns:
        df['nrows'] = np.nan
    # get ncols and nrows with missing
    idx = df['ncols'].isnull().nonzero()[
        0]  # ncols and nrows should both be missing when one is
    for i in idx:
        d = store[df['key'].iloc[i]]
        df['nrows'].iloc[i] = len(d)
        df['ncols'].iloc[i] = len(d.columns)
    # clean up and return
    df = df.set_index('key')
    df = df.sort_index()
    df = daf_manip.reorder_columns_as(
        df, ['nrows', 'ncols', 'isa', 'typ', 'indexers', 'dc'])
    df = df.replace(to_replace=np.nan, value='')
    if info:
        if isinstance(info, dict):
            # add as many columns as there are keys in dict, using the values of the dict as functions applied to
            # the whole stored dataframe to get the column value
            df = pd.concat(
                [df, pd.DataFrame(columns=info.keys(), index=df.index)],
                axis=1)
            for key in df.index.values:
                key_data = store[key]
                for k, v in info.iteritems():
                    df[k].loc[key] = v(key_data)
        elif np.all(map(lambda x: isinstance(x, basestring), info)):
            df = daf_manip.filter_columns(df, info)
        else:
            raise ValueError('Unrecognized info format')
    # filter cols
    if cols:
        df = daf_manip.filter_columns(df, cols)
    return df
Esempio n. 3
0
File: mg.py Progetto: yz-/ut
 def get_data_with_tags(self, *args, **kwargs):
     if len(args) > 0 and isinstance(args[0], Cursor):
         c = args[0]
     else:
         c = self.mgc.find(*args, **kwargs)
     d = list()
     for ci in c:
         for seg in ci['segments']:
             dd = {'path': ci[self.path_field], 'tags': ci[self.tag_field]}
             dd.update(seg['fv'])
             dd.update({'offset_s': seg['offset_s'], 'duration': seg['duration']})
             d += [dd]
     d = reorder_columns_as(pd.DataFrame(d), ['path', 'tags', 'offset_s', 'duration'])
     return d
Esempio n. 4
0
File: pstore.py Progetto: yz-/ut
def get_info_df(store, keys=None, info=None, cols=None):
    # process inputs
    if not keys:
        keys = store.keys()
    else:
        keys = util_ulist.ascertain_list(keys)
        keys = colloc.intersect(keys, store.keys())
    # get info_dict
    info_dict = get_info_dict(store)
    # make the df
    df = pd.DataFrame([dict(v, **{'key': k}) for k, v in info_dict.iteritems()])
    df = df[df['key'].isin(keys)]
    if 'shape' in df.columns:
        del df['shape']
    if 'ncols' not in df.columns:
        df['ncols'] = np.nan
    if 'nrows' not in df.columns:
        df['nrows'] = np.nan
    # get ncols and nrows with missing
    idx = df['ncols'].isnull().nonzero()[0]  # ncols and nrows should both be missing when one is
    for i in idx:
        d = store[df['key'].iloc[i]]
        df['nrows'].iloc[i] = len(d)
        df['ncols'].iloc[i] = len(d.columns)
    # clean up and return
    df = df.set_index('key')
    df = df.sort_index()
    df = daf_manip.reorder_columns_as(df, ['nrows', 'ncols', 'isa', 'typ', 'indexers', 'dc'])
    df = df.replace(to_replace=np.nan, value='')
    if info:
        if isinstance(info, dict):
            # add as many columns as there are keys in dict, using the values of the dict as functions applied to
            # the whole stored dataframe to get the column value
            df = pd.concat([df, pd.DataFrame(columns=info.keys(), index=df.index)], axis=1)
            for key in df.index.values:
                key_data = store[key]
                for k, v in info.iteritems():
                    df[k].loc[key] = v(key_data)
        elif np.all(map(lambda x: isinstance(x, basestring), info)):
            df = daf_manip.filter_columns(df, info)
        else:
            raise ValueError('Unrecognized info format')
    # filter cols
    if cols:
        df = daf_manip.filter_columns(df, cols)
    return df
Esempio n. 5
0
 def get_data_with_tags(self, *args, **kwargs):
     if len(args) > 0 and isinstance(args[0], Cursor):
         c = args[0]
     else:
         c = self.mgc.find(*args, **kwargs)
     d = list()
     for ci in c:
         for seg in ci['segments']:
             dd = {'path': ci[self.path_field], 'tags': ci[self.tag_field]}
             dd.update(seg['fv'])
             dd.update({
                 'offset_s': seg['offset_s'],
                 'duration': seg['duration']
             })
             d += [dd]
     d = reorder_columns_as(pd.DataFrame(d),
                            ['path', 'tags', 'offset_s', 'duration'])
     return d
Esempio n. 6
0
File: mg.py Progetto: yz-/ut
    def get_data_with_kv_tags(self, *args, **kwargs):
        if 'kv_tag_keys' in kwargs.keys():
            kv_tag_keys = kwargs.get('kv_tag_keys')
            kwargs.pop('kv_tag_keys')
        else:
            kv_tag_keys = ['move_direction', 'vehicle_type']

        if len(args) > 0 and isinstance(args[0], Cursor):
            c = args[0]
        else:
            c = self.mgc.find(*args, **kwargs)
        d = list()
        for ci in c:
            for seg in ci[self.segment_field]:
                dd = {'path': ci[self.path_field]}
                for tag_key in kv_tag_keys:
                    dd.update({tag_key: ci[self.kv_tag_field].get(tag_key, None)})
                dd.update(seg['fv'])
                dd.update({'offset_s': seg['offset_s'], 'duration': seg['duration']})
                d += [dd]
        d = reorder_columns_as(pd.DataFrame(d), ['path'] + kv_tag_keys + ['offset_s', 'duration'])
        return d
Esempio n. 7
0
def test_classifiers(
        X,
        y,
        scoring=default_scorers,
        score_aggreg=default_score_aggreg,
        n_features=7,  # an int will be transformed to a list (with different num of features) of given size
        clfs=None,
        nfolds=10,
        scale=None,
        decompose=None,
        select=None,
        decompose_params={},
        print_progress=False,
        score_to_plot=None):
    """
    tests and scores (given by SCORING and SCORE_AGGREG) several classifiers (given by clfs) with several number of
    features, returning a pandas DataFrame of the results.
    """
    scoring = scoring or default_scorers
    score_aggreg = score_aggreg or default_score_aggreg

    if isinstance(
            n_features, int
    ):  # if n_features is an int, it's the number of different feature set lens to try out
        # ... so make this feature set len list
        total_n_features = np.shape(X)[1]
        n_features = range(1, total_n_features + 1,
                           int(np.floor(total_n_features /
                                        n_features)))[:n_features]
    y = np.asarray(y, dtype="|S6")
    n_features = np.array(n_features)

    if clfs is None:
        clfs = default_classifiers

    clfs = clfs_to_dict_clfs(clfs)

    general_info_dict = dict()
    if scale is not None and scale is not False:  # preprocessing.StandardScaler(), preprocessing.MinMaxScaler()
        if scale is True:
            scale = preprocessing.StandardScaler()
        general_info_dict['scale'] = get_name(scale)
    if decompose is not None and decompose is not False:
        if decompose is True:
            decompose = decomposition.PCA(
                **decompose_params
            )  # PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, TruncatedSVD
        general_info_dict['decompose'] = get_name(decompose)

    clf_results = list()

    for i_nfeats, nfeats in enumerate(n_features):
        for i_clf, clf in enumerate(clfs):
            clf_name = clf.keys()[0]
            clf = clf[clf_name]
            d = dict(general_info_dict, **{
                'model': clf_name,
                'nfeats': nfeats
            })
            if print_progress:
                printProgress("{}: nfeats={}, nfolds={}".format(
                    clf_name, n_features[i_nfeats], nfolds))
            # try:
            start_time = datetime.now()
            score_result = \
                score_classifier(X,
                   y,
                   clf=clf,
                   nfeats=nfeats,
                   scoring=scoring,
                   score_aggreg=score_aggreg,
                   nfolds=nfolds,
                   scale=scale,
                   decompose=decompose,
                   select=select,
                   decompose_params=decompose_params)
            d.update(
                {'seconds': (datetime.now() - start_time).total_seconds()})
            d.update(score_result.to_dict())
            # except ValueError as e:
            #     raise e
            #     print("Error with: {} ({} features)".format(get_name(clf),
            #                                         n_features[i_nfeats]))

            clf_results.append(d)  # accumulate results

    clf_results = pd.DataFrame(clf_results)
    if score_to_plot:
        if score_to_plot is True:
            score_to_plot = mk_aggreg_score_name(
                score_aggreg_name=mk_score_aggreg_dict(score_aggreg).keys()[0],
                score_name=mk_scoring_dict(scoring).keys()[0])
        plot_score(clf_results, score_to_plot)

    return reorder_columns_as(clf_results, ['model', 'nfeats', 'seconds'])
Esempio n. 8
0
 def order_vars(self, var_list, sort_pts=True):
     self.tb = reorder_columns_as(self.tb, ascertain_list(var_list))
     if sort_pts:
         self.sort_pts()
     return self
Esempio n. 9
0
def test_classifiers(X, y,
                 scoring=default_scorers,
                 score_aggreg=default_score_aggreg,
                 n_features=7,  # an int will be transformed to a list (with different num of features) of given size
                 clfs=None,
                 nfolds=10,
                 scale=None,
                 decompose=None,
                 select=None,
                 decompose_params={},
                 print_progress=False,
                 score_to_plot=None
                 ):
    """
    tests and scores (given by SCORING and SCORE_AGGREG) several classifiers (given by clfs) with several number of
    features, returning a pandas DataFrame of the results.
    """
    scoring = scoring or default_scorers
    score_aggreg = score_aggreg or default_score_aggreg

    if isinstance(n_features, int):  # if n_features is an int, it's the number of different feature set lens to try out
        # ... so make this feature set len list
        total_n_features = np.shape(X)[1]
        n_features = range(1, total_n_features + 1, np.floor(total_n_features / n_features))[:n_features]
    y = np.asarray(y, dtype="|S6")
    n_features = np.array(n_features)

    if clfs is None:
        clfs = default_classifiers

    clfs = clfs_to_dict_clfs(clfs)

    general_info_dict = dict()
    if scale is not None and scale is not False:  # preprocessing.StandardScaler(), preprocessing.MinMaxScaler()
        if scale is True:
            scale = preprocessing.StandardScaler()
        general_info_dict['scale'] = get_name(scale)
    if decompose is not None and decompose is not False:
        if decompose is True:
            decompose = decomposition.PCA(**decompose_params)  # PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, TruncatedSVD
        general_info_dict['decompose'] = get_name(decompose)

    clf_results = list()

    for i_nfeats, nfeats in enumerate(n_features):
        for i_clf, clf in enumerate(clfs):
            clf_name = clf.keys()[0]
            clf = clf[clf_name]
            d = dict(general_info_dict, **{'model': clf_name, 'nfeats': nfeats})
            if print_progress:
                printProgress("{}: nfeats={}, nfolds={}".format(
                    clf_name,
                    n_features[i_nfeats],
                    nfolds))
            # try:
            start_time = datetime.now()
            score_result = \
                score_classifier(X,
                   y,
                   clf=clf,
                   nfeats=nfeats,
                   scoring=scoring,
                   score_aggreg=score_aggreg,
                   nfolds=nfolds,
                   scale=scale,
                   decompose=decompose,
                   select=select,
                   decompose_params=decompose_params)
            d.update({'seconds': (datetime.now() - start_time).total_seconds()})
            d.update(score_result.to_dict())
            # except ValueError as e:
            #     raise e
            #     print("Error with: {} ({} features)".format(get_name(clf),
            #                                         n_features[i_nfeats]))

            clf_results.append(d)  # accumulate results

    clf_results = pd.DataFrame(clf_results)
    if score_to_plot:
        if score_to_plot is True:
            score_to_plot = mk_aggreg_score_name(score_aggreg_name=mk_score_aggreg_dict(score_aggreg).keys()[0],
                                                 score_name=mk_scoring_dict(scoring).keys()[0])
        plot_score(clf_results, score_to_plot)

    return reorder_columns_as(clf_results, ['model', 'nfeats', 'seconds'])
Esempio n. 10
0
 def order_cols(self, df):
     return daf_manip.reorder_columns_as(df, self.col_order_01)
Esempio n. 11
0
File: pot.py Progetto: yz-/ut
 def order_vars(self, var_list, sort_pts=True):
     self.tb = reorder_columns_as(self.tb, ascertain_list(var_list))
     if sort_pts:
         self.sort_pts()
     return self