def get_data_with_kv_tags(self, *args, **kwargs): if 'kv_tag_keys' in kwargs.keys(): kv_tag_keys = kwargs.get('kv_tag_keys') kwargs.pop('kv_tag_keys') else: kv_tag_keys = ['move_direction', 'vehicle_type'] if len(args) > 0 and isinstance(args[0], Cursor): c = args[0] else: c = self.mgc.find(*args, **kwargs) d = list() for ci in c: for seg in ci[self.segment_field]: dd = {'path': ci[self.path_field]} for tag_key in kv_tag_keys: dd.update( {tag_key: ci[self.kv_tag_field].get(tag_key, None)}) dd.update(seg['fv']) dd.update({ 'offset_s': seg['offset_s'], 'duration': seg['duration'] }) d += [dd] d = reorder_columns_as(pd.DataFrame(d), ['path'] + kv_tag_keys + ['offset_s', 'duration']) return d
def get_info_df(store, keys=None, info=None, cols=None): # process inputs if not keys: keys = store.keys() else: keys = util_ulist.ascertain_list(keys) keys = colloc.intersect(keys, store.keys()) # get info_dict info_dict = get_info_dict(store) # make the df df = pd.DataFrame( [dict(v, **{'key': k}) for k, v in info_dict.iteritems()]) df = df[df['key'].isin(keys)] if 'shape' in df.columns: del df['shape'] if 'ncols' not in df.columns: df['ncols'] = np.nan if 'nrows' not in df.columns: df['nrows'] = np.nan # get ncols and nrows with missing idx = df['ncols'].isnull().nonzero()[ 0] # ncols and nrows should both be missing when one is for i in idx: d = store[df['key'].iloc[i]] df['nrows'].iloc[i] = len(d) df['ncols'].iloc[i] = len(d.columns) # clean up and return df = df.set_index('key') df = df.sort_index() df = daf_manip.reorder_columns_as( df, ['nrows', 'ncols', 'isa', 'typ', 'indexers', 'dc']) df = df.replace(to_replace=np.nan, value='') if info: if isinstance(info, dict): # add as many columns as there are keys in dict, using the values of the dict as functions applied to # the whole stored dataframe to get the column value df = pd.concat( [df, pd.DataFrame(columns=info.keys(), index=df.index)], axis=1) for key in df.index.values: key_data = store[key] for k, v in info.iteritems(): df[k].loc[key] = v(key_data) elif np.all(map(lambda x: isinstance(x, basestring), info)): df = daf_manip.filter_columns(df, info) else: raise ValueError('Unrecognized info format') # filter cols if cols: df = daf_manip.filter_columns(df, cols) return df
def get_data_with_tags(self, *args, **kwargs): if len(args) > 0 and isinstance(args[0], Cursor): c = args[0] else: c = self.mgc.find(*args, **kwargs) d = list() for ci in c: for seg in ci['segments']: dd = {'path': ci[self.path_field], 'tags': ci[self.tag_field]} dd.update(seg['fv']) dd.update({'offset_s': seg['offset_s'], 'duration': seg['duration']}) d += [dd] d = reorder_columns_as(pd.DataFrame(d), ['path', 'tags', 'offset_s', 'duration']) return d
def get_info_df(store, keys=None, info=None, cols=None): # process inputs if not keys: keys = store.keys() else: keys = util_ulist.ascertain_list(keys) keys = colloc.intersect(keys, store.keys()) # get info_dict info_dict = get_info_dict(store) # make the df df = pd.DataFrame([dict(v, **{'key': k}) for k, v in info_dict.iteritems()]) df = df[df['key'].isin(keys)] if 'shape' in df.columns: del df['shape'] if 'ncols' not in df.columns: df['ncols'] = np.nan if 'nrows' not in df.columns: df['nrows'] = np.nan # get ncols and nrows with missing idx = df['ncols'].isnull().nonzero()[0] # ncols and nrows should both be missing when one is for i in idx: d = store[df['key'].iloc[i]] df['nrows'].iloc[i] = len(d) df['ncols'].iloc[i] = len(d.columns) # clean up and return df = df.set_index('key') df = df.sort_index() df = daf_manip.reorder_columns_as(df, ['nrows', 'ncols', 'isa', 'typ', 'indexers', 'dc']) df = df.replace(to_replace=np.nan, value='') if info: if isinstance(info, dict): # add as many columns as there are keys in dict, using the values of the dict as functions applied to # the whole stored dataframe to get the column value df = pd.concat([df, pd.DataFrame(columns=info.keys(), index=df.index)], axis=1) for key in df.index.values: key_data = store[key] for k, v in info.iteritems(): df[k].loc[key] = v(key_data) elif np.all(map(lambda x: isinstance(x, basestring), info)): df = daf_manip.filter_columns(df, info) else: raise ValueError('Unrecognized info format') # filter cols if cols: df = daf_manip.filter_columns(df, cols) return df
def get_data_with_tags(self, *args, **kwargs): if len(args) > 0 and isinstance(args[0], Cursor): c = args[0] else: c = self.mgc.find(*args, **kwargs) d = list() for ci in c: for seg in ci['segments']: dd = {'path': ci[self.path_field], 'tags': ci[self.tag_field]} dd.update(seg['fv']) dd.update({ 'offset_s': seg['offset_s'], 'duration': seg['duration'] }) d += [dd] d = reorder_columns_as(pd.DataFrame(d), ['path', 'tags', 'offset_s', 'duration']) return d
def get_data_with_kv_tags(self, *args, **kwargs): if 'kv_tag_keys' in kwargs.keys(): kv_tag_keys = kwargs.get('kv_tag_keys') kwargs.pop('kv_tag_keys') else: kv_tag_keys = ['move_direction', 'vehicle_type'] if len(args) > 0 and isinstance(args[0], Cursor): c = args[0] else: c = self.mgc.find(*args, **kwargs) d = list() for ci in c: for seg in ci[self.segment_field]: dd = {'path': ci[self.path_field]} for tag_key in kv_tag_keys: dd.update({tag_key: ci[self.kv_tag_field].get(tag_key, None)}) dd.update(seg['fv']) dd.update({'offset_s': seg['offset_s'], 'duration': seg['duration']}) d += [dd] d = reorder_columns_as(pd.DataFrame(d), ['path'] + kv_tag_keys + ['offset_s', 'duration']) return d
def test_classifiers( X, y, scoring=default_scorers, score_aggreg=default_score_aggreg, n_features=7, # an int will be transformed to a list (with different num of features) of given size clfs=None, nfolds=10, scale=None, decompose=None, select=None, decompose_params={}, print_progress=False, score_to_plot=None): """ tests and scores (given by SCORING and SCORE_AGGREG) several classifiers (given by clfs) with several number of features, returning a pandas DataFrame of the results. """ scoring = scoring or default_scorers score_aggreg = score_aggreg or default_score_aggreg if isinstance( n_features, int ): # if n_features is an int, it's the number of different feature set lens to try out # ... so make this feature set len list total_n_features = np.shape(X)[1] n_features = range(1, total_n_features + 1, int(np.floor(total_n_features / n_features)))[:n_features] y = np.asarray(y, dtype="|S6") n_features = np.array(n_features) if clfs is None: clfs = default_classifiers clfs = clfs_to_dict_clfs(clfs) general_info_dict = dict() if scale is not None and scale is not False: # preprocessing.StandardScaler(), preprocessing.MinMaxScaler() if scale is True: scale = preprocessing.StandardScaler() general_info_dict['scale'] = get_name(scale) if decompose is not None and decompose is not False: if decompose is True: decompose = decomposition.PCA( **decompose_params ) # PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, TruncatedSVD general_info_dict['decompose'] = get_name(decompose) clf_results = list() for i_nfeats, nfeats in enumerate(n_features): for i_clf, clf in enumerate(clfs): clf_name = clf.keys()[0] clf = clf[clf_name] d = dict(general_info_dict, **{ 'model': clf_name, 'nfeats': nfeats }) if print_progress: printProgress("{}: nfeats={}, nfolds={}".format( clf_name, n_features[i_nfeats], nfolds)) # try: start_time = datetime.now() score_result = \ score_classifier(X, y, clf=clf, nfeats=nfeats, scoring=scoring, score_aggreg=score_aggreg, nfolds=nfolds, scale=scale, decompose=decompose, select=select, decompose_params=decompose_params) d.update( {'seconds': (datetime.now() - start_time).total_seconds()}) d.update(score_result.to_dict()) # except ValueError as e: # raise e # print("Error with: {} ({} features)".format(get_name(clf), # n_features[i_nfeats])) clf_results.append(d) # accumulate results clf_results = pd.DataFrame(clf_results) if score_to_plot: if score_to_plot is True: score_to_plot = mk_aggreg_score_name( score_aggreg_name=mk_score_aggreg_dict(score_aggreg).keys()[0], score_name=mk_scoring_dict(scoring).keys()[0]) plot_score(clf_results, score_to_plot) return reorder_columns_as(clf_results, ['model', 'nfeats', 'seconds'])
def order_vars(self, var_list, sort_pts=True): self.tb = reorder_columns_as(self.tb, ascertain_list(var_list)) if sort_pts: self.sort_pts() return self
def test_classifiers(X, y, scoring=default_scorers, score_aggreg=default_score_aggreg, n_features=7, # an int will be transformed to a list (with different num of features) of given size clfs=None, nfolds=10, scale=None, decompose=None, select=None, decompose_params={}, print_progress=False, score_to_plot=None ): """ tests and scores (given by SCORING and SCORE_AGGREG) several classifiers (given by clfs) with several number of features, returning a pandas DataFrame of the results. """ scoring = scoring or default_scorers score_aggreg = score_aggreg or default_score_aggreg if isinstance(n_features, int): # if n_features is an int, it's the number of different feature set lens to try out # ... so make this feature set len list total_n_features = np.shape(X)[1] n_features = range(1, total_n_features + 1, np.floor(total_n_features / n_features))[:n_features] y = np.asarray(y, dtype="|S6") n_features = np.array(n_features) if clfs is None: clfs = default_classifiers clfs = clfs_to_dict_clfs(clfs) general_info_dict = dict() if scale is not None and scale is not False: # preprocessing.StandardScaler(), preprocessing.MinMaxScaler() if scale is True: scale = preprocessing.StandardScaler() general_info_dict['scale'] = get_name(scale) if decompose is not None and decompose is not False: if decompose is True: decompose = decomposition.PCA(**decompose_params) # PCA, KernelPCA, ProbabilisticPCA, RandomizedPCA, TruncatedSVD general_info_dict['decompose'] = get_name(decompose) clf_results = list() for i_nfeats, nfeats in enumerate(n_features): for i_clf, clf in enumerate(clfs): clf_name = clf.keys()[0] clf = clf[clf_name] d = dict(general_info_dict, **{'model': clf_name, 'nfeats': nfeats}) if print_progress: printProgress("{}: nfeats={}, nfolds={}".format( clf_name, n_features[i_nfeats], nfolds)) # try: start_time = datetime.now() score_result = \ score_classifier(X, y, clf=clf, nfeats=nfeats, scoring=scoring, score_aggreg=score_aggreg, nfolds=nfolds, scale=scale, decompose=decompose, select=select, decompose_params=decompose_params) d.update({'seconds': (datetime.now() - start_time).total_seconds()}) d.update(score_result.to_dict()) # except ValueError as e: # raise e # print("Error with: {} ({} features)".format(get_name(clf), # n_features[i_nfeats])) clf_results.append(d) # accumulate results clf_results = pd.DataFrame(clf_results) if score_to_plot: if score_to_plot is True: score_to_plot = mk_aggreg_score_name(score_aggreg_name=mk_score_aggreg_dict(score_aggreg).keys()[0], score_name=mk_scoring_dict(scoring).keys()[0]) plot_score(clf_results, score_to_plot) return reorder_columns_as(clf_results, ['model', 'nfeats', 'seconds'])
def order_cols(self, df): return daf_manip.reorder_columns_as(df, self.col_order_01)