def get_catch22_features(series): import catch22 # dict of {'names': [list of 22 feature names], 'values': [list of 22 feature values]} features = catch22.catch22_all(series) return features
def featurize(data, sample_rate): """ Given a list of numpy arrays containing multivariate time series and a sample rate, computes a number of "features" from each univariate time series (row) in the matrix and replaces the row with this list of features (in place). The end result is that the list of multivariate time series matrices is replaced with a list of "feature matrices", where position (i,j) contains the j-th scalar valued feature computed on the i-th channel in the EEG signal. Parameters ---------- data : list(numpy.ndarray(float)) List of matrices of data for individual subjects, size channels by time series length sample_rate : int Sample rate in Hz """ # for each matrix in the list... tau = 5 num_channels = data[0].shape[0] for i in range(len(data)): # create a list to temporarily hold features features = [] j = 0 # for each row in the matrix... for channel in data[i]: # this print statement can be removed, but lets you know that things are moving print("working on: " + str((i, j))) # compute a list of features using catch22 and append to the features variable features += catch22.catch22_all(channel.tolist())['values'] features += mean_var_features(channel, tau, sample_rate) j += 1 # convert the 2d list to a numpy array and assign to data[i] data[i] = np.array(features).reshape(num_channels, -1)
def predict_proba(self, X, check_input=True): # Correct formating of x if len(X.iloc[0]) == 1: # UNI X = [ np.array(X.iloc[i].iloc[0]).tolist() for i in range(0, len(X)) ] else: # MULTI X = [[ np.array(X.iloc[i].iloc[j]).tolist() for j in range(0, len(X.iloc[i])) ] for i in range(0, len(X))] if check_input: X = check_array(X, dtype=np.float64, allow_nd=True, order="C") if X.ndim < 2 or X.ndim > 3: raise ValueError("illegal input dimensions X.ndim ({})".format( X.ndim)) if self.n_dims_ > 1 and X.ndim != 3: raise ValueError("illegal input dimensions X.ndim != 3") if X.shape[-1] != self.n_timestep_: raise ValueError("illegal input shape ({} != {})".format( X.shape[-1], self.n_timestep_)) if X.ndim > 2 and X.shape[1] != self.n_dims_: raise ValueError("illegal input shape ({} != {}".format( X.shape[1], self.n_dims)) if X.dtype != np.float64 or not X.flags.contiguous: X = np.ascontiguousarray(X, dtype=np.float64) X = X.reshape(X.shape[0], self.n_dims_ * self.n_timestep_) # compute catch22 features num_insts = X.shape[0] X_catch22 = [] for i in range(num_insts): series = X[i, :] c22_dict = catch22_all(series) X_catch22.append(c22_dict['values']) # replace the rare nans X_catch22 = np.array(X_catch22) X_catch22[np.logical_or(np.isnan(X_catch22), np.isinf(X_catch22))] = 0 return self.bagging_classifier_.predict_proba(X_catch22)
def predict_proba(self, X): self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) c22_list = [] for i in range(n_instances): series = X[i, :] c22_dict = catch22_all(series) c22_list.append(c22_dict["values"]) X_c22 = np.array(c22_list) np.nan_to_num(X_c22, False, 0, 0, 0) return self.classifier.predict_proba(X_c22)
def feature_channel(channel, fsamp=256, band=range(0, 45)): """Convert time series of a given channel to list of features. Args: channel: 1-D array-like fsamp: sampling rate in Hz. band: band-pass in Hz. Returns: list: 1-D array-like """ # catch 22 res = catch22.catch22_all(channel)['values'] # power and freq power = pyeeg.bin_power(channel, Band=band, Fs=fsamp)[0] pwd = np.mean(power) freqs = np.arange(0, len(power)) pdf = np.array(power) / np.sum(power) mu = np.sum(freqs * pdf) # m2 = np.sum((freqs - mu)**2 * pdf) res.extend([pwd, mu]) return res
def fit(self, X, y): """Fit a random catch22 feature forest classifier Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X = check_X(X, enforce_univariate=False, coerce_to_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] c22_list = [] for i in range(n_instances): series = X[i, :] c22_dict = catch22_all(series) c22_list.append(c22_dict["values"]) self.classifier = RandomForestClassifier( n_jobs=self.n_jobs, n_estimators=self.n_estimators, random_state=self.random_state, ) X_c22 = np.array(c22_list) np.nan_to_num(X_c22, False, 0, 0, 0) self.classifier.fit(X_c22, y) self._is_fitted = True return self
def transform(self, X, y=None): """transforms data into the catch22 features Parameters ---------- X : pandas DataFrame, input time series y : array_like, target values (optional, ignored) Returns ------- Pandas dataframe containing 22 features for each input series """ self.check_is_fitted() X = check_X(X, enforce_univariate=False, coerce_to_numpy=True) n_instances = X.shape[0] X = np.reshape(X, (n_instances, -1)) c22_list = [] for i in range(n_instances): series = X[i, :] c22_dict = catch22.catch22_all(series) c22_list.append(c22_dict["values"]) return pd.DataFrame(c22_list)
def fit(self, X, y, sample_weight=None, check_input=True): """Fit a random catch22 feature forest classifier """ # Correct formating of x if len(X.iloc[0]) == 1: # UNI X2 = [ np.array(X.iloc[i].iloc[0]).tolist() for i in range(0, len(X)) ] else: # MULTI X2 = [[ np.array(X.iloc[i].iloc[j]).tolist() for j in range(0, len(X.iloc[i])) ] for i in range(0, len(X))] random_state = check_random_state(self.random_state) if check_input: X = check_array(X2, dtype=np.float64, allow_nd=True, order="C") y = check_array(y, ensure_2d=False) if X.ndim < 2 or X.ndim > 3: raise ValueError("illegal input dimension") n_samples = X.shape[0] self.n_timestep_ = X.shape[-1] if X.ndim > 2: n_dims = X.shape[1] else: n_dims = 1 self.n_dims_ = n_dims if y.ndim == 1: self.classes_, y = np.unique(y, return_inverse=True) else: _, y = np.nonzero(y) if len(y) != n_samples: raise ValueError("Single label per sample expected.") self.classes_ = np.unique(y) if len(y) != n_samples: raise ValueError("Number of labels={} does not match " "number of samples={}".format(len(y), n_samples)) if X.dtype != np.float64 or not X.flags.contiguous: X = np.ascontiguousarray(X, dtype=np.float64) if not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=np.intp) tree_classifier = treeClassifier( max_depth=self.max_depth, min_samples_split=self.min_samples_split, metric=self.metric, metric_params=self.metric_params, random_state=random_state, ) if n_dims > 1: tree_classifier.force_dim = n_dims self.bagging_classifier_ = BaggingClassifier( base_estimator=tree_classifier, bootstrap=self.bootstrap, n_jobs=self.n_jobs, n_estimators=self.n_estimators, random_state=self.random_state, ) X = X.reshape(n_samples, n_dims * self.n_timestep_) # compute catch22 features num_insts = X.shape[0] X_catch22 = [] for i in range(num_insts): series = X[i, :] c22_dict = catch22_all(series) X_catch22.append(c22_dict['values']) self.bagging_classifier_.fit(X_catch22, y, sample_weight=sample_weight) return self
def _get_features(ts: Dict[str, Any]) -> Dict[str, Any]: features = catch22.catch22_all(ts["target"]) return dict(zip(features["names"], features["values"]))
d = pd.read_csv( "/Users/trenthenderson/Documents/R/google-trends/data/31_Aug_data.csv") #%% #---------------- HCTSA ------------------------------- searches = d.interest_over_time_keyword.unique() search_data = [] for s in searches: tmp1 = d[d['interest_over_time_keyword'] == s] tmp1 = tmp1.dropna() tmp2 = tmp1[['interest_over_time_hits']] tmp2 = tmp2.to_numpy() results = pd.DataFrame.from_dict(catch22_all(tmp2)) results['keyword'] = s search_data.append(results) search_data = pd.concat(search_data) #%% #---------------- VISUALISATION ----------------------- # Standardise values heat_data = search_data.assign(values=search_data.groupby('names').transform( lambda x: (x - x.mean()) / x.std())) heat_data = pd.pivot_table(heat_data,
import catch22 for dataFile in ['../testData/test.txt', '../testData/test2.txt']: print('\n'), dataFile data = [line.rstrip().split(' ') for line in open(dataFile)] flat_data = [float(item) for sublist in data for item in sublist] catchOut = catch22.catch22_all(flat_data) featureNames = catchOut['names'] featureValues = catchOut['values'] for featureName, featureValue in zip(featureNames, featureValues): print('%s : %1.6f' % (featureName, featureValue))
def series_features(series): feat = catch22_all(series) return dict(zip(feat['names'], feat['values']))
def extract_stats(temp_list): return list(catch22_all(temp_list).values())[1]
return d6 #%% # Make dataframes for each age group did_recover = the_extractor(d4, 1) did_not_recover = the_extractor(d4, 0) #%% # Apply catch22 did_recover_output = pd.DataFrame.from_dict(catch22_all(did_recover)) did_recover_output['category'] = 1 did_not_recover_output = pd.DataFrame.from_dict(catch22_all(did_not_recover)) did_not_recover_output['category'] = 0 #%% # Bind all together final = did_recover_output final = final.append(did_not_recover_output) # Recode binary to words final['category_word'] = [
# Make dataframes for each age group age_15 = the_extractor(d1, "15-19") age_20 = the_extractor(d1, "20-24") age_25 = the_extractor(d1, "25-29") age_30 = the_extractor(d1, "30-34") age_35 = the_extractor(d1, "35-39") age_40 = the_extractor(d1, "40-44") age_45 = the_extractor(d1, "45-49") #%% # Apply catch22 age_15_output = pd.DataFrame.from_dict(catch22_all(age_15)) age_15_output['age_group'] = '15-19' age_20_output = pd.DataFrame.from_dict(catch22_all(age_20)) age_20_output['age_group'] = '20-24' age_25_output = pd.DataFrame.from_dict(catch22_all(age_25)) age_25_output['age_group'] = '25-29' age_30_output = pd.DataFrame.from_dict(catch22_all(age_30)) age_30_output['age_group'] = '30-34' age_35_output = pd.DataFrame.from_dict(catch22_all(age_35)) age_35_output['age_group'] = '35-39' age_40_output = pd.DataFrame.from_dict(catch22_all(age_40))