def get_lt(arr): params = [{ 'attr': 'pvalue' }, { 'attr': 'rvalue' }, { 'attr': 'slope' }, { 'attr': 'stderr' }] res = np.array([item[1] for item in linear_trend(arr, params)]) return np.nan_to_num(res)
def extract_features(data): day = 24 * 60 return list( numpy.nan_to_num( numpy.array([ feature.symmetry_looking(data, [{ 'r': 0.3 }])[0][1], feature.variance_larger_than_standard_deviation(data).bool(), feature.ratio_beyond_r_sigma(data, 2), feature.has_duplicate_max(data), feature.has_duplicate_min(data), feature.has_duplicate(data), feature.agg_autocorrelation(numpy.array(data.value), [{ 'f_agg': 'mean', 'maxlag': day }])[0][1], feature.partial_autocorrelation(data, [{ 'lag': day }])[0][1], feature.abs_energy(numpy.array(data.value)), feature.mean_change(data), feature.mean_second_derivative_central(data), feature.median(data), float(feature.mean(data)), float(feature.standard_deviation(data)), float(feature.longest_strike_below_mean(data)), float(feature.longest_strike_above_mean(data)), int(feature.number_peaks(data, 10)), feature.linear_trend(numpy.array(data.value), [{ 'attr': 'rvalue' }])[0][1], feature.c3(data, day), float(feature.maximum(data)), float(feature.minimum(data)) ])))
def function(x): param = [{'attr': self.attr}] return list(linear_trend(x, param))[0][1]
def features(self, x, prefix): feature_dict = dict() # create features here # numpy feature_dict[prefix + '_' + 'mean'] = np.mean(x) feature_dict[prefix + '_' + 'max'] = np.max(x) feature_dict[prefix + '_' + 'min'] = np.min(x) feature_dict[prefix + '_' + 'std'] = np.std(x) feature_dict[prefix + '_' + 'var'] = np.var(x) feature_dict[prefix + '_' + 'ptp'] = np.ptp(x) feature_dict[prefix + '_' + 'percentile_10'] = np.percentile(x, 10) feature_dict[prefix + '_' + 'percentile_20'] = np.percentile(x, 20) feature_dict[prefix + '_' + 'percentile_30'] = np.percentile(x, 30) feature_dict[prefix + '_' + 'percentile_40'] = np.percentile(x, 40) feature_dict[prefix + '_' + 'percentile_50'] = np.percentile(x, 50) feature_dict[prefix + '_' + 'percentile_60'] = np.percentile(x, 60) feature_dict[prefix + '_' + 'percentile_70'] = np.percentile(x, 70) feature_dict[prefix + '_' + 'percentile_80'] = np.percentile(x, 80) feature_dict[prefix + '_' + 'percentile_90'] = np.percentile(x, 90) # scipy feature_dict[prefix + '_' + 'skew'] = sp.stats.skew(x) feature_dict[prefix + '_' + 'kurtosis'] = sp.stats.kurtosis(x) feature_dict[prefix + '_' + 'kstat_1'] = sp.stats.kstat(x, 1) feature_dict[prefix + '_' + 'kstat_2'] = sp.stats.kstat(x, 2) feature_dict[prefix + '_' + 'kstat_3'] = sp.stats.kstat(x, 3) feature_dict[prefix + '_' + 'kstat_4'] = sp.stats.kstat(x, 4) feature_dict[prefix + '_' + 'moment_1'] = sp.stats.moment(x, 1) feature_dict[prefix + '_' + 'moment_2'] = sp.stats.moment(x, 2) feature_dict[prefix + '_' + 'moment_3'] = sp.stats.moment(x, 3) feature_dict[prefix + '_' + 'moment_4'] = sp.stats.moment(x, 4) # tsfresh feature_dict[prefix + '_' + 'abs_energy'] = feature_calculators.abs_energy(x) feature_dict[ prefix + '_' + 'abs_sum_of_changes'] = feature_calculators.absolute_sum_of_changes( x) feature_dict[ prefix + '_' + 'count_above_mean'] = feature_calculators.count_above_mean(x) feature_dict[ prefix + '_' + 'count_below_mean'] = feature_calculators.count_below_mean(x) feature_dict[prefix + '_' + 'mean_abs_change'] = feature_calculators.mean_abs_change( x) feature_dict[prefix + '_' + 'mean_change'] = feature_calculators.mean_change(x) feature_dict[ prefix + '_' + 'var_larger_than_std_dev'] = feature_calculators.variance_larger_than_standard_deviation( x) feature_dict[prefix + '_' + 'range_minf_m4000'] = feature_calculators.range_count( x, -np.inf, -4000) feature_dict[prefix + '_' + 'range_m4000_m3000'] = feature_calculators.range_count( x, -4000, -3000) feature_dict[prefix + '_' + 'range_m3000_m2000'] = feature_calculators.range_count( x, -3000, -2000) feature_dict[prefix + '_' + 'range_m2000_m1000'] = feature_calculators.range_count( x, -2000, -1000) feature_dict[prefix + '_' + 'range_m1000_0'] = feature_calculators.range_count( x, -1000, 0) feature_dict[prefix + '_' + 'range_0_p1000'] = feature_calculators.range_count( x, 0, 1000) feature_dict[prefix + '_' + 'range_p1000_p2000'] = feature_calculators.range_count( x, 1000, 2000) feature_dict[prefix + '_' + 'range_p2000_p3000'] = feature_calculators.range_count( x, 2000, 3000) feature_dict[prefix + '_' + 'range_p3000_p4000'] = feature_calculators.range_count( x, 3000, 4000) feature_dict[prefix + '_' + 'range_p4000_pinf'] = feature_calculators.range_count( x, 4000, np.inf) feature_dict[ prefix + '_' + 'ratio_unique_values'] = feature_calculators.ratio_value_number_to_time_series_length( x) feature_dict[ prefix + '_' + 'first_loc_min'] = feature_calculators.first_location_of_minimum(x) feature_dict[ prefix + '_' + 'first_loc_max'] = feature_calculators.first_location_of_maximum(x) feature_dict[ prefix + '_' + 'last_loc_min'] = feature_calculators.last_location_of_minimum(x) feature_dict[ prefix + '_' + 'last_loc_max'] = feature_calculators.last_location_of_maximum(x) feature_dict[ prefix + '_' + 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) feature_dict[ prefix + '_' + 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) feature_dict[ prefix + '_' + 'time_rev_asym_stat_1000'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1000) feature_dict[ prefix + '_' + 'autocorrelation_1'] = feature_calculators.autocorrelation(x, 1) feature_dict[ prefix + '_' + 'autocorrelation_2'] = feature_calculators.autocorrelation(x, 2) feature_dict[ prefix + '_' + 'autocorrelation_3'] = feature_calculators.autocorrelation(x, 3) feature_dict[ prefix + '_' + 'autocorrelation_4'] = feature_calculators.autocorrelation(x, 4) feature_dict[ prefix + '_' + 'autocorrelation_5'] = feature_calculators.autocorrelation(x, 5) feature_dict[ prefix + '_' + 'autocorrelation_6'] = feature_calculators.autocorrelation(x, 6) feature_dict[ prefix + '_' + 'autocorrelation_7'] = feature_calculators.autocorrelation(x, 7) feature_dict[ prefix + '_' + 'autocorrelation_8'] = feature_calculators.autocorrelation(x, 8) feature_dict[ prefix + '_' + 'autocorrelation_9'] = feature_calculators.autocorrelation(x, 9) feature_dict[ prefix + '_' + 'autocorrelation_10'] = feature_calculators.autocorrelation(x, 10) feature_dict[ prefix + '_' + 'autocorrelation_50'] = feature_calculators.autocorrelation(x, 50) feature_dict[ prefix + '_' + 'autocorrelation_100'] = feature_calculators.autocorrelation( x, 100) feature_dict[ prefix + '_' + 'autocorrelation_1000'] = feature_calculators.autocorrelation( x, 1000) feature_dict[prefix + '_' + 'c3_1'] = feature_calculators.c3(x, 1) feature_dict[prefix + '_' + 'c3_2'] = feature_calculators.c3(x, 2) feature_dict[prefix + '_' + 'c3_3'] = feature_calculators.c3(x, 3) feature_dict[prefix + '_' + 'c3_4'] = feature_calculators.c3(x, 4) feature_dict[prefix + '_' + 'c3_5'] = feature_calculators.c3(x, 5) feature_dict[prefix + '_' + 'c3_10'] = feature_calculators.c3(x, 10) feature_dict[prefix + '_' + 'c3_100'] = feature_calculators.c3(x, 100) for c in range(1, 34): feature_dict[prefix + '_' + 'fft_{0}_real'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'real' }]))[0][1] feature_dict[prefix + '_' + 'fft_{0}_imag'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'imag' }]))[0][1] feature_dict[prefix + '_' + 'fft_{0}_ang'.format(c)] = list( feature_calculators.fft_coefficient(x, [{ 'coeff': c, 'attr': 'angle' }]))[0][1] feature_dict[ prefix + '_' + 'long_strk_above_mean'] = feature_calculators.longest_strike_above_mean( x) feature_dict[ prefix + '_' + 'long_strk_below_mean'] = feature_calculators.longest_strike_below_mean( x) feature_dict[prefix + '_' + 'cid_ce_0'] = feature_calculators.cid_ce( x, 0) feature_dict[prefix + '_' + 'cid_ce_1'] = feature_calculators.cid_ce( x, 1) feature_dict[prefix + '_' + 'binned_entropy_5'] = feature_calculators.binned_entropy( x, 5) feature_dict[prefix + '_' + 'binned_entropy_10'] = feature_calculators.binned_entropy( x, 10) feature_dict[prefix + '_' + 'binned_entropy_20'] = feature_calculators.binned_entropy( x, 20) feature_dict[prefix + '_' + 'binned_entropy_50'] = feature_calculators.binned_entropy( x, 50) feature_dict[prefix + '_' + 'binned_entropy_80'] = feature_calculators.binned_entropy( x, 80) feature_dict[ prefix + '_' + 'binned_entropy_100'] = feature_calculators.binned_entropy(x, 100) feature_dict[prefix + '_' + 'num_crossing_0'] = feature_calculators.number_crossing_m( x, 0) feature_dict[prefix + '_' + 'num_peaks_1'] = feature_calculators.number_peaks(x, 1) feature_dict[prefix + '_' + 'num_peaks_3'] = feature_calculators.number_peaks(x, 3) feature_dict[prefix + '_' + 'num_peaks_5'] = feature_calculators.number_peaks(x, 5) feature_dict[prefix + '_' + 'num_peaks_10'] = feature_calculators.number_peaks(x, 10) feature_dict[prefix + '_' + 'num_peaks_50'] = feature_calculators.number_peaks(x, 50) feature_dict[prefix + '_' + 'num_peaks_100'] = feature_calculators.number_peaks( x, 100) feature_dict[prefix + '_' + 'num_peaks_500'] = feature_calculators.number_peaks( x, 500) feature_dict[prefix + '_' + 'spkt_welch_density_1'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 1 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_2'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 2 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_5'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 5 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_8'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 8 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_10'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 10 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_50'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 50 }]))[0][1] feature_dict[prefix + '_' + 'spkt_welch_density_100'] = list( feature_calculators.spkt_welch_density(x, [{ 'coeff': 100 }]))[0][1] feature_dict[ prefix + '_' + 'time_rev_asym_stat_1'] = feature_calculators.time_reversal_asymmetry_statistic( x, 1) feature_dict[ prefix + '_' + 'time_rev_asym_stat_2'] = feature_calculators.time_reversal_asymmetry_statistic( x, 2) feature_dict[ prefix + '_' + 'time_rev_asym_stat_3'] = feature_calculators.time_reversal_asymmetry_statistic( x, 3) feature_dict[ prefix + '_' + 'time_rev_asym_stat_4'] = feature_calculators.time_reversal_asymmetry_statistic( x, 4) feature_dict[ prefix + '_' + 'time_rev_asym_stat_10'] = feature_calculators.time_reversal_asymmetry_statistic( x, 10) feature_dict[ prefix + '_' + 'time_rev_asym_stat_100'] = feature_calculators.time_reversal_asymmetry_statistic( x, 100) for r in range(20): feature_dict[prefix + '_' + 'symmetry_looking_' + str(r)] = feature_calculators.symmetry_looking( x, [{ 'r': r * 0.05 }])[0][1] for r in range(1, 20): feature_dict[ prefix + '_' + 'large_standard_deviation_' + str(r)] = feature_calculators.large_standard_deviation( x, r * 0.05) for r in range(1, 10): feature_dict[prefix + '_' + 'quantile_' + str(r)] = feature_calculators.quantile(x, r * 0.1) for r in ['mean', 'median', 'var']: feature_dict[prefix + '_' + 'agg_autocorr_' + r] = feature_calculators.agg_autocorrelation( x, [{ 'f_agg': r, 'maxlag': 40 }])[0][-1] #for r in range(1, 6): # feature_dict[prefix+'_'+'number_cwt_peaks_'+str(r)] = feature_calculators.number_cwt_peaks(x, r) for r in range(1, 10): feature_dict[prefix + '_' + 'index_mass_quantile_' + str(r)] = feature_calculators.index_mass_quantile( x, [{ 'q': r }])[0][1] #for ql in [0., .2, .4, .6, .8]: # for qh in [.2, .4, .6, .8, 1.]: # if ql < qh: # for b in [False, True]: # for f in ["mean", "var"]: # feature_dict[prefix+'_'+'change_quantiles_'+str(ql)+'_'+str(qh)+'_'+str(b)+'_'+str(f)] = feature_calculators.change_quantiles(x, ql, qh, b, f) #for r in [.1, .3, .5, .7, .9]: # feature_dict[prefix+'_'+'approximate_entropy_'+str(r)] = feature_calculators.approximate_entropy(x, 2, r) feature_dict[ prefix + '_' + 'max_langevin_fixed_point'] = feature_calculators.max_langevin_fixed_point( x, 3, 30) for r in ['pvalue', 'rvalue', 'intercept', 'slope', 'stderr']: feature_dict[prefix + '_' + 'linear_trend_' + str(r)] = feature_calculators.linear_trend( x, [{ 'attr': r }])[0][1] for r in ['pvalue', 'teststat', 'usedlag']: feature_dict[prefix + '_' + 'augmented_dickey_fuller_' + r] = feature_calculators.augmented_dickey_fuller( x, [{ 'attr': r }])[0][1] for r in [0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10]: feature_dict[prefix + '_' + 'ratio_beyond_r_sigma_' + str(r)] = feature_calculators.ratio_beyond_r_sigma( x, r) #for attr in ["pvalue", "rvalue", "intercept", "slope", "stderr"]: # feature_dict[prefix+'_'+'linear_trend_timewise_'+attr] = feature_calculators.linear_trend_timewise(x, [{'attr': attr}])[0][1] #for attr in ["rvalue", "intercept", "slope", "stderr"]: # for i in [5, 10, 50]: # for f in ["max", "min", "mean", "var"]: # feature_dict[prefix+'_'+'agg_linear_trend_'+attr+'_'+str(i)+'_'+f] = feature_calculators.agg_linear_trend(x, [{'attr': attr, 'chunk_len': i, 'f_agg': f}])[0][-1] #for width in [2, 5, 10, 20]: # for coeff in range(15): # for w in [2, 5, 10, 20]: # feature_dict[prefix+'_'+'cwt_coefficients_'+str(width)+'_'+str(coeff)+'_'+str(w)] = list(feature_calculators.cwt_coefficients(x, [{'widths': width, 'coeff': coeff, 'w': w}]))[0][1] #for r in range(10): # feature_dict[prefix+'_'+'partial_autocorr_'+str(r)] = feature_calculators.partial_autocorrelation(x, [{'lag': r}])[0][1] # "ar_coefficient": [{"coeff": coeff, "k": k} for coeff in range(5) for k in [10]], # "fft_coefficient": [{"coeff": k, "attr": a} for a, k in product(["real", "imag", "abs", "angle"], range(100))], # "fft_aggregated": [{"aggtype": s} for s in ["centroid", "variance", "skew", "kurtosis"]], # "value_count": [{"value": value} for value in [0, 1, -1]], # "range_count": [{"min": -1, "max": 1}, {"min": 1e12, "max": 0}, {"min": 0, "max": 1e12}], # "friedrich_coefficients": (lambda m: [{"coeff": coeff, "m": m, "r": 30} for coeff in range(m + 1)])(3), # "energy_ratio_by_chunks": [{"num_segments": 10, "segment_focus": i} for i in range(10)], return feature_dict
def lin_reg(x): lr = ts.linear_trend(x, param=[{'attr': 'slope'}, {'attr': 'intercept'}]) return {'lr_slope': lr[0][1], 'lr_intercept': lr[1][1]}
def generate_time_series_feats(x_dataset, dataset_name="raw", test=False): make_dir_if_not_exists(os.path.join(FEATURES_PATH, 'tsfeats')) time_length = x_dataset.shape[1] features_function_dict = { "mean": mean, "median": median, "length": length, "minimum": minimum, "maximum": maximum, "variance": variance, "skewness": skewness, "kurtosis": kurtosis, "sum_values": sum_values, "abs_energy": abs_energy, "mean_change": mean_change, "mean_abs_change": mean_abs_change, "count_below_mean": count_below_mean, "count_above_mean": count_above_mean, "has_duplicate_min": has_duplicate_min, "has_duplicate_max": has_duplicate_max, "standard_deviation": standard_deviation, "absolute_sum_of_changes": absolute_sum_of_changes, "last_location_of_minimum": last_location_of_minimum, "last_location_of_maximum": last_location_of_maximum, "first_location_of_maximum": first_location_of_maximum, "longest_strike_below_mean": longest_strike_below_mean, "longest_strike_above_mean": longest_strike_above_mean, "sum_of_reoccurring_values": sum_of_reoccurring_values, "first_location_of_minimum": first_location_of_minimum, "sum_of_reoccurring_data_points": sum_of_reoccurring_data_points, "variance_larger_than_standard_deviation": variance_larger_than_standard_deviation, "ratio_value_number_to_time_series_length": ratio_value_number_to_time_series_length, "percentage_of_reoccurring_values_to_all_values": percentage_of_reoccurring_values_to_all_values, "binned_entropy_max300": lambda x: binned_entropy(x, 300), "binned_entropy_max400": lambda x: binned_entropy(x, 400), "cid_ce_true": lambda x: cid_ce(x, True), "cid_ce_false": lambda x: cid_ce(x, False), "percentage_of_reoccurring_datapoints_to_all_datapoints": percentage_of_reoccurring_datapoints_to_all_datapoints } for feature_name, function_call in features_function_dict.iteritems(): print "{:.<70s}".format("- Processing feature: %s" % feature_name), feature_name = 'tsfeats/%s_%s' % (dataset_name, feature_name) if not features_exists(feature_name, test): feats = x_dataset.apply(function_call, axis=1, raw=True).values save_features(feats, feature_name, test) print("Done") else: print("Already generated") ar_param_k100 = [{"coeff": i, "k": 100} for i in range(100 + 1)] ar_param_k500 = [{"coeff": i, "k": 500} for i in range(500 + 1)] agg50_mean_linear_trend = [{ "attr": val, "chunk_len": 50, "f_agg": "mean" } for val in ("pvalue", "rvalue", "intercept", "slope", "stderr")] aug_dickey_fuler_params = [{ "attr": "teststat" }, { "attr": "pvalue" }, { "attr": "usedlag" }] energy_ratio_num10_focus5 = [{"num_segments": 10, "segment_focus": 5}] fft_aggr_spectrum = [{ "aggtype": "centroid" }, { "aggtype": "variance" }, { "aggtype": "skew" }, { "aggtype": "kurtosis" }] fft_coefficient_real = [{ "coeff": i, "attr": "real" } for i in range((time_length + 1) // 2)] fft_coefficient_imag = [{ "coeff": i, "attr": "imag" } for i in range((time_length + 1) // 2)] fft_coefficient_abs = [{ "coeff": i, "attr": "abs" } for i in range((time_length + 1) // 2)] fft_coefficient_angle = [{ "coeff": i, "attr": "angle" } for i in range((time_length + 1) // 2)] linear_trend_params = [{ "attr": val } for val in ("pvalue", "rvalue", "intercept", "slope", "stderr")] other_feats_dict = { "ar_coeff100": lambda x: dict(ar_coefficient(x, ar_param_k100)), "ar_coeff500": lambda x: dict(ar_coefficient(x, ar_param_k500)), "agg50_mean_lin_trend": lambda x: dict(agg_linear_trend(x, agg50_mean_linear_trend)), "aug_dickey_fuler": lambda x: dict(augmented_dickey_fuller(x, aug_dickey_fuler_params)), "energy_ratio_num10_focus5": lambda x: dict(energy_ratio_by_chunks(x, energy_ratio_num10_focus5)), "fft_aggr_spectrum": lambda x: dict(fft_aggregated(x, fft_aggr_spectrum)), "fft_coeff_real": lambda x: dict(fft_coefficient(x, fft_coefficient_real)), "fft_coeff_imag": lambda x: dict(fft_coefficient(x, fft_coefficient_imag)), "fft_coeff_abs": lambda x: dict(fft_coefficient(x, fft_coefficient_abs)), "fft_coeff_angle": lambda x: dict(fft_coefficient(x, fft_coefficient_angle)), "linear_trend": lambda x: dict(linear_trend(x, linear_trend_params)), } for feature_name, function_call in other_feats_dict.iteritems(): print "{:.<70s}".format("- Processing features: %s" % feature_name), feature_name = 'tsfeats/%s_%s' % (dataset_name, feature_name) if not features_exists(feature_name, test): feats_dict = x_dataset.apply(function_call, axis=1, raw=True).values.tolist() feats = pd.DataFrame.from_dict(feats_dict) save_features(feats.values, feature_name, test) print("Done") else: print("Already generated") # Auto-correlations as features print("- Processing Auto-correlation features...") corr_dataset = x_dataset.apply(autocorrelation_all, axis=1, raw=True) save_features(corr_dataset.values, '%s_auto_correlation_all' % dataset_name, test) print("- Processing ARIMA(5,5,1) Features...") arima_features = parallelize_row(x_dataset.values, generate_arima_feats, n_jobs=2) assert arima_features.shape[0] == x_dataset.shape[0] # Assert the axis save_features(arima_features, '%s_arima_5_5_1' % dataset_name, test)
}]) # フーリエ変換 number_peaks = feature_calculators.number_peaks(data[:1000], 50) # ピークの数 index_mass_quantile = feature_calculators.index_mass_quantile( data[:1000], [{ 'q': 0.5 }, { 'q': 0.1 }]) # パーセンタイル処理 linear_trend = feature_calculators.linear_trend(range_data, [{ 'attr': "slope" }, { 'attr': 'intercept' }, { 'attr': 'rvalue' }]) # 単純なトレンド分析。attrに関しては下記を参照 # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html autocorrelation = feature_calculators.autocorrelation(data, 100) # 自己相関の計算 plt.plot(fft_aggregated) plt.show()