def generate_seasonal_time_series_data_with_trend(n_samples=1, n_obs=100, order=0, sp=1, model='additive'): """Helper function to generate time series/panel data with polynomial trend and seasonal component""" if sp == 1: return generate_time_series_data_with_trend(n_instances=n_samples, n_timepoints=n_obs, order=order) samples = [] for i in range(n_samples): # coefs = np.random.normal(scale=0.01, size=(order + 1, 1)) s = generate_polynomial_series(n_obs, order) if model == 'additive': s[::sp] = s[::sp] + 0.1 else: s[::sp] = s[::sp] * 1.1 index = np.arange(n_obs) y = pd.Series(s, index=index) samples.append(y) X = pd.DataFrame(samples) assert X.shape == (n_samples, n_obs) return detabularise(X)
def transform(self, X, y=None): """ Transform X, transforms univariate time-series using sklearn's PCA class Parameters ---------- X : nested pandas DataFrame of shape [n_samples, 1] Nested dataframe with univariate time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with the same number of rows and the (potentially reduced) PCA transformed column. Time indices of the original column are replaced with 0:( n_components - 1). """ self.check_is_fitted() X = check_X(X, enforce_univariate=True) # Transform X using the fitted PCA Xtab = tabularize(X) Xpca = pd.DataFrame(data=self.pca.transform(Xtab), index=Xtab.index, columns=Xtab.columns[:self.pca.n_components_]) # Back-transform into time series data format Xt = detabularise(Xpca, index=X.index) Xt.columns = X.columns return Xt
def generate_time_series_data_with_trend(n_samples=1, n_obs=100, order=0, coefs=None): """Helper function to generate time series/panel data with polynomial trend""" samples = [] for i in range(n_samples): s = generate_polynomial_series(n_obs, order=order, coefs=coefs) index = np.arange(n_obs) y = pd.Series(s, index=index) samples.append(y) X = pd.DataFrame(samples) assert X.shape == (n_samples, n_obs) return detabularise(X)
def generate_time_series_data_with_trend( n_instances=1, n_timepoints=100, order=0, coefs=None, noise=False ): """Helper function to generate time series/panel data with polynomial trend""" samples = [] for _ in range(n_instances): s = generate_polynomial_series(n_timepoints, order=order, coefs=coefs) if noise: s = s + np.random.normal(size=n_timepoints) index = np.arange(n_timepoints) y = pd.Series(s, index=index) samples.append(y) X = pd.DataFrame(samples) assert X.shape == (n_instances, n_timepoints) return detabularise(X)
clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)) saste_fixed = SASTEnsemble(cand_length_list=combination_list, nb_inst_per_class=1, random_state=None, classifier=clf, n_jobs=-1) start = time.time() saste_fixed.fit(X_train, y_train) saste_fixed.score(X_test, y_test) saste_fixed_time = time.time() - start ## ShapeletTransform X_train_sktime = detabularise(pd.DataFrame(X_train)) X_test_sktime = detabularise(pd.DataFrame(X_test)) clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)) stc = ShapeletTransform(min_shapelet_length=min_shp_length, max_shapelet_length=np.inf) start = time.time() stc.fit(X_train_sktime, y_train) X_train_transformed = stc.transform(X_train_sktime) clf.fit(X_train_transformed, y_train) X_test_transformed = stc.transform(X_test_sktime) clf.score(X_test_transformed, y_test)
def build_model(dataset, pipeline, experiment, current_target='class', test_size=0.3): models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment) reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment) experiment_index_file = './results/{}_{}_{}/index.json'.format( dataset, pipeline, experiment) log_file = './results/{}_{}_{}/model_build.log'.format( dataset, pipeline, experiment) scoring = make_scorer(precision_score, zero_division=1, average='micro') os.makedirs(models_dir, exist_ok=True) os.makedirs(reports_dir, exist_ok=True) # Setup logging logger.setup(filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model') index_name = 'index' if '.' in dataset: splits = dataset.split(".") dataset = splits[0] index_name = splits[1] # Load the dataset index dataset_index = load_dataset(dataset, return_index=True, index_name=index_name) # Dynamically import the pipeline we want to use for building the model logger.info('Start experiment: {} using {} on {} with target {}'.format( experiment, pipeline, dataset, current_target)) reports = ReportCollection(dataset, pipeline, experiment) for _sym, data in {'BTC': dataset_index['BTC']}.items(): try: logger.info('Start processing: {}'.format(_sym)) features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value features = features.dropna( axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] #X_train, X_test, y_train, y_test = train_test_split(features, target, shuffle=False, test_size=test_size) all_size = features.shape[0] train_size = int(all_size * (1 - test_size)) features = detabularise( features[[c for c in features.columns if 'close' in c]]) X_train = features.iloc[0:train_size] y_train = target.iloc[0:train_size] X_test = features.iloc[train_size:all_size] y_test = target.iloc[train_size:all_size] # Summarize distribution logger.info("Start Grid search") clf = ShapeletTransformClassifier(time_contract_in_mins=5) clf.fit(X_train, y_train) print('{} Score: {}'.format(_sym, clf.score(X_test, y_test))) pred = clf.predict(X_test) print(classification_report(y_test, pred)) logger.info("End Grid search") logger.info("--- {} end ---".format(_sym)) except Exception as e: logger.error( "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}" .format(pipeline, dataset, _sym, e)) traceback.print_exc() return reports