Beispiel #1
0
def generate_seasonal_time_series_data_with_trend(n_samples=1,
                                                  n_obs=100,
                                                  order=0,
                                                  sp=1,
                                                  model='additive'):
    """Helper function to generate time series/panel data with polynomial
    trend and seasonal component"""
    if sp == 1:
        return generate_time_series_data_with_trend(n_instances=n_samples,
                                                    n_timepoints=n_obs,
                                                    order=order)

    samples = []
    for i in range(n_samples):
        # coefs = np.random.normal(scale=0.01, size=(order + 1, 1))
        s = generate_polynomial_series(n_obs, order)

        if model == 'additive':
            s[::sp] = s[::sp] + 0.1
        else:
            s[::sp] = s[::sp] * 1.1

        index = np.arange(n_obs)
        y = pd.Series(s, index=index)
        samples.append(y)

    X = pd.DataFrame(samples)
    assert X.shape == (n_samples, n_obs)
    return detabularise(X)
Beispiel #2
0
    def transform(self, X, y=None):
        """
        Transform X, transforms univariate time-series using sklearn's PCA
        class

        Parameters
        ----------
        X : nested pandas DataFrame of shape [n_samples, 1]
            Nested dataframe with univariate time-series in cells.

        Returns
        -------
        Xt : pandas DataFrame
          Transformed pandas DataFrame with the same number of rows and the
          (potentially reduced) PCA transformed
          column. Time indices of the original column are replaced with 0:(
          n_components - 1).
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=True)

        # Transform X using the fitted PCA
        Xtab = tabularize(X)
        Xpca = pd.DataFrame(data=self.pca.transform(Xtab),
                            index=Xtab.index,
                            columns=Xtab.columns[:self.pca.n_components_])

        # Back-transform into time series data format
        Xt = detabularise(Xpca, index=X.index)
        Xt.columns = X.columns
        return Xt
Beispiel #3
0
def generate_time_series_data_with_trend(n_samples=1, n_obs=100, order=0, coefs=None):
    """Helper function to generate time series/panel data with polynomial trend"""
    samples = []
    for i in range(n_samples):
        s = generate_polynomial_series(n_obs, order=order, coefs=coefs)

        index = np.arange(n_obs)
        y = pd.Series(s, index=index)
        samples.append(y)

    X = pd.DataFrame(samples)
    assert X.shape == (n_samples, n_obs)
    return detabularise(X)
Beispiel #4
0
def generate_time_series_data_with_trend(
    n_instances=1, n_timepoints=100, order=0, coefs=None, noise=False
):
    """Helper function to generate time series/panel data with polynomial
    trend"""
    samples = []
    for _ in range(n_instances):
        s = generate_polynomial_series(n_timepoints, order=order, coefs=coefs)

        if noise:
            s = s + np.random.normal(size=n_timepoints)

        index = np.arange(n_timepoints)
        y = pd.Series(s, index=index)

        samples.append(y)

    X = pd.DataFrame(samples)
    assert X.shape == (n_instances, n_timepoints)
    return detabularise(X)
        clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
        saste_fixed = SASTEnsemble(cand_length_list=combination_list,
                                   nb_inst_per_class=1,
                                   random_state=None,
                                   classifier=clf,
                                   n_jobs=-1)

        start = time.time()

        saste_fixed.fit(X_train, y_train)
        saste_fixed.score(X_test, y_test)

        saste_fixed_time = time.time() - start

        ## ShapeletTransform
        X_train_sktime = detabularise(pd.DataFrame(X_train))
        X_test_sktime = detabularise(pd.DataFrame(X_test))

        clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))

        stc = ShapeletTransform(min_shapelet_length=min_shp_length,
                                max_shapelet_length=np.inf)

        start = time.time()

        stc.fit(X_train_sktime, y_train)
        X_train_transformed = stc.transform(X_train_sktime)
        clf.fit(X_train_transformed, y_train)
        X_test_transformed = stc.transform(X_test_sktime)
        clf.score(X_test_transformed, y_test)
Beispiel #6
0
def build_model(dataset,
                pipeline,
                experiment,
                current_target='class',
                test_size=0.3):
    models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline,
                                                     experiment)
    reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline,
                                                       experiment)
    experiment_index_file = './results/{}_{}_{}/index.json'.format(
        dataset, pipeline, experiment)
    log_file = './results/{}_{}_{}/model_build.log'.format(
        dataset, pipeline, experiment)

    scoring = make_scorer(precision_score, zero_division=1, average='micro')
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)
    # Setup logging
    logger.setup(filename=log_file,
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='build_model')
    index_name = 'index'
    if '.' in dataset:
        splits = dataset.split(".")
        dataset = splits[0]
        index_name = splits[1]
    # Load the dataset index
    dataset_index = load_dataset(dataset,
                                 return_index=True,
                                 index_name=index_name)
    # Dynamically import the pipeline we want to use for building the model
    logger.info('Start experiment: {} using {} on {} with target {}'.format(
        experiment, pipeline, dataset, current_target))
    reports = ReportCollection(dataset, pipeline, experiment)
    for _sym, data in {'BTC': dataset_index['BTC']}.items():
        try:
            logger.info('Start processing: {}'.format(_sym))
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            targets = pd.read_csv(data['target_csv'],
                                  sep=',',
                                  encoding='utf-8',
                                  index_col='Date',
                                  parse_dates=True)

            # Drop columns whose values are all NaN, as well as rows with ANY nan value, then
            # replace infinity values with nan so that they can later be imputed to a finite value
            features = features.dropna(
                axis='columns', how='all').dropna().replace([np.inf, -np.inf],
                                                            np.nan)
            target = targets.loc[features.index][current_target]

            #X_train, X_test, y_train, y_test = train_test_split(features, target, shuffle=False, test_size=test_size)

            all_size = features.shape[0]
            train_size = int(all_size * (1 - test_size))
            features = detabularise(
                features[[c for c in features.columns if 'close' in c]])
            X_train = features.iloc[0:train_size]
            y_train = target.iloc[0:train_size]
            X_test = features.iloc[train_size:all_size]
            y_test = target.iloc[train_size:all_size]
            # Summarize distribution
            logger.info("Start Grid search")
            clf = ShapeletTransformClassifier(time_contract_in_mins=5)
            clf.fit(X_train, y_train)
            print('{} Score: {}'.format(_sym, clf.score(X_test, y_test)))
            pred = clf.predict(X_test)
            print(classification_report(y_test, pred))
            logger.info("End Grid search")

            logger.info("--- {} end ---".format(_sym))
        except Exception as e:
            logger.error(
                "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}"
                .format(pipeline, dataset, _sym, e))
            traceback.print_exc()
    return reports