Exemple #1
0
def save_adult_dataset_for_cpp_benchmarks():
    """Fetches and saves as C++ cereal serialized file the adult dataset
    """
    save_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             '../../tools/benchmark/data')
    os.makedirs(save_path, exist_ok=True)

    label_path = os.path.join(save_path, 'adult.labels.cereal')
    features_path = os.path.join(save_path, 'adult.features.cereal')

    X, y = fetch_tick_dataset('binary/adult/adult.trn.bz2')
    serialize_array(y, label_path)
    serialize_array(X, features_path)
Exemple #2
0
def fetch_hawkes_bund_data():
    """Load Hawkes formatted bund data from
    https://github.com/X-DataInitiative/tick-datasets/tree/master/hawkes/bund

    This data is meant to be fitted with Hawkes processes. It contains for each
    day 4 time series representing:

        1. Mid-price movement up
        2. Mid-price movement down
        3. Buyer initiated trades that do not move the mid-price
        4. Seller initiated trades that do not move the mid-price

    Returns
    -------
    output : `list` of `list` of `np.ndarray`, dim=(20, 4, _)
        List of 20 days of 4 timestamps data.
    """
    dataset = 'hawkes/bund/bund.npz'
    return [timestamps for _, timestamps in fetch_tick_dataset(dataset)]
Exemple #3
0
  not the case in `tick`. Note that this difference can be reduced by tuning the
  ``intercept_scaling`` parameter from ``scikit-learn``'s
  ``LogisticRegression``
* In this example, the computational time of ``tick`` is better than ``scikit``'s
"""
import numpy as np
from time import time
import matplotlib.pyplot as plt

from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression as LogRegScikit

from tick.dataset import fetch_tick_dataset
from tick.inference import LogisticRegression as LogRegTick

train_set = fetch_tick_dataset('binary/adult/adult.trn.bz2')
test_set = fetch_tick_dataset('binary/adult/adult.tst.bz2')

clf_tick = LogRegTick(C=1e5, penalty='l1', tol=1e-8)
clf_scikit = LogRegScikit(penalty='l1', tol=1e-8)

t1 = time()
clf_tick.fit(train_set[0], train_set[1])
t_tick = time() - t1

t1 = time()
clf_scikit.fit(train_set[0], train_set[1])
t_scikit = time() - t1

pred_tick = clf_tick.predict_proba(test_set[0])
pred_scikit = clf_scikit.predict_proba(test_set[0])
Exemple #4
0
In this example we compare the convergence speed of our learners given the
float precision used.

In both case the convergence speed in term of number of iterations
(on the left) is similar up to float 32 precision.
But compared to the running time (on the right), we can see that using
float 32 instead of float 64 leads to faster convergence up to
float 32 precision.
"""
import matplotlib.pyplot as plt

from tick.dataset import fetch_tick_dataset
from tick.linear_model import LogisticRegression
from tick.plot import plot_history

X, y = fetch_tick_dataset('binary/adult/adult.trn.bz2')
X = X.toarray()  # It is more visible with dense matrices

max_iter = 50
seed = 7108

learner_64 = LogisticRegression(tol=0,
                                max_iter=max_iter,
                                record_every=2,
                                random_state=seed)
learner_64.fit(X, y)

X_32, y_32 = X.astype('float32'), y.astype('float32')
learner_32 = LogisticRegression(tol=0,
                                max_iter=max_iter,
                                record_every=2,