def save_adult_dataset_for_cpp_benchmarks(): """Fetches and saves as C++ cereal serialized file the adult dataset """ save_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../tools/benchmark/data') os.makedirs(save_path, exist_ok=True) label_path = os.path.join(save_path, 'adult.labels.cereal') features_path = os.path.join(save_path, 'adult.features.cereal') X, y = fetch_tick_dataset('binary/adult/adult.trn.bz2') serialize_array(y, label_path) serialize_array(X, features_path)
def fetch_hawkes_bund_data(): """Load Hawkes formatted bund data from https://github.com/X-DataInitiative/tick-datasets/tree/master/hawkes/bund This data is meant to be fitted with Hawkes processes. It contains for each day 4 time series representing: 1. Mid-price movement up 2. Mid-price movement down 3. Buyer initiated trades that do not move the mid-price 4. Seller initiated trades that do not move the mid-price Returns ------- output : `list` of `list` of `np.ndarray`, dim=(20, 4, _) List of 20 days of 4 timestamps data. """ dataset = 'hawkes/bund/bund.npz' return [timestamps for _, timestamps in fetch_tick_dataset(dataset)]
not the case in `tick`. Note that this difference can be reduced by tuning the ``intercept_scaling`` parameter from ``scikit-learn``'s ``LogisticRegression`` * In this example, the computational time of ``tick`` is better than ``scikit``'s """ import numpy as np from time import time import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, auc from sklearn.linear_model import LogisticRegression as LogRegScikit from tick.dataset import fetch_tick_dataset from tick.inference import LogisticRegression as LogRegTick train_set = fetch_tick_dataset('binary/adult/adult.trn.bz2') test_set = fetch_tick_dataset('binary/adult/adult.tst.bz2') clf_tick = LogRegTick(C=1e5, penalty='l1', tol=1e-8) clf_scikit = LogRegScikit(penalty='l1', tol=1e-8) t1 = time() clf_tick.fit(train_set[0], train_set[1]) t_tick = time() - t1 t1 = time() clf_scikit.fit(train_set[0], train_set[1]) t_scikit = time() - t1 pred_tick = clf_tick.predict_proba(test_set[0]) pred_scikit = clf_scikit.predict_proba(test_set[0])
In this example we compare the convergence speed of our learners given the float precision used. In both case the convergence speed in term of number of iterations (on the left) is similar up to float 32 precision. But compared to the running time (on the right), we can see that using float 32 instead of float 64 leads to faster convergence up to float 32 precision. """ import matplotlib.pyplot as plt from tick.dataset import fetch_tick_dataset from tick.linear_model import LogisticRegression from tick.plot import plot_history X, y = fetch_tick_dataset('binary/adult/adult.trn.bz2') X = X.toarray() # It is more visible with dense matrices max_iter = 50 seed = 7108 learner_64 = LogisticRegression(tol=0, max_iter=max_iter, record_every=2, random_state=seed) learner_64.fit(X, y) X_32, y_32 = X.astype('float32'), y.astype('float32') learner_32 = LogisticRegression(tol=0, max_iter=max_iter, record_every=2,