Beispiel #1
0
    def _read_stock(self):
        gdf_filename = self.gdf_filename_pattern.format(
            self.stock, self.r, self.s)
        reg_filename = '{}'.format(self.stock)
        logger.debug('Will read %s and %s', gdf_filename, reg_filename)
        d = lob.load_prepared_data(gdf_filename,
                                   data_dir=self.data_dir,
                                   cv=False,
                                   length=self.data_length)
        if d is not None and len(d) == 2:
            df, df_test = d
        else:
            return None
        df_reg, df_reg_test = lob.load_prepared_data(
            reg_filename,
            data_dir='../gaussian_filter/data',
            cv=False,
            length=self.data_length)

        df['queue_imbalance'] = df_reg['queue_imbalance']
        df['prev_queue_imbalance'] = df['queue_imbalance'].shift()
        df.dropna(inplace=True)
        df_test['queue_imbalance'] = df_reg_test['queue_imbalance']
        df_test['prev_queue_imbalance'] = df_test['queue_imbalance'].shift()
        df_test.dropna(inplace=True)
        return df, df_test
Beispiel #2
0
 def _read_stock(self):
     gdf_filename = self.gdf_filename_pattern.format(
         self.stock, self.r, self.s)
     reg_filename = '{}'.format(self.stock)
     logger.debug('Will read %s and %s', gdf_filename, reg_filename)
     d = lob.load_prepared_data(gdf_filename,
                                data_dir=self.data_dir,
                                length=self.data_length)
     if len(d) == 2:
         df, df_test = d
     else:
         return pd.DataFrame(), pd.DataFrame()
     df_reg, df_reg_test = lob.load_prepared_data(
         reg_filename, data_dir=self.reg_data_dir, length=self.data_length)
     df['datetime'] = df_reg['Unnamed: 0']
     df['bid_price'] = df_reg['bid_price']
     df['ask_price'] = df_reg['ask_price']
     df['queue_imbalance'] = df_reg['queue_imbalance']
     df['prev_queue_imbalance'] = df['queue_imbalance'].shift()
     df.index = pd.to_datetime(df['datetime'])
     df.dropna(inplace=True)
     df_test['datetime'] = df_reg_test['Unnamed: 0']
     df_test['bid_price'] = df_reg_test['bid_price']
     df_test['ask_price'] = df_reg_test['ask_price']
     df_test['queue_imbalance'] = df_reg_test['queue_imbalance']
     df_test['prev_queue_imbalance'] = df_test['queue_imbalance'].shift()
     df_test.index = pd.to_datetime(df_test['datetime'])
     df_test.dropna(inplace=True)
     return df, df_test
Beispiel #3
0
    def _read_stocks(self):
        dfs = {}
        dfs_test = {}
        dfs_reg = {}
        dfs_reg_test = {}

        for stock in self.stocks:
            gdf_filename = self.gdf_filename_pattern.format(
                stock, self.r, self.s)
            reg_filename = '{}'.format(stock)
            dfs[stock], dfs_test[stock] = lob.load_prepared_data(
                gdf_filename,
                data_dir=self.data_dir,
                cv=False,
                length=self.data_length)
            dfs_reg[stock], dfs_reg_test[stock] = lob.load_prepared_data(
                reg_filename,
                data_dir='../gaussian_filter/data',
                cv=False,
                length=self.data_length)
        for stock in self.stocks:
            dfs[stock]['queue_imbalance'] = dfs_reg[stock]['queue_imbalance']
            dfs[stock]['prev_queue_imbalance'] = dfs[stock][
                'queue_imbalance'].shift()
            dfs[stock].dropna(inplace=True)
            dfs_test[stock]['queue_imbalance'] = dfs_reg_test[stock][
                'queue_imbalance']
            dfs_test[stock]['prev_queue_imbalance'] = dfs_test[stock][
                'queue_imbalance'].shift()
            dfs_test[stock].dropna(inplace=True)
        return dfs, dfs_test
def main(r=0.02, s=0.2, n=15, K=50):
    print('*****************************************************')
    print('r', r, 's', s)
    gdf_columns = ['gdf_' + str(i) for i in range(0, n)]
    gdfs_r = r * np.ones(K)
    gdfs_m = 0.1000 * np.hstack([np.arange(- K // 2, 0), np.arange(1, K // 2 + 1)])
    gdfs_s = s * np.ones(K)
    gdfs = np.vstack([gdfs_r, gdfs_m, gdfs_s]).T

    data_length = 5050

    stocks = ['9061']
    stocks = ['9062', '9063', '9064', '9065']

    for s in stocks:
        try:
            d, d_cv, d_test = lob.load_prepared_data(s, data_dir='data/', cv=True, length=data_length)
            dfs = transform_to_orders(d, n, gdfs)
            clf = svm_classification(dfs, gdf_columns)
            predictions = clf.predict(dfs.loc[:, gdf_columns])
            print('train', s, roc_auc_score(predictions, dfs['mid_price_indicator']))

            dfs_test = transform_to_orders(d_test, n, gdfs)
            predictions = clf.predict(dfs_test.loc[:, gdf_columns])
            print('test ', s, roc_auc_score(predictions, dfs_test['mid_price_indicator']))
        except Exception as e:
            print(e)
Beispiel #5
0
    def write_svm_gdf(self, K=None, Kn=None, rr=None, ss=None):
        results = []

        try:
            df_gdf_res = pd.read_csv('{}/res_gdf_svm_{}_{}.csv'.format(
                self.results_dir, self.stock, Kn))
            print('Results read from file')
        except FileNotFoundError:
            print('Results file does not exist yet')
            df_gdf_res = pd.DataFrame(columns=[
                'svm', 'c', 'gamma', 'roc_cv_score', 'roc_train_score', 'K',
                'Kn', 'r', 's'
            ])

        for r in rr:
            for s in ss:
                filename = 'gdf_{}_len{}_r{}_s{}_K{}{}'.format(
                    self.stock, self.data_length, r, s, K, self.suffix)
                dfs, dfs_cv, dfs_test = lob.load_prepared_data(
                    filename, data_dir=self.data_dir, cv=True,
                    length=None)  # we don't care about length here

                for C in [1, 10, 100, 1000, 10000]:
                    for gamma in [1, 10, 100, 1000, 10000]:

                        if self.is_in_results(
                                df_gdf_res, {
                                    'c': C,
                                    'gamma': gamma,
                                    'r': r,
                                    's': s,
                                    'K': K,
                                    'Kn': Kn,
                                    'svm': 'rbf'
                                }):
                            continue
                        res = self.perform_gdf_svm(dfs,
                                                   dfs_cv,
                                                   C=C,
                                                   gamma=gamma,
                                                   r=r,
                                                   s=s,
                                                   K=K,
                                                   Kn=Kn)
                        results.append(res)
                        pd.DataFrame(results).to_csv(
                            '{}/new_res_gdf_svm_{}_{}.csv'.format(
                                self.results_dir, self.stock, Kn))
        for result in results:
            df_gdf_res = df_gdf_res.append(result, ignore_index=True)
        df_gdf_res.to_csv('results/res_gdf_svm_{}_{}.csv'.format(
            self.stock, Kn))
        return df_gdf_res
Beispiel #6
0
    def _read_stock(self):
        reg_filename = '{}'.format(self.stock)
        logger.debug('Will read %s', reg_filename)
        df, df_test = lob.load_prepared_data(reg_filename,
                                             data_dir=self.data_dir,
                                             length=self.data_length)

        df['datetime'] = df['Unnamed: 0']
        df['prev_queue_imbalance'] = df['queue_imbalance'].shift()
        df.index = pd.to_datetime(df['datetime'])
        df.dropna(inplace=True)
        df_test['datetime'] = df_test['Unnamed: 0']
        df_test['prev_queue_imbalance'] = df_test['queue_imbalance'].shift()
        df_test.index = pd.to_datetime(df_test['datetime'])
        df_test.dropna(inplace=True)
        return df, df_test
def main(stock):
    """
    This gets gdf_data
    :return:
    """
    K = 50
    length = 15000
    rr = [0.01, 0.05, 0.1, 0.5, 1.0]
    ss = [0.01, 0.05, 0.1, 0.5, 1.0]
    gdf_data_dir = 'data_gdf'
    results_dir = 'data_res_logistic'
    gdf_start = 0
    gdf_end = 50
    algorithm = 'logistic'
    results = []
    results_filename = os.path.join(
        results_dir,
        'res_log_{}_len{}_K{}-{}.csv'.format(stock, length, gdf_start,
                                             gdf_end))
    results_partial_filename = os.path.join(
        results_dir,
        'res_log_{}_len{}_K{}-{}_partial.csv'.format(stock, length, gdf_start,
                                                     gdf_end))
    for r in rr:
        for s in ss:
            gdf_filename = 'gdf_{}_len{}_r{}_s{}_K{}'.format(
                stock, length, r, s, K)

            dfs, dfs_test = lob.load_prepared_data(gdf_filename,
                                                   data_dir=gdf_data_dir,
                                                   cv=False,
                                                   length=length)
            gdf_columns = ['gdf_' + str(i) for i in range(gdf_start, gdf_end)]

            res = {'r': r, 's': s, 'stock': stock, 'K': K, 'method': algorithm}
            print('********************************************')
            print(res)
            try:
                scores = svm_classification(dfs, gdf_columns)
                print(res, scores)
                results.append({**res, **scores})
            except Exception as e:
                print('Exception', e, res)
                results.append(res)
            pd.DataFrame(results).to_csv(results_partial_filename)
    pd.DataFrame(results).to_csv(results_filename)
Beispiel #8
0
def main():
    for s in stocks:
        df, df_cv, df_test = lob.load_prepared_data(s,
                                                    cv=True,
                                                    length=data_length)
        if df is None:
            continue
        for c in cs:
            for g in gammas:
                for coef0 in coef0s:
                    svm = SVMSigmoid(s,
                                     df,
                                     c=c,
                                     coef0=coef0,
                                     gamma=g,
                                     data_length=data_length)
                    svm.predict(df_cv, 'cv', check=False)
                sleep(1)
                svm = SVMRbf(s, df, c=c, gamma=g, data_length=data_length)
                svm.predict(df_cv, 'cv', check=False)
            sleep(1)
            svm = SVMLinear(s, df, c=c, data_length=data_length)
            svm.predict(df_cv, 'cv', check=False)
def main(stock):
    """
    This gets gdf_data
    :return:
    """
    K = 50
    length = 15000
    rr = [0.01, 0.05, 0.1, 0.5, 1.0]
    ss = [0.01, 0.05, 0.1, 0.5, 1.0]
    gdf_data_dir = 'data_gdf_feature_scaling'
    results_dir = 'data_res_gdf_feature_scaling'
    gdf_start = 24
    gdf_end = 26
    algorithm = 'svm_rbf'
    for r in rr:
        for s in ss:
            results_filename = os.path.join(
                results_dir, 'res_{}_len{}_r{}_s{}_K{}-{}.csv'.format(
                    stock, length, r, s, gdf_start, gdf_end))
            results_partial_filename = os.path.join(
                results_dir, 'res_{}_len{}_r{}_s{}_K{}-{}_partial.csv'.format(
                    stock, length, r, s, gdf_start, gdf_end))

            gdf_filename = 'gdf_{}_r{}_s{}_K{}_feature_scaling'.format(
                stock, r, s, K)

            if os.path.exists(results_filename):
                print('Exists ', results_filename)
                continue
            else:
                print('Will create ', results_filename)
            dfs, dfs_test = lob.load_prepared_data(gdf_filename,
                                                   data_dir=gdf_data_dir,
                                                   cv=False,
                                                   length=length)
            gdf_columns = ['gdf_' + str(i) for i in range(gdf_start, gdf_end)]

            results = []
            for C in [1, 10, 100, 1000, 10000]:
                for gamma in [1, 10, 100, 1000, 10000]:
                    res = {
                        'C': C,
                        'gamma': gamma,
                        'r': r,
                        's': s,
                        'stock': stock,
                        'K': K,
                        'method': algorithm
                    }
                    print('********************************************')
                    print(res)
                    try:
                        scores = svm_classification(dfs,
                                                    gdf_columns,
                                                    C=C,
                                                    gamma=gamma)
                        print(res, scores)
                        results.append({**res, **scores})
                    except Exception as e:
                        print('Exception', e, res)
                        results.append(res)
                    pd.DataFrame(results).to_csv(results_partial_filename)
            pd.DataFrame(results).to_csv(results_filename)
Beispiel #10
0
def main(stock):
    """
    This gets gdf_data
    :return:
    """
    K = 50
    length = 15000
    rr = [0.01, 0.05, 0.1, 0.5, 1.0]
    ss = [0.01, 0.05, 0.1, 0.5, 1.0]
    for r in rr:
        for s in ss:
            # TODO: if file exists
            if os.path.exists('res_{}_len{}_r{}_s{}_K{}.csv'.format(
                    stock, length, r, s, K)):
                print(
                    'Exists ', 'res_{}_len{}_r{}_s{}_K{}.csv'.format(
                        stock, length, r, s, K))
                continue
            else:
                print(
                    'Will create ', 'res_{}_len{}_r{}_s{}_K{}.csv'.format(
                        stock, length, r, s, K))
            filename = 'gdf_{}_len{}_r{}_s{}_K{}'.format(
                stock, length, r, s, K)
            dfs, dfs_cv, dfs_test = lob.load_prepared_data(
                filename, data_dir='data_gdf_/', cv=True, length=length)
            gdf_columns = ['gdf_' + str(i) for i in range(0, 50)]

            results = []
            for C in [1, 10, 100, 1000, 10000]:
                for gamma in [1, 10, 100, 1000, 10000]:
                    res = {}
                    res['C'] = C
                    res['gamma'] = gamma
                    res['r'] = r
                    res['s'] = s
                    res['stock'] = stock
                    res['K'] = K

                    print('********************************************')
                    print('C', C, 'gamma', gamma)
                    lob.mo
                    clf = svm_classification(dfs,
                                             gdf_columns,
                                             C=C,
                                             gamma=gamma)
                    predictions = clf.predict(dfs.loc[:, gdf_columns])
                    try:
                        roc_train = roc_auc_score(predictions,
                                                  dfs['mid_price_indicator'])
                        res['roc_train'] = roc_train
                        print('train', s, roc_train)
                    except Exception as e:
                        print(e)
                        pd.DataFrame(results).to_csv(
                            'res_{}_len{}_r{}_s{}_K{}.csv_partial'.format(
                                stock, length, r, s, K))
                    predictions = clf.predict(dfs_cv.loc[:, gdf_columns])
                    try:
                        roc_cv = roc_auc_score(predictions,
                                               dfs_cv['mid_price_indicator'])
                        res['roc_cv'] = roc_cv
                        print('test ', s, roc_cv)
                    except Exception as e:
                        print(e)
                        pd.DataFrame(results).to_csv(
                            'res_{}_len{}_r{}_s{}_K{}.csv_partial'.format(
                                stock, length, r, s, K))
                    results.append(res)
            pd.DataFrame(results).to_csv('res_{}_len{}_r{}_s{}_K{}.csv'.format(
                stock, length, r, s, K))
Beispiel #11
0
    '13113', '2290', '9269', '12059', '3879', '1229', '4695', '5836', '10484',
    '2890', '1694', '1080', '3107', '11038', '12417', '9266', '4320', '3022',
    '3388', '8080', '1431', '12255', '7843', '11714', '4575', '2028', '11946',
    '2813', '11869'
]

i = 0
data_length = 10000
rocs_areas = {}
plt.figure()
for s in stocks:
    try:
        print('for', s)
        d, d_cv, d_test = lob.load_prepared_data(
            s,
            data_dir='../queue_imbalance/data/prepared/',
            cv=True,
            length=data_length)

        print('performing regressions', s)
        reg = lob.logistic_regression(d, 0, len(d))

        print('performing predictions', s)
        score = lob.plot_roc(d_test, reg, stock=s)
        rocs_areas[s] = score
        print('{} (area = {})'.format(s, score))
        i += 1
        # if i % 10 == 0:
        #     plt.savefig('plots_cv_{}.png'.format(i))
        #     plt.figure()
    except Exception as e: