def main(stock, r=0.1, s=0.1):
    try:
        result_dir = 'res_mlp_pca_gdf_que_prev10_split_15000'
        data_length = 15000
        svm_gdf_res = gdf_pca.SvmGdfResults(
            stock,
            r=r,
            s=s,
            data_length=data_length,
            gdf_filename_pattern='gdf_{}_r{}_s{}_K50',
            data_dir='../gaussian_filter/data_gdf_whole/')
        results = []
        for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 2]:
            for hidden_layer_size in [(8, 16), (16, 16), (16, 8), (8, 8)]:
                activation = 'tanh'
                solver = 'adam'
                clf = MLPClassifier(solver=solver,
                                    alpha=alpha,
                                    activation=activation,
                                    hidden_layer_sizes=hidden_layer_size,
                                    random_state=1)
                scores = svm_gdf_res.train_clf(
                    clf, feature_name='pca_gdf_que_prev_split10', method='mlp')
                results.append({
                    **scores, 'alpha': alpha,
                    'solver': solver,
                    'hidden_layer_sizes': hidden_layer_size
                })
        pd.DataFrame(results).to_csv(
            os.path.join(
                result_dir, 'mlp_pca_gdf_{}_len{}_r{}_s{}.csv'.format(
                    stock, data_length, r, s)))
    except Exception as e:
        print(e)
Beispiel #2
0
def train_lstm(res):
    gdf_filename_pattern = 'gdf_{}_r{}_s{}_K50'
    data_length = 24000
    r = res['r'].values[0]
    s = res['s'].values[0]
    feature = res['features'].values[0]
    n_steps = int(res['n_steps'].values[0])
    stock = str(int(res['stock'].values[0]))
    arch = res['arch'].values[0]
    print(stock, n_steps, feature)
    gdf_dfs = gdf_pca.SvmGdfResults(stock,
                                    r=r,
                                    s=s,
                                    data_length=data_length,
                                    gdf_filename_pattern=gdf_filename_pattern)
    weights = gdf_dfs.get_classes_weights()

    epochs = 50
    batch_size = 512

    filename = os.path.join(
        'res_lstm_predictions',
        f'pred_lstm_iter_{stock}_len{data_length}_r{r}_s{s}.csv')
    if os.path.exists(filename):
        print(f'Exists {filename}.')
        return None

    get_model_func = get_model(arch)
    s, m = gdf_dfs.train_lstm(get_model_func,
                              feature_name=feature,
                              should_return_model=True,
                              fit_kwargs={
                                  'epochs': epochs,
                                  'batch_size': batch_size,
                                  'verbose': 0,
                                  'shuffle': False
                              },
                              should_validate=False,
                              compile_kwargs={
                                  'loss': 'binary_crossentropy',
                                  'optimizer': 'adam',
                                  'metrics': [matthews_correlation, auc_roc]
                              },
                              class_weight=weights,
                              n_steps=n_steps)
    test_x, test_y = gdf_dfs.get_test_set(feature_name=feature,
                                          n_steps=n_steps)
    pred = m.predict_classes(test_x)
    df_scores = pd.DataFrame()
    df_scores['pred'] = pred.ravel()
    df_scores['actual'] = test_y
    df_scores.to_csv(filename)
    return None
def main(stock, r=0.1, s=0.1):
    result_dir = 'res_mlp_pca'

    data_length = 24000
    svm_gdf_res = gdf_pca.SvmGdfResults(
        stock, r=r, s=s, data_length=data_length, gdf_filename_pattern='gdf_{}_r{}_s{}_K50')
    feature_name = 'pca_n_gdf_que'

    weights = svm_gdf_res.get_classes_weights()
    epochs = 50
    batch_size = 512
    filename = os.path.join(result_dir, 'mlp_pca_gdf_n_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    if os.path.exists(filename):
        print(f'Exists {filename}')
        return
    filename_partial = os.path.join(
        result_dir, 'mlp_pca_n_gdf_{}_len{}_r{}_s{}.csv_partial'.format(stock, data_length, r, s))
    df_partial = pd.DataFrame()
    if os.path.exists(filename_partial):
        print(f'Reading partial file {filename_partial}')
        df_partial = pd.read_csv(filename_partial)
    for k, get_m in models.items():
            if np.any(df_partial):

                print(filename_partial)
                row = df_partial[df_partial['hidden_layer_sizes'] == k]

                if np.any(row):
                    print(f'Read result for hidden layer {k} in {filename_partial}')
                    continue
            print(f'Training {stock} {r} {s} {k}')

            plot_name = f'plot_mlp/{stock}_mlp_pca_gdf_n_r{r}_s{s}'
            score = svm_gdf_res.train_mlp(
                get_m, feature_name=feature_name, method='mlp',
                fit_kwargs={'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'shuffle': False},
                compile_kwargs={'loss': 'binary_crossentropy', 'optimizer': 'adam',
                                'metrics': [auc_roc, matthews_correlation, 'acc']},
                plot_name=plot_name, class_weight=weights)
            score = {**score, 'r': r, 's': s,
                     'epochs': epochs, 'batch_size': batch_size}
            score = {'solver': 'adam', 'hidden_layer_sizes': k,
                     'learning_rate': 0.001, **score}
            df_partial = df_partial.append(pd.DataFrame([score]), ignore_index=True)
            df_partial.index = list((range(len(df_partial))))
            df_partial.to_csv(filename_partial)
    df_partial.to_csv(filename)
    return True
def train_lstm(res):
    data_length = 24000
    r = res['r'].values[0]
    s = res['s'].values[0]
    feature = res['features'].values[0]
    n_steps = int(res['n_steps'].values[0])
    unit = res['unit'].values[0]
    stock = str(int(res['stock'].values[0]))
    arch = res['arch'].values[0]

    gdf_dfs = gdf_pca.SvmGdfResults(
        stock, r=r, s=s, data_length=data_length, gdf_filename_pattern='gdf_{}_r{}_s{}_K50')
    weights = gdf_dfs.get_classes_weights()

    epochs = 50
    batch_size = 512

    filename = os.path.join('res_lstm_iter', f'res_lstm_iter_{stock}_len{data_length}_r{r}_s{s}.csv')
    partial_filename = filename + '_partial'

    # if os.path.exists(filename):
    #     print(f'Exists {filename}.')
    #     return None

    df_partial = pd.DataFrame()
    if os.path.exists(partial_filename):
        df_partial = pd.read_csv(partial_filename)

    if len(df_partial) < 30:
        logger.info('Iteration %s stock %s', len(df_partial), stock)
        get_model = get_lstm_model_for_arch(arch)
        try:
            score = gdf_dfs.train_lstm(
                get_model, feature_name=feature,
                fit_kwargs={'epochs': epochs, 'batch_size': batch_size, 'verbose': 0, 'shuffle': False},
                compile_kwargs={'loss': 'binary_crossentropy', 'optimizer': 'adam',
                                'metrics': [matthews_correlation, auc_roc]}, class_weight=weights, n_steps=n_steps)
            score = {**score, 'r': r, 's': s, 'unit': unit, 'arch': arch,
                     'epochs': epochs, 'batch_size': batch_size, 'n_steps': n_steps}
            df_partial = df_partial.append([score])
            df_partial.to_csv(partial_filename)
            logger.info('Done %s stock %s', len(df_partial), stock)
        except Exception as e:
            logger.error('%s: iter %s  %s', stock, len(df_partial), e)
            raise Exception(stock, e)
    df_partial.to_csv(filename)
    return None
Beispiel #5
0
def perform_mcnemar(res):
    gdf_filename_pattern = 'gdf_{}_r{}_s{}_K50'
    data_length = 24000
    r = res['r'].values[0]
    s = res['s'].values[0]
    stock = str(int(res['stock'].values[0]))
    gdf_dfs = gdf_pca.SvmGdfResults(
        stock, r=r, s=s, data_length=data_length, gdf_filename_pattern=gdf_filename_pattern)

    df = gdf_dfs.df
    df_test = gdf_dfs.df_test
    df_lstm = pd.read_csv(f'res_lstm_predictions/pred_lstm_iter_{stock}_len{data_length}_r{r}_s{s}.csv')

    reg = LogisticRegression(class_weight=get_classes_weights(df))
    reg.fit(df[['queue_imbalance']], df['mid_price_indicator'])
    log_pred = reg.predict(df_test[['queue_imbalance']])

    df_all = pd.DataFrame()
    df_all['pred_log'] = log_pred[(len(log_pred) - len(df_lstm)):]
    df_all['pred_lstm'] = df_lstm['pred'].values
    df_all['actual'] = df_test['mid_price_indicator'].values[(len(log_pred) - len(df_lstm)):]

    df_all['correct_lstm'] = (df_all['pred_lstm'] == df_all['actual']).astype(np.int64)
    df_all['correct_log'] = (df_all['pred_log'] == df_all['actual']).astype(np.int64)

    table = pd.crosstab(df_all['correct_lstm'], df_all['correct_log'])
    mcnemar_res = mcnemar(table, exact=False, correction=True)

    df_mcnemar = pd.DataFrame()
    df_mcnemar['pvalue'] = [mcnemar_res.pvalue]
    df_mcnemar['statistic'] = [mcnemar_res.statistic]
    df_mcnemar['TN'] = [table[0][0]]
    df_mcnemar['FN'] = [table[0][1]]
    df_mcnemar['FP'] = [table[1][0]]
    df_mcnemar['TP'] = [table[1][1]]
    df_mcnemar['stock'] = stock
    df_mcnemar.to_csv(f'res_lstm_predictions/mcnemar_lstm_log_{stock}_len{data_length}_r{r}_s{s}.csv')
    return mcnemar_res
Beispiel #6
0
def main(stock, r=0.1, s=0.1):
    result_dir = 'res_mlp_pca'

    data_length = 24000
    svm_gdf_res = gdf_pca.SvmGdfResults(
        stock,
        r=r,
        s=s,
        data_length=data_length,
        gdf_filename_pattern='gdf_{}_r{}_s{}_K50')
    feature_name = 'pca_n_gdf_que'
    n = svm_gdf_res.get_pca(feature_name).n_components
    hidden_layer_sizes = [(n, ), (n, n), (2 * n, n), (2 * n, 2 * n),
                          (n, 2 * n), (n, n, n)]

    weights = svm_gdf_res.get_classes_weights()
    epochs = 10
    batch_size = 300
    filename = os.path.join(
        result_dir,
        'mlp_pca_gdf_{}_len{}_r{}_s{}.csv'.format(stock, data_length, r, s))
    if os.path.exists(filename):
        print(f'Exists {filename}')
        return
    filename_partial = os.path.join(
        result_dir, 'mlp_pca_gdf_{}_len{}_r{}_s{}.csv_partial'.format(
            stock, data_length, r, s))
    df_partial = pd.DataFrame()
    if os.path.exists(filename_partial):
        print(f'Reading partial file {filename_partial}')
        df_partial = pd.read_csv(filename_partial)
    for hidden_layer_size in hidden_layer_sizes:
        for learning_rate in [0.001
                              ]:  #[0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0]:
            if np.any(df_partial):

                print(filename_partial)
                row = df_partial[df_partial['hidden_layer_sizes'] ==
                                 hidden_layer_size]
                if np.any(row) and len(row) >= 1:
                    print(row)
                    row = df_partial[
                        df_partial['hidden_layer_sizes'] == hidden_layer_size][
                            df_partial['learning_rate'] == learning_rate]
                    print(row)
                    if np.any(row):
                        print(
                            f'Read result for hidden layer {hidden_layer_size} lr {learning_rate} in {filename_partial}'
                        )
                        continue
            print(
                f'Training {stock} {r} {s} {hidden_layer_size} {learning_rate}'
            )
            solver = optimizers.Adam(lr=learning_rate)
            model = Sequential()
            if isinstance(hidden_layer_size, int):
                model.add(Dense(hidden_layer_size))
            else:
                for h in hidden_layer_size:
                    model.add(Dense(h))
            model.add(Dense(1, activation='sigmoid'))

            plot_name = f'plot_mlp/{stock}_mlp_pca_n_r{r}_s{s}'
            score = svm_gdf_res.train_mlp(model,
                                          feature_name=feature_name,
                                          method='mlp',
                                          fit_kwargs={
                                              'epochs': epochs,
                                              'batch_size': batch_size,
                                              'verbose': 0,
                                              'shuffle': False
                                          },
                                          compile_kwargs={
                                              'loss': 'binary_crossentropy',
                                              'optimizer': solver,
                                              'metrics': [auc_roc, 'acc']
                                          },
                                          plot_name=plot_name,
                                          class_weight=weights)
            score = {
                **score, 'r': r,
                's': s,
                'arch': model.to_json(),
                'epochs': epochs,
                'batch_size': batch_size
            }
            score = {
                'solver': solver,
                'hidden_layer_sizes': hidden_layer_size,
                'learning_rate': learning_rate,
                **score
            }
            df_partial = df_partial.append(pd.DataFrame([score]),
                                           ignore_index=True)
            df_partial.index = list((range(len(df_partial))))
            #  df_partial.drop(columns=[[c for c in df_partial.columns if 'Unnamed' in c]], inplace=True)
            df_partial.to_csv(filename_partial)
    df_partial.to_csv(filename)
    return True
def train_lstm(stock,
               r,
               s,
               data_length,
               units=None,
               kernel_regularizations=None):
    import tensorflow as tf
    auc_roc = as_keras_metric(tf.metrics.auc)
    r = float(r)
    s = float(s)
    data_length = int(data_length)
    print('running', stock, r, s, data_length)
    gdf_filename_pattern = 'gdf_{}_r{}_s{}_K50'
    gdf_dfs = gdf_pca.SvmGdfResults(str(stock),
                                    r=r,
                                    s=s,
                                    data_length=data_length,
                                    gdf_filename_pattern=gdf_filename_pattern)

    weights = gdf_dfs.get_classes_weights()
    feature = 'pca_n_gdf_que'  # , 'pca_n_gdf_que_prev']
    epochs = 50
    batch_size = 512
    n_steps = 1

    filename = os.path.join(
        'res_gru',
        f'res_gru_pca_n_one_layer_{stock}_len{data_length}_r{r}_s{s}.csv')
    # if os.path.exists(filename):
    #     logger.info('Exists %s', filename)
    #     return
    partial_filename = filename + '_partial'

    df_partial = pd.DataFrame()
    if os.path.exists(partial_filename):
        df_partial = pd.read_csv(partial_filename)
        if 'kernel_reg' not in df_partial.columns:
            print('Kernel reg not in columns!')
            df_partial['kernel_reg'] = np.zeros(len(df_partial)).astype(
                np.float)
        df_partial.drop(
            columns=[c for c in df_partial.columns if 'Unnamed' in c],
            inplace=True)

    for unit in units:
        unit_str = f'({unit}: tanh, 1)'
        for kernel_reg in kernel_regularizations:

            if np.any(df_partial):
                row = df_partial[df_partial['unit'] == unit_str][
                    df_partial['kernel_reg'] == kernel_reg][
                        df_partial['n_steps'] == n_steps]
                if np.any(row):
                    print(
                        f'Already calculated {stock} {unit_str} {kernel_reg}')
                    continue
            print(f'Will train {stock} r{r} s{s} {unit_str} {kernel_reg}')
            pca = gdf_dfs.get_pca(feature)
            get_model = get_model_func(unit,
                                       input_shape=(n_steps,
                                                    pca.n_components_))
            plot_name = f'plot_lstm/{stock}_one_layer_u{unit}_kr{kernel_reg}_pca_n_r{r}_s{s}'
            score = gdf_dfs.train_lstm(get_model,
                                       feature_name=feature,
                                       method='gru',
                                       fit_kwargs={
                                           'epochs': epochs,
                                           'batch_size': batch_size,
                                           'verbose': 0,
                                           'shuffle': False
                                       },
                                       compile_kwargs={
                                           'loss':
                                           'binary_crossentropy',
                                           'optimizer':
                                           'adam',
                                           'metrics':
                                           [matthews_correlation, auc_roc]
                                       },
                                       plot_name=plot_name,
                                       class_weight=weights,
                                       n_steps=n_steps)
            score = {
                **score, 'r': r,
                's': s,
                'unit': unit_str,
                'kernel_reg': kernel_reg,
                'epochs': epochs,
                'batch_size': batch_size,
                'n_steps': n_steps
            }
            df_partial = df_partial.append(pd.DataFrame([score]),
                                           ignore_index=True)
            df_partial.to_csv(partial_filename)
    df_partial.drop(columns=[c for c in df_partial.columns if 'Unnamed' in c],
                    inplace=True)
    df_partial.to_csv(filename)
    return True