コード例 #1
0
def knn_optimize(input_file, transforms, parameters, algorithmType):
    train_data_set = prepare_train_test(input_file, transforms, parameters)
    res_data_set = []
    for df_train, df_test, df_train_labels, df_test_labels, _ in train_data_set:
        label = parameters['trainLabel']
        df_train_target = df_train_labels[label]
        df_test_target = df_test_labels[label]

        Y = []
        X = []
        for k in parameters['n_neighbors']:
            classifier = KNeighborsClassifier(n_neighbors=k,
                                              p=parameters['P'][0],
                                              metric=parameters['metric'][0])
            classifier.fit(df_train, df_train_target)
            score = classifier.score(df_test, df_test_target)
            X.append(k)
            Y.append(score)
        kn = KneeLocator(X, Y, S=1.2, curve='convex', direction='decreasing')
        n_neighbors = round(kn.knee, 0)
        res_data_set.append([
            np.array([X, Y]).T, {
                'n_neighbors': n_neighbors,
                'P': parameters['P'][0],
                'metric': parameters['metric'][0]
            }
        ])
    return res_data_set
コード例 #2
0
def pca_analyse(input_file, transforms, parameters):
    train_data_set = prepare_train_test(input_file, transforms, parameters)
    res_data_set = []
    for [df_train, _] in train_data_set:
        k = df_train.shape[1]
        pca = PCA(n_components=k)
        pca.fit(df_train)

        metrics = np.array(
            [pca.explained_variance_ratio_ * 100, pca.singular_values_])
        res_data_set.append([metrics, metrics.T])
    return res_data_set
コード例 #3
0
def grid_optimize(input_file, transforms, parameters, algorithmType):
    train_data_set = prepare_train_test(input_file, transforms, parameters)
    res_data_set = []
    for df_train, _, df_train_labels, _, _ in train_data_set:
        label = parameters['trainLabel']
        df_train_target = df_train_labels[label]

        params = []
        main_param = ''
        if algorithmType == 3:
            classifier = LogisticRegression()
            main_param = 'C'
            params = ['C', 'solver', 'penalty']
        elif algorithmType == 4:
            if parameters.get('useSVR', False):
                classifier = SVR()
            else:
                classifier = SVC(random_state=parameters['random_state'][0])
            main_param = 'C'
            params = ['C', 'gamma', 'kernel', 'degree']
        elif algorithmType == 7:
            main_param = 'max_depth'
            params = ['max_depth', 'random_state', 'criterion']
            if parameters['regression']:
                classifier = DecisionTreeRegressor()
            else:
                classifier = DecisionTreeClassifier()
        elif algorithmType == 8:
            main_param = 'max_depth'
            params = ['max_depth', 'random_state', 'criterion']
            if parameters['regression']:
                classifier = RandomForestRegressor
            else:
                classifier = RandomForestClassifier()
        else:
            continue

        param_grid = {}
        for p in params:
            param_grid[p] = parameters[p]
        gridCV = GridSearchCV(classifier, param_grid=param_grid)
        gridCV.fit(df_train, df_train_target)

        result = []
        k = 0
        for idx, val in enumerate(gridCV.cv_results_['mean_test_score']):
            if not np.isnan(val):
                p = gridCV.cv_results_['params'][idx]
                result.append([p[main_param], val])
                k = k + 1
        res_data_set.append([result, gridCV.best_params_])
    return res_data_set
コード例 #4
0
def lda_analyse(input_file, transforms, parameters):
    train_data_set = prepare_train_test(input_file, transforms, parameters)
    res_data_set = []
    for [df_train, _, df_train_target, _] in train_data_set:
        k = parameters['n_components']
        lda = LinearDiscriminantAnalysis(n_components=k)
        df_new = lda.fit_transform(df_train,
                                   df_train_target[parameters['trainLabel']])
        metrics = np.array([lda.explained_variance_ratio_ * 100])

        date_index = input_file.loc[df_train.index, 'Date']
        res_data_set.append(
            [np.concatenate(([date_index], df_new.T), axis=0), metrics.T])
    return res_data_set
コード例 #5
0
def kmean_clustering(input_file, transforms, parameters, optimize):
    df0 = pd.DataFrame(index=input_file.index)
    if 'Open' in input_file:
        df0['Ret'] = input_file.Open.shift(
            -2, fill_value=0) - input_file.Open.shift(-1, fill_value=0)

    train_data_set = prepare_train_test(input_file, transforms, parameters)
    res_data_set = []
    for df_train, df_test in train_data_set:
        noSplit = False
        if df_test is None:
            df_test = df_train
            noSplit = True
        df0_train = df0.loc[df_train.index, :]
        df0_test = df0.loc[df_test.index, :]

        n_clusters = parameters['n_clusters']
        random_state = parameters['random_state']
        init = parameters['init']

        if not optimize:
            k_means = KMeans(n_clusters=n_clusters,
                             random_state=random_state,
                             init=init)

        Y = []
        X = []
        if optimize:
            maxY = None
            minY = None
            for k in n_clusters:
                k_means = KMeans(n_clusters=k,
                                 random_state=random_state[0],
                                 init=init[0])
                k_means.fit(df_train)
                Y.append(float(k_means.inertia_))
                if maxY is None or k_means.inertia_ > maxY:
                    maxY = k_means.inertia_
                if minY is None or k_means.inertia_ < minY:
                    minY = k_means.inertia_
                X.append(k)
            Y = [(y - minY) / (maxY - minY) for y in Y]
            kn = KneeLocator(X,
                             Y,
                             S=1.2,
                             curve='convex',
                             direction='decreasing')
            n_clusters = round(kn.knee, 0)
            k_means = KMeans(n_clusters=n_clusters,
                             random_state=random_state[0],
                             init=init[0])

        k_means.fit(df_train)
        df_train['Tar'] = k_means.predict(df_train)
        if not noSplit:
            df_test['Tar'] = k_means.predict(df_test)
        graph = []

        if 'Open' in input_file:
            df0_test_ = df0_test['Ret'][df0_test.index < df0.shape[0] - 2]
            graph.extend([
                np.cumsum(
                    np.insert(df0_test_.loc[df_test['Tar'] == c].to_numpy(), 0,
                              0)) for c in range(0, n_clusters)
            ])
        # graph.extend([np.insert(df0_test_.loc[df_test['Tar'] == c].to_numpy(), 0, 0) for c in range(0, n_clusters)])
        # graph = [df_train[col].to_numpy() for col in df_test]
        # graph = [df_test.loc[df_test['Tar'] == c].to_numpy() for c in range(0, n_clusters)]
        if 'Open' in input_file:
            metrics = [[
                df0_train['Ret'][df0_train.index < df0.shape[0] -
                                 2].loc[df_train['Tar'] == c].sum(),
                df0_test_.loc[df_test['Tar'] == c].sum()
            ] for c in range(0, n_clusters)]
        else:
            metrics = []

        test_features = []
        if df_test.shape[1] == 3:
            for c in range(0, n_clusters):
                df_test_1 = df_test[df_test['Tar'] == c]
                test_features.append(
                    [df_test_1.values[:, 0], df_test_1.values[:, 1]])
        train_features = []
        if df_train.shape[1] == 3:
            for c in range(0, n_clusters):
                df_train_1 = df_train[df_train['Tar'] == c]
                train_features.append(
                    [df_train_1.values[:, 0], df_train_1.values[:, 1]])

        if optimize:
            res_data_set.append([
                np.array([X, Y]).T, {
                    'n_clusters': n_clusters,
                    'init': init[0],
                    'random_state': random_state[0]
                }, features
            ])
        else:
            res_data_set.append(
                [graph, metrics, [train_features, test_features]])
    return res_data_set
コード例 #6
0
def knn_classifier(input_file, transforms, parameters, algorithmType):
    input_file["Date"] = input_file["Date"] + " " + input_file["Time"]
    train_data_set = prepare_train_test(input_file, transforms, parameters)
    res_data_set = []

    for X_train, X_test, y_train, y_test, trained_params in train_data_set:
        label = parameters['trainLabel']

        if algorithmType == 1:
            classifier = KNeighborsClassifier(
                n_neighbors=parameters['n_neighbors'],
                p=parameters['P'],
                metric=parameters['metric'])
        elif algorithmType == 2:
            classifier = LinearRegression()
        elif algorithmType == 3:
            classifier = LogisticRegression(
                random_state=0,
                solver=parameters.get('solver', 'lbfgs'),
                penalty=parameters.get('penalty', 'l2'))
        elif algorithmType == 4:
            if parameters.get('useSVR', False):
                model = SVR(gamma=parameters.get('gamma', 'auto'),
                            kernel=parameters.get('kernel', 'rbf'),
                            degree=parameters.get('degree', 3))
            else:
                model = SVC(gamma=parameters.get('gamma', 'auto'),
                            kernel=parameters.get('kernel', 'rbf'),
                            degree=parameters.get('degree', 3))
            classifier = model  #make_pipeline(StandardScaler(), model)
        elif algorithmType == 6:
            classifier = LinearDiscriminantAnalysis()
        elif algorithmType == 7:
            if not parameters.get('regression', False):
                classifier = DecisionTreeClassifier(
                    max_depth=parameters.get('max_depth', 2),
                    random_state=parameters.get('random_state', 0),
                    criterion=parameters.get('criterion', 'gini'))
            else:
                classifier = DecisionTreeRegressor(
                    max_depth=parameters.get('max_depth', 2),
                    random_state=parameters.get('random_state', 0),
                    criterion=parameters.get('criterion', 'mse'))
        elif algorithmType == 8:
            if not parameters.get('regression', False):
                classifier = RandomForestClassifier(
                    max_depth=parameters.get('max_depth', 2),
                    random_state=parameters.get('random_state', 0),
                    n_estimators=parameters.get('n_estimators', 100),
                    criterion=parameters.get('criterion', 'gini'))
            else:
                classifier = RandomForestRegressor(
                    max_depth=parameters.get('max_depth', 2),
                    random_state=parameters.get('random_state', 0),
                    n_estimators=parameters.get('n_estimators', 100),
                    criterion=parameters.get('criterion', 'mse'))
        elif algorithmType == 9:
            layers = parameters.get('hidden_layer_sizes', '5,2').split(',')
            hidden_layers = []
            for layer in layers:
                hidden_layers.append(int(layer))
            batch_size = parameters.get('batch_size', 'auto')
            batch_size = int(batch_size) if batch_size != 'auto' else 'auto'
            if not parameters.get('regression', False):
                classifier = MLPClassifier(
                    hidden_layer_sizes=hidden_layers,
                    solver=parameters.get('solver', 'sgd'),
                    alpha=parameters.get('alpha', 0.00001),
                    random_state=parameters.get('random_state', 0),
                    learning_rate_init=parameters.get('learning_rate_init',
                                                      0.001),
                    learning_rate=parameters.get('learning_rate', 'constant'),
                    batch_size=batch_size,
                    max_iter=500)
            else:
                classifier = MLPRegressor(
                    hidden_layer_sizes=hidden_layers,
                    solver=parameters.get('solver', 'sgd'),
                    alpha=parameters.get('alpha', 0.00001),
                    random_state=parameters.get('random_state', 0),
                    learning_rate_init=parameters.get('learning_rate_init',
                                                      0.001),
                    learning_rate=parameters.get('learning_rate', 'constant'),
                    batch_size=batch_size,
                    max_iter=500)

        y_train = y_train[label]
        y_test = y_test[label]
        is_regression = algorithmType == 2 or (
            algorithmType == 4
            and parameters.get('useSVR', False)) or (parameters.get(
                'regression', False))

        if label != 'triple_barrier' and not is_regression:
            y_train = y_train.astype('int').astype('category')
            y_test = y_test.fillna(0).astype('int').astype('category')

        for idx, col in enumerate(parameters['features']):
            if col == label and parameters['inputFilters'][idx] is False:
                del X_train[label]
                del X_test[label]
                break

        classifier.fit(X_train, y_train)

        # if algorithmType == 3 or algorithmType == 4:
        #   df_train = df_train.astype('int').astype('category')
        #   df_test = df_test.astype('int').astype('category')
        p_train = classifier.predict(X_train)
        p_test = classifier.predict(X_test)

        test_shift = parameters[
            'testShift'] if parameters['trainLabel'] != 'triple_barrier' else 0
        y_test1 = y_test.dropna()
        df_test_score, df_test_cm = get_metrics(y_test1, p_test[:len(y_test1)],
                                                is_regression, algorithmType)
        df_train_score, _ = get_metrics(y_train, p_train, is_regression,
                                        algorithmType)

        date_index = input_file.loc[y_test.index, "Date"]
        date_index = date_index.dropna()
        res = [np.array(date_index)]

        y_test = y_test.dropna().to_numpy()
        # p_test = p_test
        res.append(y_test)
        res.append(p_test)

        contours, features = [[], []]
        if not is_regression and X_test.shape[1] == 2:
            X_train = pd.DataFrame(index=X_train.index)
            for col in input_file:
                if col == 'No' or X_train.shape[1] >= 2:
                    continue
                X_train[col] = input_file.loc[X_train.index, col]

            X_test = pd.DataFrame(index=X_test.index)
            for col in input_file:
                if col == 'No' or X_test.shape[1] >= 2:
                    continue
                X_test[col] = input_file.loc[X_test.index, col]
            contours_train, features_train = get_decision_boundaries(
                classifier, X_train, y_train, 200, transforms, trained_params,
                algorithmType, parameters)
            features_test = get_features(X_test, y_test)
            features = np.array([features_train, features_test])
            contours = contours_train
        if is_regression and X_test.shape[1] == 1:
            features = np.array(
                [[X_train[X_train.columns[0]].values, y_train.values],
                 [X_test[X_test.columns[0]].values, y_test]])
            contours = np.array([[X_train[X_train.columns[0]].values, p_train],
                                 [X_test[X_test.columns[0]].values, p_test]])

        res = np.array(res)
        res_data = [
            res,
            np.array([df_train_score, df_test_score]).T, df_test_cm, contours,
            features
        ]
        res_data_set.append(res_data)
    return res_data_set