def knn_optimize(input_file, transforms, parameters, algorithmType): train_data_set = prepare_train_test(input_file, transforms, parameters) res_data_set = [] for df_train, df_test, df_train_labels, df_test_labels, _ in train_data_set: label = parameters['trainLabel'] df_train_target = df_train_labels[label] df_test_target = df_test_labels[label] Y = [] X = [] for k in parameters['n_neighbors']: classifier = KNeighborsClassifier(n_neighbors=k, p=parameters['P'][0], metric=parameters['metric'][0]) classifier.fit(df_train, df_train_target) score = classifier.score(df_test, df_test_target) X.append(k) Y.append(score) kn = KneeLocator(X, Y, S=1.2, curve='convex', direction='decreasing') n_neighbors = round(kn.knee, 0) res_data_set.append([ np.array([X, Y]).T, { 'n_neighbors': n_neighbors, 'P': parameters['P'][0], 'metric': parameters['metric'][0] } ]) return res_data_set
def pca_analyse(input_file, transforms, parameters): train_data_set = prepare_train_test(input_file, transforms, parameters) res_data_set = [] for [df_train, _] in train_data_set: k = df_train.shape[1] pca = PCA(n_components=k) pca.fit(df_train) metrics = np.array( [pca.explained_variance_ratio_ * 100, pca.singular_values_]) res_data_set.append([metrics, metrics.T]) return res_data_set
def grid_optimize(input_file, transforms, parameters, algorithmType): train_data_set = prepare_train_test(input_file, transforms, parameters) res_data_set = [] for df_train, _, df_train_labels, _, _ in train_data_set: label = parameters['trainLabel'] df_train_target = df_train_labels[label] params = [] main_param = '' if algorithmType == 3: classifier = LogisticRegression() main_param = 'C' params = ['C', 'solver', 'penalty'] elif algorithmType == 4: if parameters.get('useSVR', False): classifier = SVR() else: classifier = SVC(random_state=parameters['random_state'][0]) main_param = 'C' params = ['C', 'gamma', 'kernel', 'degree'] elif algorithmType == 7: main_param = 'max_depth' params = ['max_depth', 'random_state', 'criterion'] if parameters['regression']: classifier = DecisionTreeRegressor() else: classifier = DecisionTreeClassifier() elif algorithmType == 8: main_param = 'max_depth' params = ['max_depth', 'random_state', 'criterion'] if parameters['regression']: classifier = RandomForestRegressor else: classifier = RandomForestClassifier() else: continue param_grid = {} for p in params: param_grid[p] = parameters[p] gridCV = GridSearchCV(classifier, param_grid=param_grid) gridCV.fit(df_train, df_train_target) result = [] k = 0 for idx, val in enumerate(gridCV.cv_results_['mean_test_score']): if not np.isnan(val): p = gridCV.cv_results_['params'][idx] result.append([p[main_param], val]) k = k + 1 res_data_set.append([result, gridCV.best_params_]) return res_data_set
def lda_analyse(input_file, transforms, parameters): train_data_set = prepare_train_test(input_file, transforms, parameters) res_data_set = [] for [df_train, _, df_train_target, _] in train_data_set: k = parameters['n_components'] lda = LinearDiscriminantAnalysis(n_components=k) df_new = lda.fit_transform(df_train, df_train_target[parameters['trainLabel']]) metrics = np.array([lda.explained_variance_ratio_ * 100]) date_index = input_file.loc[df_train.index, 'Date'] res_data_set.append( [np.concatenate(([date_index], df_new.T), axis=0), metrics.T]) return res_data_set
def kmean_clustering(input_file, transforms, parameters, optimize): df0 = pd.DataFrame(index=input_file.index) if 'Open' in input_file: df0['Ret'] = input_file.Open.shift( -2, fill_value=0) - input_file.Open.shift(-1, fill_value=0) train_data_set = prepare_train_test(input_file, transforms, parameters) res_data_set = [] for df_train, df_test in train_data_set: noSplit = False if df_test is None: df_test = df_train noSplit = True df0_train = df0.loc[df_train.index, :] df0_test = df0.loc[df_test.index, :] n_clusters = parameters['n_clusters'] random_state = parameters['random_state'] init = parameters['init'] if not optimize: k_means = KMeans(n_clusters=n_clusters, random_state=random_state, init=init) Y = [] X = [] if optimize: maxY = None minY = None for k in n_clusters: k_means = KMeans(n_clusters=k, random_state=random_state[0], init=init[0]) k_means.fit(df_train) Y.append(float(k_means.inertia_)) if maxY is None or k_means.inertia_ > maxY: maxY = k_means.inertia_ if minY is None or k_means.inertia_ < minY: minY = k_means.inertia_ X.append(k) Y = [(y - minY) / (maxY - minY) for y in Y] kn = KneeLocator(X, Y, S=1.2, curve='convex', direction='decreasing') n_clusters = round(kn.knee, 0) k_means = KMeans(n_clusters=n_clusters, random_state=random_state[0], init=init[0]) k_means.fit(df_train) df_train['Tar'] = k_means.predict(df_train) if not noSplit: df_test['Tar'] = k_means.predict(df_test) graph = [] if 'Open' in input_file: df0_test_ = df0_test['Ret'][df0_test.index < df0.shape[0] - 2] graph.extend([ np.cumsum( np.insert(df0_test_.loc[df_test['Tar'] == c].to_numpy(), 0, 0)) for c in range(0, n_clusters) ]) # graph.extend([np.insert(df0_test_.loc[df_test['Tar'] == c].to_numpy(), 0, 0) for c in range(0, n_clusters)]) # graph = [df_train[col].to_numpy() for col in df_test] # graph = [df_test.loc[df_test['Tar'] == c].to_numpy() for c in range(0, n_clusters)] if 'Open' in input_file: metrics = [[ df0_train['Ret'][df0_train.index < df0.shape[0] - 2].loc[df_train['Tar'] == c].sum(), df0_test_.loc[df_test['Tar'] == c].sum() ] for c in range(0, n_clusters)] else: metrics = [] test_features = [] if df_test.shape[1] == 3: for c in range(0, n_clusters): df_test_1 = df_test[df_test['Tar'] == c] test_features.append( [df_test_1.values[:, 0], df_test_1.values[:, 1]]) train_features = [] if df_train.shape[1] == 3: for c in range(0, n_clusters): df_train_1 = df_train[df_train['Tar'] == c] train_features.append( [df_train_1.values[:, 0], df_train_1.values[:, 1]]) if optimize: res_data_set.append([ np.array([X, Y]).T, { 'n_clusters': n_clusters, 'init': init[0], 'random_state': random_state[0] }, features ]) else: res_data_set.append( [graph, metrics, [train_features, test_features]]) return res_data_set
def knn_classifier(input_file, transforms, parameters, algorithmType): input_file["Date"] = input_file["Date"] + " " + input_file["Time"] train_data_set = prepare_train_test(input_file, transforms, parameters) res_data_set = [] for X_train, X_test, y_train, y_test, trained_params in train_data_set: label = parameters['trainLabel'] if algorithmType == 1: classifier = KNeighborsClassifier( n_neighbors=parameters['n_neighbors'], p=parameters['P'], metric=parameters['metric']) elif algorithmType == 2: classifier = LinearRegression() elif algorithmType == 3: classifier = LogisticRegression( random_state=0, solver=parameters.get('solver', 'lbfgs'), penalty=parameters.get('penalty', 'l2')) elif algorithmType == 4: if parameters.get('useSVR', False): model = SVR(gamma=parameters.get('gamma', 'auto'), kernel=parameters.get('kernel', 'rbf'), degree=parameters.get('degree', 3)) else: model = SVC(gamma=parameters.get('gamma', 'auto'), kernel=parameters.get('kernel', 'rbf'), degree=parameters.get('degree', 3)) classifier = model #make_pipeline(StandardScaler(), model) elif algorithmType == 6: classifier = LinearDiscriminantAnalysis() elif algorithmType == 7: if not parameters.get('regression', False): classifier = DecisionTreeClassifier( max_depth=parameters.get('max_depth', 2), random_state=parameters.get('random_state', 0), criterion=parameters.get('criterion', 'gini')) else: classifier = DecisionTreeRegressor( max_depth=parameters.get('max_depth', 2), random_state=parameters.get('random_state', 0), criterion=parameters.get('criterion', 'mse')) elif algorithmType == 8: if not parameters.get('regression', False): classifier = RandomForestClassifier( max_depth=parameters.get('max_depth', 2), random_state=parameters.get('random_state', 0), n_estimators=parameters.get('n_estimators', 100), criterion=parameters.get('criterion', 'gini')) else: classifier = RandomForestRegressor( max_depth=parameters.get('max_depth', 2), random_state=parameters.get('random_state', 0), n_estimators=parameters.get('n_estimators', 100), criterion=parameters.get('criterion', 'mse')) elif algorithmType == 9: layers = parameters.get('hidden_layer_sizes', '5,2').split(',') hidden_layers = [] for layer in layers: hidden_layers.append(int(layer)) batch_size = parameters.get('batch_size', 'auto') batch_size = int(batch_size) if batch_size != 'auto' else 'auto' if not parameters.get('regression', False): classifier = MLPClassifier( hidden_layer_sizes=hidden_layers, solver=parameters.get('solver', 'sgd'), alpha=parameters.get('alpha', 0.00001), random_state=parameters.get('random_state', 0), learning_rate_init=parameters.get('learning_rate_init', 0.001), learning_rate=parameters.get('learning_rate', 'constant'), batch_size=batch_size, max_iter=500) else: classifier = MLPRegressor( hidden_layer_sizes=hidden_layers, solver=parameters.get('solver', 'sgd'), alpha=parameters.get('alpha', 0.00001), random_state=parameters.get('random_state', 0), learning_rate_init=parameters.get('learning_rate_init', 0.001), learning_rate=parameters.get('learning_rate', 'constant'), batch_size=batch_size, max_iter=500) y_train = y_train[label] y_test = y_test[label] is_regression = algorithmType == 2 or ( algorithmType == 4 and parameters.get('useSVR', False)) or (parameters.get( 'regression', False)) if label != 'triple_barrier' and not is_regression: y_train = y_train.astype('int').astype('category') y_test = y_test.fillna(0).astype('int').astype('category') for idx, col in enumerate(parameters['features']): if col == label and parameters['inputFilters'][idx] is False: del X_train[label] del X_test[label] break classifier.fit(X_train, y_train) # if algorithmType == 3 or algorithmType == 4: # df_train = df_train.astype('int').astype('category') # df_test = df_test.astype('int').astype('category') p_train = classifier.predict(X_train) p_test = classifier.predict(X_test) test_shift = parameters[ 'testShift'] if parameters['trainLabel'] != 'triple_barrier' else 0 y_test1 = y_test.dropna() df_test_score, df_test_cm = get_metrics(y_test1, p_test[:len(y_test1)], is_regression, algorithmType) df_train_score, _ = get_metrics(y_train, p_train, is_regression, algorithmType) date_index = input_file.loc[y_test.index, "Date"] date_index = date_index.dropna() res = [np.array(date_index)] y_test = y_test.dropna().to_numpy() # p_test = p_test res.append(y_test) res.append(p_test) contours, features = [[], []] if not is_regression and X_test.shape[1] == 2: X_train = pd.DataFrame(index=X_train.index) for col in input_file: if col == 'No' or X_train.shape[1] >= 2: continue X_train[col] = input_file.loc[X_train.index, col] X_test = pd.DataFrame(index=X_test.index) for col in input_file: if col == 'No' or X_test.shape[1] >= 2: continue X_test[col] = input_file.loc[X_test.index, col] contours_train, features_train = get_decision_boundaries( classifier, X_train, y_train, 200, transforms, trained_params, algorithmType, parameters) features_test = get_features(X_test, y_test) features = np.array([features_train, features_test]) contours = contours_train if is_regression and X_test.shape[1] == 1: features = np.array( [[X_train[X_train.columns[0]].values, y_train.values], [X_test[X_test.columns[0]].values, y_test]]) contours = np.array([[X_train[X_train.columns[0]].values, p_train], [X_test[X_test.columns[0]].values, p_test]]) res = np.array(res) res_data = [ res, np.array([df_train_score, df_test_score]).T, df_test_cm, contours, features ] res_data_set.append(res_data) return res_data_set