def feature_extraction(data_df, inputs): """ feature extraction --> Transform the existing features into a lower dimensional space Transforms the data - can be used to linearly separate data thru dimensionality reduction """ timeme(principal_component_analysis)(data_df, tuple(inputs))
def feature_selection(train_df, inputs): """ Sequential Backward Selection - feature selection to see which are the most telling variable Default is K-means Clustering Feature selection: Select a subset of the existing features without a transformation Use this to limit down the factors we learn on """ ests = [] ests.append([ DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0), 'DecTree' ]) ests.append([ RandomForestClassifier(criterion='entropy', n_estimators=3, random_state=1, n_jobs=3), 'RandForest' ]) # ests.append([SVC(kernel='linear', C=100, random_state=0), 'SVC']) ests.append( [LogisticRegression(C=100, random_state=0, penalty='l1'), 'LogRegr']) # ests.append([AdalineSGD(n_iter=15, eta=0.001, random_state=1), # 'AdalineSGD']) # ests.append([AdalineGD(n_iter=20, eta=0.001), 'AdalineGD']) ests.append([KNeighborsClassifier(n_neighbors=3), 'Kmeans']) ranks = [] # for ind_est in ests: # print("running for {}".format(ind_est[1])) # ranks.append([ind_est[1], timeme(sbs_run)(train_df, tuple(inputs), # est=ind_est[0], name=ind_est[1])]) # Random Forest Feature Selection - using a random forest to identify # which factors decrease impurity the most pdb.set_trace() ranks.append([ timeme(random_forest_feature_importance)(train_df, tuple(inputs)), 'RandForestFeats' ]) # Logistic Regression Feature Selection - logistic regression # should expose the important variables through its weights pdb.set_trace() ranks.append([ timeme(logistic_regression_feature_importance)(train_df, tuple(inputs)), "LogRegrWgts" ]) pdb.set_trace() for rank in ranks: print("Ranks for {}".format(rank[1])) print(rank[0])
def run_analysis(): """ Function to run thru the whole analysis process """ # Run the filter print("Running Screener:") ticks = timeme(run_filter)() print("{} stocks thru filter\n".format(len(ticks))) # check recent momentum returns print("Running Momentum Check:") ticks = timeme(check_momentum)('20190703', ticks) print("{} stocks thru momentum checks\n".format(len(ticks))) # check recent big vs small results print("Running Big vs. Small Filter:") ticks = timeme(check_big_v_small)('20190703', ticks.reset_index()) print("{} stocks thru big vs small filter\n".format(len(ticks))) # remove ticks that should be ignored print("Ignoring Certain Symbols:") ticks = timeme(ignore_ticks)(ticks.reset_index()) print("{} stocks thru ignore filter\n".format(len(ticks))) # Run equity valuation print("Running Equity Valuation on: {}".format( ticks.index.levels[0].values)) timeme(run_eq_valuation)(ticks) print()
def model_evaluation(data_df, inputs): """ Evaluate the performance of your model thru different techniques """ timeme(kfold_cross_validation)(data_df, tuple(inputs))
def model_evaluation(df, inputs): timeme(kfold_cross_validation)(df, tuple(inputs))