oos_end_date = isOosDates[5] modelStartDate = is_start_date modelEndDate = modelStartDate + relativedelta(months=is_months) oosModelStartDate = oos_start_date oosModelEndDate = oosModelStartDate + relativedelta(months=oos_months) # Correlation study corrData = dataSet[modelStartDate:oosModelEndDate].copy() col_vals = [k for k, v in feature_dict.items() if v == 'Drop'] to_drop = ['Open', 'High', 'Low', 'gainAhead', 'Close', 'beLong', 'Volume'] for x in to_drop: col_vals.append(x) corrData = dSet.drop_columns(corrData, col_vals) plotIt.correlation_matrix(corrData) # Create correlation matrix corr_matrix = corrData.corr() # Select upper triangle of correlation matrix upper = corr_matrix.where( np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Find index of feature columns with correlation greater than 0.85 to_drop = [column for column in upper.columns if any(upper[column] > 0.85)] print(to_drop) for x in to_drop: feature_dict[x] = 'Drop' # initialize dataframes for trade analysis tradesDataFull = pd.DataFrame() valDataFull = pd.DataFrame()
# save Dataset of analysis # THIS SHOULD BE A FUNCTION print("====Saving dataSet====\n") file_title = "raw-features-" + system_name + ".pkl" file_name = os.path.join(system_directory, file_title) dataSet2.to_pickle(file_name) # Examine correlations of features # Get columns to drop from feature_dict col_vals = [k for k,v in feature_dict.items() if v == 'Drop'] # And set OHLC, etc., to Drop for cleaner correlation analysis to_drop = ['Open','High','Low', 'gainAhead', 'Close', 'beLong', 'AdjClose'] for x in to_drop: col_vals.append(x) mmData = dSet.drop_columns(dataSet2, col_vals) plotIt.correlation_matrix(mmData) # Examine and drop feature with corr value > 0.85 # Create correlation matrix corr_matrix = mmData.corr() # Select upper triangle of correlation matrix upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Find index of feature columns with correlation greater than 0.85 to_drop = [column for column in upper.columns if any(upper[column] > 0.7)] print('Column(s) to drop: %s' % to_drop) # If there are columns to Drop, change feature dict to indicate Drop if len(to_drop) > 0: for x in to_drop: feature_dict[x] = 'Drop'