def create_ml_features_df(data_frame) : """Create a DataFrame with non-ticker specific features for machine learning. Features added as new columns: - 'Days Since Trading': Days since last trading day - 'Days Until Trading': Days until next trading day - Various date part columns from fastai.structured.add_datepart Args: data_frame: DataFrame to add features to. Returns: A new DataFrame with features added to data_frame. """ assert isinstance(data_frame, pd.DataFrame), \ "data_frame must be pandas.DataFrame object" data = create_days_since_valid_date(data_frame, 'Days Since Trading') # Get the days until next day of trading. Reverse sort data, call same # function as above, then resort in normal order. data.sort_index(ascending=False, inplace=True) data = create_days_since_valid_date(data, 'Days Until Trading') data.sort_index(ascending=True, inplace=True) # Add separate columns for various date parts (month, day, etc.) data = data.reset_index() add_datepart(data, 'Date', drop=False) data = data.set_index('Date') return data
def divide_dataframe(df): ''' :param df: dataset we want to split into training and test data :return : X_train, X_test, y_train, y_test ''' # Drop the date column # df.drop('Date', axis=1, inplace=True) # Converting date into int with fastai add_datepart(df, 'Date') # Preprocess with fastai X, y, nas, mapper = proc_df(df, y_fld='Actions', do_scale=True) # print(X.transpose()) # Debugging X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, shuffle=False) return { 'X train': X_train, 'X test': X_test, 'y train': y_train, 'y test': y_test }
data = df.sort_index(ascending=True, axis=0) #print(data) #print(df) #creating a separate dataset new_data = pd.DataFrame(index=range(0,len(df)),columns=['Date', 'Close']) for i in range(0,len(data)): new_data['Date'][i] = data['Date'][i] new_data['Close'][i] = data['Close'][i] print(new_data) #create features from fastai.structured import add_datepart add_datepart(new_data, 'Date') new_data.drop('Elapsed', axis=1, inplace=True) #elapsed will be the time stamp print(new_data) new_data['mon_fri'] = 0 for i in range(0,len(new_data)): if (new_data['Dayofweek'][i] == 0 or new_data['Dayofweek'][i] == 4): new_data['mon_fri'][i] = 1 else: new_data['mon_fri'][i] = 0 #split into train and validation train = new_data[:987] valid = new_data[987:] x_train = train.drop('Close', axis=1)
import pandas as pd import numpy as np from fastai.structured import add_datepart from sklearn import neighbors from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import MinMaxScaler import db_functions scaler = MinMaxScaler(feature_range=(0, 1)) data = db_functions.getDb() add_datepart(data, 'Date') data.drop('Elapsed', axis=1, inplace=True) train = data[:987] valid = data[987:] x_train = train.drop('Close', axis=1) y_train = train['Close'] x_valid = valid.drop('Close', axis=1) y_valid = valid['Close'] x_train_scaled = scaler.fit_transform(x_train) x_train = pd.DataFrame(x_train_scaled) x_valid_scaled = scaler.fit_transform(x_valid) x_valid = pd.DataFrame(x_valid_scaled) params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]} knn = neighbors.KNeighborsRegressor() model = GridSearchCV(knn, params, cv=5)
newdf.head(5) # In[ ]: # Apart from this, we can add our own set of features that we believe would be relevant for the predictions. For instance, my hypothesis is that the first and last days of the week could potentially affect the closing price of the stock far more than the other days. So I have created a feature that identifies # whether a given day is Monday/Friday or Tuesday/Wednesday/Thursday. This can be done using the following lines of code: # In[ ]: #create features from fastai.structured import add_datepart add_datepart(newdf, 'Date') newdf.drop('Elapsed', axis=1, inplace=True) #elapsed will be the time stamp # In[ ]: #create a new column[mon-fri]:monday to friday import sys newdf['mon-fri'] = 0 for i in range(0,len(newdf)): if(newdf['Dayofweek'][i]==0 or newdf['Dayofweek'][i]==4): newdf['mon-fri'][i] =1 else: newdf['mon-fri'][i] =0