import numpy as np from data_cleaning import DataCleaner from features_engineering import FeatureExtractor from model_selection import ModelSelector from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt plt.interactive(True) if __name__ == '__main__': # read and clean the data dc = DataCleaner() data = dc.clean() # Debug transformations # data.to_csv('./data/debug.csv', index=False, encoding='latin1') # assert False # separate target variable target = data.pop('Target') # train test split data_train, data_test, target_train, target_test = train_test_split( data, target) # featurize data featurizer = FeatureExtractor() X_train = featurizer.featurize(data_train) X_test = featurizer.featurize(data_test) # Convert to numpy arrays y_train = np.array(target_train)
# Create sub_area categorical with all levels shared # between train and test to avoid errors test['price_doc'] = -99 merged = pd.concat([train, test], axis=0) merged = merged.merge(gps, how='left', on='sub_area') merged['sub_area'] = merged.sub_area.astype('category') train = merged[merged.price_doc != -99] test = merged[merged.price_doc == -99] test.pop('price_doc') macro = pd.read_csv('data/macro.csv', parse_dates=['timestamp']) train = train.merge(macro, how='left', on='timestamp', suffixes=('_train', '_macro')) # Clean dc = DataCleaner(data=train, sample_rate=0.3) data, y = dc.clean() y = np.array(y) y = np.log(y+1) # Train / test split data_train, data_test, y_train, y_test = train_test_split(data, y, random_state=77) house_ids_test = data_test.id # Featurize training data set feat_train = Featurizer() X_train = feat_train.featurize(data_train) # Grid search tune all estimators ms = ModelSelector() print ' # {:s} | X_train shape: {:s}'.format(now(), X_train.shape) print ' # {:s} | y_train size: {:d}'.format(now(), y_train.shape[0])
gps = pd.read_csv('./data/Longitud_Latitud.csv') # Create sub_area categorical with all levels shared # between train and test to avoid errors test['price_doc'] = -99 merged = pd.concat([train, test], axis=0) merged = merged.merge(gps, how='left', on='sub_area') merged['sub_area'] = merged.sub_area.astype('category') train = merged[merged.price_doc != -99] train = train.merge(macro, how='left', on='timestamp', suffixes=('_train', '_macro')) dc = DataCleaner(data=train) train, y = dc.clean() y = np.array(y) y = np.log(y + 1) # Featurize training data set feat_train = Featurizer() train = feat_train.featurize(train) print 'train shape', train.shape # # Remove all categorical variables for now # mask = ~(train.dtypes == 'object').values # train = train.iloc[:, mask] # print 'train shape with only numerical features', train.shape # Print NAs proportions for features with NA values