from pipeline import Pipe from model import xgboost_model, lightgbm_model from sklearn.cross_validation import train_test_split import xgboost as xgb import lightgbm as lgb if __name__ == '__main__': train_path = 'data/train_test.csv' val_path = 'data/train_ensemble.csv' test_path = 'data/test_clean.csv' firsttime = True if firsttime: training_pipe = Pipe(train_path, val_path) train = training_pipe.make('pickle/train') train.to_csv('train_feature.csv', index=False) testing_pipe = Pipe(val_path, test_path) test = testing_pipe.make('pickle/test') test.to_csv('test_features.csv', index=False) else: train = pd.read_csv('train_feature.csv') test = pd.read_csv('test_features.csv') featlist = train.columns.tolist() featlist.remove('is_listened') X = train[featlist].as_matrix() y = train['is_listened'].as_matrix() test_X = test[featlist].as_matrix()
if __name__ == '__main__': # Input files train_path = 'data/archive/train_clean.csv' test_path = 'data/archive/test_clean.csv' # Intermediate files TRAIN_PATH_INTERMEDIATE = 'data/archive/train_intermediate.csv' TEST_PATH_INTERMEDIATE = 'data/archive/test_intermediate.csv' # Set boolean to use pre-made features or build on the fly firsttime = True if firsttime: training_pipe = Pipe(train_path, train_path) train = training_pipe.make('train_intermediate') train.to_csv(TRAIN_PATH_INTERMEDIATE, index=False) testing_pipe = Pipe(train_path, test_path) test = testing_pipe.make('test_intermediate') test.to_csv(TEST_PATH_INTERMEDIATE, index=False) else: train = pd.read_csv('train_feature.csv') test = pd.read_csv('test_features.csv') # Preparing train and test dataset for ensemble layer featlist = train.columns.tolist() featlist.remove('is_listened') X = train[featlist].as_matrix() y = train['is_listened'].as_matrix()