Used to generate trained GB models with different train-test splits. """ np.random.seed(100) df = pickle.load(open('../../data/pairs_pdos.pkl')) if True: X,y = train_prep_pdos(df,include_WF=True,dE=0.1) model_type = 'pdos' else: X,y = train_prep(df,include_WF=True) model_type = 'moments' if False: X_train, X_dev, X_test, y_train, y_dev, y_test = split_by_cols(df,X,y,['comp','ads_a','ads_b']) split_type = 'comp_rxn' elif True: X_train, X_dev, X_test, y_train, y_dev, y_test = split_by_cols(df,X,y,['comp']) split_type = 'comp' elif False: X_train, X_dev, X_test, y_train, y_dev, y_test = split_by_cols(df,X,y,['ads_a','ads_b']) split_type = 'rxn' else: X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3) X_dev, X_test, y_dev, y_test = train_test_split(X_test,y_test,test_size=0.5) split_type = 'random' model = CatBoostRegressor(loss_function='MAE',iterations=1.5e4) model.fit(X_train,y_train,eval_set=(X_dev,y_dev)) model.save_model('%s_%s.cbm'%(model_type,split_type))
return mae if __name__ == '__main__': df = pickle.load(open('data/pairs_pdos.pkl')) features = 'moments' # #pdos,'moments' bayes = True #Feature Selection if features == 'moments': X, y = train_prep(df) elif features == 'pdos': X, y = train_prep_pdos(df, stack=False, include_WF=False, dE=0.1) X_train, X_dev, X_test, y_train, y_dev, y_test, groups = split_by_cols( df, X, y, ['comp', 'ads_a', 'ads_b'], ret_groups=True) rf = ensemble.RandomForestRegressor(n_estimators=100) group_kfold = GroupKFold(n_splits=3) #print(X_train.shape[1]),np.sqrt(X_train.shape[1]) if bayes: random_grid = { #'n_estimators': (5,100), 'max_features': (int(np.sqrt(X_train.shape[1])), X_train.shape[1]), 'max_depth': (5, 50), 'min_samples_split': (2, 10), 'min_samples_leaf': (2, 5), 'bootstrap': [True, False] }
df = pickle.load(open('../../data/pairs_pdos.pkl')) X, y = train_prep_pdos(df, include_WF=False, stack=True) for split_type in ['comp', 'rxn', 'comp_rxn', 'random']: print split_type if split_type == 'comp': cols = ['comp'] elif split_type == 'rxn': cols = ['ads_a', 'ads_b'] elif split_type == 'comp_rxn': cols = ['comp', 'ads_a', 'ads_b'] else: cols = None if cols != None: X_train, X_dev, X_test, y_train, y_dev, y_test = split_by_cols( df, X, y, cols) else: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5) np.save('X_%s_train.npy' % split_type, X_train) np.save('X_%s_dev.npy' % split_type, X_dev) np.save('X_%s_test.npy' % split_type, X_test) np.save('y_%s_train.npy' % split_type, y_train) np.save('y_%s_dev.npy' % split_type, y_dev) np.save('y_%s_test.npy' % split_type, y_test)