Python split_by_cols Examples

Programming Language: Python

Namespace/Package Name: ML_prep

Method/Function: split_by_cols

Examples at hotexamples.com: 3

Python split_by_cols - 3 examples found. These are the top rated real world Python examples of ML_prep.split_by_cols extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: boosting.py Project: colinfd/ChemLearn

Used to generate trained GB models with different train-test splits.
"""

np.random.seed(100)

df = pickle.load(open('../../data/pairs_pdos.pkl'))

if True:
    X,y = train_prep_pdos(df,include_WF=True,dE=0.1)
    model_type = 'pdos'
else:
    X,y = train_prep(df,include_WF=True)
    model_type = 'moments'

if False:
    X_train, X_dev, X_test, y_train, y_dev, y_test = split_by_cols(df,X,y,['comp','ads_a','ads_b'])
    split_type = 'comp_rxn'
elif True:
    X_train, X_dev, X_test, y_train, y_dev, y_test = split_by_cols(df,X,y,['comp'])
    split_type = 'comp'
elif False:
    X_train, X_dev, X_test, y_train, y_dev, y_test = split_by_cols(df,X,y,['ads_a','ads_b'])
    split_type = 'rxn'
else:
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
    X_dev, X_test, y_dev, y_test = train_test_split(X_test,y_test,test_size=0.5)
    split_type = 'random'

model = CatBoostRegressor(loss_function='MAE',iterations=1.5e4)
model.fit(X_train,y_train,eval_set=(X_dev,y_dev))
model.save_model('%s_%s.cbm'%(model_type,split_type))

Example #2

Show file

    return mae


if __name__ == '__main__':
    df = pickle.load(open('data/pairs_pdos.pkl'))

    features = 'moments'  # #pdos,'moments'
    bayes = True

    #Feature Selection
    if features == 'moments':
        X, y = train_prep(df)
    elif features == 'pdos':
        X, y = train_prep_pdos(df, stack=False, include_WF=False, dE=0.1)

    X_train, X_dev, X_test, y_train, y_dev, y_test, groups = split_by_cols(
        df, X, y, ['comp', 'ads_a', 'ads_b'], ret_groups=True)

    rf = ensemble.RandomForestRegressor(n_estimators=100)

    group_kfold = GroupKFold(n_splits=3)

    #print(X_train.shape[1]),np.sqrt(X_train.shape[1])
    if bayes:
        random_grid = {  #'n_estimators': (5,100),
            'max_features': (int(np.sqrt(X_train.shape[1])), X_train.shape[1]),
            'max_depth': (5, 50),
            'min_samples_split': (2, 10),
            'min_samples_leaf': (2, 5),
            'bootstrap': [True, False]
        }

Example #3

Show file

df = pickle.load(open('../../data/pairs_pdos.pkl'))
X, y = train_prep_pdos(df, include_WF=False, stack=True)

for split_type in ['comp', 'rxn', 'comp_rxn', 'random']:
    print split_type
    if split_type == 'comp':
        cols = ['comp']
    elif split_type == 'rxn':
        cols = ['ads_a', 'ads_b']
    elif split_type == 'comp_rxn':
        cols = ['comp', 'ads_a', 'ads_b']
    else:
        cols = None

    if cols != None:
        X_train, X_dev, X_test, y_train, y_dev, y_test = split_by_cols(
            df, X, y, cols)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.3)
        X_dev, X_test, y_dev, y_test = train_test_split(X_test,
                                                        y_test,
                                                        test_size=0.5)

    np.save('X_%s_train.npy' % split_type, X_train)
    np.save('X_%s_dev.npy' % split_type, X_dev)
    np.save('X_%s_test.npy' % split_type, X_test)
    np.save('y_%s_train.npy' % split_type, y_train)
    np.save('y_%s_dev.npy' % split_type, y_dev)
    np.save('y_%s_test.npy' % split_type, y_test)